Merge pull request #749 from egaburov/nvptx_clean
Experimental support for PTX with examples
This commit is contained in:
43
LICENSE.txt
43
LICENSE.txt
@@ -141,3 +141,46 @@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
The ptxtools use parts of the PTX parser code from GPU Ocelot project
|
||||
(https://code.google.com/p/gpuocelot/), which is covered by the following
|
||||
license:
|
||||
|
||||
Copyright 2011
|
||||
GEORGIA TECH RESEARCH CORPORATION
|
||||
ALL RIGHTS RESERVED
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimers.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimers in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of GEORGIA TECH RESEARCH CORPORATION nor the
|
||||
names of its contributors may be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY GEORGIA TECH RESEARCH CORPORATION ''AS IS''
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GEORGIA TECH RESEARCH
|
||||
CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You agree that the Software will not be shipped, transferred, exported,
|
||||
or re-exported directly into any country prohibited by the United States
|
||||
Export Administration Act and the regulations thereunder nor will be
|
||||
used for any purpose prohibited by the Act.
|
||||
|
||||
|
||||
|
||||
21
Makefile
21
Makefile
@@ -73,6 +73,10 @@ endif
|
||||
# To enable: make ARM_ENABLED=1
|
||||
ARM_ENABLED=0
|
||||
|
||||
# Disable NVPTX by request
|
||||
# To disable: make NVPTX_ENABLED=0
|
||||
NVPTX_ENABLED=1
|
||||
|
||||
# Add llvm bin to the path so any scripts run will go to the right llvm-config
|
||||
LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
|
||||
export PATH:=$(LLVM_BIN):$(PATH)
|
||||
@@ -89,7 +93,7 @@ LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
|
||||
LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e 's/svn//' -e 's/\./_/' -e 's/\..*//')
|
||||
LLVM_VERSION_DEF=-D$(LLVM_VERSION)
|
||||
|
||||
LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker
|
||||
LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker
|
||||
# Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step.
|
||||
# We check if it's available before adding it (to not break 3.2 and earlier).
|
||||
ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1)
|
||||
@@ -98,6 +102,9 @@ endif
|
||||
ifneq ($(ARM_ENABLED), 0)
|
||||
LLVM_COMPONENTS+=arm
|
||||
endif
|
||||
ifneq ($(NVPTX_ENABLED), 0)
|
||||
LLVM_COMPONENTS+=nvptx
|
||||
endif
|
||||
LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS))
|
||||
|
||||
CLANG=clang
|
||||
@@ -160,6 +167,9 @@ endif
|
||||
ifneq ($(ARM_ENABLED), 0)
|
||||
CXXFLAGS+=-DISPC_ARM_ENABLED
|
||||
endif
|
||||
ifneq ($(NVPTX_ENABLED), 0)
|
||||
CXXFLAGS+=-DISPC_NVPTX_ENABLED
|
||||
endif
|
||||
|
||||
LDFLAGS=
|
||||
ifeq ($(ARCH_OS),Linux)
|
||||
@@ -184,6 +194,9 @@ TARGETS=avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-
|
||||
ifneq ($(ARM_ENABLED), 0)
|
||||
TARGETS+=neon-32 neon-16 neon-8
|
||||
endif
|
||||
ifneq ($(NVPTX_ENABLED), 0)
|
||||
TARGETS+=nvptx
|
||||
endif
|
||||
# These files need to be compiled in two versions - 32 and 64 bits.
|
||||
BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
|
||||
# These are files to be compiled in single version.
|
||||
@@ -289,15 +302,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
|
||||
@echo Compiling $<
|
||||
@$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
|
||||
objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
|
||||
@echo Creating C++ source from builtins definition file $<
|
||||
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
|
||||
|
||||
objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
|
||||
objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
|
||||
@echo Creating C++ source from builtins definition file $< \(32 bit version\)
|
||||
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
|
||||
|
||||
objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
|
||||
objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
|
||||
@echo Creating C++ source from builtins definition file $< \(64 bit version\)
|
||||
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@
|
||||
|
||||
|
||||
112
builtins.cpp
112
builtins.cpp
@@ -342,11 +342,17 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__all",
|
||||
"__any",
|
||||
"__aos_to_soa3_float",
|
||||
//#ifdef ISPC_NVPTX_ENABLED
|
||||
"__aos_to_soa3_float1",
|
||||
//#endif /* ISPC_NVPTX_ENABLED */
|
||||
"__aos_to_soa3_float16",
|
||||
"__aos_to_soa3_float4",
|
||||
"__aos_to_soa3_float8",
|
||||
"__aos_to_soa3_int32",
|
||||
"__aos_to_soa4_float",
|
||||
//#ifdef ISPC_NVPTX_ENABLED
|
||||
"__aos_to_soa4_float1",
|
||||
//#endif /* ISPC_NVPTX_ENABLED */
|
||||
"__aos_to_soa4_float16",
|
||||
"__aos_to_soa4_float4",
|
||||
"__aos_to_soa4_float8",
|
||||
@@ -395,6 +401,38 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__atomic_xor_int64_global",
|
||||
"__atomic_xor_uniform_int32_global",
|
||||
"__atomic_xor_uniform_int64_global",
|
||||
//#ifdef ISPC_NVPTX_ENABLED
|
||||
"__atomic_add_varying_int32_global",
|
||||
"__atomic_add_varying_int64_global",
|
||||
"__atomic_and_varying_int32_global",
|
||||
"__atomic_and_varying_int64_global",
|
||||
"__atomic_compare_exchange_varying_double_global",
|
||||
"__atomic_compare_exchange_varying_float_global",
|
||||
"__atomic_compare_exchange_varying_int32_global",
|
||||
"__atomic_compare_exchange_varying_int64_global",
|
||||
"__atomic_max_varying_int32_global",
|
||||
"__atomic_max_varying_int64_global",
|
||||
"__atomic_min_varying_int32_global",
|
||||
"__atomic_min_varying_int64_global",
|
||||
"__atomic_or_varying_int32_global",
|
||||
"__atomic_or_varying_int64_global",
|
||||
"__atomic_sub_varying_int32_global",
|
||||
"__atomic_sub_varying_int64_global",
|
||||
"__atomic_swap_varying_double_global",
|
||||
"__atomic_swap_varying_float_global",
|
||||
"__atomic_swap_varying_int32_global",
|
||||
"__atomic_swap_varying_int64_global",
|
||||
"__atomic_umax_varying_uint32_global",
|
||||
"__atomic_umax_varying_uint64_global",
|
||||
"__atomic_umin_varying_uint32_global",
|
||||
"__atomic_umin_varying_uint64_global",
|
||||
"__atomic_xor_uniform_int32_global",
|
||||
"__atomic_xor_uniform_int64_global",
|
||||
"__atomic_xor_varying_int32_global",
|
||||
"__atomic_xor_varying_int64_global",
|
||||
"__atomic_xor_varying_int32_global",
|
||||
"__atomic_xor_varying_int64_global",
|
||||
//#endif /* ISPC_NVPTX_ENABLED */
|
||||
"__broadcast_double",
|
||||
"__broadcast_float",
|
||||
"__broadcast_i16",
|
||||
@@ -417,6 +455,9 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__do_assert_uniform",
|
||||
"__do_assert_varying",
|
||||
"__do_print",
|
||||
//#ifdef ISPC_NVPTX_ENABLED
|
||||
"__do_print_nvptx",
|
||||
//#endif /* ISPC_NVPTX_ENABLED */
|
||||
"__doublebits_uniform_int64",
|
||||
"__doublebits_varying_int64",
|
||||
"__exclusive_scan_add_double",
|
||||
@@ -431,6 +472,10 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__extract_int32",
|
||||
"__extract_int64",
|
||||
"__extract_int8",
|
||||
//#ifdef ISPC_NVPTX_ENABLED
|
||||
"__extract_float",
|
||||
"__extract_double",
|
||||
//#endif /* ISPC_NVPTX_ENABLED */
|
||||
"__fastmath",
|
||||
"__float_to_half_uniform",
|
||||
"__float_to_half_varying",
|
||||
@@ -447,6 +492,10 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__insert_int32",
|
||||
"__insert_int64",
|
||||
"__insert_int8",
|
||||
//#ifdef ISPC_NVPTX_ENABLED
|
||||
"__insert_float",
|
||||
"__insert_double",
|
||||
//#endif /* ISPC_NVPTX_ENABLED */
|
||||
"__intbits_uniform_double",
|
||||
"__intbits_uniform_float",
|
||||
"__intbits_varying_double",
|
||||
@@ -483,6 +532,9 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__min_varying_uint32",
|
||||
"__min_varying_uint64",
|
||||
"__movmsk",
|
||||
//#ifdef ISPC_NVPTX_ENABLED
|
||||
"__movmsk_ptx",
|
||||
//#endif /* ISPC_NVPTX_ENABLED */
|
||||
"__new_uniform_32rt",
|
||||
"__new_uniform_64rt",
|
||||
"__new_varying32_32rt",
|
||||
@@ -581,6 +633,10 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__soa_to_aos3_float8",
|
||||
"__soa_to_aos3_int32",
|
||||
"__soa_to_aos4_float",
|
||||
//#ifdef ISPC_NVPTX_ENABLED
|
||||
"__soa_to_aos3_float1",
|
||||
"__soa_to_aos4_float1",
|
||||
//#endif /* ISPC_NVPTX_ENABLED */
|
||||
"__soa_to_aos4_float16",
|
||||
"__soa_to_aos4_float4",
|
||||
"__soa_to_aos4_float8",
|
||||
@@ -681,6 +737,26 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__vec4_add_float",
|
||||
"__vec4_add_int32",
|
||||
"__vselect_float",
|
||||
//#ifdef ISPC_NVPTX_ENABLED
|
||||
"__program_index",
|
||||
"__program_count",
|
||||
"__warp_index",
|
||||
"__task_index0",
|
||||
"__task_index1",
|
||||
"__task_index2",
|
||||
"__task_index",
|
||||
"__task_count0",
|
||||
"__task_count1",
|
||||
"__task_count2",
|
||||
"__task_count",
|
||||
"__cvt_loc2gen",
|
||||
"__cvt_loc2gen_var",
|
||||
"__cvt_const2gen",
|
||||
"__puts_nvptx",
|
||||
"ISPCAlloc",
|
||||
"ISPCLaunch",
|
||||
"ISPCSync",
|
||||
//#endif /* ISPC_NVPTX_ENABLED */
|
||||
"__vselect_i32"
|
||||
};
|
||||
|
||||
@@ -759,6 +835,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
g->target->getISA() != Target::NEON16 &&
|
||||
g->target->getISA() != Target::NEON8)
|
||||
#endif // !__arm__
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
if (g->target->getISA() != Target::NVPTX)
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
{
|
||||
Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
|
||||
mTriple.getArch() == bcTriple.getArch());
|
||||
@@ -954,6 +1033,19 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
// Next, add the target's custom implementations of the various needed
|
||||
// builtin functions (e.g. __masked_store_32(), etc).
|
||||
switch (g->target->getISA()) {
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
case Target::NVPTX:
|
||||
{
|
||||
if (runtime32) {
|
||||
fprintf(stderr, "Unfortunatly 32bit targets are not supported at the moment .. \n");
|
||||
assert(0);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_nvptx_64bit);
|
||||
}
|
||||
break;
|
||||
};
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
|
||||
#ifdef ISPC_ARM_ENABLED
|
||||
case Target::NEON8: {
|
||||
@@ -1224,7 +1316,18 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
}
|
||||
|
||||
// define the 'programCount' builtin variable
|
||||
lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
lDefineConstantInt("programCount", 32, module, symbolTable);
|
||||
}
|
||||
else
|
||||
{
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
}
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
|
||||
// define the 'programIndex' builtin
|
||||
lDefineProgramIndex(module, symbolTable);
|
||||
@@ -1256,6 +1359,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
|
||||
module, symbolTable);
|
||||
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
lDefineConstantInt("__is_nvptx_target", (int)(g->target->getISA() == Target::NVPTX),
|
||||
module, symbolTable);
|
||||
#else
|
||||
lDefineConstantInt("__is_nvptx_target", (int)0, module, symbolTable);
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
|
||||
if (g->forceAlignment != -1) {
|
||||
llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
|
||||
alignment->setInitializer(LLVMInt32(g->forceAlignment));
|
||||
|
||||
130
builtins/__do_print_nvptx.cu
Normal file
130
builtins/__do_print_nvptx.cu
Normal file
@@ -0,0 +1,130 @@
|
||||
#include <cstdio>
|
||||
|
||||
#define PRINT_BUF_SIZE 4096
|
||||
#define uint64_t unsigned long long
|
||||
|
||||
static __device__ size_t d_strlen(const char *str)
|
||||
{
|
||||
const char *s;
|
||||
|
||||
for (s = str; *s; ++s)
|
||||
;
|
||||
return (s - str);
|
||||
}
|
||||
|
||||
static __device__ char* d_strncat(char *dest, const char *src, size_t n)
|
||||
{
|
||||
size_t dest_len = d_strlen(dest);
|
||||
size_t i;
|
||||
|
||||
for (i = 0 ; i < n && src[i] != '\0' ; i++)
|
||||
dest[dest_len + i] = src[i];
|
||||
dest[dest_len + i] = '\0';
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
#define APPEND(str) \
|
||||
do { \
|
||||
int offset = bufp - &printString[0]; \
|
||||
*bufp = '\0'; \
|
||||
d_strncat(bufp, str, PRINT_BUF_SIZE-offset); \
|
||||
bufp += d_strlen(str); \
|
||||
if (bufp >= &printString[PRINT_BUF_SIZE]) \
|
||||
goto done; \
|
||||
} while (0) /* eat semicolon */
|
||||
|
||||
|
||||
#define PRINT_SCALAR(fmt, type) \
|
||||
sprintf(tmpBuf, fmt, *((type *)ptr)); \
|
||||
APPEND(tmpBuf); \
|
||||
break
|
||||
|
||||
#define PRINT_VECTOR(fmt, type) \
|
||||
*bufp++ = '['; \
|
||||
if (bufp == &printString[PRINT_BUF_SIZE]) break; \
|
||||
for (int i = 0; i < width; ++i) { \
|
||||
/* only print the value if the current lane is executing */ \
|
||||
type val0 = *((type*)ptr); \
|
||||
type val = val0; \
|
||||
if (mask & (1ull<<i)) \
|
||||
sprintf(tmpBuf, fmt, val); \
|
||||
else \
|
||||
sprintf(tmpBuf, "(( * )) "); \
|
||||
APPEND(tmpBuf); \
|
||||
*bufp++ = (i != width-1 ? ',' : ']'); \
|
||||
} \
|
||||
break
|
||||
|
||||
extern "C"
|
||||
__device__ void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
|
||||
void **args) {
|
||||
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
|
||||
char *bufp = &printString[0];
|
||||
char tmpBuf[256];
|
||||
const char trueBuf[] = "true";
|
||||
const char falseBuf[] = "false";
|
||||
|
||||
int argCount = 0;
|
||||
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
|
||||
// Format strings are just single percent signs.
|
||||
if (*format != '%') {
|
||||
*bufp++ = *format;
|
||||
}
|
||||
else {
|
||||
if (*types) {
|
||||
void *ptr = args[argCount++];
|
||||
// Based on the encoding in the types string, cast the
|
||||
// value appropriately and print it with a reasonable
|
||||
// printf() formatting string.
|
||||
switch (*types) {
|
||||
case 'b': {
|
||||
const char *tmpBuf1 = *((bool *)ptr) ? trueBuf : falseBuf;
|
||||
APPEND(tmpBuf1);
|
||||
break;
|
||||
}
|
||||
case 'B': {
|
||||
*bufp++ = '[';
|
||||
if (bufp == &printString[PRINT_BUF_SIZE])
|
||||
break;
|
||||
for (int i = 0; i < width; ++i) {
|
||||
bool val0 = *((bool*)ptr);
|
||||
bool val = val0; \
|
||||
if (mask & (1ull << i)) {
|
||||
const char *tmpBuf1 = val ? trueBuf : falseBuf;
|
||||
APPEND(tmpBuf1);
|
||||
}
|
||||
else
|
||||
APPEND("_________");
|
||||
*bufp++ = (i != width-1) ? ',' : ']';
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'i': PRINT_SCALAR("%d", int);
|
||||
case 'I': PRINT_VECTOR("%d", int);
|
||||
case 'u': PRINT_SCALAR("%u", unsigned int);
|
||||
case 'U': PRINT_VECTOR("%u", unsigned int);
|
||||
case 'f': PRINT_SCALAR("%f", float);
|
||||
case 'F': PRINT_VECTOR("%f", float);
|
||||
case 'l': PRINT_SCALAR("%lld", long long);
|
||||
case 'L': PRINT_VECTOR("%lld", long long);
|
||||
case 'v': PRINT_SCALAR("%llu", unsigned long long);
|
||||
case 'V': PRINT_VECTOR("%llu", unsigned long long);
|
||||
case 'd': PRINT_SCALAR("%f", double);
|
||||
case 'D': PRINT_VECTOR("%f", double);
|
||||
case 'p': PRINT_SCALAR("%p", void *);
|
||||
case 'P': PRINT_VECTOR("%p", void *);
|
||||
default:
|
||||
APPEND("UNKNOWN TYPE ");
|
||||
*bufp++ = *types;
|
||||
}
|
||||
++types;
|
||||
}
|
||||
}
|
||||
++format;
|
||||
}
|
||||
|
||||
done:
|
||||
*bufp = '\n'; bufp++;
|
||||
*bufp = '\0';
|
||||
}
|
||||
@@ -185,6 +185,81 @@ void __do_print(const char *format, const char *types, int width, uint64_t mask,
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
/* this is print for PTX target only */
|
||||
int __puts_nvptx(const char *);
|
||||
void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
|
||||
void **args) {
|
||||
#if 0
|
||||
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
|
||||
char *bufp = &printString[0];
|
||||
char tmpBuf[256];
|
||||
|
||||
int argCount = 0;
|
||||
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
|
||||
// Format strings are just single percent signs.
|
||||
if (*format != '%') {
|
||||
*bufp++ = *format;
|
||||
}
|
||||
else {
|
||||
if (*types) {
|
||||
void *ptr = args[argCount++];
|
||||
// Based on the encoding in the types string, cast the
|
||||
// value appropriately and print it with a reasonable
|
||||
// printf() formatting string.
|
||||
switch (*types) {
|
||||
case 'b': {
|
||||
sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
|
||||
APPEND(tmpBuf);
|
||||
break;
|
||||
}
|
||||
case 'B': {
|
||||
*bufp++ = '[';
|
||||
if (bufp == &printString[PRINT_BUF_SIZE])
|
||||
break;
|
||||
for (int i = 0; i < width; ++i) {
|
||||
if (mask & (1ull << i)) {
|
||||
sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
|
||||
APPEND(tmpBuf);
|
||||
}
|
||||
else
|
||||
APPEND("_________");
|
||||
*bufp++ = (i != width-1) ? ',' : ']';
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'i': PRINT_SCALAR("%d", int);
|
||||
case 'I': PRINT_VECTOR("%d", int);
|
||||
case 'u': PRINT_SCALAR("%u", unsigned int);
|
||||
case 'U': PRINT_VECTOR("%u", unsigned int);
|
||||
case 'f': PRINT_SCALAR("%f", float);
|
||||
case 'F': PRINT_VECTOR("%f", float);
|
||||
case 'l': PRINT_SCALAR("%lld", long long);
|
||||
case 'L': PRINT_VECTOR("%lld", long long);
|
||||
case 'v': PRINT_SCALAR("%llu", unsigned long long);
|
||||
case 'V': PRINT_VECTOR("%llu", unsigned long long);
|
||||
case 'd': PRINT_SCALAR("%f", double);
|
||||
case 'D': PRINT_VECTOR("%f", double);
|
||||
case 'p': PRINT_SCALAR("%p", void *);
|
||||
case 'P': PRINT_VECTOR("%p", void *);
|
||||
default:
|
||||
APPEND("UNKNOWN TYPE ");
|
||||
*bufp++ = *types;
|
||||
}
|
||||
++types;
|
||||
}
|
||||
}
|
||||
++format;
|
||||
}
|
||||
|
||||
done:
|
||||
*bufp = '\n'; bufp++;
|
||||
*bufp = '\0';
|
||||
__puts_nvptx(printString);
|
||||
#else
|
||||
__puts_nvptx("---nvptx printing is not support---\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
int __num_cores() {
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
|
||||
@@ -289,4 +289,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
declare_nvptx()
|
||||
|
||||
|
||||
@@ -42,6 +42,7 @@ packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
aossoa()
|
||||
declare_nvptx()
|
||||
saturation_arithmetic_novec()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
@@ -382,6 +382,7 @@ declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
declare_nvptx()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reciprocals in double precision, if supported
|
||||
|
||||
@@ -344,3 +344,4 @@ packed_load_and_store(4)
|
||||
;; prefetch
|
||||
|
||||
define_prefetches()
|
||||
declare_nvptx()
|
||||
|
||||
2340
builtins/target-nvptx.ll
Normal file
2340
builtins/target-nvptx.ll
Normal file
File diff suppressed because it is too large
Load Diff
@@ -274,3 +274,4 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
||||
|
||||
define_avgs()
|
||||
|
||||
declare_nvptx()
|
||||
|
||||
@@ -278,3 +278,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
declare_nvptx()
|
||||
|
||||
3492
builtins/util-nvptx.m4
Normal file
3492
builtins/util-nvptx.m4
Normal file
File diff suppressed because it is too large
Load Diff
@@ -4964,6 +4964,62 @@ declare double @__rcp_uniform_double(double)
|
||||
declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
|
||||
')
|
||||
|
||||
define(`declare_nvptx',
|
||||
`
|
||||
declare i32 @__program_index() nounwind readnone alwaysinline
|
||||
declare i32 @__program_count() nounwind readnone alwaysinline
|
||||
declare i32 @__warp_index() nounwind readnone alwaysinline
|
||||
declare i32 @__task_index0() nounwind readnone alwaysinline
|
||||
declare i32 @__task_index1() nounwind readnone alwaysinline
|
||||
declare i32 @__task_index2() nounwind readnone alwaysinline
|
||||
declare i32 @__task_index() nounwind readnone alwaysinline
|
||||
declare i32 @__task_count0() nounwind readnone alwaysinline
|
||||
declare i32 @__task_count1() nounwind readnone alwaysinline
|
||||
declare i32 @__task_count2() nounwind readnone alwaysinline
|
||||
declare i32 @__task_count() nounwind readnone alwaysinline
|
||||
declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
|
||||
declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
|
||||
declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline
|
||||
declare i64 @__movmsk_ptx(<WIDTH x i1>) nounwind readnone alwaysinline;
|
||||
')
|
||||
|
||||
define(`global_atomic_varying',`
|
||||
declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
|
||||
')
|
||||
|
||||
define(`global_atomic_cas_varying',`
|
||||
declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %cmp, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
|
||||
')
|
||||
|
||||
global_atomic_cas_varying(WIDTH, compare_exchange, i32, int32)
|
||||
global_atomic_cas_varying(WIDTH, compare_exchange, i64, int64)
|
||||
global_atomic_cas_varying(WIDTH, compare_exchange, float, float)
|
||||
global_atomic_cas_varying(WIDTH, compare_exchange, double, double)
|
||||
|
||||
global_atomic_varying(WIDTH, swap, i32, int32)
|
||||
global_atomic_varying(WIDTH, swap, i64, int64)
|
||||
global_atomic_varying(WIDTH, swap, float, float)
|
||||
global_atomic_varying(WIDTH, swap, double, double)
|
||||
|
||||
global_atomic_varying(WIDTH, add, i32, int32)
|
||||
global_atomic_varying(WIDTH, sub, i32, int32)
|
||||
global_atomic_varying(WIDTH, and, i32, int32)
|
||||
global_atomic_varying(WIDTH, or, i32, int32)
|
||||
global_atomic_varying(WIDTH, xor, i32, int32)
|
||||
global_atomic_varying(WIDTH, min, i32, int32)
|
||||
global_atomic_varying(WIDTH, max, i32, int32)
|
||||
global_atomic_varying(WIDTH, umin, i32, uint32)
|
||||
global_atomic_varying(WIDTH, umax, i32, uint32)
|
||||
|
||||
global_atomic_varying(WIDTH, add, i64, int64)
|
||||
global_atomic_varying(WIDTH, sub, i64, int64)
|
||||
global_atomic_varying(WIDTH, and, i64, int64)
|
||||
global_atomic_varying(WIDTH, or, i64, int64)
|
||||
global_atomic_varying(WIDTH, xor, i64, int64)
|
||||
global_atomic_varying(WIDTH, min, i64, int64)
|
||||
global_atomic_varying(WIDTH, max, i64, int64)
|
||||
global_atomic_varying(WIDTH, umin, i64, uint64)
|
||||
global_atomic_varying(WIDTH, umax, i64, uint64)
|
||||
|
||||
define(`transcendetals_decl',`
|
||||
declare float @__log_uniform_float(float) nounwind readnone
|
||||
|
||||
233
ctx.cpp
233
ctx.cpp
@@ -57,6 +57,10 @@
|
||||
#include <llvm/IR/Instructions.h>
|
||||
#include <llvm/IR/DerivedTypes.h>
|
||||
#endif
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
#include <llvm/Support/raw_ostream.h>
|
||||
#include <llvm/Support/FormattedStream.h>
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
|
||||
/** This is a small utility structure that records information related to one
|
||||
level of nested control flow. It's mostly used in correctly restoring
|
||||
@@ -1383,10 +1387,17 @@ FunctionEmitContext::None(llvm::Value *mask) {
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::LaneMask(llvm::Value *v) {
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
/* this makes mandelbrot example slower with "nvptx" target.
|
||||
* Needs further investigation. */
|
||||
const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
|
||||
#else
|
||||
const char *__movmsk = "__movmsk";
|
||||
#endif
|
||||
// Call the target-dependent movmsk function to turn the vector mask
|
||||
// into an i64 value
|
||||
std::vector<Symbol *> mm;
|
||||
m->symbolTable->LookupFunction("__movmsk", &mm);
|
||||
m->symbolTable->LookupFunction(__movmsk, &mm);
|
||||
if (g->target->getMaskBitCount() == 1)
|
||||
AssertPos(currentPos, mm.size() == 1);
|
||||
else
|
||||
@@ -1398,13 +1409,78 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
|
||||
return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
|
||||
}
|
||||
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName)
|
||||
{
|
||||
llvm::Type *type = vector->getType();
|
||||
if (type == LLVMTypes::Int8VectorType)
|
||||
funcName += "_int8";
|
||||
else if (type == LLVMTypes::Int16VectorType)
|
||||
funcName += "_int16";
|
||||
else if (type == LLVMTypes::Int32VectorType)
|
||||
funcName += "_int32";
|
||||
else if (type == LLVMTypes::Int64VectorType)
|
||||
funcName += "_int64";
|
||||
else if (type == LLVMTypes::FloatVectorType)
|
||||
funcName += "_float";
|
||||
else if (type == LLVMTypes::DoubleVectorType)
|
||||
funcName += "_double";
|
||||
else
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
llvm::Value*
|
||||
FunctionEmitContext::Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar)
|
||||
{
|
||||
std::string funcName = "__insert";
|
||||
assert(lAppendInsertExtractName(vector, funcName));
|
||||
assert(lane->getType() == LLVMTypes::Int32Type);
|
||||
|
||||
llvm::Function *func = m->module->getFunction(funcName.c_str());
|
||||
assert(func != NULL);
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(vector);
|
||||
args.push_back(lane);
|
||||
args.push_back(scalar);
|
||||
llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
|
||||
return ret;
|
||||
}
|
||||
|
||||
llvm::Value*
|
||||
FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane)
|
||||
{
|
||||
std::string funcName = "__extract";
|
||||
assert(lAppendInsertExtractName(vector, funcName));
|
||||
assert(lane->getType() == LLVMTypes::Int32Type);
|
||||
|
||||
llvm::Function *func = m->module->getFunction(funcName.c_str());
|
||||
assert(func != NULL);
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(vector);
|
||||
args.push_back(lane);
|
||||
llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
|
||||
return ret;
|
||||
}
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
// Compare the two masks to get a vector of i1s
|
||||
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
|
||||
v1, v2, "v1==v2");
|
||||
return ExtractInst(cmp, 0); /* this works without calling All(..) in PTX. Why ?!? */
|
||||
}
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
|
||||
#if 0
|
||||
// Compare the two masks to get a vector of i1s
|
||||
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
|
||||
v1, v2, "v1==v2");
|
||||
v1, v2, "v1==v2");
|
||||
// Turn that into a bool vector type (often i32s)
|
||||
cmp = I1VecToBoolVec(cmp);
|
||||
// And see if it's all on
|
||||
@@ -1413,7 +1489,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
|
||||
llvm::Value *mm1 = LaneMask(v1);
|
||||
llvm::Value *mm2 = LaneMask(v2);
|
||||
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
|
||||
LLVMGetName("equal", v1, v2));
|
||||
LLVMGetName("equal", v1, v2));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1421,8 +1497,8 @@ llvm::Value *
|
||||
FunctionEmitContext::ProgramIndexVector(bool is32bits) {
|
||||
llvm::SmallVector<llvm::Constant*, 16> array;
|
||||
for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
|
||||
llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
|
||||
array.push_back(C);
|
||||
llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
|
||||
array.push_back(C);
|
||||
}
|
||||
|
||||
llvm::Constant* index = llvm::ConstantVector::get(array);
|
||||
@@ -1430,6 +1506,20 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) {
|
||||
return index;
|
||||
}
|
||||
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
llvm::Value *
|
||||
FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) {
|
||||
llvm::Function *func_program_index = m->module->getFunction("__program_index");
|
||||
llvm::Value *__program_index = CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__program_indexS");
|
||||
llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV");
|
||||
#if 0
|
||||
if (!is32bits)
|
||||
index = ZExtInst(index, LLVMTypes::Int64VectandType);
|
||||
#endif
|
||||
return index;
|
||||
}
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::GetStringPtr(const std::string &str) {
|
||||
@@ -3555,31 +3645,117 @@ llvm::Value *
|
||||
FunctionEmitContext::LaunchInst(llvm::Value *callee,
|
||||
std::vector<llvm::Value *> &argVals,
|
||||
llvm::Value *launchCount[3]){
|
||||
if (callee == NULL) {
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
if (callee == NULL) {
|
||||
AssertPos(currentPos, m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
launchedTasks = true;
|
||||
|
||||
AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
|
||||
std::vector<llvm::Type*> argTypes;
|
||||
|
||||
llvm::Function *F = llvm::dyn_cast<llvm::Function>(callee);
|
||||
const unsigned int nArgs = F->arg_size();
|
||||
llvm::Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
|
||||
for (; I != E; ++I)
|
||||
argTypes.push_back(I->getType());
|
||||
llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
|
||||
llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
|
||||
llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
|
||||
if (structSize->getType() != LLVMTypes::Int64Type)
|
||||
structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
|
||||
"struct_size_to_64");
|
||||
|
||||
const int align = 8;
|
||||
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
|
||||
AssertPos(currentPos, falloc != NULL);
|
||||
std::vector<llvm::Value *> allocArgs;
|
||||
allocArgs.push_back(launchGroupHandlePtr);
|
||||
allocArgs.push_back(structSize);
|
||||
allocArgs.push_back(LLVMInt32(align));
|
||||
llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
|
||||
llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
|
||||
llvm::BasicBlock* if_true = CreateBasicBlock("if_true");
|
||||
llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
|
||||
|
||||
/* check if the pointer returned by ISPCAlloc is not NULL
|
||||
* --------------
|
||||
* this is a workaround for not checking the value of programIndex
|
||||
* because ISPCAlloc will return NULL pointer for all programIndex > 0
|
||||
* of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0
|
||||
* will also be NULL
|
||||
* This check must be added, and also rewrite the code to make it less opaque
|
||||
*/
|
||||
llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
|
||||
BranchInst(if_true, if_false, cmp1);
|
||||
|
||||
/**********************/
|
||||
bblock = if_true;
|
||||
|
||||
// label_if_then block:
|
||||
llvm::Type *pt = llvm::PointerType::getUnqual(st);
|
||||
llvm::Value *argmem = BitCastInst(voidmem, pt);
|
||||
for (unsigned int i = 0; i < argVals.size(); ++i)
|
||||
{
|
||||
llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
|
||||
// don't need to do masked store here, I think
|
||||
StoreInst(argVals[i], ptr);
|
||||
}
|
||||
if (nArgs == argVals.size() + 1) {
|
||||
// copy in the mask
|
||||
llvm::Value *mask = GetFullMask();
|
||||
llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
|
||||
"funarg_mask");
|
||||
StoreInst(mask, ptr);
|
||||
}
|
||||
BranchInst(if_false);
|
||||
|
||||
/**********************/
|
||||
bblock = if_false;
|
||||
|
||||
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
|
||||
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
|
||||
AssertPos(currentPos, flaunch != NULL);
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(launchGroupHandlePtr);
|
||||
args.push_back(fptr);
|
||||
args.push_back(voidmem);
|
||||
args.push_back(launchCount[0]);
|
||||
args.push_back(launchCount[1]);
|
||||
args.push_back(launchCount[2]);
|
||||
llvm::Value *ret = CallInst(flaunch, NULL, args, "");
|
||||
return ret;
|
||||
}
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
|
||||
if (callee == NULL) {
|
||||
AssertPos(currentPos, m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
launchedTasks = true;
|
||||
|
||||
AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
|
||||
llvm::Type *argType =
|
||||
(llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
|
||||
(llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
|
||||
AssertPos(currentPos, llvm::PointerType::classof(argType));
|
||||
llvm::PointerType *pt =
|
||||
llvm::dyn_cast<llvm::PointerType>(argType);
|
||||
llvm::dyn_cast<llvm::PointerType>(argType);
|
||||
AssertPos(currentPos, llvm::StructType::classof(pt->getElementType()));
|
||||
llvm::StructType *argStructType =
|
||||
static_cast<llvm::StructType *>(pt->getElementType());
|
||||
static_cast<llvm::StructType *>(pt->getElementType());
|
||||
|
||||
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
|
||||
AssertPos(currentPos, falloc != NULL);
|
||||
llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
|
||||
if (structSize->getType() != LLVMTypes::Int64Type)
|
||||
// ISPCAlloc expects the size as an uint64_t, but on 32-bit
|
||||
// targets, SizeOf returns a 32-bit value
|
||||
structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
|
||||
"struct_size_to_64");
|
||||
// ISPCAlloc expects the size as an uint64_t, but on 32-bit
|
||||
// targets, SizeOf returns a 32-bit value
|
||||
structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
|
||||
"struct_size_to_64");
|
||||
int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth());
|
||||
|
||||
std::vector<llvm::Value *> allocArgs;
|
||||
@@ -3592,17 +3768,17 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
|
||||
// Copy the values of the parameters into the appropriate place in
|
||||
// the argument block
|
||||
for (unsigned int i = 0; i < argVals.size(); ++i) {
|
||||
llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
|
||||
// don't need to do masked store here, I think
|
||||
StoreInst(argVals[i], ptr);
|
||||
llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
|
||||
// don't need to do masked store here, I think
|
||||
StoreInst(argVals[i], ptr);
|
||||
}
|
||||
|
||||
if (argStructType->getNumElements() == argVals.size() + 1) {
|
||||
// copy in the mask
|
||||
llvm::Value *mask = GetFullMask();
|
||||
llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
|
||||
"funarg_mask");
|
||||
StoreInst(mask, ptr);
|
||||
// copy in the mask
|
||||
llvm::Value *mask = GetFullMask();
|
||||
llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
|
||||
"funarg_mask");
|
||||
StoreInst(mask, ptr);
|
||||
}
|
||||
|
||||
// And emit the call to the user-supplied task launch function, passing
|
||||
@@ -3624,6 +3800,21 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
|
||||
|
||||
void
|
||||
FunctionEmitContext::SyncInst() {
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
|
||||
llvm::Value *nullPtrValue =
|
||||
llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
|
||||
llvm::Function *fsync = m->module->getFunction("ISPCSync");
|
||||
if (fsync == NULL)
|
||||
FATAL("Couldn't find ISPCSync declaration?!");
|
||||
CallInst(fsync, NULL, launchGroupHandle, "");
|
||||
StoreInst(nullPtrValue, launchGroupHandlePtr);
|
||||
return;
|
||||
}
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
|
||||
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
|
||||
llvm::Value *nullPtrValue =
|
||||
llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
|
||||
|
||||
10
ctx.h
10
ctx.h
@@ -302,9 +302,17 @@ public:
|
||||
that indicates whether the two masks are equal. */
|
||||
llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
|
||||
|
||||
/** Generate ConstantVector, which contains ProgramIndex, i.e.
|
||||
/** generate constantvector, which contains programindex, i.e.
|
||||
< i32 0, i32 1, i32 2, i32 3> */
|
||||
llvm::Value *ProgramIndexVector(bool is32bits = true);
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
llvm::Value *ProgramIndexVectorPTX(bool is32bits = true);
|
||||
|
||||
/** Issues a call to __insert_int8/int16/int32/int64/float/double */
|
||||
llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar);
|
||||
/** Issues a call to __extract_int8/int16/int32/int64/float/double */
|
||||
llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane);
|
||||
#endif
|
||||
|
||||
/** Given a string, create an anonymous global variable to hold its
|
||||
value and return the pointer to the string. */
|
||||
|
||||
18
decl.cpp
18
decl.cpp
@@ -168,6 +168,15 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
|
||||
retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
|
||||
|
||||
if (soaWidth > 0) {
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
#if 0 /* see stmt.cpp in DeclStmt::EmitCode for work-around of SOAType Declaration */
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
Error(pos, "\"soa\" data types are currently not supported with \"nvptx\" target.");
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
const StructType *st = CastType<StructType>(retType);
|
||||
|
||||
if (st == NULL) {
|
||||
@@ -402,6 +411,15 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
#if 0 /* NVPTX */
|
||||
if (baseType->IsUniformType())
|
||||
{
|
||||
fprintf(stderr, " detected uniform array of size= %d array= %s\n" ,arraySize,
|
||||
baseType->IsArrayType() ? " true " : " false ");
|
||||
}
|
||||
#endif
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
const Type *arrayType = new ArrayType(baseType, arraySize);
|
||||
if (child != NULL) {
|
||||
child->InitFromType(arrayType, ds);
|
||||
|
||||
130
docs/ispc.rst
130
docs/ispc.rst
@@ -178,6 +178,13 @@ Contents:
|
||||
+ `Data Alignment and Aliasing`_
|
||||
+ `Restructuring Existing Programs to Use ISPC`_
|
||||
|
||||
* `Experimental support for PTX`_
|
||||
|
||||
+ `Overview`_
|
||||
+ `Compiling For The NVIDIA Kepler GPU`_
|
||||
+ `Hints`_
|
||||
+ `Limitations & known issues`_
|
||||
|
||||
* `Disclaimer and Legal Information`_
|
||||
|
||||
* `Optimization Notice`_
|
||||
@@ -4936,6 +4943,129 @@ program instances improves performance.
|
||||
.. _ispc Performance Tuning Guide: http://ispc.github.com/perfguide.html
|
||||
|
||||
|
||||
Experimental support for PTX
|
||||
============================
|
||||
``ispc`` provides experimental support for PTX code generation which currently
|
||||
targets NVIDIA GPUs with compute capability >3.5 [Kepler GPUs with support for
|
||||
dynamic parallelism]. Due to its nature, the PTX backend currently impose
|
||||
several restrictions on the ``ispc`` program, which will be described below.
|
||||
|
||||
Overview
|
||||
--------
|
||||
SPMD programming in ``ispc`` is similar to a warp-synchronous CUDA programming.
|
||||
Namely, program instances in a gang are equivalent of CUDA threads in a single
|
||||
warp. Hence, to run efficiently on a GPU ``ispc`` program must use tasking
|
||||
functionality via ``launch`` keyword to ensure multiple number of warps are
|
||||
executed concurrently on the GPU.
|
||||
|
||||
``export`` functions are equipped with a CUDA C wrapper which schedules a
|
||||
single warp--a thread-block with a total of 32 threads. In contract to CPU
|
||||
programming, this exported function, either directly or otherwise, should
|
||||
utilize ``launch`` keyword to schedule work on a GPU.
|
||||
|
||||
At the PTX level, ``launch`` keyword is mapped to CUDA Dynamic Parallelism and
|
||||
it schedules a grid of thread-blocks each 4 warps-wide (128 threads). As a
|
||||
result, ``ispc`` has a tasking-granularity of 4 tasks with PTX target; this
|
||||
restriction will be eliminated in future.
|
||||
|
||||
When passing pointers to an ``export`` function, it is important that they
|
||||
remain legal when are accessed from GPU. Prior to CUDA 6.0, such a pointer were
|
||||
holding an address that is only accessible from the GPU. With the release of
|
||||
CUDA 6.0, it is possible to pass a pointer to a unified memory allocated with
|
||||
``cudaMallocManaged``. Examples provides rudimentary wrapper functions that
|
||||
call CUDA API for managed memory allocations, allowing the programmers to avoid
|
||||
explicit memory copies.
|
||||
|
||||
|
||||
|
||||
Compiling For The NVIDIA Kepler GPU
|
||||
-----------------------------------
|
||||
Compilation for NVIDIA Kepler GPU is a several step procedure.
|
||||
|
||||
First, we need to generate a LLVM assembly from ``ispc`` source file (``ispc``
|
||||
generates LLVM assembly instead of bitcode when ``nvptx`` target is chosen):
|
||||
|
||||
::
|
||||
|
||||
$ISPC_HOME/ispc foo.ispc --emit-llvm --target=nvptx -o foo.ll
|
||||
|
||||
|
||||
This LLVM assembly can immediately be compiled into PTX with the help of
|
||||
``ptxgen`` tool; this tool uses ``libNVVM`` which is a part of a CUDA Toolkit.
|
||||
|
||||
::
|
||||
|
||||
$ISPC_HOME/ptxtools/ptxgen --use_fast_math foo.ll -o foo.ptx
|
||||
|
||||
.. If ``ispc`` is compiled with LLVM >3.2, the resulting bitcode must first be
|
||||
.. decompiled with the ``llvm-dis`` from LLVM 3.2 distribution; this "trick" is
|
||||
.. required to generate an IR compatible with libNVVM:
|
||||
|
||||
.. ::
|
||||
..
|
||||
.. $LLVM32/bin/llvm-dis foo.bc -o foo.ll
|
||||
.. $ISPC_HOME/ptxtools/ptxgen --use_fast_math foo.ll -o foo.ptx
|
||||
|
||||
This PTX is ready for execution on a GPU, for example via CUDA
|
||||
Driver API. Alternatively, we also provide a simple ``ptxcc`` tool, which
|
||||
compiles the resulting PTX code into an object file:
|
||||
|
||||
::
|
||||
|
||||
$ISPC_HOME/ptxtools/ptxcc foo.ptx -o foo_cu.o -Xnvcc="--maxrregcount=64
|
||||
-Xptxas=-v"
|
||||
|
||||
This object file can be linked with the main program via ``nvcc``:
|
||||
|
||||
::
|
||||
|
||||
nvcc foo_cu.o foo_main.o -o foo
|
||||
|
||||
|
||||
Hints
|
||||
-----
|
||||
- ``uniform`` arrays in a function scope are statically allocated in
|
||||
``__shared__`` memory, with all ensuing consequences. For example, if more
|
||||
than avaiable shared memory per SMX is allocated, a link- or runtime-error will occur
|
||||
- If ``uniform`` arrays of large size are desired, we recommend to use
|
||||
``uniform new uniform T[size]`` for their allocation, ideally outside the
|
||||
tasking function (see ``deferred/kernels.ispc`` in the deferred shading example)
|
||||
|
||||
Examples that produces executables for CPU, XeonPhi and Kepler GPU display
|
||||
several tuning approaches that can benefit GPU performance.
|
||||
``ispc`` may also generate performance warning, that if followed, may improve
|
||||
GPU application performance.
|
||||
|
||||
Limitations & known issues
|
||||
--------------------------
|
||||
Due to its experimental form, PTX code generation is known to impose several
|
||||
limitation on the ``ispc`` program which are documented in the following list:
|
||||
|
||||
- Must use ``ispc`` tasking functionality to run efficiently on GPU
|
||||
- Must use ``new/delete`` and/or ``ispc_malloc``/``ispc_free``/``ispc_memset``/``ispc_memcpy`` to allocate/free/set/copy memory that is visible to GPU
|
||||
- ``export`` functions must have ``void`` return type.
|
||||
- ``task``/``export`` functions do not accept varying data-types
|
||||
- ``new``/``delete`` currently only works with ``uniform`` data-types
|
||||
- ``aossoa``/``soaaos`` is not yet supported
|
||||
- ``sizeof(varying)`` is not yet unsupported
|
||||
- Function pointers do not work yet (may or may not generate compilation fail)
|
||||
- ``memset``/``memcpy``/``memmove`` is not yet supported
|
||||
- ``uniform`` arrays in global scope are mapped to global memory
|
||||
- ``varying`` arrays in global scope are not yet supported
|
||||
- ``uniform`` arrays in local scope are mapped to shared memory
|
||||
- ``varying`` arrays in local scope are mapped to local memory
|
||||
- ``const uniform/varying`` arrays are mapped to local memory
|
||||
- ``const static uniform`` arrays are mapped to constant memory
|
||||
- ``const static varying`` arrays are mapped to global memory
|
||||
- ``static`` data types in local scope are not allowed; compilation will fail
|
||||
- Best performance is obtained with libNVVM (LLVM PTX backend can also be used but it requires libdevice.compute_35.10.bc that comes with libNVVM)
|
||||
|
||||
|
||||
Likely there are more... which, together with some of the above-mentioned
|
||||
issues, will be fixed in due time.
|
||||
|
||||
|
||||
|
||||
Disclaimer and Legal Information
|
||||
================================
|
||||
|
||||
|
||||
2
examples/portable/aobench/.gitignore
vendored
Normal file
2
examples/portable/aobench/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
ao
|
||||
*.ppm
|
||||
8
examples/portable/aobench/Makefile_cpu
Normal file
8
examples/portable/aobench/Makefile_cpu
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=ao
|
||||
CPP_SRC=ao.cpp
|
||||
ISPC_SRC=ao.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x8
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common_cpu.mk
|
||||
7
examples/portable/aobench/Makefile_knc
Normal file
7
examples/portable/aobench/Makefile_knc
Normal file
@@ -0,0 +1,7 @@
|
||||
EXAMPLE=ao
|
||||
CXX_SRC=ao.cpp
|
||||
ISPC_SRC=ao.ispc
|
||||
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
|
||||
ISPC_TARGET=generic-16
|
||||
|
||||
include ../common_knc.mk
|
||||
14
examples/portable/aobench/Makefile_ptx
Normal file
14
examples/portable/aobench/Makefile_ptx
Normal file
@@ -0,0 +1,14 @@
|
||||
PROG=ao
|
||||
ISPC_SRC=ao.ispc
|
||||
CU_SRC=ao.cu
|
||||
CXX_SRC=ao.cpp
|
||||
PTXCC_REGMAX=64
|
||||
#ISPC_FLAGS= --opt=disable-uniform-control-flow
|
||||
|
||||
#LLVM_GPU=1
|
||||
NVVM_GPU=1
|
||||
|
||||
include ../common_ptx.mk
|
||||
|
||||
|
||||
|
||||
152
examples/portable/aobench/ao.cpp
Normal file
152
examples/portable/aobench/ao.cpp
Normal file
@@ -0,0 +1,152 @@
|
||||
/*
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#ifdef __linux__
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#include <math.h>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "ao_ispc.h"
|
||||
|
||||
#include "timing.h"
|
||||
#include "ispc_malloc.h"
|
||||
|
||||
#define NSUBSAMPLES 2
|
||||
|
||||
static unsigned int test_iterations[] = {3, 7, 1};
|
||||
static unsigned int width, height;
|
||||
static unsigned char *img;
|
||||
static float *fimg;
|
||||
|
||||
|
||||
static unsigned char
|
||||
clamp(float f)
|
||||
{
|
||||
int i = (int)(f * 255.5);
|
||||
|
||||
if (i < 0) i = 0;
|
||||
if (i > 255) i = 255;
|
||||
|
||||
return (unsigned char)i;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
savePPM(const char *fname, int w, int h)
|
||||
{
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 0; x < w; x++) {
|
||||
img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]);
|
||||
img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]);
|
||||
img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]);
|
||||
}
|
||||
}
|
||||
|
||||
FILE *fp = fopen(fname, "wb");
|
||||
if (!fp) {
|
||||
perror(fname);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fprintf(fp, "P6\n");
|
||||
fprintf(fp, "%d %d\n", w, h);
|
||||
fprintf(fp, "255\n");
|
||||
fwrite(img, w * h * 3, 1, fp);
|
||||
fclose(fp);
|
||||
printf("Wrote image file %s\n", fname);
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc < 3) {
|
||||
printf ("%s\n", argv[0]);
|
||||
printf ("Usage: ao [width] [height] [ispc iterations] [tasks iterations] [serial iterations]\n");
|
||||
getchar();
|
||||
exit(-1);
|
||||
}
|
||||
else {
|
||||
if (argc == 6) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
test_iterations[i] = atoi(argv[3 + i]);
|
||||
}
|
||||
}
|
||||
width = atoi (argv[1]);
|
||||
height = atoi (argv[2]);
|
||||
}
|
||||
|
||||
// Allocate space for output images
|
||||
img = new unsigned char[width * height * 3];
|
||||
fimg = new float[width * height * 3];
|
||||
|
||||
//
|
||||
// Run the ispc + tasks path, test_iterations times, and report the
|
||||
// minimum time for any of them.
|
||||
//
|
||||
double minTimeISPCTasks = 1e30;
|
||||
for (unsigned int i = 0; i < test_iterations[1]; i++) {
|
||||
ispc_memset(fimg, 0, sizeof(float) * width * height * 3);
|
||||
assert(NSUBSAMPLES == 2);
|
||||
|
||||
reset_and_start_timer();
|
||||
ispc::ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
|
||||
double t = get_elapsed_msec();
|
||||
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", t);
|
||||
minTimeISPCTasks = std::min(minTimeISPCTasks, t);
|
||||
}
|
||||
|
||||
// Report results and save image
|
||||
printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n",
|
||||
minTimeISPCTasks, width, height);
|
||||
savePPM("ao-ispc-tasks.ppm", width, height);
|
||||
|
||||
delete img;
|
||||
delete fimg;
|
||||
|
||||
return 0;
|
||||
}
|
||||
447
examples/portable/aobench/ao.cu
Normal file
447
examples/portable/aobench/ao.cu
Normal file
@@ -0,0 +1,447 @@
|
||||
// -*- mode: c++ -*-
|
||||
/*
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
|
||||
*/
|
||||
|
||||
#include "cuda_helpers.cuh"
|
||||
|
||||
#define NAO_SAMPLES 8
|
||||
//#define M_PI 3.1415926535f
|
||||
|
||||
#define vec Float3
|
||||
struct Float3
|
||||
{
|
||||
float x,y,z;
|
||||
|
||||
__device__ friend Float3 operator+(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x+b.x;
|
||||
c.y = a.y+b.y;
|
||||
c.z = a.z+b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator-(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x-b.x;
|
||||
c.y = a.y-b.y;
|
||||
c.z = a.z-b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator/(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x/b.x;
|
||||
c.y = a.y/b.y;
|
||||
c.z = a.z/b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator/(const float a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a/b.x;
|
||||
c.y = a/b.y;
|
||||
c.z = a/b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator*(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x*b.x;
|
||||
c.y = a.y*b.y;
|
||||
c.z = a.z*b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator*(const Float3 a, const float b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x*b;
|
||||
c.y = a.y*b;
|
||||
c.z = a.z*b;
|
||||
return c;
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// RNG stuff
|
||||
|
||||
struct RNGState {
|
||||
unsigned int z1, z2, z3, z4;
|
||||
};
|
||||
|
||||
__device__
|
||||
static inline unsigned int random(RNGState * state)
|
||||
{
|
||||
unsigned int b;
|
||||
|
||||
b = ((state->z1 << 6) ^ state->z1) >> 13;
|
||||
state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
|
||||
b = ((state->z2 << 2) ^ state->z2) >> 27;
|
||||
state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
|
||||
b = ((state->z3 << 13) ^ state->z3) >> 21;
|
||||
state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
|
||||
b = ((state->z4 << 3) ^ state->z4) >> 12;
|
||||
state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
|
||||
return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline float frandom(RNGState * state)
|
||||
{
|
||||
unsigned int irand = random(state);
|
||||
irand &= (1ul<<23)-1;
|
||||
return __int_as_float(0x3F800000 | irand)-1.0f;
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline void seed_rng(RNGState * state,
|
||||
unsigned int seed) {
|
||||
state->z1 = seed;
|
||||
state->z2 = seed ^ 0xbeeff00d;
|
||||
state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
|
||||
state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) |
|
||||
((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
|
||||
}
|
||||
|
||||
|
||||
|
||||
struct Isect {
|
||||
float t;
|
||||
vec p;
|
||||
vec n;
|
||||
int hit;
|
||||
};
|
||||
|
||||
struct Sphere {
|
||||
vec center;
|
||||
float radius;
|
||||
};
|
||||
|
||||
struct Plane {
|
||||
vec p;
|
||||
vec n;
|
||||
};
|
||||
|
||||
struct Ray {
|
||||
vec org;
|
||||
vec dir;
|
||||
};
|
||||
|
||||
__device__
|
||||
static inline float dot(vec a, vec b) {
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline vec vcross(vec v0, vec v1) {
|
||||
vec ret;
|
||||
ret.x = v0.y * v1.z - v0.z * v1.y;
|
||||
ret.y = v0.z * v1.x - v0.x * v1.z;
|
||||
ret.z = v0.x * v1.y - v0.y * v1.x;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline void vnormalize(vec &v) {
|
||||
float len2 = dot(v, v);
|
||||
float invlen = rsqrt(len2);
|
||||
v = v*invlen;
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline void
|
||||
ray_plane_intersect(Isect &isect,const Ray &ray, const Plane &plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
#if 0
|
||||
if (abs(v) < 1.0f-17)
|
||||
return;
|
||||
else {
|
||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||
|
||||
if ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = plane.n;
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (abs(v) <= 1.0e-17)
|
||||
return;
|
||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||
if ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = plane.n;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline void
|
||||
ray_sphere_intersect(Isect &isect,const Ray &ray, const Sphere &sphere) {
|
||||
vec rs = ray.org - sphere.center;
|
||||
|
||||
float B = dot(rs, ray.dir);
|
||||
float C = dot(rs, rs) - sphere.radius * sphere.radius;
|
||||
float D = B * B - C;
|
||||
|
||||
#if 0
|
||||
if (D > 0.) {
|
||||
float t = -B - sqrt(D);
|
||||
|
||||
if ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = isect.p - sphere.center;
|
||||
vnormalize(isect.n);
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (D <= 0.0f)
|
||||
return;
|
||||
|
||||
float t = -B - sqrt(D);
|
||||
|
||||
if ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = isect.p - sphere.center;
|
||||
vnormalize(isect.n);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline void
|
||||
orthoBasis(vec basis[3], vec n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0f; basis[1].y = 0.0f; basis[1].z = 0.0f;
|
||||
|
||||
if ((n.x < 0.6f) && (n.x > -0.6f)) {
|
||||
basis[1].x = 1.0f;
|
||||
} else if ((n.y < 0.6f) && (n.y > -0.6f)) {
|
||||
basis[1].y = 1.0f;
|
||||
} else if ((n.z < 0.6f) && (n.z > -0.6f)) {
|
||||
basis[1].z = 1.0f;
|
||||
} else {
|
||||
basis[1].x = 1.0f;
|
||||
}
|
||||
|
||||
basis[0] = vcross(basis[1], basis[2]);
|
||||
vnormalize(basis[0]);
|
||||
|
||||
basis[1] = vcross(basis[2], basis[0]);
|
||||
vnormalize(basis[1]);
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline float
|
||||
ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
float eps = 0.0001f;
|
||||
vec p; //, n;
|
||||
vec basis[3];
|
||||
float occlusion = 0.0f;
|
||||
|
||||
p = isect.p + isect.n * eps;
|
||||
|
||||
orthoBasis(basis, isect.n);
|
||||
|
||||
const int ntheta = NAO_SAMPLES;
|
||||
const int nphi = NAO_SAMPLES;
|
||||
for ( int j = 0; j < ntheta; j++) {
|
||||
for ( int i = 0; i < nphi; i++) {
|
||||
Ray ray;
|
||||
Isect occIsect;
|
||||
|
||||
float theta = sqrt(frandom(&rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(&rngstate);
|
||||
float x = cos(phi) * theta;
|
||||
float y = sin(phi) * theta;
|
||||
float z = sqrtf(1.0f - theta * theta);
|
||||
|
||||
// local . global
|
||||
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
|
||||
float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
|
||||
float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
|
||||
|
||||
ray.org = p;
|
||||
ray.dir.x = rx;
|
||||
ray.dir.y = ry;
|
||||
ray.dir.z = rz;
|
||||
|
||||
occIsect.t = 1.0f+17;
|
||||
occIsect.hit = 0;
|
||||
|
||||
for ( int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(occIsect, ray, spheres[snum]);
|
||||
ray_plane_intersect (occIsect, ray, plane);
|
||||
|
||||
if (occIsect.hit) occlusion += 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
|
||||
return occlusion;
|
||||
}
|
||||
|
||||
|
||||
/* Compute the image for the scanlines from [y0,y1), for an overall image
|
||||
of width w and height h.
|
||||
*/
|
||||
__device__
|
||||
static inline void ao_tiles(
|
||||
int x0, int x1,
|
||||
int y0, int y1,
|
||||
int w, int h,
|
||||
int nsubsamples,
|
||||
float image[])
|
||||
{
|
||||
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
const Sphere spheres[3] = {
|
||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
|
||||
float invSamples = 1.f / nsubsamples;
|
||||
for ( int y = y0; y < y1; y++)
|
||||
for ( int x = programIndex+x0; x < x1; x += programCount)
|
||||
{
|
||||
const int offset = 3 * (y * w + x);
|
||||
float res = 0.0f;
|
||||
|
||||
for ( int u = 0; u < nsubsamples; u++)
|
||||
for ( int v = 0; v < nsubsamples; v++)
|
||||
{
|
||||
float du = (float)u * invSamples, dv = (float)v * invSamples;
|
||||
|
||||
// Figure out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
float ret = 0.f;
|
||||
Ray ray;
|
||||
Isect isect;
|
||||
|
||||
ray.org.x = 0.0f;
|
||||
ray.org.y = 0.0f;
|
||||
ray.org.z = 0.0f;
|
||||
|
||||
// Poor man's perspective projection
|
||||
ray.dir.x = px;
|
||||
ray.dir.y = py;
|
||||
ray.dir.z = -1.0;
|
||||
vnormalize(ray.dir);
|
||||
|
||||
isect.t = 1.0e+17;
|
||||
isect.hit = 0;
|
||||
|
||||
for ( int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(isect, ray, spheres[snum]);
|
||||
ray_plane_intersect(isect, ray, plane);
|
||||
|
||||
// Note use of 'coherent' if statement; the set of rays we
|
||||
// trace will often all hit or all miss the scene
|
||||
if (any(isect.hit)) {
|
||||
ret = isect.hit*ambient_occlusion(isect, plane, spheres, rngstate);
|
||||
ret *= invSamples * invSamples;
|
||||
res += ret;
|
||||
}
|
||||
}
|
||||
|
||||
if (x < x1)
|
||||
{
|
||||
image[offset ] = res;
|
||||
image[offset+1] = res;
|
||||
image[offset+2] = res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define TILEX 64
|
||||
#define TILEY 4
|
||||
|
||||
extern "C"
|
||||
__global__
|
||||
void ao_task( int width, int height,
|
||||
int nsubsamples, float image[])
|
||||
{
|
||||
if (taskIndex0 >= taskCount0) return;
|
||||
if (taskIndex1 >= taskCount1) return;
|
||||
|
||||
const int x0 = taskIndex0 * TILEX;
|
||||
const int x1 = min(x0 + TILEX, width);
|
||||
|
||||
const int y0 = taskIndex1 * TILEY;
|
||||
const int y1 = min(y0 + TILEY, height);
|
||||
ao_tiles(x0,x1,y0,y1, width, height, nsubsamples, image);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
__global__
|
||||
void ao_ispc_tasks___export(
|
||||
int w, int h, int nsubsamples,
|
||||
float image[])
|
||||
{
|
||||
const int ntilex = (w+TILEX-1)/TILEX;
|
||||
const int ntiley = (h+TILEY-1)/TILEY;
|
||||
launch(ntilex,ntiley,1,ao_task)(w,h,nsubsamples,image);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
__host__ void ao_ispc_tasks(
|
||||
int w, int h, int nsubsamples,
|
||||
float image[])
|
||||
{
|
||||
ao_ispc_tasks___export<<<1,32>>>(w,h,nsubsamples,image);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
340
examples/portable/aobench/ao.ispc
Normal file
340
examples/portable/aobench/ao.ispc
Normal file
@@ -0,0 +1,340 @@
|
||||
// -*- mode: c++ -*-
|
||||
/*
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
|
||||
*/
|
||||
|
||||
#define NAO_SAMPLES 8
|
||||
#define M_PI 3.1415926535f
|
||||
|
||||
typedef float<3> vec;
|
||||
|
||||
#if 1
|
||||
#define __inline inline
|
||||
#else
|
||||
#define __inline
|
||||
#endif
|
||||
|
||||
struct Isect {
|
||||
float t;
|
||||
vec p;
|
||||
vec n;
|
||||
int hit;
|
||||
};
|
||||
|
||||
struct Sphere {
|
||||
vec center;
|
||||
float radius;
|
||||
};
|
||||
|
||||
struct Plane {
|
||||
vec p;
|
||||
vec n;
|
||||
};
|
||||
|
||||
struct Ray {
|
||||
vec org;
|
||||
vec dir;
|
||||
};
|
||||
|
||||
static inline float dot(vec a, vec b) {
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
static inline vec vcross(vec v0, vec v1) {
|
||||
vec ret;
|
||||
ret.x = v0.y * v1.z - v0.z * v1.y;
|
||||
ret.y = v0.z * v1.x - v0.x * v1.z;
|
||||
ret.z = v0.x * v1.y - v0.y * v1.x;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void vnormalize(vec &v) {
|
||||
float len2 = dot(v, v);
|
||||
float invlen = rsqrt(len2);
|
||||
v *= invlen;
|
||||
}
|
||||
|
||||
|
||||
__inline
|
||||
static void
|
||||
ray_plane_intersect(Isect &isect, Ray &ray, const Plane &plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
|
||||
#if 0
|
||||
cif (abs(v) < 1.0e-17)
|
||||
return;
|
||||
else {
|
||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||
|
||||
cif ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = plane.n;
|
||||
}
|
||||
}
|
||||
#else
|
||||
cif (abs(v) <= 1.0e-17)
|
||||
return;
|
||||
float t = -(dot(ray.org, plane.n) + d) / v;
|
||||
cif ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + ray.dir * t;
|
||||
isect.n = plane.n;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
ray_sphere_intersect(Isect &isect, Ray &ray, const Sphere &sphere) {
|
||||
vec rs = ray.org - sphere.center;
|
||||
|
||||
float B = dot(rs, ray.dir);
|
||||
float C = dot(rs, rs) - sphere.radius * sphere.radius;
|
||||
float D = B * B - C;
|
||||
|
||||
#if 0
|
||||
cif (D > 0.) {
|
||||
float t = -B - sqrt(D);
|
||||
|
||||
cif ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + t * ray.dir;
|
||||
isect.n = isect.p - sphere.center;
|
||||
vnormalize(isect.n);
|
||||
}
|
||||
}
|
||||
#else
|
||||
cif (D <=0.0f)
|
||||
return;
|
||||
|
||||
float t = -B - sqrt(D);
|
||||
cif ((t > 0.0) && (t < isect.t)) {
|
||||
isect.t = t;
|
||||
isect.hit = 1;
|
||||
isect.p = ray.org + t * ray.dir;
|
||||
isect.n = isect.p - sphere.center;
|
||||
vnormalize(isect.n);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
__inline
|
||||
static void
|
||||
orthoBasis(vec basis[3], vec n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||
|
||||
if ((n.x < 0.6) && (n.x > -0.6)) {
|
||||
basis[1].x = 1.0;
|
||||
} else if ((n.y < 0.6) && (n.y > -0.6)) {
|
||||
basis[1].y = 1.0;
|
||||
} else if ((n.z < 0.6) && (n.z > -0.6)) {
|
||||
basis[1].z = 1.0;
|
||||
} else {
|
||||
basis[1].x = 1.0;
|
||||
}
|
||||
|
||||
basis[0] = vcross(basis[1], basis[2]);
|
||||
vnormalize(basis[0]);
|
||||
|
||||
basis[1] = vcross(basis[2], basis[0]);
|
||||
vnormalize(basis[1]);
|
||||
}
|
||||
|
||||
|
||||
__inline
|
||||
static float
|
||||
ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
float eps = 0.0001f;
|
||||
vec p, n;
|
||||
vec basis[3];
|
||||
float occlusion = 0.0;
|
||||
|
||||
p = isect.p + eps * isect.n;
|
||||
|
||||
orthoBasis(basis, isect.n);
|
||||
|
||||
static const uniform int ntheta = NAO_SAMPLES;
|
||||
static const uniform int nphi = NAO_SAMPLES;
|
||||
for (uniform int j = 0; j < ntheta; j++) {
|
||||
for (uniform int i = 0; i < nphi; i++) {
|
||||
Ray ray;
|
||||
Isect occIsect;
|
||||
|
||||
float theta = sqrt(frandom(&rngstate));
|
||||
float phi = 2.0f * M_PI * frandom(&rngstate);
|
||||
float x = cos(phi) * theta;
|
||||
float y = sin(phi) * theta;
|
||||
float z = sqrt(1.0 - theta * theta);
|
||||
|
||||
// local . global
|
||||
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
|
||||
float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
|
||||
float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
|
||||
|
||||
ray.org = p;
|
||||
ray.dir.x = rx;
|
||||
ray.dir.y = ry;
|
||||
ray.dir.z = rz;
|
||||
|
||||
occIsect.t = 1.0e+17;
|
||||
occIsect.hit = 0;
|
||||
|
||||
for (uniform int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(occIsect, ray, spheres[snum]);
|
||||
ray_plane_intersect (occIsect, ray, plane);
|
||||
|
||||
if (occIsect.hit) occlusion += 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
|
||||
return occlusion;
|
||||
}
|
||||
|
||||
static inline void ao_tiles(
|
||||
uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int w, uniform int h,
|
||||
uniform int nsubsamples,
|
||||
uniform float image[])
|
||||
{
|
||||
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
|
||||
const Sphere spheres[3] = {
|
||||
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
|
||||
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
|
||||
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
|
||||
float invSamples = 1.f / nsubsamples;
|
||||
foreach_tiled (y = y0 ... y1, x = x0 ... x1)
|
||||
{
|
||||
const int offset = 3 * (y * w + x);
|
||||
float res = 0.0f;
|
||||
|
||||
for (uniform int u = 0; u < nsubsamples; u++)
|
||||
for (uniform int v = 0; v < nsubsamples; v++)
|
||||
{
|
||||
float du = (float)u * invSamples, dv = (float)v * invSamples;
|
||||
|
||||
// Figure out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
float ret = 0.f;
|
||||
Ray ray;
|
||||
Isect isect;
|
||||
|
||||
ray.org = 0.f;
|
||||
|
||||
// Poor man's perspective projection
|
||||
ray.dir.x = px;
|
||||
ray.dir.y = py;
|
||||
ray.dir.z = -1.0;
|
||||
vnormalize(ray.dir);
|
||||
|
||||
isect.t = 1.0e+17;
|
||||
isect.hit = 0;
|
||||
|
||||
for (uniform int snum = 0; snum < 3; ++snum)
|
||||
ray_sphere_intersect(isect, ray, spheres[snum]);
|
||||
ray_plane_intersect(isect, ray, plane);
|
||||
|
||||
// Note use of 'coherent' if statement; the set of rays we
|
||||
// trace will often all hit or all miss the scene
|
||||
#if 0
|
||||
cif (isect.hit) {
|
||||
ret = ambient_occlusion(isect, plane, spheres, rngstate);
|
||||
ret *= invSamples * invSamples;
|
||||
res += ret;
|
||||
}
|
||||
#else
|
||||
if(any(isect.hit))
|
||||
{
|
||||
ret = isect.hit*ambient_occlusion(isect, plane, spheres, rngstate);
|
||||
ret *= invSamples * invSamples;
|
||||
res += ret;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
image[offset ] = res;
|
||||
image[offset+1] = res;
|
||||
image[offset+2] = res;
|
||||
}
|
||||
}
|
||||
|
||||
#define TILEX max(64,programCount*2)
|
||||
#define TILEY 4
|
||||
|
||||
export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[]) {
|
||||
const uniform int x0 = 0;
|
||||
const uniform int x1 = w;
|
||||
const uniform int y0 = 0;
|
||||
const uniform int y1 = h;
|
||||
ao_tiles(x0,x1,y0,y1, w, h, nsubsamples, image);
|
||||
}
|
||||
|
||||
void task ao_task(uniform int width, uniform int height,
|
||||
uniform int nsubsamples, uniform float image[])
|
||||
{
|
||||
if (taskIndex0 >= taskCount0) return;
|
||||
if (taskIndex1 >= taskCount1) return;
|
||||
|
||||
const uniform int x0 = taskIndex0 * TILEX;
|
||||
const uniform int x1 = min(x0 + TILEX, width);
|
||||
|
||||
const uniform int y0 = taskIndex1 * TILEY;
|
||||
const uniform int y1 = min(y0 + TILEY, height);
|
||||
ao_tiles(x0,x1,y0,y1, width, height, nsubsamples, image);
|
||||
}
|
||||
|
||||
|
||||
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
|
||||
uniform float image[])
|
||||
{
|
||||
const uniform int ntilex = (w+TILEX-1)/TILEX;
|
||||
const uniform int ntiley = (h+TILEY-1)/TILEY;
|
||||
launch[ntilex,ntiley] ao_task(w, h, nsubsamples, image);
|
||||
sync;
|
||||
}
|
||||
122
examples/portable/common_cpu.mk
Normal file
122
examples/portable/common_cpu.mk
Normal file
@@ -0,0 +1,122 @@
|
||||
|
||||
TASK_CXX=../omp_tasksys.cpp ../../util/ispc_malloc.cpp
|
||||
TASK_LIB=-lpthread
|
||||
TASK_OBJ=objs/omp_tasksys.o objs/ispc_malloc.o
|
||||
|
||||
CXX=clang++
|
||||
CXX=icc -openmp
|
||||
CXXFLAGS+=-Iobjs/ -O2 -I../../ -I../../util
|
||||
CXXFLAGS+=-DISPC_USE_OMP
|
||||
CC=clang
|
||||
CC=icc -openmp
|
||||
CCFLAGS+=-Iobjs/ -O2 -I../../ -I../../util
|
||||
CCFLAGS+=-DISPC_USE_OMP
|
||||
|
||||
LIBS=-lm $(TASK_LIB) -lstdc++
|
||||
ISPC=ispc
|
||||
ISPC_FLAGS+=-O2
|
||||
ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
|
||||
|
||||
ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
|
||||
|
||||
ifeq ($(ARCH),x86)
|
||||
ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o)
|
||||
COMMA=,
|
||||
ifneq (,$(findstring $(COMMA),$(ISPC_IA_TARGETS)))
|
||||
#$(info multi-target detected: $(ISPC_IA_TARGETS))
|
||||
ifneq (,$(findstring sse2,$(ISPC_IA_TARGETS)))
|
||||
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse2.o)
|
||||
endif
|
||||
ifneq (,$(findstring sse4,$(ISPC_IA_TARGETS)))
|
||||
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse4.o)
|
||||
endif
|
||||
ifneq (,$(findstring avx1-,$(ISPC_IA_TARGETS)))
|
||||
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx.o)
|
||||
endif
|
||||
ifneq (,$(findstring avx1.1,$(ISPC_IA_TARGETS)))
|
||||
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx11.o)
|
||||
endif
|
||||
ifneq (,$(findstring avx2,$(ISPC_IA_TARGETS)))
|
||||
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx2.o)
|
||||
endif
|
||||
endif
|
||||
ISPC_TARGETS=$(ISPC_IA_TARGETS)
|
||||
ARCH_BIT:=$(shell getconf LONG_BIT)
|
||||
ifeq ($(ARCH_BIT),32)
|
||||
ISPC_FLAGS += --arch=x86
|
||||
CXXFLAGS += -m32
|
||||
CCFLAGS += -m32
|
||||
else
|
||||
ISPC_FLAGS += --arch=x86-64
|
||||
CXXFLAGS += -m64
|
||||
CCFLAGS += -m64
|
||||
endif
|
||||
else ifeq ($(ARCH),arm)
|
||||
ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o))
|
||||
ISPC_TARGETS=$(ISPC_ARM_TARGETS)
|
||||
else
|
||||
$(error Unknown architecture $(ARCH) from uname -m)
|
||||
endif
|
||||
|
||||
CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))
|
||||
CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))
|
||||
OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)
|
||||
|
||||
default: $(EXAMPLE)
|
||||
|
||||
all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs/
|
||||
|
||||
objs/%.cpp objs/%.o objs/%.h: dirs
|
||||
|
||||
clean:
|
||||
/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test
|
||||
|
||||
$(EXAMPLE): $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
|
||||
objs/%.o: %.cpp dirs $(ISPC_HEADER)
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: %.c dirs $(ISPC_HEADER)
|
||||
$(CC) $< $(CCFLAGS) -c -o $@
|
||||
|
||||
objs/%.o: ../%.cpp dirs
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
objs/%.o: ../../%.cpp dirs
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
objs/%.o: ../../util/%.cpp dirs
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h dirs
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o: %.ispc dirs
|
||||
$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
|
||||
$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
|
||||
$(CXX) -I../../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
$(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
|
||||
$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
|
||||
$(CXX) -I../../intrinsics $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
$(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
|
||||
$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1
|
||||
|
||||
$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
52
examples/portable/common_knc.mk
Normal file
52
examples/portable/common_knc.mk
Normal file
@@ -0,0 +1,52 @@
|
||||
TASK_CXX=../omp_tasksys.cpp ../../util/ispc_malloc.cpp
|
||||
TASK_OBJ=objs_knc/omp_tasksys.o objs_knc/ispc_malloc.o
|
||||
TASK_LIB=-openmp
|
||||
|
||||
CXX=icc -openmp -mmic
|
||||
CXXFLAGS+=-Iobjs_knc/ -O2 -I../../ -I../../util -I./
|
||||
CXXFLAGS+= -DISPC_USE_OMP
|
||||
CC=icc -openmp -mmic
|
||||
CCFLAGS+= -Iobjs_knc/ -O2 -I../../ -I../../util -I./
|
||||
CCFLAGS+=-DISPC_USE_OMP
|
||||
|
||||
LD=icc -mmic -openmp
|
||||
|
||||
LIBS=-lm $(TASK_LIB) -lstdc++
|
||||
ISPC=ispc
|
||||
ISPC_FLAGS+=-O2
|
||||
ISPC_FLAGS+= --target=$(ISPC_TARGET) --c++-include-file=$(ISPC_INTRINSICS)
|
||||
|
||||
ISPC_HEADERS=$(ISPC_SRC:%.ispc=objs_knc/%_ispc.h)
|
||||
ISPC_OBJ=$(ISPC_SRC:%.ispc=objs_knc/%_ispc.o)
|
||||
CXX_OBJ=$(CXX_SRC:%.cpp=objs_knc/%.o)
|
||||
CXX_OBJ+=$(TASK_OBJ)
|
||||
|
||||
PROG=$(EXAMPLE)_knc
|
||||
|
||||
all: dirs $(PROG)
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs_knc/
|
||||
|
||||
objs_knc/%.cpp objs_knc/%.o objs_knc/%.h: dirs
|
||||
|
||||
clean:
|
||||
/bin/rm -rf $(PROG) objs_knc
|
||||
|
||||
$(PROG): $(ISPC_OBJ) $(CXX_OBJ)
|
||||
$(LD) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
objs_knc/%.o: %.cpp
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs_knc/%.o: ../%.cpp
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
objs_knc/%.o: ../../%.cpp
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
objs_knc/%.o: ../../util/%.cpp
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
objs_knc/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPC_FLAGS) --emit-c++ -o objs_knc/$*_ispc_zmm.cpp -h objs_knc/$*_ispc.h $<
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs_knc/$*_ispc_zmm.cpp -c
|
||||
|
||||
136
examples/portable/common_ptx.mk
Normal file
136
examples/portable/common_ptx.mk
Normal file
@@ -0,0 +1,136 @@
|
||||
NVCC_SRC=../../util/nvcc_helpers.cu
|
||||
NVCC_OBJS=objs_ptx/nvcc_helpers_nvcc.o
|
||||
#
|
||||
CXX=g++ -ffast-math
|
||||
CXXFLAGS=-O3 -I$(CUDATK)/include -Iobjs_ptx/ -D_CUDA_ -I../../util -I../../
|
||||
#
|
||||
NVCC=nvcc
|
||||
NVCC_FLAGS+=-O3 -arch=sm_35 -D_CUDA_ -I../../util -Xptxas=-v -Iobjs_ptx/
|
||||
ifdef PTXCC_REGMAX
|
||||
NVCC_FLAGS += --maxrregcount=$(PTXCC_REGMAX)
|
||||
endif
|
||||
NVCC_FLAGS+=--use_fast_math
|
||||
#
|
||||
LD=nvcc
|
||||
LDFLAGS=-lcudart -lcudadevrt -arch=sm_35
|
||||
#
|
||||
PTXCC=$(ISPC_HOME)/ptxtools/ptxcc
|
||||
PTXCC_FLAGS+= -Xptxas=-v
|
||||
ifdef PTXCC_REGMAX
|
||||
PTXCC_FLAGS += -maxrregcount=$(PTXCC_REGMAX)
|
||||
endif
|
||||
|
||||
#
|
||||
ISPC=$(ISPC_HOME)/ispc
|
||||
ISPC_FLAGS+=-O3 --math-lib=fast --target=nvptx --opt=fast-math
|
||||
#
|
||||
#
|
||||
#
|
||||
ISPC_LLVM_OBJS=$(ISPC_SRC:%.ispc=objs_ptx/%_llvm_ispc.o)
|
||||
ISPC_NVVM_OBJS=$(ISPC_SRC:%.ispc=objs_ptx/%_nvvm_ispc.o)
|
||||
#ISPC_BCS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.bc)
|
||||
ISPC_LLS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.ll)
|
||||
ISPC_LLVM_PTX=$(ISPC_SRC:%.ispc=objs_ptx/%_llvm_ispc.ptx)
|
||||
ISPC_NVVM_PTX=$(ISPC_SRC:%.ispc=objs_ptx/%_nvvm_ispc.ptx)
|
||||
ISPC_HEADERS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.h)
|
||||
CXX_OBJS=$(CXX_SRC:%.cpp=objs_ptx/%_gcc.o)
|
||||
CU_OBJS=$(CU_SRC:%.cu=objs_ptx/%_cu.o)
|
||||
#NVCC_OBJS=$(NVCC_SRC:%.cu=objs_ptx/%_nvcc.o)
|
||||
|
||||
CXX_SRC+=ispc_malloc.cpp
|
||||
CXX_OBJS+=objs_ptx/ispc_malloc_gcc.o
|
||||
|
||||
PTXGEN = $(ISPC_HOME)/ptxtools/ptxgen
|
||||
PTXGEN += --use_fast_math
|
||||
|
||||
#LLVM32=$(HOME)/usr/local/llvm/bin-3.2
|
||||
#LLVM32DIS=$(LLVM32)/bin/llvm-dis
|
||||
|
||||
LLC=$(LLVM_ROOT)/bin/llc
|
||||
LLC_FLAGS=-march=nvptx64 -mcpu=sm_35
|
||||
|
||||
# .SUFFIXES: .bc .o .cu .ll
|
||||
|
||||
ifdef LLVM_GPU
|
||||
OBJSptx_llvm=$(ISPC_LLVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS)
|
||||
PROGptx_llvm=$(PROG)_llvm_ptx
|
||||
else
|
||||
ISPC_LLVM_PTX=
|
||||
endif
|
||||
|
||||
|
||||
ifdef NVVM_GPU
|
||||
OBJSptx_nvvm=$(ISPC_NVVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS) $(ISPC_LVVM_PTX)
|
||||
PROGptx_nvvm=$(PROG)_nvvm_ptx
|
||||
else
|
||||
ISPC_NVVM_PTX=
|
||||
endif
|
||||
|
||||
ifdef CU_SRC
|
||||
OBJScu=$(CU_OBJS) $(CXX_OBJS) $(NVCC_OBJS)
|
||||
PROGcu=$(PROG)_cu
|
||||
endif
|
||||
|
||||
|
||||
all: dirs \
|
||||
$(PROGptx_nvvm) \
|
||||
$(PROGptx_llvm) \
|
||||
$(PROGcu) $(ISPC_BCS) $(ISPC_LLS) $(ISPC_HEADERS) $(ISPC_NVVM_PTX) $(ISPC_LLVM_PTX)
|
||||
|
||||
dirs:
|
||||
/bin/mkdir -p objs_ptx/
|
||||
|
||||
objs_ptx/%.cpp objs_ptx/%.o objs_ptx/%.h: dirs
|
||||
|
||||
clean:
|
||||
/bin/rm -rf $(PROGptx_nvvm) $(PROGptx_llvm) $(PROGcu) objs_ptx
|
||||
|
||||
# generate binaries
|
||||
$(PROGptx_llvm): $(OBJSptx_llvm)
|
||||
$(LD) -o $@ $^ $(LDFLAGS)
|
||||
$(PROGptx_nvvm): $(OBJSptx_nvvm)
|
||||
$(LD) -o $@ $^ $(LDFLAGS)
|
||||
$(PROGcu): $(OBJScu)
|
||||
$(LD) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
# compile C++ code
|
||||
objs_ptx/%_gcc.o: %.cpp $(ISPC_HEADERS)
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
objs_ptx/%_gcc.o: ../../util/%.cpp
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
# CUDA helpers
|
||||
objs_ptx/%_cu.o: %.cu $(ISPC_HEADERS)
|
||||
$(NVCC) $(NVCC_FLAGS) -o $@ -dc $<
|
||||
|
||||
# compile CUDA code
|
||||
objs_ptx/%_nvcc.o: ../../util/%.cu
|
||||
$(NVCC) $(NVCC_FLAGS) -o $@ -c $<
|
||||
objs_ptx/%_nvcc.o: %.cu
|
||||
$(NVCC) $(NVCC_FLAGS) -o $@ -c $<
|
||||
|
||||
# compile ISPC to LLVM BC
|
||||
#objs_ptx/%_ispc.h objs_ptx/%_ispc.bc: %.ispc
|
||||
# $(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.bc $<
|
||||
objs_ptx/%_ispc.h objs_ptx/%_ispc.ll: %.ispc
|
||||
$(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.ll $<
|
||||
|
||||
# generate PTX from LLVM BC
|
||||
#objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.bc
|
||||
# $(LLC) $(LLC_FLAGS) -o $@ $<
|
||||
objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.ll
|
||||
$(LLC) $(LLC_FLAGS) -o $@ $<
|
||||
#objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.bc
|
||||
# $(LLVM32DIS) $< -o objs_ptx/$*_ispc-ll32.ll
|
||||
# $(PTXGEN) objs_ptx/$*_ispc-ll32.ll -o $@
|
||||
objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.ll
|
||||
$(PTXGEN) $< -o $@
|
||||
|
||||
# generate an object file from PTX
|
||||
objs_ptx/%_ispc.o: objs_ptx/%_ispc.ptx
|
||||
$(PTXCC) $< -Xnvcc="$(PTXCC_FLAGS)" -o $@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
10
examples/portable/deferred/Makefile_cpu
Normal file
10
examples/portable/deferred/Makefile_cpu
Normal file
@@ -0,0 +1,10 @@
|
||||
|
||||
EXAMPLE=deferred_shading
|
||||
CPP_SRC=common.cpp main.cpp dynamic_c.cpp
|
||||
# CPP_SRC+=dynamic_cilk.cpp
|
||||
ISPC_SRC=kernels.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x16
|
||||
ISPC_ARM_TARGETS=neon
|
||||
ISPC_FLAGS=--opt=fast-math
|
||||
|
||||
include ../common_cpu.mk
|
||||
8
examples/portable/deferred/Makefile_knc
Normal file
8
examples/portable/deferred/Makefile_knc
Normal file
@@ -0,0 +1,8 @@
|
||||
EXAMPLE=deferred_shading
|
||||
CXX_SRC=common.cpp main.cpp dynamic_c.cpp
|
||||
ISPC_SRC=kernels.ispc
|
||||
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
|
||||
ISPC_TARGET=generic-16
|
||||
ISPC_FLAGS=--opt=fast-math
|
||||
|
||||
include ../common_knc.mk
|
||||
13
examples/portable/deferred/Makefile_ptx
Normal file
13
examples/portable/deferred/Makefile_ptx
Normal file
@@ -0,0 +1,13 @@
|
||||
PROG=deferred_shading
|
||||
ISPC_SRC=kernels.ispc
|
||||
CU_SRC=kernels.cu
|
||||
CXX_SRC=common.cpp main.cpp
|
||||
PTXCC_REGMAX=64
|
||||
|
||||
NVVM_GPU=1
|
||||
#LLVM_GPU=1
|
||||
|
||||
include ../common_ptx.mk
|
||||
|
||||
|
||||
|
||||
222
examples/portable/deferred/common.cpp
Normal file
222
examples/portable/deferred/common.cpp
Normal file
@@ -0,0 +1,222 @@
|
||||
/*
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#include "deferred.h"
|
||||
#include "timing.h"
|
||||
#include "ispc_malloc.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||
#ifndef _CUDA_
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
#else
|
||||
void *ptr;
|
||||
ispc_malloc(&ptr, size);
|
||||
return ptr;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lAlignedFree(void *ptr) {
|
||||
#ifndef _CUDA_
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
#else
|
||||
ispc_free(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
Framebuffer::Framebuffer(int width, int height) {
|
||||
nPixels = width*height;
|
||||
r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||
g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||
b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
|
||||
}
|
||||
|
||||
|
||||
Framebuffer::~Framebuffer() {
|
||||
lAlignedFree(r);
|
||||
lAlignedFree(g);
|
||||
lAlignedFree(b);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Framebuffer::clear() {
|
||||
memset(r, 0, nPixels);
|
||||
memset(g, 0, nPixels);
|
||||
memset(b, 0, nPixels);
|
||||
}
|
||||
|
||||
|
||||
InputData *
|
||||
CreateInputDataFromFile(const char *path) {
|
||||
FILE *in = fopen(path, "rb");
|
||||
if (!in) return 0;
|
||||
|
||||
InputData *input = new InputData;
|
||||
|
||||
// Load header
|
||||
if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
|
||||
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Load data chunk and update pointers
|
||||
input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
|
||||
ALIGNMENT_BYTES);
|
||||
if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
|
||||
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
input->arrays.zBuffer =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
|
||||
input->arrays.normalEncoded_x =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
|
||||
input->arrays.normalEncoded_y =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
|
||||
input->arrays.specularAmount =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
|
||||
input->arrays.specularPower =
|
||||
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
|
||||
input->arrays.albedo_x =
|
||||
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
|
||||
input->arrays.albedo_y =
|
||||
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
|
||||
input->arrays.albedo_z =
|
||||
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
|
||||
input->arrays.lightPositionView_x =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
|
||||
input->arrays.lightPositionView_y =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
|
||||
input->arrays.lightPositionView_z =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
|
||||
input->arrays.lightAttenuationBegin =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
|
||||
input->arrays.lightColor_x =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
|
||||
input->arrays.lightColor_y =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
|
||||
input->arrays.lightColor_z =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
|
||||
input->arrays.lightAttenuationEnd =
|
||||
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
|
||||
|
||||
fclose(in);
|
||||
return input;
|
||||
}
|
||||
|
||||
|
||||
void DeleteInputData(InputData *input) {
|
||||
lAlignedFree(input->chunk);
|
||||
}
|
||||
|
||||
|
||||
void WriteFrame(const char *filename, const InputData *input,
|
||||
const Framebuffer &framebuffer) {
|
||||
// Deswizzle and copy to RGBA output
|
||||
// Doesn't need to be fast... only happens once
|
||||
size_t imageBytes = 3 * input->header.framebufferWidth *
|
||||
input->header.framebufferHeight;
|
||||
uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
|
||||
memset(framebufferAOS, 0, imageBytes);
|
||||
|
||||
for (int i = 0; i < input->header.framebufferWidth *
|
||||
input->header.framebufferHeight; ++i) {
|
||||
framebufferAOS[3 * i + 0] = framebuffer.r[i];
|
||||
framebufferAOS[3 * i + 1] = framebuffer.g[i];
|
||||
framebufferAOS[3 * i + 2] = framebuffer.b[i];
|
||||
}
|
||||
|
||||
// Write out simple PPM file
|
||||
FILE *out = fopen(filename, "wb");
|
||||
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
fwrite(framebufferAOS, imageBytes, 1, out);
|
||||
fclose(out);
|
||||
|
||||
lAlignedFree(framebufferAOS);
|
||||
}
|
||||
1
examples/portable/deferred/data
Symbolic link
1
examples/portable/deferred/data
Symbolic link
@@ -0,0 +1 @@
|
||||
../../deferred/data
|
||||
108
examples/portable/deferred/deferred.h
Normal file
108
examples/portable/deferred/deferred.h
Normal file
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DEFERRED_H
|
||||
#define DEFERRED_H
|
||||
|
||||
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||
#define MIN_TILE_WIDTH 64
|
||||
#define MIN_TILE_HEIGHT 16
|
||||
#define MAX_LIGHTS 1024
|
||||
|
||||
enum InputDataArraysEnum {
|
||||
idaZBuffer = 0,
|
||||
idaNormalEncoded_x,
|
||||
idaNormalEncoded_y,
|
||||
idaSpecularAmount,
|
||||
idaSpecularPower,
|
||||
idaAlbedo_x,
|
||||
idaAlbedo_y,
|
||||
idaAlbedo_z,
|
||||
idaLightPositionView_x,
|
||||
idaLightPositionView_y,
|
||||
idaLightPositionView_z,
|
||||
idaLightAttenuationBegin,
|
||||
idaLightColor_x,
|
||||
idaLightColor_y,
|
||||
idaLightColor_z,
|
||||
idaLightAttenuationEnd,
|
||||
|
||||
idaNum
|
||||
};
|
||||
|
||||
#ifndef ISPC
|
||||
|
||||
#include <stdint.h>
|
||||
#include "kernels_ispc.h"
|
||||
|
||||
#define ALIGNMENT_BYTES 64
|
||||
|
||||
#define MAX_LIGHTS 1024
|
||||
|
||||
#define VISUALIZE_LIGHT_COUNT 0
|
||||
|
||||
struct InputData
|
||||
{
|
||||
ispc::InputHeader header;
|
||||
ispc::InputDataArrays arrays;
|
||||
uint8_t *chunk;
|
||||
};
|
||||
|
||||
|
||||
struct Framebuffer {
|
||||
Framebuffer(int width, int height);
|
||||
~Framebuffer();
|
||||
|
||||
void clear();
|
||||
|
||||
uint8_t *r, *g, *b;
|
||||
|
||||
private:
|
||||
int nPixels;
|
||||
Framebuffer(const Framebuffer &);
|
||||
Framebuffer &operator=(const Framebuffer *);
|
||||
};
|
||||
|
||||
|
||||
InputData *CreateInputDataFromFile(const char *path);
|
||||
void DeleteInputData(InputData *input);
|
||||
void WriteFrame(const char *filename, const InputData *input,
|
||||
const Framebuffer &framebuffer);
|
||||
void InitDynamicC(InputData *input);
|
||||
void InitDynamicCilk(InputData *input);
|
||||
void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
|
||||
void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
|
||||
|
||||
#endif // !ISPC
|
||||
|
||||
#endif // DEFERRED_H
|
||||
874
examples/portable/deferred/dynamic_c.cpp
Normal file
874
examples/portable/deferred/dynamic_c.cpp
Normal file
@@ -0,0 +1,874 @@
|
||||
/*
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
#include <algorithm>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif // ISPC_IS_LINUX
|
||||
|
||||
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||
#ifndef MIN_TILE_WIDTH
|
||||
#define MIN_TILE_WIDTH 16
|
||||
#endif
|
||||
#ifndef MIN_TILE_HEIGHT
|
||||
#define MIN_TILE_HEIGHT 16
|
||||
#endif
|
||||
|
||||
|
||||
#define DYNAMIC_TREE_LEVELS 5
|
||||
// If this is set to 1 then the result will be identical to the static version
|
||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lAlignedFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ComputeZBounds(int tileStartX, int tileEndX,
|
||||
int tileStartY, int tileEndY,
|
||||
// G-buffer data
|
||||
float zBuffer[],
|
||||
int gBufferWidth,
|
||||
// Camera data
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar,
|
||||
// Output
|
||||
float *minZ, float *maxZ)
|
||||
{
|
||||
// Find Z bounds
|
||||
float laneMinZ = cameraFar;
|
||||
float laneMaxZ = cameraNear;
|
||||
for (int y = tileStartY; y < tileEndY; ++y) {
|
||||
for (int x = tileStartX; x < tileEndX; ++x) {
|
||||
// Unproject depth buffer Z value into view space
|
||||
float z = zBuffer[(y * gBufferWidth + x)];
|
||||
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
||||
|
||||
// Work out Z bounds for our samples
|
||||
// Avoid considering skybox/background or otherwise invalid pixels
|
||||
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
|
||||
laneMinZ = std::min(laneMinZ, viewSpaceZ);
|
||||
laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
*minZ = laneMinZ;
|
||||
*maxZ = laneMaxZ;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
|
||||
int numTilesX, int numTilesY,
|
||||
// G-buffer data
|
||||
float zBuffer[],
|
||||
int gBufferWidth,
|
||||
// Camera data
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar,
|
||||
// Output
|
||||
float minZArray[],
|
||||
float maxZArray[])
|
||||
{
|
||||
for (int tileX = 0; tileX < numTilesX; ++tileX) {
|
||||
float minZ, maxZ;
|
||||
ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
|
||||
cameraNear, cameraFar, &minZ, &maxZ);
|
||||
minZArray[tileX] = minZ;
|
||||
maxZArray[tileX] = maxZ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class MinMaxZTree
|
||||
{
|
||||
public:
|
||||
// Currently (min) tile dimensions must divide gBuffer dimensions evenly
|
||||
// Levels must be small enough that neither dimension goes below one tile
|
||||
MinMaxZTree(
|
||||
int tileWidth, int tileHeight, int levels,
|
||||
int gBufferWidth, int gBufferHeight)
|
||||
: mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
|
||||
{
|
||||
mNumTilesX = gBufferWidth / mTileWidth;
|
||||
mNumTilesY = gBufferHeight / mTileHeight;
|
||||
|
||||
// Allocate arrays
|
||||
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
int x = NumTilesX(i);
|
||||
int y = NumTilesY(i);
|
||||
assert(x > 0);
|
||||
assert(y > 0);
|
||||
// NOTE: If the following two asserts fire it probably means that
|
||||
// the base tile dimensions do not evenly divide the G-buffer dimensions
|
||||
assert(x * (mTileWidth << i) >= gBufferWidth);
|
||||
assert(y * (mTileHeight << i) >= gBufferHeight);
|
||||
mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
}
|
||||
}
|
||||
|
||||
void Update(float *zBuffer, int gBufferPitchInElements,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar)
|
||||
{
|
||||
for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
|
||||
ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
|
||||
zBuffer, gBufferPitchInElements,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
mMinZArrays[0] + (tileY * mNumTilesX),
|
||||
mMaxZArrays[0] + (tileY * mNumTilesX));
|
||||
}
|
||||
|
||||
// Generate other levels
|
||||
for (int level = 1; level < mLevels; ++level) {
|
||||
int destTilesX = NumTilesX(level);
|
||||
int destTilesY = NumTilesY(level);
|
||||
int srcLevel = level - 1;
|
||||
int srcTilesX = NumTilesX(srcLevel);
|
||||
int srcTilesY = NumTilesY(srcLevel);
|
||||
for (int y = 0; y < destTilesY; ++y) {
|
||||
for (int x = 0; x < destTilesX; ++x) {
|
||||
int srcX = x << 1;
|
||||
int srcY = y << 1;
|
||||
// NOTE: Ugly branches to deal with non-multiple dimensions at some levels
|
||||
// TODO: SSE branchless min/max is probably better...
|
||||
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
if (srcX + 1 < srcTilesX) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
}
|
||||
}
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
}
|
||||
mMinZArrays[level][y * destTilesX + x] = minZ;
|
||||
mMaxZArrays[level][y * destTilesX + x] = maxZ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~MinMaxZTree() {
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
lAlignedFree(mMinZArrays[i]);
|
||||
lAlignedFree(mMaxZArrays[i]);
|
||||
}
|
||||
lAlignedFree(mMinZArrays);
|
||||
lAlignedFree(mMaxZArrays);
|
||||
}
|
||||
|
||||
int Levels() const { return mLevels; }
|
||||
|
||||
// These round UP, so beware that the last tile for a given level may not be completely full
|
||||
// TODO: Verify this...
|
||||
int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
|
||||
int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
|
||||
int TileWidth(int level = 0) const { return (mTileWidth << level); }
|
||||
int TileHeight(int level = 0) const { return (mTileHeight << level); }
|
||||
|
||||
float MinZ(int level, int tileX, int tileY) const {
|
||||
return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
float MaxZ(int level, int tileX, int tileY) const {
|
||||
return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
|
||||
private:
|
||||
int mTileWidth;
|
||||
int mTileHeight;
|
||||
int mLevels;
|
||||
int mNumTilesX;
|
||||
int mNumTilesY;
|
||||
|
||||
// One array for each "level" in the tree
|
||||
float **mMinZArrays;
|
||||
float **mMaxZArrays;
|
||||
};
|
||||
|
||||
static MinMaxZTree *gMinMaxZTree = 0;
|
||||
|
||||
void InitDynamicC(InputData *input) {
|
||||
gMinMaxZTree =
|
||||
new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
|
||||
input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
}
|
||||
|
||||
|
||||
/* We're going to split a tile into 4 sub-tiles. This function
|
||||
reclassifies the tile's lights with respect to the sub-tiles. */
|
||||
static void
|
||||
SplitTileMinMax(
|
||||
int tileMidX, int tileMidY,
|
||||
// Subtile data (00, 10, 01, 11)
|
||||
float subtileMinZ[],
|
||||
float subtileMaxZ[],
|
||||
// G-buffer data
|
||||
int gBufferWidth, int gBufferHeight,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
// Light Data
|
||||
int lightIndices[],
|
||||
int numLights,
|
||||
float light_positionView_x_array[],
|
||||
float light_positionView_y_array[],
|
||||
float light_positionView_z_array[],
|
||||
float light_attenuationEnd_array[],
|
||||
// Outputs
|
||||
int subtileIndices[],
|
||||
int subtileIndicesPitch,
|
||||
int subtileNumLights[]
|
||||
)
|
||||
{
|
||||
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y) };
|
||||
float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
|
||||
tileMidY - gBufferScale_y };
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
// Normalize
|
||||
float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
frustumPlanes_z[i] * frustumPlanes_z[i]);
|
||||
frustumPlanes_xy[i] *= norm;
|
||||
frustumPlanes_z[i] *= norm;
|
||||
}
|
||||
|
||||
// Initialize
|
||||
int subtileLightOffset[4];
|
||||
subtileLightOffset[0] = 0 * subtileIndicesPitch;
|
||||
subtileLightOffset[1] = 1 * subtileIndicesPitch;
|
||||
subtileLightOffset[2] = 2 * subtileIndicesPitch;
|
||||
subtileLightOffset[3] = 3 * subtileIndicesPitch;
|
||||
|
||||
for (int i = 0; i < numLights; ++i) {
|
||||
int lightIndex = lightIndices[i];
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
// Test lights again against subtile z bounds
|
||||
bool inFrustum[4];
|
||||
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
|
||||
|
||||
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
float dy = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_y * frustumPlanes_xy[1];
|
||||
|
||||
if (fabsf(dx) > light_attenuationEnd) {
|
||||
bool positiveX = dx > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile
|
||||
}
|
||||
if (fabsf(dy) > light_attenuationEnd) {
|
||||
bool positiveY = dy > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile
|
||||
}
|
||||
|
||||
if (inFrustum[0])
|
||||
subtileIndices[subtileLightOffset[0]++] = lightIndex;
|
||||
if (inFrustum[1])
|
||||
subtileIndices[subtileLightOffset[1]++] = lightIndex;
|
||||
if (inFrustum[2])
|
||||
subtileIndices[subtileLightOffset[2]++] = lightIndex;
|
||||
if (inFrustum[3])
|
||||
subtileIndices[subtileLightOffset[3]++] = lightIndex;
|
||||
}
|
||||
|
||||
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||
subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
|
||||
subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
|
||||
subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
dot3(float x, float y, float z, float a, float b, float c) {
|
||||
return (x*a + y*b + z*c);
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
|
||||
float n = 1.f / sqrtf(x*x + y*y + z*z);
|
||||
ox = x * n;
|
||||
oy = y * n;
|
||||
oz = z * n;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
Unorm8ToFloat32(uint8_t u) {
|
||||
return (float)u * (1.0f / 255.0f);
|
||||
}
|
||||
|
||||
|
||||
static inline uint8_t
|
||||
Float32ToUnorm8(float f) {
|
||||
return (uint8_t)(f * 255.0f);
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
half_to_float_fast(uint16_t h) {
|
||||
uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit
|
||||
uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits
|
||||
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
|
||||
|
||||
// sign
|
||||
uint32_t xs = ((uint32_t) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
|
||||
// Exponent
|
||||
uint32_t xe = (uint32_t) (xes << 23);
|
||||
// Mantissa
|
||||
uint32_t xm = ((uint32_t) hm) << 13;
|
||||
|
||||
uint32_t bits = (xs | xe | xm);
|
||||
float *fp = reinterpret_cast<float *>(&bits);
|
||||
return *fp;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ShadeTileC(
|
||||
int32_t tileStartX, int32_t tileEndX,
|
||||
int32_t tileStartY, int32_t tileEndY,
|
||||
int32_t gBufferWidth, int32_t gBufferHeight,
|
||||
const ispc::InputDataArrays &inputData,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
// Light list
|
||||
int32_t tileLightIndices[],
|
||||
int32_t tileNumLights,
|
||||
// UI
|
||||
bool visualizeLightCount,
|
||||
// Output
|
||||
uint8_t framebuffer_r[],
|
||||
uint8_t framebuffer_g[],
|
||||
uint8_t framebuffer_b[]
|
||||
)
|
||||
{
|
||||
if (tileNumLights == 0 || visualizeLightCount) {
|
||||
uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
|
||||
for (int32_t y = tileStartY; y < tileEndY; ++y) {
|
||||
for (int32_t x = tileStartX; x < tileEndX; ++x) {
|
||||
int32_t framebufferIndex = (y * gBufferWidth + x);
|
||||
framebuffer_r[framebufferIndex] = c;
|
||||
framebuffer_g[framebufferIndex] = c;
|
||||
framebuffer_b[framebufferIndex] = c;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||
float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||
|
||||
for (int32_t y = tileStartY; y < tileEndY; ++y) {
|
||||
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||
|
||||
for (int32_t x = tileStartX; x < tileEndX; ++x) {
|
||||
int32_t gBufferOffset = y * gBufferWidth + x;
|
||||
|
||||
// Reconstruct position and (negative) view vector from G-buffer
|
||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||
float Vneg_x, Vneg_y, Vneg_z;
|
||||
|
||||
float z = inputData.zBuffer[gBufferOffset];
|
||||
|
||||
// Compute screen/clip-space position
|
||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
twoOverGBufferWidth - 1.0f;
|
||||
|
||||
// Unproject depth buffer Z value into view space
|
||||
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
cameraProj_11;
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
cameraProj_22;
|
||||
|
||||
// We actually end up with a vector pointing *at* the
|
||||
// surface (i.e. the negative view vector)
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrtf(4.0f * f - 1.0f);
|
||||
|
||||
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||
surface_normal_z = 3.0f - 8.0f * f;
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
half_to_float_fast(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
|
||||
float lit_x = 0.0f;
|
||||
float lit_y = 0.0f;
|
||||
float lit_z = 0.0f;
|
||||
for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
++tileLightIndex) {
|
||||
int32_t lightIndex = tileLightIndices[tileLightIndex];
|
||||
|
||||
// Gather light data relevant to initial culling
|
||||
float light_positionView_x =
|
||||
inputData.lightPositionView_x[lightIndex];
|
||||
float light_positionView_y =
|
||||
inputData.lightPositionView_y[lightIndex];
|
||||
float light_positionView_z =
|
||||
inputData.lightPositionView_z[lightIndex];
|
||||
float light_attenuationEnd =
|
||||
inputData.lightAttenuationEnd[lightIndex];
|
||||
|
||||
// Compute light vector
|
||||
float L_x = light_positionView_x - surface_positionView_x;
|
||||
float L_y = light_positionView_y - surface_positionView_y;
|
||||
float L_z = light_positionView_z - surface_positionView_z;
|
||||
|
||||
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip at end of attenuation
|
||||
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||
|
||||
if (distanceToLight2 < light_attenutaionEnd2) {
|
||||
float distanceToLight = sqrtf(distanceToLight2);
|
||||
|
||||
float distanceToLightRcp = 1.f / distanceToLight;
|
||||
L_x *= distanceToLightRcp;
|
||||
L_y *= distanceToLightRcp;
|
||||
L_z *= distanceToLightRcp;
|
||||
|
||||
// Start computing brdf
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip back facing
|
||||
if (NdotL > 0.0f) {
|
||||
float light_attenuationBegin =
|
||||
inputData.lightAttenuationBegin[lightIndex];
|
||||
|
||||
// Light distance attenuation (linstep)
|
||||
float lightRange = (light_attenuationEnd - light_attenuationBegin);
|
||||
float falloffPosition = (light_attenuationEnd - distanceToLight);
|
||||
float attenuation = std::min(falloffPosition / lightRange, 1.0f);
|
||||
|
||||
float H_x = (L_x - Vneg_x);
|
||||
float H_y = (L_y - Vneg_y);
|
||||
float H_z = (L_z - Vneg_z);
|
||||
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, H_x, H_y, H_z);
|
||||
NdotH = std::max(NdotH, 0.0f);
|
||||
|
||||
float specular = powf(NdotH, surface_specularPower);
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
(1.0f / 8.0f);
|
||||
float specularContrib = surface_specularAmount *
|
||||
specularNorm * specular;
|
||||
|
||||
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||
|
||||
float light_color_x = inputData.lightColor_x[lightIndex];
|
||||
float light_color_y = inputData.lightColor_y[lightIndex];
|
||||
float light_color_z = inputData.lightColor_z[lightIndex];
|
||||
|
||||
float lightContrib_x = surface_albedo_x * light_color_x;
|
||||
float lightContrib_y = surface_albedo_y * light_color_y;
|
||||
float lightContrib_z = surface_albedo_z * light_color_z;
|
||||
|
||||
lit_x += lightContrib_x * k;
|
||||
lit_y += lightContrib_y * k;
|
||||
lit_z += lightContrib_z * k;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gamma correct
|
||||
float gamma = 1.0 / 2.2f;
|
||||
lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
|
||||
lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
|
||||
lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
|
||||
|
||||
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int *lightIndices, int numLights,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
// If we few enough lights or this is the base case (last level), shade
|
||||
// this full tile directly
|
||||
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// Skip entirely offscreen tiles
|
||||
if (endX > startX && endY > startY) {
|
||||
ShadeTileC(startX, endX, startY, endY,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->arrays,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer->r, framebuffer->g, framebuffer->b);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
|
||||
// Move down a level in the tree
|
||||
--level;
|
||||
tileX <<= 1;
|
||||
tileY <<= 1;
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
|
||||
// Work out splitting coords
|
||||
int midX = (tileX + 1) * width;
|
||||
int midY = (tileY + 1) * height;
|
||||
|
||||
// Read subtile min/max data
|
||||
// NOTE: We must be sure to handle out-of-bounds access here since
|
||||
// sometimes we'll only have 1 or 2 subtiles for non-pow-2
|
||||
// framebuffer sizes.
|
||||
bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
|
||||
bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
|
||||
|
||||
// NOTE: Order is 00, 10, 01, 11
|
||||
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
|
||||
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||
input->header.cameraFar, input->header.cameraFar};
|
||||
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||
input->header.cameraNear, input->header.cameraNear};
|
||||
|
||||
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
if (rightTileExists) {
|
||||
minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
|
||||
maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
|
||||
if (bottomTileExists) {
|
||||
minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
|
||||
maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
|
||||
}
|
||||
}
|
||||
if (bottomTileExists) {
|
||||
minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
|
||||
maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
|
||||
}
|
||||
|
||||
// Cull lights into subtile lists
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int subtileLightIndices[4][MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int subtileNumLights[4];
|
||||
SplitTileMinMax(midX, midY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd,
|
||||
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
|
||||
|
||||
// Recurse into subtiles
|
||||
ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||
subtileLightIndices[0], subtileNumLights[0],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
|
||||
subtileLightIndices[1], subtileNumLights[1],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX , tileY + 1,
|
||||
subtileLightIndices[2], subtileNumLights[2],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
|
||||
subtileLightIndices[3], subtileNumLights[3],
|
||||
framebuffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
IntersectLightsWithTileMinMax(
|
||||
int tileStartX, int tileEndX,
|
||||
int tileStartY, int tileEndY,
|
||||
// Tile data
|
||||
float minZ,
|
||||
float maxZ,
|
||||
// G-buffer data
|
||||
int gBufferWidth, int gBufferHeight,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
// Light Data
|
||||
int numLights,
|
||||
float light_positionView_x_array[],
|
||||
float light_positionView_y_array[],
|
||||
float light_positionView_z_array[],
|
||||
float light_attenuationEnd_array[],
|
||||
// Output
|
||||
int tileLightIndices[]
|
||||
)
|
||||
{
|
||||
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
float frustumPlanes_xy[4];
|
||||
float frustumPlanes_z[4];
|
||||
|
||||
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||
float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y),
|
||||
-(cameraProj_22 * gBufferScale_y) };
|
||||
|
||||
float frustumPlanes_z_v[4] = { tileEndX - gBufferScale_x,
|
||||
-tileStartX + gBufferScale_x,
|
||||
tileEndY - gBufferScale_y,
|
||||
-tileStartY + gBufferScale_y };
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
|
||||
frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
|
||||
frustumPlanes_xy_v[i] *= norm;
|
||||
frustumPlanes_z_v[i] *= norm;
|
||||
|
||||
frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
|
||||
frustumPlanes_z[i] = frustumPlanes_z_v[i];
|
||||
}
|
||||
|
||||
int tileNumLights = 0;
|
||||
|
||||
for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
float d = light_positionView_z - minZ;
|
||||
bool inFrustum = (d >= light_attenuationEndNeg);
|
||||
|
||||
d = maxZ - light_positionView_z;
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
if (!inFrustum)
|
||||
continue;
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// Pack and store intersecting lights
|
||||
if (inFrustum)
|
||||
tileLightIndices[tileNumLights++] = lightIndex;
|
||||
}
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
// Get Z min/max for this tile
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
float minZ = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// This is a root tile, so first do a full 6-plane cull
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int lightIndices[MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int numLights = IntersectLightsWithTileMinMax(
|
||||
startX, endX, startY, endY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd, lightIndices);
|
||||
|
||||
// Now kick off the recursive process for this tile
|
||||
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||
numLights, framebuffer);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
|
||||
{
|
||||
MinMaxZTree *minMaxZTree = gMinMaxZTree;
|
||||
|
||||
// Update min/max Z tree
|
||||
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
input->header.cameraNear, input->header.cameraFar);
|
||||
|
||||
int rootLevel = minMaxZTree->Levels() - 1;
|
||||
int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
|
||||
int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
|
||||
int rootTiles = rootTilesX * rootTilesY;
|
||||
for (int g = 0; g < rootTiles; ++g) {
|
||||
uint32_t tileY = g / rootTilesX;
|
||||
uint32_t tileX = g % rootTilesX;
|
||||
ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
|
||||
}
|
||||
}
|
||||
398
examples/portable/deferred/dynamic_cilk.cpp
Normal file
398
examples/portable/deferred/dynamic_cilk.cpp
Normal file
@@ -0,0 +1,398 @@
|
||||
/*
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef __cilk
|
||||
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif // ISPC_IS_LINUX
|
||||
|
||||
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
|
||||
#define MIN_TILE_WIDTH 16
|
||||
#define MIN_TILE_HEIGHT 16
|
||||
|
||||
|
||||
#define DYNAMIC_TREE_LEVELS 5
|
||||
// If this is set to 1 then the result will be identical to the static version
|
||||
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
|
||||
|
||||
static void *
|
||||
lAlignedMalloc(size_t size, int32_t alignment) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return _aligned_malloc(size, alignment);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
return memalign(alignment, size);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
void *mem = malloc(size + (alignment-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
|
||||
(alignment - 1)));
|
||||
((void**)amem)[-1] = mem;
|
||||
return amem;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
lAlignedFree(void *ptr) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
_aligned_free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_LINUX
|
||||
free(ptr);
|
||||
#endif
|
||||
#ifdef ISPC_IS_APPLE
|
||||
free(((void**)ptr)[-1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
class MinMaxZTreeCilk
|
||||
{
|
||||
public:
|
||||
// Currently (min) tile dimensions must divide gBuffer dimensions evenly
|
||||
// Levels must be small enough that neither dimension goes below one tile
|
||||
MinMaxZTreeCilk(
|
||||
int tileWidth, int tileHeight, int levels,
|
||||
int gBufferWidth, int gBufferHeight)
|
||||
: mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
|
||||
{
|
||||
mNumTilesX = gBufferWidth / mTileWidth;
|
||||
mNumTilesY = gBufferHeight / mTileHeight;
|
||||
|
||||
// Allocate arrays
|
||||
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
int x = NumTilesX(i);
|
||||
int y = NumTilesY(i);
|
||||
assert(x > 0);
|
||||
assert(y > 0);
|
||||
// NOTE: If the following two asserts fire it probably means that
|
||||
// the base tile dimensions do not evenly divide the G-buffer dimensions
|
||||
assert(x * (mTileWidth << i) >= gBufferWidth);
|
||||
assert(y * (mTileHeight << i) >= gBufferHeight);
|
||||
mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
|
||||
}
|
||||
}
|
||||
|
||||
void Update(float *zBuffer, int gBufferPitchInElements,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar)
|
||||
{
|
||||
// Compute level 0 in parallel. Outer loops is here since we use Cilk
|
||||
_Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
|
||||
ispc::ComputeZBoundsRow(tileY,
|
||||
mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
|
||||
zBuffer, gBufferPitchInElements,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
mMinZArrays[0] + (tileY * mNumTilesX),
|
||||
mMaxZArrays[0] + (tileY * mNumTilesX));
|
||||
}
|
||||
|
||||
// Generate other levels
|
||||
// NOTE: We currently don't use ispc here since it's sort of an
|
||||
// awkward gather-based reduction Using SSE odd pack/unpack
|
||||
// instructions might actually work here when we need to optimize
|
||||
for (int level = 1; level < mLevels; ++level) {
|
||||
int destTilesX = NumTilesX(level);
|
||||
int destTilesY = NumTilesY(level);
|
||||
int srcLevel = level - 1;
|
||||
int srcTilesX = NumTilesX(srcLevel);
|
||||
int srcTilesY = NumTilesY(srcLevel);
|
||||
_Cilk_for (int y = 0; y < destTilesY; ++y) {
|
||||
for (int x = 0; x < destTilesX; ++x) {
|
||||
int srcX = x << 1;
|
||||
int srcY = y << 1;
|
||||
// NOTE: Ugly branches to deal with non-multiple dimensions at some levels
|
||||
// TODO: SSE branchless min/max is probably better...
|
||||
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
|
||||
if (srcX + 1 < srcTilesX) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX + 1)]);
|
||||
}
|
||||
}
|
||||
if (srcY + 1 < srcTilesY) {
|
||||
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
|
||||
(srcX )]);
|
||||
}
|
||||
mMinZArrays[level][y * destTilesX + x] = minZ;
|
||||
mMaxZArrays[level][y * destTilesX + x] = maxZ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~MinMaxZTreeCilk() {
|
||||
for (int i = 0; i < mLevels; ++i) {
|
||||
lAlignedFree(mMinZArrays[i]);
|
||||
lAlignedFree(mMaxZArrays[i]);
|
||||
}
|
||||
lAlignedFree(mMinZArrays);
|
||||
lAlignedFree(mMaxZArrays);
|
||||
}
|
||||
|
||||
int Levels() const { return mLevels; }
|
||||
|
||||
// These round UP, so beware that the last tile for a given level may not be completely full
|
||||
// TODO: Verify this...
|
||||
int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
|
||||
int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
|
||||
int TileWidth(int level = 0) const { return (mTileWidth << level); }
|
||||
int TileHeight(int level = 0) const { return (mTileHeight << level); }
|
||||
|
||||
float MinZ(int level, int tileX, int tileY) const {
|
||||
return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
float MaxZ(int level, int tileX, int tileY) const {
|
||||
return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
|
||||
}
|
||||
|
||||
private:
|
||||
int mTileWidth;
|
||||
int mTileHeight;
|
||||
int mLevels;
|
||||
int mNumTilesX;
|
||||
int mNumTilesY;
|
||||
|
||||
// One array for each "level" in the tree
|
||||
float **mMinZArrays;
|
||||
float **mMaxZArrays;
|
||||
};
|
||||
|
||||
static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
|
||||
|
||||
void InitDynamicCilk(InputData *input) {
|
||||
gMinMaxZTreeCilk =
|
||||
new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
|
||||
input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
|
||||
int *lightIndices, int numLights,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
// If we few enough lights or this is the base case (last level), shade
|
||||
// this full tile directly
|
||||
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// Skip entirely offscreen tiles
|
||||
if (endX > startX && endY > startY) {
|
||||
ispc::ShadeTile(
|
||||
startX, endX, startY, endY,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
&input->arrays,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer->r, framebuffer->g, framebuffer->b);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
|
||||
// Move down a level in the tree
|
||||
--level;
|
||||
tileX <<= 1;
|
||||
tileY <<= 1;
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
|
||||
// Work out splitting coords
|
||||
int midX = (tileX + 1) * width;
|
||||
int midY = (tileY + 1) * height;
|
||||
|
||||
// Read subtile min/max data
|
||||
// NOTE: We must be sure to handle out-of-bounds access here since
|
||||
// sometimes we'll only have 1 or 2 subtiles for non-pow-2
|
||||
// framebuffer sizes.
|
||||
bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
|
||||
bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
|
||||
|
||||
// NOTE: Order is 00, 10, 01, 11
|
||||
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
|
||||
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
|
||||
input->header.cameraFar, input->header.cameraFar};
|
||||
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
|
||||
input->header.cameraNear, input->header.cameraNear};
|
||||
|
||||
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
if (rightTileExists) {
|
||||
minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
|
||||
maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
|
||||
if (bottomTileExists) {
|
||||
minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
|
||||
maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
|
||||
}
|
||||
}
|
||||
if (bottomTileExists) {
|
||||
minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
|
||||
maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
|
||||
}
|
||||
|
||||
// Cull lights into subtile lists
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int subtileLightIndices[4][MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int subtileNumLights[4];
|
||||
ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
lightIndices, numLights, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd,
|
||||
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
|
||||
|
||||
// Recurse into subtiles
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY,
|
||||
subtileLightIndices[0], subtileNumLights[0],
|
||||
framebuffer);
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
|
||||
subtileLightIndices[1], subtileNumLights[1],
|
||||
framebuffer);
|
||||
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY + 1,
|
||||
subtileLightIndices[2], subtileNumLights[2],
|
||||
framebuffer);
|
||||
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
|
||||
subtileLightIndices[3], subtileNumLights[3],
|
||||
framebuffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
|
||||
Framebuffer *framebuffer) {
|
||||
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
// Get Z min/max for this tile
|
||||
int width = minMaxZTree->TileWidth(level);
|
||||
int height = minMaxZTree->TileHeight(level);
|
||||
float minZ = minMaxZTree->MinZ(level, tileX, tileY);
|
||||
float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
|
||||
|
||||
int startX = tileX * width;
|
||||
int startY = tileY * height;
|
||||
int endX = std::min(input->header.framebufferWidth, startX + width);
|
||||
int endY = std::min(input->header.framebufferHeight, startY + height);
|
||||
|
||||
// This is a root tile, so first do a full 6-plane cull
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
__declspec(align(ALIGNMENT_BYTES))
|
||||
#endif
|
||||
int lightIndices[MAX_LIGHTS]
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
__attribute__ ((aligned(ALIGNMENT_BYTES)))
|
||||
#endif
|
||||
;
|
||||
int numLights = ispc::IntersectLightsWithTileMinMax(
|
||||
startX, endX, startY, endY, minZ, maxZ,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight,
|
||||
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
|
||||
MAX_LIGHTS, input->arrays.lightPositionView_x,
|
||||
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
|
||||
input->arrays.lightAttenuationEnd, lightIndices);
|
||||
|
||||
// Now kick off the recursive process for this tile
|
||||
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
|
||||
numLights, framebuffer);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
|
||||
{
|
||||
MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
|
||||
|
||||
// Update min/max Z tree
|
||||
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
|
||||
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
|
||||
input->header.cameraNear, input->header.cameraFar);
|
||||
|
||||
// Launch the "root" tiles. Ideally these should at least fill the
|
||||
// machine... at the moment we have a static number of "levels" to the
|
||||
// mip tree but it might make sense to compute it based on the width of
|
||||
// the machine.
|
||||
int rootLevel = minMaxZTree->Levels() - 1;
|
||||
int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
|
||||
int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
|
||||
int rootTiles = rootTilesX * rootTilesY;
|
||||
_Cilk_for (int g = 0; g < rootTiles; ++g) {
|
||||
uint32_t tileY = g / rootTilesX;
|
||||
uint32_t tileX = g % rootTilesX;
|
||||
ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // __cilk
|
||||
778
examples/portable/deferred/kernels.cu
Normal file
778
examples/portable/deferred/kernels.cu
Normal file
@@ -0,0 +1,778 @@
|
||||
/*
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#include "deferred.h"
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
|
||||
#define programCount 32
|
||||
#define programIndex (threadIdx.x & 31)
|
||||
#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5))
|
||||
#define taskCount (gridDim.x*4)
|
||||
#define warpIdx (threadIdx.x >> 5)
|
||||
|
||||
#define int32 int
|
||||
#define int16 short
|
||||
#define int8 char
|
||||
|
||||
__device__ static inline float clamp(float v, float low, float high)
|
||||
{
|
||||
return min(max(v, low), high);
|
||||
}
|
||||
|
||||
struct InputDataArrays
|
||||
{
|
||||
float *zBuffer;
|
||||
unsigned int16 *normalEncoded_x; // half float
|
||||
unsigned int16 *normalEncoded_y; // half float
|
||||
unsigned int16 *specularAmount; // half float
|
||||
unsigned int16 *specularPower; // half float
|
||||
unsigned int8 *albedo_x; // unorm8
|
||||
unsigned int8 *albedo_y; // unorm8
|
||||
unsigned int8 *albedo_z; // unorm8
|
||||
float *lightPositionView_x;
|
||||
float *lightPositionView_y;
|
||||
float *lightPositionView_z;
|
||||
float *lightAttenuationBegin;
|
||||
float *lightColor_x;
|
||||
float *lightColor_y;
|
||||
float *lightColor_z;
|
||||
float *lightAttenuationEnd;
|
||||
};
|
||||
|
||||
struct InputHeader
|
||||
{
|
||||
float cameraProj[4][4];
|
||||
float cameraNear;
|
||||
float cameraFar;
|
||||
|
||||
int32 framebufferWidth;
|
||||
int32 framebufferHeight;
|
||||
int32 numLights;
|
||||
int32 inputDataChunkSize;
|
||||
int32 inputDataArrayOffsets[idaNum];
|
||||
};
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Common utility routines
|
||||
|
||||
__device__
|
||||
static inline float
|
||||
dot3(float x, float y, float z, float a, float b, float c) {
|
||||
return (x*a + y*b + z*c);
|
||||
}
|
||||
|
||||
|
||||
#if 0
|
||||
static __shared__ int shdata_full[128];
|
||||
template<typename T, int N>
|
||||
struct Uniform
|
||||
{
|
||||
T data[(N+programCount-1)/programCount];
|
||||
volatile T *shdata;
|
||||
|
||||
__device__ inline Uniform()
|
||||
{
|
||||
shdata = ((T*)shdata_full) + warpIdx*32;
|
||||
}
|
||||
|
||||
__device__ inline int2 get_chunk(const int i) const
|
||||
{
|
||||
const int elem = i & (programCount - 1);
|
||||
const int chunk = i >> 5;
|
||||
shdata[programIndex] = chunk;
|
||||
shdata[ elem] = chunk;
|
||||
return make_int2(shdata[programIndex], elem);
|
||||
}
|
||||
|
||||
__device__ inline const T get(const int i) const
|
||||
{
|
||||
const int2 idx = get_chunk(i);
|
||||
return __shfl(data[idx.x], idx.y);
|
||||
}
|
||||
|
||||
__device__ inline void set(const bool active, const int i, T value)
|
||||
{
|
||||
const int2 idx = get_chunk(i);
|
||||
const int chunkIdx = idx.x;
|
||||
const int elemIdx = idx.y;
|
||||
shdata[programIndex] = data[chunkIdx];
|
||||
if (active) shdata[elemIdx] = value;
|
||||
data[chunkIdx] = shdata[programIndex];
|
||||
}
|
||||
};
|
||||
#elif 1
|
||||
template<typename T, int N>
|
||||
struct Uniform
|
||||
{
|
||||
union
|
||||
{
|
||||
T *data;
|
||||
int32_t ptr[2];
|
||||
};
|
||||
|
||||
__device__ inline Uniform()
|
||||
{
|
||||
if (programIndex == 0)
|
||||
data = (T*)malloc(N*sizeof(T));
|
||||
ptr[0] = __shfl(ptr[0], 0);
|
||||
ptr[1] = __shfl(ptr[1], 0);
|
||||
}
|
||||
__device__ inline ~Uniform()
|
||||
{
|
||||
if (programIndex == 0)
|
||||
free(data);
|
||||
}
|
||||
|
||||
__device__ inline const T get(const int i) const
|
||||
{
|
||||
return data[i];
|
||||
}
|
||||
|
||||
__device__ inline T* get_ptr(const int i) {return &data[i]; }
|
||||
__device__ inline void set(const bool active, const int i, T value)
|
||||
{
|
||||
if (active)
|
||||
data[i] = value;
|
||||
}
|
||||
};
|
||||
|
||||
#else
|
||||
__shared__ int shdata_full[4*MAX_LIGHTS];
|
||||
template<typename T, int N>
|
||||
struct Uniform
|
||||
{
|
||||
/* volatile */ T *shdata;
|
||||
|
||||
__device__ Uniform()
|
||||
{
|
||||
shdata = (T*)&shdata_full[warpIdx*MAX_LIGHTS];
|
||||
}
|
||||
|
||||
__device__ inline const T get(const int i) const
|
||||
{
|
||||
return shdata[i];
|
||||
}
|
||||
|
||||
__device__ inline void set(const bool active, const int i, T value)
|
||||
{
|
||||
if (active)
|
||||
shdata[i] = value;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
__device__
|
||||
static inline void
|
||||
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
|
||||
float n = rsqrt(x*x + y*y + z*z);
|
||||
ox = x * n;
|
||||
oy = y * n;
|
||||
oz = z * n;
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
static float reduce_min(float value)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 4; i >=0; i--)
|
||||
value = fminf(value, __shfl_xor(value, 1<<i, 32));
|
||||
return value;
|
||||
}
|
||||
__device__ inline
|
||||
static float reduce_max(float value)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 4; i >=0; i--)
|
||||
value = fmaxf(value, __shfl_xor(value, 1<<i, 32));
|
||||
return value;
|
||||
}
|
||||
|
||||
#if 0
|
||||
__device__ inline
|
||||
static int reduce_sum(int value)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int i = 4; i >=0; i--)
|
||||
value += __shfl_xor(value, 1<<i, 32);
|
||||
return value;
|
||||
}
|
||||
static __device__ __forceinline__ uint shfl_scan_add_step(uint partial, uint up_offset)
|
||||
{
|
||||
uint result;
|
||||
asm(
|
||||
"{.reg .u32 r0;"
|
||||
".reg .pred p;"
|
||||
"shfl.up.b32 r0|p, %1, %2, 0;"
|
||||
"@p add.u32 r0, r0, %3;"
|
||||
"mov.u32 %0, r0;}"
|
||||
: "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
|
||||
return result;
|
||||
}
|
||||
static __device__ __forceinline__ int inclusive_scan_warp(const int value)
|
||||
{
|
||||
uint sum = value;
|
||||
#pragma unroll
|
||||
for(int i = 0; i < 5; ++i)
|
||||
sum = shfl_scan_add_step(sum, 1 << i);
|
||||
return sum - value;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static __device__ __forceinline__ int lanemask_lt()
|
||||
{
|
||||
int mask;
|
||||
asm("mov.u32 %0, %lanemask_lt;" : "=r" (mask));
|
||||
return mask;
|
||||
}
|
||||
static __device__ __forceinline__ int2 warpBinExclusiveScan(const bool p)
|
||||
{
|
||||
const int b = __ballot(p);
|
||||
return make_int2(__popc(b), __popc(b & lanemask_lt()));
|
||||
}
|
||||
__device__ static inline
|
||||
int packed_store_active(bool active, int* ptr, int value)
|
||||
{
|
||||
const int2 res = warpBinExclusiveScan(active);
|
||||
const int idx = res.y;
|
||||
const int nactive = res.x;
|
||||
if (active)
|
||||
ptr[idx] = value;
|
||||
return nactive;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
__device__
|
||||
static inline float
|
||||
Unorm8ToFloat32(unsigned int8 u) {
|
||||
return (float)u * (1.0f / 255.0f);
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline unsigned int8
|
||||
Float32ToUnorm8(float f) {
|
||||
return (unsigned int8)(f * 255.0f);
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline void
|
||||
ComputeZBounds(
|
||||
int32 tileStartX, int32 tileEndX,
|
||||
int32 tileStartY, int32 tileEndY,
|
||||
// G-buffer data
|
||||
float zBuffer[],
|
||||
int32 gBufferWidth,
|
||||
// Camera data
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar,
|
||||
// Output
|
||||
float &minZ,
|
||||
float &maxZ
|
||||
)
|
||||
{
|
||||
// Find Z bounds
|
||||
float laneMinZ = cameraFar;
|
||||
float laneMaxZ = cameraNear;
|
||||
for ( int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
for ( int xb = tileStartX; xb < tileEndX; xb += programCount)
|
||||
{
|
||||
const int x = xb + programIndex;
|
||||
if (x >= tileEndX) break;
|
||||
// Unproject depth buffer Z value into view space
|
||||
float z = zBuffer[y * gBufferWidth + x];
|
||||
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
||||
|
||||
// Work out Z bounds for our samples
|
||||
// Avoid considering skybox/background or otherwise invalid pixels
|
||||
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
|
||||
laneMinZ = min(laneMinZ, viewSpaceZ);
|
||||
laneMaxZ = max(laneMaxZ, viewSpaceZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
minZ = reduce_min(laneMinZ);
|
||||
maxZ = reduce_max(laneMaxZ);
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline int32
|
||||
IntersectLightsWithTileMinMax(
|
||||
int32 tileStartX, int32 tileEndX,
|
||||
int32 tileStartY, int32 tileEndY,
|
||||
// Tile data
|
||||
float minZ,
|
||||
float maxZ,
|
||||
// G-buffer data
|
||||
int32 gBufferWidth, int32 gBufferHeight,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
// Light Data
|
||||
int32 numLights,
|
||||
float light_positionView_x_array[],
|
||||
float light_positionView_y_array[],
|
||||
float light_positionView_z_array[],
|
||||
float light_attenuationEnd_array[],
|
||||
// Output
|
||||
Uniform<int,MAX_LIGHTS> &tileLightIndices
|
||||
)
|
||||
{
|
||||
float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
float frustumPlanes_xy[4] = {
|
||||
-(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y),
|
||||
-(cameraProj_22 * gBufferScale_y) };
|
||||
float frustumPlanes_z[4] = {
|
||||
tileEndX - gBufferScale_x,
|
||||
-tileStartX + gBufferScale_x,
|
||||
tileEndY - gBufferScale_y,
|
||||
-tileStartY + gBufferScale_y };
|
||||
|
||||
for ( int i = 0; i < 4; ++i) {
|
||||
float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
frustumPlanes_z[i] * frustumPlanes_z[i]);
|
||||
frustumPlanes_xy[i] *= norm;
|
||||
frustumPlanes_z[i] *= norm;
|
||||
}
|
||||
|
||||
int32 tileNumLights = 0;
|
||||
|
||||
for ( int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount)
|
||||
{
|
||||
const int lightIndex = lightIndexB + programIndex;
|
||||
if (lightIndex >= numLights) break;
|
||||
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
float d = light_positionView_z - minZ;
|
||||
bool inFrustum = (d >= light_attenuationEndNeg);
|
||||
|
||||
d = maxZ - light_positionView_z;
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// This seems better than cif(!inFrustum) ccontinue; here since we
|
||||
// don't actually need to mask the rest of this function - this is
|
||||
// just a greedy early-out. Could also structure all of this as
|
||||
// nested if() statements, but this a bit easier to read
|
||||
if (__ballot(inFrustum) > 0)
|
||||
{
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// Pack and store intersecting lights
|
||||
const bool active = inFrustum && lightIndex < numLights;
|
||||
#if 0
|
||||
if (__ballot(active) > 0)
|
||||
tileNumLights += packed_store_active(active, tileLightIndices.get_ptr(tileNumLights), lightIndex);
|
||||
#else
|
||||
if (__ballot(active) > 0)
|
||||
{
|
||||
const int2 res = warpBinExclusiveScan(active);
|
||||
const int idx = tileNumLights + res.y;
|
||||
const int nactive = res.x;
|
||||
tileLightIndices.set(active, idx, lightIndex);
|
||||
tileNumLights += nactive;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline int32
|
||||
IntersectLightsWithTile(
|
||||
int32 tileStartX, int32 tileEndX,
|
||||
int32 tileStartY, int32 tileEndY,
|
||||
int32 gBufferWidth, int32 gBufferHeight,
|
||||
// G-buffer data
|
||||
float zBuffer[],
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
float cameraNear, float cameraFar,
|
||||
// Light Data
|
||||
int32 numLights,
|
||||
float light_positionView_x_array[],
|
||||
float light_positionView_y_array[],
|
||||
float light_positionView_z_array[],
|
||||
float light_attenuationEnd_array[],
|
||||
// Output
|
||||
Uniform<int,MAX_LIGHTS> &tileLightIndices
|
||||
)
|
||||
{
|
||||
float minZ, maxZ;
|
||||
ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
|
||||
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
minZ, maxZ);
|
||||
|
||||
|
||||
int32 tileNumLights = IntersectLightsWithTileMinMax(
|
||||
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
|
||||
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
|
||||
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
|
||||
light_positionView_z_array, light_attenuationEnd_array,
|
||||
tileLightIndices);
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
static inline void
|
||||
ShadeTile(
|
||||
int32 tileStartX, int32 tileEndX,
|
||||
int32 tileStartY, int32 tileEndY,
|
||||
int32 gBufferWidth, int32 gBufferHeight,
|
||||
const InputDataArrays &inputData,
|
||||
// Camera data
|
||||
float cameraProj_11, float cameraProj_22,
|
||||
float cameraProj_33, float cameraProj_43,
|
||||
// Light list
|
||||
Uniform<int,MAX_LIGHTS> &tileLightIndices,
|
||||
int32 tileNumLights,
|
||||
// UI
|
||||
bool visualizeLightCount,
|
||||
// Output
|
||||
unsigned int8 framebuffer_r[],
|
||||
unsigned int8 framebuffer_g[],
|
||||
unsigned int8 framebuffer_b[]
|
||||
)
|
||||
{
|
||||
if (tileNumLights == 0 || visualizeLightCount) {
|
||||
unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
|
||||
for ( int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
|
||||
{
|
||||
const int x = xb + programIndex;
|
||||
if (x >= tileEndX) continue;
|
||||
int32 framebufferIndex = (y * gBufferWidth + x);
|
||||
framebuffer_r[framebufferIndex] = c;
|
||||
framebuffer_g[framebufferIndex] = c;
|
||||
framebuffer_b[framebufferIndex] = c;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||
float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||
|
||||
for ( int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||
|
||||
for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
|
||||
{
|
||||
const int x = xb + programIndex;
|
||||
// if (x >= tileEndX) break;
|
||||
int32 gBufferOffset = y * gBufferWidth + x;
|
||||
|
||||
// Reconstruct position and (negative) view vector from G-buffer
|
||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||
float Vneg_x, Vneg_y, Vneg_z;
|
||||
|
||||
float z = inputData.zBuffer[gBufferOffset];
|
||||
|
||||
// Compute screen/clip-space position
|
||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
twoOverGBufferWidth - 1.0f;
|
||||
|
||||
// Unproject depth buffer Z value into view space
|
||||
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
cameraProj_11;
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
cameraProj_22;
|
||||
|
||||
// We actually end up with a vector pointing *at* the
|
||||
// surface (i.e. the negative view vector)
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||
asm("// half2float //");
|
||||
float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]);
|
||||
asm("// half2float //");
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrt(4.0f * f - 1.0f);
|
||||
|
||||
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||
surface_normal_z = 3.0f - 8.0f * f;
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
__half2float(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
__half2float(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
|
||||
float lit_x = 0.0f;
|
||||
float lit_y = 0.0f;
|
||||
float lit_z = 0.0f;
|
||||
for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
++tileLightIndex) {
|
||||
int32 lightIndex = tileLightIndices.get(tileLightIndex);
|
||||
|
||||
// Gather light data relevant to initial culling
|
||||
float light_positionView_x =
|
||||
__ldg(&inputData.lightPositionView_x[lightIndex]);
|
||||
float light_positionView_y =
|
||||
__ldg(&inputData.lightPositionView_y[lightIndex]);
|
||||
float light_positionView_z =
|
||||
__ldg(&inputData.lightPositionView_z[lightIndex]);
|
||||
float light_attenuationEnd =
|
||||
__ldg(&inputData.lightAttenuationEnd[lightIndex]);
|
||||
|
||||
// Compute light vector
|
||||
float L_x = light_positionView_x - surface_positionView_x;
|
||||
float L_y = light_positionView_y - surface_positionView_y;
|
||||
float L_z = light_positionView_z - surface_positionView_z;
|
||||
|
||||
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip at end of attenuation
|
||||
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||
|
||||
if (distanceToLight2 < light_attenutaionEnd2) {
|
||||
float distanceToLight = sqrt(distanceToLight2);
|
||||
|
||||
// HLSL "rcp" is allowed to be fairly inaccurate
|
||||
float distanceToLightRcp = 1.0f/distanceToLight;
|
||||
L_x *= distanceToLightRcp;
|
||||
L_y *= distanceToLightRcp;
|
||||
L_z *= distanceToLightRcp;
|
||||
|
||||
// Start computing brdf
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip back facing
|
||||
if (NdotL > 0.0f) {
|
||||
float light_attenuationBegin =
|
||||
inputData.lightAttenuationBegin[lightIndex];
|
||||
|
||||
// Light distance attenuation (linstep)
|
||||
float lightRange = (light_attenuationEnd - light_attenuationBegin);
|
||||
float falloffPosition = (light_attenuationEnd - distanceToLight);
|
||||
float attenuation = min(falloffPosition / lightRange, 1.0f);
|
||||
|
||||
float H_x = (L_x - Vneg_x);
|
||||
float H_y = (L_y - Vneg_y);
|
||||
float H_z = (L_z - Vneg_z);
|
||||
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, H_x, H_y, H_z);
|
||||
NdotH = max(NdotH, 0.0f);
|
||||
|
||||
float specular = pow(NdotH, surface_specularPower);
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
(1.0f / 8.0f);
|
||||
float specularContrib = surface_specularAmount *
|
||||
specularNorm * specular;
|
||||
|
||||
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||
|
||||
float light_color_x = inputData.lightColor_x[lightIndex];
|
||||
float light_color_y = inputData.lightColor_y[lightIndex];
|
||||
float light_color_z = inputData.lightColor_z[lightIndex];
|
||||
|
||||
float lightContrib_x = surface_albedo_x * light_color_x;
|
||||
float lightContrib_y = surface_albedo_y * light_color_y;
|
||||
float lightContrib_z = surface_albedo_z * light_color_z;
|
||||
|
||||
lit_x += lightContrib_x * k;
|
||||
lit_y += lightContrib_y * k;
|
||||
lit_z += lightContrib_z * k;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gamma correct
|
||||
// These pows are pretty slow right now, but we can do
|
||||
// something faster if really necessary to squeeze every
|
||||
// last bit of performance out of it
|
||||
float gamma = 1.0 / 2.2f;
|
||||
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
|
||||
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
|
||||
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
|
||||
|
||||
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Static decomposition
|
||||
|
||||
__global__ void
|
||||
RenderTile( int num_groups_x, int num_groups_y,
|
||||
const InputHeader *inputHeaderPtr,
|
||||
const InputDataArrays *inputDataPtr,
|
||||
int visualizeLightCount,
|
||||
// Output
|
||||
unsigned int8 framebuffer_r[],
|
||||
unsigned int8 framebuffer_g[],
|
||||
unsigned int8 framebuffer_b[]) {
|
||||
if (taskIndex >= taskCount) return;
|
||||
|
||||
const InputHeader inputHeader = *inputHeaderPtr;
|
||||
const InputDataArrays inputData = *inputDataPtr;
|
||||
int32 group_y = taskIndex / num_groups_x;
|
||||
int32 group_x = taskIndex % num_groups_x;
|
||||
|
||||
int32 tile_start_x = group_x * MIN_TILE_WIDTH;
|
||||
int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
|
||||
int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
|
||||
int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
|
||||
|
||||
int framebufferWidth = inputHeader.framebufferWidth;
|
||||
int framebufferHeight = inputHeader.framebufferHeight;
|
||||
float cameraProj_00 = inputHeader.cameraProj[0][0];
|
||||
float cameraProj_11 = inputHeader.cameraProj[1][1];
|
||||
float cameraProj_22 = inputHeader.cameraProj[2][2];
|
||||
float cameraProj_32 = inputHeader.cameraProj[3][2];
|
||||
|
||||
// Light intersection: figure out which lights illuminate this tile.
|
||||
Uniform<int,MAX_LIGHTS> tileLightIndices; // Light list for the tile
|
||||
#if 1
|
||||
int numTileLights =
|
||||
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
||||
tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight,
|
||||
inputData.zBuffer,
|
||||
cameraProj_00, cameraProj_11,
|
||||
cameraProj_22, cameraProj_32,
|
||||
inputHeader.cameraNear, inputHeader.cameraFar,
|
||||
MAX_LIGHTS,
|
||||
inputData.lightPositionView_x,
|
||||
inputData.lightPositionView_y,
|
||||
inputData.lightPositionView_z,
|
||||
inputData.lightAttenuationEnd,
|
||||
tileLightIndices);
|
||||
|
||||
// And now shade the tile, using the lights in tileLightIndices
|
||||
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight, inputData,
|
||||
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
|
||||
tileLightIndices, numTileLights, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
extern "C" __global__ void
|
||||
RenderStatic___export( InputHeader inputHeaderPtr[],
|
||||
InputDataArrays inputDataPtr[],
|
||||
int visualizeLightCount,
|
||||
// Output
|
||||
unsigned int8 framebuffer_r[],
|
||||
unsigned int8 framebuffer_g[],
|
||||
unsigned int8 framebuffer_b[]) {
|
||||
|
||||
const InputHeader inputHeader = *inputHeaderPtr;
|
||||
const InputDataArrays inputData = *inputDataPtr;
|
||||
|
||||
|
||||
int num_groups_x = (inputHeader.framebufferWidth +
|
||||
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
||||
int num_groups_y = (inputHeader.framebufferHeight +
|
||||
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
||||
int num_groups = num_groups_x * num_groups_y;
|
||||
|
||||
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
|
||||
// by MIN_TILE_HEIGHT pixels.
|
||||
if (programIndex == 0)
|
||||
RenderTile<<<(num_groups+4-1)/4,128>>>(num_groups_x, num_groups_y,
|
||||
inputHeaderPtr, inputDataPtr, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
extern "C" __host__ void
|
||||
RenderStatic( InputHeader inputHeaderPtr[],
|
||||
InputDataArrays inputDataPtr[],
|
||||
int visualizeLightCount,
|
||||
// Output
|
||||
unsigned int8 framebuffer_r[],
|
||||
unsigned int8 framebuffer_g[],
|
||||
unsigned int8 framebuffer_b[]) {
|
||||
RenderStatic___export<<<1,32>>>( inputHeaderPtr,
|
||||
inputDataPtr,
|
||||
visualizeLightCount,
|
||||
// Output
|
||||
framebuffer_r,
|
||||
framebuffer_g,
|
||||
framebuffer_b);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
717
examples/portable/deferred/kernels.ispc
Normal file
717
examples/portable/deferred/kernels.ispc
Normal file
@@ -0,0 +1,717 @@
|
||||
/*
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "deferred.h"
|
||||
|
||||
#ifdef __NVPTX__
|
||||
#define uniform_t varying
|
||||
#else
|
||||
#define uniform_t uniform
|
||||
#endif
|
||||
|
||||
struct InputDataArrays
|
||||
{
|
||||
float *zBuffer;
|
||||
unsigned int16 *normalEncoded_x; // half float
|
||||
unsigned int16 *normalEncoded_y; // half float
|
||||
unsigned int16 *specularAmount; // half float
|
||||
unsigned int16 *specularPower; // half float
|
||||
unsigned int8 *albedo_x; // unorm8
|
||||
unsigned int8 *albedo_y; // unorm8
|
||||
unsigned int8 *albedo_z; // unorm8
|
||||
float *lightPositionView_x;
|
||||
float *lightPositionView_y;
|
||||
float *lightPositionView_z;
|
||||
float *lightAttenuationBegin;
|
||||
float *lightColor_x;
|
||||
float *lightColor_y;
|
||||
float *lightColor_z;
|
||||
float *lightAttenuationEnd;
|
||||
};
|
||||
|
||||
struct InputHeader
|
||||
{
|
||||
float cameraProj[4][4];
|
||||
float cameraNear;
|
||||
float cameraFar;
|
||||
|
||||
int32 framebufferWidth;
|
||||
int32 framebufferHeight;
|
||||
int32 numLights;
|
||||
int32 inputDataChunkSize;
|
||||
int32 inputDataArrayOffsets[idaNum];
|
||||
};
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Common utility routines
|
||||
|
||||
static inline float
|
||||
dot3(float x, float y, float z, float a, float b, float c) {
|
||||
return (x*a + y*b + z*c);
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
|
||||
float n = rsqrt(x*x + y*y + z*z);
|
||||
ox = x * n;
|
||||
oy = y * n;
|
||||
oz = z * n;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
Unorm8ToFloat32(unsigned int8 u) {
|
||||
return (float)u * (1.0f / 255.0f);
|
||||
}
|
||||
|
||||
|
||||
static inline unsigned int8
|
||||
Float32ToUnorm8(float f) {
|
||||
return (unsigned int8)(f * 255.0f);
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static void
|
||||
ComputeZBounds(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
// G-buffer data
|
||||
uniform float zBuffer[],
|
||||
uniform int32 gBufferWidth,
|
||||
// Camera data
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Output
|
||||
uniform float &minZ,
|
||||
uniform float &maxZ
|
||||
)
|
||||
{
|
||||
// Find Z bounds
|
||||
float laneMinZ = cameraFar;
|
||||
float laneMaxZ = cameraNear;
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
foreach (x = tileStartX ... tileEndX) {
|
||||
// Unproject depth buffer Z value into view space
|
||||
float z = zBuffer[y * gBufferWidth + x];
|
||||
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
|
||||
|
||||
// Work out Z bounds for our samples
|
||||
// Avoid considering skybox/background or otherwise invalid pixels
|
||||
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
|
||||
laneMinZ = min(laneMinZ, viewSpaceZ);
|
||||
laneMaxZ = max(laneMaxZ, viewSpaceZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
minZ = reduce_min(laneMinZ);
|
||||
maxZ = reduce_max(laneMaxZ);
|
||||
}
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
#ifndef __NVPTX__
|
||||
export
|
||||
#endif
|
||||
uniform int32
|
||||
IntersectLightsWithTileMinMax(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
// Tile data
|
||||
uniform float minZ,
|
||||
uniform float maxZ,
|
||||
// G-buffer data
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
// Light Data
|
||||
uniform int32 numLights,
|
||||
uniform float light_positionView_x_array[],
|
||||
uniform float light_positionView_y_array[],
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Output
|
||||
uniform int32 tileLightIndices[]
|
||||
)
|
||||
{
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
uniform_t float frustumPlanes_xy[4] = {
|
||||
-(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y),
|
||||
-(cameraProj_22 * gBufferScale_y) };
|
||||
uniform_t float frustumPlanes_z[4] = {
|
||||
tileEndX - gBufferScale_x,
|
||||
-tileStartX + gBufferScale_x,
|
||||
tileEndY - gBufferScale_y,
|
||||
-tileStartY + gBufferScale_y };
|
||||
|
||||
for (uniform int i = 0; i < 4; ++i) {
|
||||
uniform_t float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
frustumPlanes_z[i] * frustumPlanes_z[i]);
|
||||
frustumPlanes_xy[i] *= norm;
|
||||
frustumPlanes_z[i] *= norm;
|
||||
}
|
||||
|
||||
uniform int32 tileNumLights = 0;
|
||||
|
||||
foreach (lightIndex = 0 ... numLights) {
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
float d = light_positionView_z - minZ;
|
||||
bool inFrustum = (d >= light_attenuationEndNeg);
|
||||
|
||||
d = maxZ - light_positionView_z;
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
// This seems better than cif(!inFrustum) ccontinue; here since we
|
||||
// don't actually need to mask the rest of this function - this is
|
||||
// just a greedy early-out. Could also structure all of this as
|
||||
// nested if() statements, but this a bit easier to read
|
||||
if (any(inFrustum)) {
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_x * frustumPlanes_xy[1];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[2] +
|
||||
light_positionView_y * frustumPlanes_xy[2];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
d = light_positionView_z * frustumPlanes_z[3] +
|
||||
light_positionView_y * frustumPlanes_xy[3];
|
||||
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
|
||||
|
||||
#if 0
|
||||
// Pack and store intersecting lights
|
||||
cif (inFrustum) {
|
||||
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
|
||||
lightIndex);
|
||||
}
|
||||
#else
|
||||
const bool active = inFrustum && lightIndex < numLights;
|
||||
if(any(active))
|
||||
tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static uniform int32
|
||||
IntersectLightsWithTile(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
// G-buffer data
|
||||
uniform float zBuffer[],
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Light Data
|
||||
uniform int32 numLights,
|
||||
uniform float light_positionView_x_array[],
|
||||
uniform float light_positionView_y_array[],
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Output
|
||||
uniform int32 tileLightIndices[]
|
||||
)
|
||||
{
|
||||
uniform float minZ, maxZ;
|
||||
ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
|
||||
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
minZ, maxZ);
|
||||
|
||||
uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
|
||||
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
|
||||
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
|
||||
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
|
||||
light_positionView_z_array, light_attenuationEnd_array,
|
||||
tileLightIndices);
|
||||
|
||||
return tileNumLights;
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
#ifndef __NVPTX__
|
||||
export
|
||||
#endif
|
||||
void
|
||||
ShadeTile(
|
||||
uniform int32 tileStartX, uniform int32 tileEndX,
|
||||
uniform int32 tileStartY, uniform int32 tileEndY,
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
uniform InputDataArrays &inputData,
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
// Light list
|
||||
uniform int32 tileLightIndices[],
|
||||
uniform int32 tileNumLights,
|
||||
// UI
|
||||
uniform bool visualizeLightCount,
|
||||
// Output
|
||||
uniform unsigned int8 framebuffer_r[],
|
||||
uniform unsigned int8 framebuffer_g[],
|
||||
uniform unsigned int8 framebuffer_b[]
|
||||
)
|
||||
{
|
||||
if (tileNumLights == 0 || visualizeLightCount) {
|
||||
uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
foreach (x = tileStartX ... tileEndX) {
|
||||
int32 framebufferIndex = (y * gBufferWidth + x);
|
||||
framebuffer_r[framebufferIndex] = c;
|
||||
framebuffer_g[framebufferIndex] = c;
|
||||
framebuffer_b[framebufferIndex] = c;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
|
||||
uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
|
||||
|
||||
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
|
||||
uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
|
||||
|
||||
foreach (x = tileStartX ... tileEndX) {
|
||||
int32 gBufferOffset = y * gBufferWidth + x;
|
||||
|
||||
// Reconstruct position and (negative) view vector from G-buffer
|
||||
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
|
||||
float Vneg_x, Vneg_y, Vneg_z;
|
||||
|
||||
float z = inputData.zBuffer[gBufferOffset];
|
||||
|
||||
// Compute screen/clip-space position
|
||||
// NOTE: Mind DX11 viewport transform and pixel center!
|
||||
float positionScreen_x = (0.5f + (float)(x)) *
|
||||
twoOverGBufferWidth - 1.0f;
|
||||
|
||||
// Unproject depth buffer Z value into view space
|
||||
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
|
||||
surface_positionView_x = positionScreen_x * surface_positionView_z /
|
||||
cameraProj_11;
|
||||
surface_positionView_y = positionScreen_y * surface_positionView_z /
|
||||
cameraProj_22;
|
||||
|
||||
// We actually end up with a vector pointing *at* the
|
||||
// surface (i.e. the negative view vector)
|
||||
normalize3(surface_positionView_x, surface_positionView_y,
|
||||
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
|
||||
|
||||
// Reconstruct normal from G-buffer
|
||||
float surface_normal_x, surface_normal_y, surface_normal_z;
|
||||
float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
|
||||
float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
|
||||
|
||||
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
|
||||
float m = sqrt(4.0f * f - 1.0f);
|
||||
|
||||
surface_normal_x = m * (4.0f * normal_x - 2.0f);
|
||||
surface_normal_y = m * (4.0f * normal_y - 2.0f);
|
||||
surface_normal_z = 3.0f - 8.0f * f;
|
||||
|
||||
// Load other G-buffer parameters
|
||||
float surface_specularAmount =
|
||||
half_to_float(inputData.specularAmount[gBufferOffset]);
|
||||
float surface_specularPower =
|
||||
half_to_float(inputData.specularPower[gBufferOffset]);
|
||||
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
|
||||
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
|
||||
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
|
||||
|
||||
float lit_x = 0.0f;
|
||||
float lit_y = 0.0f;
|
||||
float lit_z = 0.0f;
|
||||
for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
|
||||
++tileLightIndex) {
|
||||
uniform int32 lightIndex = tileLightIndices[tileLightIndex];
|
||||
|
||||
// Gather light data relevant to initial culling
|
||||
uniform float light_positionView_x =
|
||||
inputData.lightPositionView_x[lightIndex];
|
||||
uniform float light_positionView_y =
|
||||
inputData.lightPositionView_y[lightIndex];
|
||||
uniform float light_positionView_z =
|
||||
inputData.lightPositionView_z[lightIndex];
|
||||
uniform float light_attenuationEnd =
|
||||
inputData.lightAttenuationEnd[lightIndex];
|
||||
|
||||
// Compute light vector
|
||||
float L_x = light_positionView_x - surface_positionView_x;
|
||||
float L_y = light_positionView_y - surface_positionView_y;
|
||||
float L_z = light_positionView_z - surface_positionView_z;
|
||||
|
||||
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip at end of attenuation
|
||||
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
|
||||
|
||||
cif (distanceToLight2 < light_attenutaionEnd2) {
|
||||
float distanceToLight = sqrt(distanceToLight2);
|
||||
|
||||
// HLSL "rcp" is allowed to be fairly inaccurate
|
||||
float distanceToLightRcp = rcp(distanceToLight);
|
||||
L_x *= distanceToLightRcp;
|
||||
L_y *= distanceToLightRcp;
|
||||
L_z *= distanceToLightRcp;
|
||||
|
||||
// Start computing brdf
|
||||
float NdotL = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, L_x, L_y, L_z);
|
||||
|
||||
// Clip back facing
|
||||
cif (NdotL > 0.0f) {
|
||||
uniform float light_attenuationBegin =
|
||||
inputData.lightAttenuationBegin[lightIndex];
|
||||
|
||||
// Light distance attenuation (linstep)
|
||||
float lightRange = (light_attenuationEnd - light_attenuationBegin);
|
||||
float falloffPosition = (light_attenuationEnd - distanceToLight);
|
||||
float attenuation = min(falloffPosition / lightRange, 1.0f);
|
||||
|
||||
float H_x = (L_x - Vneg_x);
|
||||
float H_y = (L_y - Vneg_y);
|
||||
float H_z = (L_z - Vneg_z);
|
||||
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
|
||||
|
||||
float NdotH = dot3(surface_normal_x, surface_normal_y,
|
||||
surface_normal_z, H_x, H_y, H_z);
|
||||
NdotH = max(NdotH, 0.0f);
|
||||
|
||||
float specular = pow(NdotH, surface_specularPower);
|
||||
float specularNorm = (surface_specularPower + 2.0f) *
|
||||
(1.0f / 8.0f);
|
||||
float specularContrib = surface_specularAmount *
|
||||
specularNorm * specular;
|
||||
|
||||
float k = attenuation * NdotL * (1.0f + specularContrib);
|
||||
|
||||
uniform float light_color_x = inputData.lightColor_x[lightIndex];
|
||||
uniform float light_color_y = inputData.lightColor_y[lightIndex];
|
||||
uniform float light_color_z = inputData.lightColor_z[lightIndex];
|
||||
|
||||
float lightContrib_x = surface_albedo_x * light_color_x;
|
||||
float lightContrib_y = surface_albedo_y * light_color_y;
|
||||
float lightContrib_z = surface_albedo_z * light_color_z;
|
||||
|
||||
lit_x += lightContrib_x * k;
|
||||
lit_y += lightContrib_y * k;
|
||||
lit_z += lightContrib_z * k;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gamma correct
|
||||
// These pows are pretty slow right now, but we can do
|
||||
// something faster if really necessary to squeeze every
|
||||
// last bit of performance out of it
|
||||
float gamma = 1.0 / 2.2f;
|
||||
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
|
||||
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
|
||||
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
|
||||
|
||||
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
|
||||
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
|
||||
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Static decomposition
|
||||
|
||||
task void
|
||||
RenderTile(uniform int num_groups_x, uniform int num_groups_y,
|
||||
uniform InputHeader inputHeaderPtr[],
|
||||
uniform InputDataArrays inputDataPtr[],
|
||||
uniform int visualizeLightCount,
|
||||
// Output
|
||||
uniform unsigned int8 framebuffer_r[],
|
||||
uniform unsigned int8 framebuffer_g[],
|
||||
uniform unsigned int8 framebuffer_b[]) {
|
||||
|
||||
uniform InputHeader inputHeader = *inputHeaderPtr;
|
||||
uniform InputDataArrays inputData = *inputDataPtr;
|
||||
|
||||
uniform int32 group_y = taskIndex / num_groups_x;
|
||||
uniform int32 group_x = taskIndex % num_groups_x;
|
||||
uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
|
||||
uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
|
||||
uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
|
||||
uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
|
||||
|
||||
uniform int framebufferWidth = inputHeader.framebufferWidth;
|
||||
uniform int framebufferHeight = inputHeader.framebufferHeight;
|
||||
uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
|
||||
uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
|
||||
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
|
||||
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
|
||||
|
||||
// Light intersection: figure out which lights illuminate this tile.
|
||||
#if 1
|
||||
uniform int * uniform tileLightIndices = uniform new uniform int [MAX_LIGHTS];
|
||||
#define MALLOC
|
||||
#else /* shared memory doesn't full work... why? */
|
||||
uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
|
||||
#endif
|
||||
uniform int numTileLights =
|
||||
IntersectLightsWithTile(tile_start_x, tile_end_x,
|
||||
tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight,
|
||||
inputData.zBuffer,
|
||||
cameraProj_00, cameraProj_11,
|
||||
cameraProj_22, cameraProj_32,
|
||||
inputHeader.cameraNear, inputHeader.cameraFar,
|
||||
MAX_LIGHTS,
|
||||
inputData.lightPositionView_x,
|
||||
inputData.lightPositionView_y,
|
||||
inputData.lightPositionView_z,
|
||||
inputData.lightAttenuationEnd,
|
||||
tileLightIndices);
|
||||
|
||||
// And now shade the tile, using the lights in tileLightIndices
|
||||
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
|
||||
framebufferWidth, framebufferHeight, inputData,
|
||||
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
|
||||
tileLightIndices, numTileLights, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||
#ifdef MALLOC
|
||||
delete tileLightIndices;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
RenderStatic(uniform InputHeader inputHeaderPtr[],
|
||||
uniform InputDataArrays inputDataPtr[],
|
||||
uniform int visualizeLightCount,
|
||||
// Output
|
||||
uniform unsigned int8 framebuffer_r[],
|
||||
uniform unsigned int8 framebuffer_g[],
|
||||
uniform unsigned int8 framebuffer_b[]) {
|
||||
|
||||
uniform InputHeader inputHeader = *inputHeaderPtr;
|
||||
uniform InputDataArrays inputData = *inputDataPtr;
|
||||
|
||||
uniform int num_groups_x = (inputHeader.framebufferWidth +
|
||||
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
|
||||
uniform int num_groups_y = (inputHeader.framebufferHeight +
|
||||
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
|
||||
uniform int num_groups = num_groups_x * num_groups_y;
|
||||
|
||||
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
|
||||
// by MIN_TILE_HEIGHT pixels.
|
||||
launch[num_groups] RenderTile(num_groups_x, num_groups_y,
|
||||
inputHeaderPtr, inputDataPtr, visualizeLightCount,
|
||||
framebuffer_r, framebuffer_g, framebuffer_b);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Routines for dynamic decomposition path
|
||||
|
||||
// This computes the z min/max range for a whole row worth of tiles.
|
||||
export void
|
||||
ComputeZBoundsRow(
|
||||
uniform int32 tileY,
|
||||
uniform int32 tileWidth, uniform int32 tileHeight,
|
||||
uniform int32 numTilesX, uniform int32 numTilesY,
|
||||
// G-buffer data
|
||||
uniform float zBuffer[],
|
||||
uniform int32 gBufferWidth,
|
||||
// Camera data
|
||||
uniform float cameraProj_33, uniform float cameraProj_43,
|
||||
uniform float cameraNear, uniform float cameraFar,
|
||||
// Output
|
||||
uniform float minZArray[],
|
||||
uniform float maxZArray[]
|
||||
)
|
||||
{
|
||||
for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
|
||||
uniform float minZ, maxZ;
|
||||
ComputeZBounds(
|
||||
tileX * tileWidth, tileX * tileWidth + tileWidth,
|
||||
tileY * tileHeight, tileY * tileHeight + tileHeight,
|
||||
zBuffer, gBufferWidth,
|
||||
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
|
||||
minZ, maxZ);
|
||||
minZArray[tileX] = minZ;
|
||||
maxZArray[tileX] = maxZ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
|
||||
// numLights need not be a multiple of programCount here, but the input and output arrays
|
||||
// should be able to handle programCount-sized load/stores.
|
||||
export void
|
||||
SplitTileMinMax(
|
||||
uniform int32 tileMidX, uniform int32 tileMidY,
|
||||
// Subtile data (00, 10, 01, 11)
|
||||
uniform float subtileMinZ[],
|
||||
uniform float subtileMaxZ[],
|
||||
// G-buffer data
|
||||
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
|
||||
// Camera data
|
||||
uniform float cameraProj_11, uniform float cameraProj_22,
|
||||
// Light Data
|
||||
uniform int32 lightIndices[],
|
||||
uniform int32 numLights,
|
||||
uniform float light_positionView_x_array[],
|
||||
uniform float light_positionView_y_array[],
|
||||
uniform float light_positionView_z_array[],
|
||||
uniform float light_attenuationEnd_array[],
|
||||
// Outputs
|
||||
uniform int32 subtileIndices[],
|
||||
uniform int32 subtileIndicesPitch,
|
||||
uniform int32 subtileNumLights[]
|
||||
)
|
||||
{
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
uniform_t float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y) };
|
||||
uniform_t float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
|
||||
tileMidY - gBufferScale_y };
|
||||
|
||||
// Normalize
|
||||
uniform_t float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
|
||||
frustumPlanes_z[0] * frustumPlanes_z[0]),
|
||||
rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
|
||||
frustumPlanes_z[1] * frustumPlanes_z[1]) };
|
||||
frustumPlanes_xy[0] *= norm[0];
|
||||
frustumPlanes_xy[1] *= norm[1];
|
||||
frustumPlanes_z[0] *= norm[0];
|
||||
frustumPlanes_z[1] *= norm[1];
|
||||
|
||||
// Initialize
|
||||
uniform int32 subtileLightOffset[4];
|
||||
subtileLightOffset[0] = 0 * subtileIndicesPitch;
|
||||
subtileLightOffset[1] = 1 * subtileIndicesPitch;
|
||||
subtileLightOffset[2] = 2 * subtileIndicesPitch;
|
||||
subtileLightOffset[3] = 3 * subtileIndicesPitch;
|
||||
|
||||
foreach (i = 0 ... numLights) {
|
||||
int32 lightIndex = lightIndices[i];
|
||||
|
||||
float light_positionView_x = light_positionView_x_array[lightIndex];
|
||||
float light_positionView_y = light_positionView_y_array[lightIndex];
|
||||
float light_positionView_z = light_positionView_z_array[lightIndex];
|
||||
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
|
||||
float light_attenuationEndNeg = -light_attenuationEnd;
|
||||
|
||||
// Test lights again subtile z bounds
|
||||
bool inFrustum[4];
|
||||
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
|
||||
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
|
||||
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
|
||||
|
||||
float dx = light_positionView_z * frustumPlanes_z[0] +
|
||||
light_positionView_x * frustumPlanes_xy[0];
|
||||
float dy = light_positionView_z * frustumPlanes_z[1] +
|
||||
light_positionView_y * frustumPlanes_xy[1];
|
||||
|
||||
cif (abs(dx) > light_attenuationEnd) {
|
||||
bool positiveX = dx > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile
|
||||
}
|
||||
cif (abs(dy) > light_attenuationEnd) {
|
||||
bool positiveY = dy > 0.0f;
|
||||
inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile
|
||||
inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile
|
||||
inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile
|
||||
inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile
|
||||
}
|
||||
|
||||
// Pack and store intersecting lights
|
||||
// TODO: Experiment with a loop here instead
|
||||
cif (inFrustum[0])
|
||||
subtileLightOffset[0] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[0]],
|
||||
lightIndex);
|
||||
cif (inFrustum[1])
|
||||
subtileLightOffset[1] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[1]],
|
||||
lightIndex);
|
||||
cif (inFrustum[2])
|
||||
subtileLightOffset[2] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[2]],
|
||||
lightIndex);
|
||||
cif (inFrustum[3])
|
||||
subtileLightOffset[3] +=
|
||||
packed_store_active(&subtileIndices[subtileLightOffset[3]],
|
||||
lightIndex);
|
||||
}
|
||||
|
||||
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
|
||||
subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
|
||||
subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
|
||||
subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
|
||||
}
|
||||
107
examples/portable/deferred/main.cpp
Normal file
107
examples/portable/deferred/main.cpp
Normal file
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ISPC_IS_WINDOWS
|
||||
#define NOMINMAX
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include "deferred.h"
|
||||
#include "kernels_ispc.h"
|
||||
#include "timing.h"
|
||||
#include "ispc_malloc.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (argc < 2) {
|
||||
printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)> [tasks iterations] [serial iterations]\n");
|
||||
return 1;
|
||||
}
|
||||
static unsigned int test_iterations[] = {5, 3, 500}; //last value is for nframes, it is scale.
|
||||
if (argc == 5) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
test_iterations[i] = atoi(argv[2 + i]);
|
||||
}
|
||||
}
|
||||
|
||||
InputData *input = CreateInputDataFromFile(argv[1]);
|
||||
if (!input) {
|
||||
printf("Failed to load input file \"%s\"!\n", argv[1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
Framebuffer framebuffer(input->header.framebufferWidth,
|
||||
input->header.framebufferHeight);
|
||||
|
||||
int nframes = test_iterations[2];
|
||||
double ispcCycles = 1e30;
|
||||
for (int i = 0; i < test_iterations[0]; ++i) {
|
||||
framebuffer.clear();
|
||||
reset_and_start_timer();
|
||||
for (int j = 0; j < nframes; ++j)
|
||||
ispc::RenderStatic(&input->header, &input->arrays,
|
||||
VISUALIZE_LIGHT_COUNT,
|
||||
framebuffer.r, framebuffer.g, framebuffer.b);
|
||||
double msec = get_elapsed_msec() / nframes;
|
||||
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec [%.3f fps]\n", msec, 1.0e3/msec);
|
||||
ispcCycles = std::min(ispcCycles, msec);
|
||||
}
|
||||
printf("[ispc static + tasks]:\t\t[%.3f] msec to render "
|
||||
"%d x %d image\n", ispcCycles,
|
||||
input->header.framebufferWidth, input->header.framebufferHeight);
|
||||
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
|
||||
|
||||
DeleteInputData(input);
|
||||
|
||||
return 0;
|
||||
}
|
||||
12
examples/portable/mergeSort/Makefile_cpu
Normal file
12
examples/portable/mergeSort/Makefile_cpu
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
EXAMPLE=mergeSort
|
||||
CPP_SRC=mergeSort.cpp
|
||||
ISPC_SRC=mergeSort.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x8
|
||||
ISPC_ARM_TARGETS=neon
|
||||
#ISPC_FLAGS=-DDEBUG -g
|
||||
CXXFLAGS=-g
|
||||
CCFLAGS=-g
|
||||
#NVCC_FLAGS=-Xptxas=-O0
|
||||
|
||||
include ../common_cpu.mk
|
||||
7
examples/portable/mergeSort/Makefile_knc
Normal file
7
examples/portable/mergeSort/Makefile_knc
Normal file
@@ -0,0 +1,7 @@
|
||||
EXAMPLE=mergeSort
|
||||
CXX_SRC=mergeSort.cpp
|
||||
ISPC_SRC=mergeSort.ispc
|
||||
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
|
||||
ISPC_TARGET=generic-16
|
||||
|
||||
include ../common_knc.mk
|
||||
15
examples/portable/mergeSort/Makefile_ptx
Normal file
15
examples/portable/mergeSort/Makefile_ptx
Normal file
@@ -0,0 +1,15 @@
|
||||
PROG=mergeSort
|
||||
ISPC_SRC=mergeSort.ispc
|
||||
CU_SRC=mergeSort.cu
|
||||
CXX_SRC=mergeSort.cpp mergeSort.cpp
|
||||
PTXCC_REGMAX=64
|
||||
#PTXCC_FLAGS= -Xptxas=-O3
|
||||
#NVCC_FLAGS=-Xptxas=-O0
|
||||
|
||||
LLVM_GPU=1
|
||||
NVVM_GPU=1
|
||||
|
||||
include ../common_ptx.mk
|
||||
|
||||
|
||||
|
||||
3
examples/portable/mergeSort/keyType.h
Normal file
3
examples/portable/mergeSort/keyType.h
Normal file
@@ -0,0 +1,3 @@
|
||||
#pragma once
|
||||
typedef float Key_t;
|
||||
typedef int Val_t;
|
||||
171
examples/portable/mergeSort/mergeSort.cpp
Normal file
171
examples/portable/mergeSort/mergeSort.cpp
Normal file
@@ -0,0 +1,171 @@
|
||||
/*
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <iomanip>
|
||||
#include "timing.h"
|
||||
#include "ispc_malloc.h"
|
||||
#include "mergeSort_ispc.h"
|
||||
|
||||
static void progressBar(const int x, const int n, const int width = 50)
|
||||
{
|
||||
assert(n > 1);
|
||||
assert(x >= 0 && x < n);
|
||||
assert(width > 10);
|
||||
const float f = static_cast<float>(x)/(n-1);
|
||||
const int w = static_cast<int>(f * width);
|
||||
|
||||
// print bar
|
||||
std::string bstr("[");
|
||||
for (int i = 0; i < width; i++)
|
||||
bstr += i < w ? '=' : ' ';
|
||||
bstr += "]";
|
||||
|
||||
// print percentage
|
||||
char pstr0[32];
|
||||
sprintf(pstr0, " %2d %c ", static_cast<int>(f*100.0),'%');
|
||||
const std::string pstr(pstr0);
|
||||
std::copy(pstr.begin(), pstr.end(), bstr.begin() + (width/2-2));
|
||||
|
||||
std::cout << bstr;
|
||||
std::cout << (x == n-1 ? "\n" : "\r") << std::flush;
|
||||
}
|
||||
|
||||
#include "keyType.h"
|
||||
struct Key
|
||||
{
|
||||
Key_t key;
|
||||
Val_t val;
|
||||
};
|
||||
|
||||
|
||||
int main (int argc, char *argv[])
|
||||
{
|
||||
int i, j, n = argc == 1 ? 1024*1024: atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
|
||||
double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
|
||||
|
||||
Key *keys = new Key[n];
|
||||
srand48(rtc()*65536);
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
keys[i].key = i; //((int)(drand48() * (1<<30)));
|
||||
keys[i].val = i;
|
||||
}
|
||||
std::random_shuffle(keys, keys + n);
|
||||
|
||||
Key_t *keysSrc = new Key_t[n];
|
||||
Val_t *valsSrc = new Val_t[n];
|
||||
Key_t *keysBuf = new Key_t[n];
|
||||
Val_t *valsBuf = new Val_t[n];
|
||||
Key_t *keysDst = new Key_t[n];
|
||||
Val_t *valsDst = new Val_t[n];
|
||||
Key_t *keysGld = new Key_t[n];
|
||||
Val_t *valsGld = new Val_t[n];
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
keysSrc[i] = keys[i].key;
|
||||
valsSrc[i] = keys[i].val;
|
||||
|
||||
keysGld[i] = keysSrc[i];
|
||||
valsGld[i] = valsSrc[i];
|
||||
}
|
||||
delete keys;
|
||||
|
||||
ispcSetMallocHeapLimit(1024*1024*1024);
|
||||
|
||||
ispc::openMergeSort();
|
||||
|
||||
tISPC2 = 1e30;
|
||||
for (i = 0; i < m; i ++)
|
||||
{
|
||||
ispcMemcpy(keysSrc, keysGld, n*sizeof(Key_t));
|
||||
ispcMemcpy(valsSrc, valsGld, n*sizeof(Val_t));
|
||||
|
||||
reset_and_start_timer();
|
||||
ispc::mergeSort(keysDst, valsDst, keysBuf, valsBuf, keysSrc, valsSrc, n);
|
||||
tISPC2 = std::min(tISPC2, get_elapsed_msec());
|
||||
|
||||
if (argc != 3)
|
||||
progressBar (i, m);
|
||||
}
|
||||
|
||||
ispc::closeMergeSort();
|
||||
|
||||
printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n/tISPC2);
|
||||
|
||||
#if 0
|
||||
printf("\n---\n");
|
||||
for (int i = 0; i < 128; i++)
|
||||
{
|
||||
if ((i%32) == 0) printf("\n");
|
||||
printf("%d ", (int)keysSrc[i]);
|
||||
}
|
||||
printf("\n---\n");
|
||||
for (int i = 0; i < 128; i++)
|
||||
{
|
||||
if ((i%32) == 0) printf("\n");
|
||||
printf("%d ", (int)keysBuf[i]);
|
||||
}
|
||||
printf("\n---\n");
|
||||
for (int i = 0; i < 128; i++)
|
||||
{
|
||||
if ((i%32) == 0) printf("\n");
|
||||
printf("%d ", (int)keysDst[i]);
|
||||
}
|
||||
printf("\n---\n");
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
std::sort(keysGld, keysGld + n);
|
||||
for (int i = 0; i < n; i++)
|
||||
assert(keysDst[i] == keysGld[i]);
|
||||
|
||||
delete keysSrc;
|
||||
delete valsSrc;
|
||||
delete keysDst;
|
||||
delete valsDst;
|
||||
delete keysBuf;
|
||||
delete valsBuf;
|
||||
delete keysGld;
|
||||
delete valsGld;
|
||||
|
||||
return 0;
|
||||
}
|
||||
694
examples/portable/mergeSort/mergeSort.cu
Normal file
694
examples/portable/mergeSort/mergeSort.cu
Normal file
@@ -0,0 +1,694 @@
|
||||
/*
|
||||
Copyright (c) 2014, Evghenii Gaburov
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
Based on mergeSort from CUDA SDK
|
||||
*/
|
||||
|
||||
#include "keyType.h"
|
||||
#include "cuda_helpers.cuh"
|
||||
#include <cassert>
|
||||
|
||||
#define uniform
|
||||
|
||||
#define SAMPLE_STRIDE programCount
|
||||
|
||||
#define iDivUp(a,b) (((a) + (b) - 1)/(b))
|
||||
#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
|
||||
|
||||
#define W (/*sizeof(int)=*/4 * 8)
|
||||
|
||||
__device__ static inline
|
||||
int nextPowerOfTwo(int x)
|
||||
{
|
||||
#if 0
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return ++x;
|
||||
#else
|
||||
return 1U << (W - __clz(x - 1));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
__device__ static inline
|
||||
int binarySearchInclusiveRanks(
|
||||
const int val,
|
||||
uniform int *data,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
return 0;
|
||||
|
||||
int pos = 0;
|
||||
for (; stride > 0; stride >>= 1)
|
||||
{
|
||||
int newPos = min(pos + stride, L);
|
||||
|
||||
if (data[newPos - 1] <= val)
|
||||
pos = newPos;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
__device__ static inline
|
||||
int binarySearchExclusiveRanks(
|
||||
const int val,
|
||||
uniform int *data,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
return 0;
|
||||
|
||||
int pos = 0;
|
||||
for (; stride > 0; stride >>= 1)
|
||||
{
|
||||
int newPos = min(pos + stride, L);
|
||||
|
||||
if (data[newPos - 1] < val)
|
||||
pos = newPos;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
__device__ static inline
|
||||
int binarySearchInclusive(
|
||||
const Key_t val,
|
||||
uniform Key_t *data,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
return 0;
|
||||
|
||||
int pos = 0;
|
||||
for (; stride > 0; stride >>= 1)
|
||||
{
|
||||
int newPos = min(pos + stride, L);
|
||||
|
||||
if (data[newPos - 1] <= val)
|
||||
pos = newPos;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
__device__ static inline
|
||||
int binarySearchExclusive(
|
||||
const Key_t val,
|
||||
uniform Key_t *data,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
return 0;
|
||||
|
||||
int pos = 0;
|
||||
for (; stride > 0; stride >>= 1)
|
||||
{
|
||||
int newPos = min(pos + stride, L);
|
||||
|
||||
if (data[newPos - 1] < val)
|
||||
pos = newPos;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
__device__ static inline
|
||||
int binarySearchInclusive1(
|
||||
const Key_t val,
|
||||
Key_t data,
|
||||
const uniform int L,
|
||||
uniform int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
return 0;
|
||||
|
||||
int pos = 0;
|
||||
for (; stride > 0; stride >>= 1)
|
||||
{
|
||||
int newPos = min(pos + stride, L);
|
||||
|
||||
if (shuffle(data,newPos - 1) <= val)
|
||||
pos = newPos;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
__device__ static inline
|
||||
int binarySearchExclusive1(
|
||||
const Key_t val,
|
||||
Key_t data,
|
||||
const uniform int L,
|
||||
uniform int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
return 0;
|
||||
|
||||
int pos = 0;
|
||||
for (; stride > 0; stride >>= 1)
|
||||
{
|
||||
int newPos = min(pos + stride, L);
|
||||
|
||||
if (shuffle(data,newPos - 1) < val)
|
||||
pos = newPos;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Bottom-level merge sort (binary search-based)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__global__
|
||||
void mergeSortGangKernel(
|
||||
uniform int batchSize,
|
||||
uniform Key_t dstKey[],
|
||||
uniform Val_t dstVal[],
|
||||
uniform Key_t srcKey[],
|
||||
uniform Val_t srcVal[])
|
||||
{
|
||||
const uniform int blkIdx = taskIndex;
|
||||
const uniform int blkDim = (batchSize + taskCount - 1)/taskCount;
|
||||
const uniform int blkBeg = blkIdx * blkDim;
|
||||
const uniform int blkEnd = min(blkBeg + blkDim, batchSize);
|
||||
|
||||
__shared__ Key_t s_key_tmp[2*programCount*4];
|
||||
__shared__ Val_t s_val_tmp[2*programCount*4];
|
||||
Key_t *s_key = s_key_tmp + warpIdx*(2*programCount);
|
||||
Val_t *s_val = s_val_tmp + warpIdx*(2*programCount);
|
||||
|
||||
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
|
||||
{
|
||||
const uniform int base = blk * (programCount*2);
|
||||
s_key[programIndex + 0] = srcKey[base + programIndex + 0];
|
||||
s_val[programIndex + 0] = srcVal[base + programIndex + 0];
|
||||
s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
|
||||
s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];
|
||||
|
||||
for (uniform int stride = 1; stride < 2*programCount; stride <<= 1)
|
||||
{
|
||||
const int lPos = programIndex & (stride - 1);
|
||||
uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
|
||||
uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
|
||||
|
||||
Key_t keyA = baseKey[lPos + 0];
|
||||
Val_t valA = baseVal[lPos + 0];
|
||||
Key_t keyB = baseKey[lPos + stride];
|
||||
Val_t valB = baseVal[lPos + stride];
|
||||
int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
|
||||
int posB = binarySearchInclusive(keyB, baseKey + 0, stride, stride) + lPos;
|
||||
|
||||
baseKey[posA] = keyA;
|
||||
baseVal[posA] = valA;
|
||||
baseKey[posB] = keyB;
|
||||
baseVal[posB] = valB;
|
||||
}
|
||||
|
||||
dstKey[base + programIndex + 0] = s_key[programIndex + 0];
|
||||
dstVal[base + programIndex + 0] = s_val[programIndex + 0];
|
||||
dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
|
||||
dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ static inline
|
||||
void mergeSortGang(
|
||||
uniform Key_t dstKey[],
|
||||
uniform Val_t dstVal[],
|
||||
uniform Key_t srcKey[],
|
||||
uniform Val_t srcVal[],
|
||||
uniform int batchSize)
|
||||
{
|
||||
uniform int nTasks = batchSize;
|
||||
launch (nTasks,1,1,mergeSortGangKernel)(batchSize, dstKey, dstVal, srcKey, srcVal);
|
||||
sync;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge step 1: generate sample ranks
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__global__
|
||||
void generateSampleRanksKernel(
|
||||
uniform int nBlocks,
|
||||
uniform int in_ranksA[],
|
||||
uniform int in_ranksB[],
|
||||
uniform Key_t in_srcKey[],
|
||||
uniform int stride,
|
||||
uniform int N,
|
||||
uniform int totalProgramCount)
|
||||
{
|
||||
const uniform int blkIdx = taskIndex;
|
||||
const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
|
||||
const uniform int blkBeg = blkIdx * blkDim;
|
||||
const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
|
||||
|
||||
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
|
||||
{
|
||||
const int pos = blk * programCount + programIndex;
|
||||
cif (pos >= totalProgramCount)
|
||||
return;
|
||||
|
||||
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||
|
||||
uniform Key_t * srcKey = in_srcKey + segmentBase;
|
||||
uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
|
||||
uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
|
||||
|
||||
const int segmentElementsA = stride;
|
||||
const int segmentElementsB = min(stride, N - segmentBase - stride);
|
||||
const int segmentSamplesA = getSampleCount(segmentElementsA);
|
||||
const int segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
|
||||
if (i < segmentSamplesA)
|
||||
{
|
||||
ranksA[i] = i * SAMPLE_STRIDE;
|
||||
ranksB[i] = binarySearchExclusive(
|
||||
srcKey[i * SAMPLE_STRIDE], srcKey + stride,
|
||||
segmentElementsB, nextPowerOfTwo(segmentElementsB));
|
||||
}
|
||||
|
||||
if (i < segmentSamplesB)
|
||||
{
|
||||
ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
|
||||
ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
|
||||
srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
|
||||
segmentElementsA, nextPowerOfTwo(segmentElementsA));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ static inline
|
||||
void generateSampleRanks(
|
||||
uniform int ranksA[],
|
||||
uniform int ranksB[],
|
||||
uniform Key_t srcKey[],
|
||||
uniform int stride,
|
||||
uniform int N)
|
||||
{
|
||||
uniform int lastSegmentElements = N % (2 * stride);
|
||||
uniform int threadCount = (lastSegmentElements > stride) ?
|
||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
|
||||
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
||||
uniform int nTasks = nBlocks;
|
||||
|
||||
launch (nTasks,1,1, generateSampleRanksKernel)(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
|
||||
sync;
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge step 2: generate sample ranks and indices
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__global__
|
||||
void mergeRanksAndIndicesKernel(
|
||||
uniform int nBlocks,
|
||||
uniform int in_Limits[],
|
||||
uniform int in_Ranks[],
|
||||
uniform int stride,
|
||||
uniform int N,
|
||||
uniform int totalProgramCount)
|
||||
{
|
||||
const uniform int blkIdx = taskIndex;
|
||||
const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
|
||||
const uniform int blkBeg = blkIdx * blkDim;
|
||||
const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
|
||||
|
||||
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
|
||||
{
|
||||
int pos = blk * programCount + programIndex;
|
||||
cif (pos >= totalProgramCount)
|
||||
return;
|
||||
|
||||
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||
uniform int * ranks = in_Ranks + (pos - i) * 2;
|
||||
uniform int * limits = in_Limits + (pos - i) * 2;
|
||||
|
||||
const int segmentElementsA = stride;
|
||||
const int segmentElementsB = min(stride, N - segmentBase - stride);
|
||||
const int segmentSamplesA = getSampleCount(segmentElementsA);
|
||||
const int segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
|
||||
if (i < segmentSamplesA)
|
||||
{
|
||||
int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
|
||||
limits[dstPos] = ranks[i];
|
||||
}
|
||||
|
||||
if (i < segmentSamplesB)
|
||||
{
|
||||
int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
|
||||
limits[dstPos] = ranks[segmentSamplesA + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
__device__ static inline
|
||||
void mergeRanksAndIndices(
|
||||
uniform int limitsA[],
|
||||
uniform int limitsB[],
|
||||
uniform int ranksA[],
|
||||
uniform int ranksB[],
|
||||
uniform int stride,
|
||||
uniform int N)
|
||||
{
|
||||
const uniform int lastSegmentElements = N % (2 * stride);
|
||||
const uniform int threadCount = (lastSegmentElements > stride) ?
|
||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
|
||||
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
||||
uniform int nTasks = nBlocks;
|
||||
|
||||
launch (nTasks,1,1,mergeRanksAndIndicesKernel)(
|
||||
nBlocks,
|
||||
limitsA,
|
||||
ranksA,
|
||||
stride,
|
||||
N,
|
||||
threadCount);
|
||||
launch (nTasks,1,1, mergeRanksAndIndicesKernel)(
|
||||
nBlocks,
|
||||
limitsB,
|
||||
ranksB,
|
||||
stride,
|
||||
N,
|
||||
threadCount);
|
||||
sync;
|
||||
}
|
||||
|
||||
|
||||
__global__
|
||||
void mergeElementaryIntervalsKernel(
|
||||
uniform int mergePairs,
|
||||
uniform Key_t dstKey[],
|
||||
uniform Val_t dstVal[],
|
||||
uniform Key_t srcKey[],
|
||||
uniform Val_t srcVal[],
|
||||
uniform int limitsA[],
|
||||
uniform int limitsB[],
|
||||
uniform int stride,
|
||||
uniform int N)
|
||||
{
|
||||
const uniform int blkIdx = taskIndex;
|
||||
const uniform int blkDim = (mergePairs + taskCount - 1)/taskCount;
|
||||
const uniform int blkBeg = blkIdx * blkDim;
|
||||
const uniform int blkEnd = min(blkBeg + blkDim, mergePairs);
|
||||
|
||||
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
|
||||
{
|
||||
const int uniform intervalI = blk & ((2 * stride) / SAMPLE_STRIDE - 1);
|
||||
const int uniform segmentBase = (blk - intervalI) * SAMPLE_STRIDE;
|
||||
|
||||
//Set up threadblk-wide parameters
|
||||
|
||||
const uniform int segmentElementsA = stride;
|
||||
const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
|
||||
const uniform int segmentSamplesA = getSampleCount(segmentElementsA);
|
||||
const uniform int segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
const uniform int segmentSamples = segmentSamplesA + segmentSamplesB;
|
||||
|
||||
const uniform int startSrcA = limitsA[blk];
|
||||
const uniform int startSrcB = limitsB[blk];
|
||||
const uniform int endSrcA = (intervalI + 1 < segmentSamples) ? limitsA[blk + 1] : segmentElementsA;
|
||||
const uniform int endSrcB = (intervalI + 1 < segmentSamples) ? limitsB[blk + 1] : segmentElementsB;
|
||||
const uniform int lenSrcA = endSrcA - startSrcA;
|
||||
const uniform int lenSrcB = endSrcB - startSrcB;
|
||||
const uniform int startDstA = startSrcA + startSrcB;
|
||||
const uniform int startDstB = startDstA + lenSrcA;
|
||||
|
||||
//Load main input data
|
||||
|
||||
Key_t keyA, keyB;
|
||||
Val_t valA, valB;
|
||||
if (programIndex < lenSrcA)
|
||||
{
|
||||
keyA = srcKey[segmentBase + startSrcA + programIndex];
|
||||
valA = srcVal[segmentBase + startSrcA + programIndex];
|
||||
}
|
||||
|
||||
if (programIndex < lenSrcB)
|
||||
{
|
||||
keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
|
||||
valB = srcVal[segmentBase + stride + startSrcB + programIndex];
|
||||
}
|
||||
|
||||
// Compute destination addresses for merge data
|
||||
int dstPosA, dstPosB, dstA = -1, dstB = -1;
|
||||
if (any(programIndex < lenSrcA))
|
||||
dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
|
||||
if (any(programIndex < lenSrcB))
|
||||
dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
|
||||
|
||||
if (programIndex < lenSrcA && dstPosA < lenSrcA)
|
||||
dstA = segmentBase + startDstA + dstPosA;
|
||||
dstPosA -= lenSrcA;
|
||||
if (programIndex < lenSrcA && dstPosA < lenSrcB)
|
||||
dstA = segmentBase + startDstB + dstPosA;
|
||||
|
||||
if (programIndex < lenSrcB && dstPosB < lenSrcA)
|
||||
dstB = segmentBase + startDstA + dstPosB;
|
||||
dstPosB -= lenSrcA;
|
||||
if (programIndex < lenSrcB && dstPosB < lenSrcB)
|
||||
dstB = segmentBase + startDstB + dstPosB;
|
||||
|
||||
// store merge data
|
||||
if (dstA >= 0)
|
||||
{
|
||||
// int dstA = segmentBase + startSrcA + programIndex;
|
||||
dstKey[dstA] = keyA;
|
||||
dstVal[dstA] = valA;
|
||||
}
|
||||
if (dstB >= 0)
|
||||
{
|
||||
// int dstB = segmentBase + stride + startSrcB + programIndex;
|
||||
dstKey[dstB] = keyB;
|
||||
dstVal[dstB] = valB;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
__device__ static inline
|
||||
void mergeElementaryIntervals(
|
||||
uniform int nTasks,
|
||||
uniform Key_t dstKey[],
|
||||
uniform Val_t dstVal[],
|
||||
uniform Key_t srcKey[],
|
||||
uniform Val_t srcVal[],
|
||||
uniform int limitsA[],
|
||||
uniform int limitsB[],
|
||||
uniform int stride,
|
||||
uniform int N)
|
||||
{
|
||||
const uniform int lastSegmentElements = N % (2 * stride);
|
||||
const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||
|
||||
|
||||
nTasks = mergePairs/(programCount);
|
||||
|
||||
launch (nTasks,1,1, mergeElementaryIntervalsKernel)(
|
||||
mergePairs,
|
||||
dstKey,
|
||||
dstVal,
|
||||
srcKey,
|
||||
srcVal,
|
||||
limitsA,
|
||||
limitsB,
|
||||
stride,
|
||||
N);
|
||||
sync;
|
||||
}
|
||||
|
||||
__device__ static uniform int * uniform memPool = NULL;
|
||||
__device__ static uniform int * uniform ranksA;
|
||||
__device__ static uniform int * uniform ranksB;
|
||||
__device__ static uniform int * uniform limitsA;
|
||||
__device__ static uniform int * uniform limitsB;
|
||||
__device__ static uniform int nTasks;
|
||||
__device__ static uniform int MAX_SAMPLE_COUNT = 0;
|
||||
|
||||
__global__
|
||||
void openMergeSort___export()
|
||||
{
|
||||
nTasks = 13*32*13;
|
||||
MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
|
||||
assert(memPool == NULL);
|
||||
const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
|
||||
memPool = uniform new uniform int[nalloc];
|
||||
ranksA = memPool;
|
||||
ranksB = ranksA + MAX_SAMPLE_COUNT;
|
||||
limitsA = ranksB + MAX_SAMPLE_COUNT;
|
||||
limitsB = limitsA + MAX_SAMPLE_COUNT;
|
||||
}
|
||||
extern "C"
|
||||
void openMergeSort()
|
||||
{
|
||||
openMergeSort___export<<<1,1>>>();
|
||||
sync;
|
||||
}
|
||||
|
||||
__global__
|
||||
void closeMergeSort___export()
|
||||
{
|
||||
assert(memPool != NULL);
|
||||
delete memPool;
|
||||
memPool = NULL;
|
||||
}
|
||||
extern "C"
|
||||
void closeMergeSort()
|
||||
{
|
||||
closeMergeSort___export<<<1,1>>>();
|
||||
sync;
|
||||
}
|
||||
|
||||
__global__
|
||||
void mergeSort___export(
|
||||
uniform Key_t dstKey[],
|
||||
uniform Val_t dstVal[],
|
||||
uniform Key_t bufKey[],
|
||||
uniform Val_t bufVal[],
|
||||
uniform Key_t srcKey[],
|
||||
uniform Val_t srcVal[],
|
||||
uniform int N)
|
||||
{
|
||||
uniform int stageCount = 0;
|
||||
for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);
|
||||
|
||||
uniform Key_t * uniform iKey, * uniform oKey;
|
||||
uniform Val_t * uniform iVal, * uniform oVal;
|
||||
|
||||
if (stageCount & 1)
|
||||
{
|
||||
iKey = bufKey;
|
||||
iVal = bufVal;
|
||||
oKey = dstKey;
|
||||
oVal = dstVal;
|
||||
}
|
||||
else
|
||||
{
|
||||
iKey = dstKey;
|
||||
iVal = dstVal;
|
||||
oKey = bufKey;
|
||||
oVal = bufVal;
|
||||
}
|
||||
|
||||
|
||||
|
||||
assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
|
||||
assert(N % (programCount*2) == 0);
|
||||
|
||||
// k20m: 140 M/s
|
||||
{
|
||||
// k20m: 2367 M/s
|
||||
mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
|
||||
|
||||
#if 1
|
||||
for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
|
||||
{
|
||||
const uniform int lastSegmentElements = N % (2 * stride);
|
||||
|
||||
// k20m: 271 M/s
|
||||
{
|
||||
#if 1
|
||||
// k20m: 944 M/s
|
||||
{
|
||||
// k20m: 1396 M/s
|
||||
//Find sample ranks and prepare for limiters merge
|
||||
generateSampleRanks(ranksA, ranksB, iKey, stride, N);
|
||||
|
||||
// k20m: 2379 M/s
|
||||
//Merge ranks and indices
|
||||
mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
|
||||
}
|
||||
#endif
|
||||
|
||||
// k20m: 371 M/s
|
||||
//Merge elementary intervals
|
||||
mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
|
||||
}
|
||||
|
||||
if (lastSegmentElements <= stride)
|
||||
for (int i = programIndex; i < lastSegmentElements; i += programCount)
|
||||
if (i < lastSegmentElements)
|
||||
{
|
||||
oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i];
|
||||
oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i];
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
uniform Key_t * uniform tmpKey = iKey;
|
||||
iKey = oKey;
|
||||
oKey = tmpKey;
|
||||
}
|
||||
{
|
||||
uniform Val_t * uniform tmpVal = iVal;
|
||||
iVal = oVal;
|
||||
oVal = tmpVal;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
extern "C"
|
||||
void mergeSort(
|
||||
uniform Key_t dstKey[],
|
||||
uniform Val_t dstVal[],
|
||||
uniform Key_t bufKey[],
|
||||
uniform Val_t bufVal[],
|
||||
uniform Key_t srcKey[],
|
||||
uniform Val_t srcVal[],
|
||||
uniform int N)
|
||||
{
|
||||
mergeSort___export<<<1,32>>>(
|
||||
dstKey,
|
||||
dstVal,
|
||||
bufKey,
|
||||
bufVal,
|
||||
srcKey,
|
||||
srcVal,
|
||||
N);
|
||||
sync;
|
||||
}
|
||||
658
examples/portable/mergeSort/mergeSort.ispc
Normal file
658
examples/portable/mergeSort/mergeSort.ispc
Normal file
@@ -0,0 +1,658 @@
|
||||
/*
|
||||
Copyright (c) 2014, Evghenii Gaburov
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
Based on mergeSort from CUDA SDK
|
||||
*/
|
||||
|
||||
#include "keyType.h"
|
||||
|
||||
#define SAMPLE_STRIDE programCount
|
||||
|
||||
#define iDivUp(a,b) (((a) + (b) - 1)/(b))
|
||||
#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
|
||||
|
||||
#define W (/*sizeof(int)=*/4 * 8)
|
||||
|
||||
static inline
|
||||
int nextPowerOfTwo(int x)
|
||||
{
|
||||
#if 0
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return ++x;
|
||||
#else
|
||||
return 1U << (W - count_leading_zeros(x - 1));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline
|
||||
int binarySearchInclusiveRanks(
|
||||
const int val,
|
||||
uniform int *data,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
cif (L == 0)
|
||||
return 0;
|
||||
|
||||
int pos = 0;
|
||||
cfor (; stride > 0; stride >>= 1)
|
||||
{
|
||||
int newPos = min(pos + stride, L);
|
||||
|
||||
cif (data[newPos - 1] <= val)
|
||||
pos = newPos;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
static inline
|
||||
int binarySearchExclusiveRanks(
|
||||
const int val,
|
||||
uniform int *data,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
cif (L == 0)
|
||||
return 0;
|
||||
|
||||
int pos = 0;
|
||||
cfor (; stride > 0; stride >>= 1)
|
||||
{
|
||||
int newPos = min(pos + stride, L);
|
||||
|
||||
if (data[newPos - 1] < val)
|
||||
pos = newPos;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
static inline
|
||||
int binarySearchInclusive(
|
||||
const Key_t val,
|
||||
uniform Key_t *data,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
cif (L == 0)
|
||||
return 0;
|
||||
|
||||
int pos = 0;
|
||||
cfor (; stride > 0; stride >>= 1)
|
||||
{
|
||||
int newPos = min(pos + stride, L);
|
||||
|
||||
if (data[newPos - 1] <= val)
|
||||
pos = newPos;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
static inline
|
||||
int binarySearchExclusive(
|
||||
const Key_t val,
|
||||
uniform Key_t *data,
|
||||
const int L,
|
||||
int stride)
|
||||
{
|
||||
cif (L == 0)
|
||||
return 0;
|
||||
|
||||
int pos = 0;
|
||||
cfor (; stride > 0; stride >>= 1)
|
||||
{
|
||||
int newPos = min(pos + stride, L);
|
||||
|
||||
if (data[newPos - 1] < val)
|
||||
pos = newPos;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
static inline
|
||||
int binarySearchInclusive1(
|
||||
const Key_t val,
|
||||
Key_t data,
|
||||
const uniform int L,
|
||||
uniform int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
return 0;
|
||||
|
||||
int pos = 0;
|
||||
for (; stride > 0; stride >>= 1)
|
||||
{
|
||||
int newPos = min(pos + stride, L);
|
||||
|
||||
if (shuffle(data,newPos - 1) <= val)
|
||||
pos = newPos;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
static inline
|
||||
int binarySearchExclusive1(
|
||||
const Key_t val,
|
||||
Key_t data,
|
||||
const uniform int L,
|
||||
uniform int stride)
|
||||
{
|
||||
if (L == 0)
|
||||
return 0;
|
||||
|
||||
int pos = 0;
|
||||
for (; stride > 0; stride >>= 1)
|
||||
{
|
||||
int newPos = min(pos + stride, L);
|
||||
|
||||
if (shuffle(data,newPos - 1) < val)
|
||||
pos = newPos;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Bottom-level merge sort (binary search-based)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
task
|
||||
void mergeSortGangKernel(
|
||||
uniform int batchSize,
|
||||
uniform Key_t dstKey[],
|
||||
uniform Val_t dstVal[],
|
||||
uniform Key_t srcKey[],
|
||||
uniform Val_t srcVal[],
|
||||
uniform int arrayLength)
|
||||
{
|
||||
const uniform int blockIdx = taskIndex;
|
||||
const uniform int blockDim = (batchSize + taskCount - 1)/taskCount;
|
||||
const uniform int blockBeg = blockIdx * blockDim;
|
||||
const uniform int blockEnd = min(blockBeg + blockDim, batchSize);
|
||||
|
||||
uniform Key_t s_key[2*programCount];
|
||||
uniform Val_t s_val[2*programCount];
|
||||
|
||||
for (uniform int block = blockBeg; block < blockEnd; block++)
|
||||
{
|
||||
const uniform int base = block * (programCount*2);
|
||||
s_key[programIndex + 0] = srcKey[base + programIndex + 0];
|
||||
s_val[programIndex + 0] = srcVal[base + programIndex + 0];
|
||||
s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
|
||||
s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];
|
||||
|
||||
for (uniform int stride = 1; stride < arrayLength; stride <<= 1)
|
||||
{
|
||||
const int lPos = programIndex & (stride - 1);
|
||||
const int offset = 2 * (programIndex - lPos);
|
||||
uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
|
||||
uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
|
||||
|
||||
Key_t keyA = baseKey[lPos + 0];
|
||||
Val_t valA = baseVal[lPos + 0];
|
||||
Key_t keyB = baseKey[lPos + stride];
|
||||
Val_t valB = baseVal[lPos + stride];
|
||||
|
||||
int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
|
||||
int posB = binarySearchInclusive(keyB, baseKey + 0, stride, stride) + lPos;
|
||||
|
||||
baseKey[posA] = keyA;
|
||||
baseVal[posA] = valA;
|
||||
baseKey[posB] = keyB;
|
||||
baseVal[posB] = valB;
|
||||
}
|
||||
|
||||
dstKey[base + programIndex + 0] = s_key[programIndex + 0];
|
||||
dstVal[base + programIndex + 0] = s_val[programIndex + 0];
|
||||
dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
|
||||
dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
|
||||
}
|
||||
}
|
||||
|
||||
static inline
|
||||
void mergeSortGang(
|
||||
uniform Key_t dstKey[],
|
||||
uniform Val_t dstVal[],
|
||||
uniform Key_t srcKey[],
|
||||
uniform Val_t srcVal[],
|
||||
uniform int batchSize)
|
||||
{
|
||||
uniform int nTasks = num_cores()*4;
|
||||
#ifdef __NVPTX__
|
||||
nTasks = iDivUp(batchSize,1);
|
||||
#endif
|
||||
launch [nTasks] mergeSortGangKernel(batchSize, dstKey, dstVal, srcKey, srcVal, 2*programCount);
|
||||
sync;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge step 1: generate sample ranks
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
task
|
||||
void generateSampleRanksKernel(
|
||||
uniform int nBlocks,
|
||||
uniform int in_ranksA[],
|
||||
uniform int in_ranksB[],
|
||||
uniform Key_t in_srcKey[],
|
||||
uniform int stride,
|
||||
uniform int N,
|
||||
uniform int totalProgramCount)
|
||||
{
|
||||
const uniform int blockIdx = taskIndex;
|
||||
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
|
||||
const uniform int blockBeg = blockIdx * blockDim;
|
||||
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
|
||||
|
||||
for (uniform int block = blockBeg; block < blockEnd; block++)
|
||||
{
|
||||
const int pos = block * programCount + programIndex;
|
||||
cif (pos >= totalProgramCount)
|
||||
return;
|
||||
|
||||
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||
|
||||
uniform Key_t * srcKey = in_srcKey + segmentBase;
|
||||
uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
|
||||
uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
|
||||
|
||||
const int segmentElementsA = stride;
|
||||
const int segmentElementsB = min(stride, N - segmentBase - stride);
|
||||
const int segmentSamplesA = getSampleCount(segmentElementsA);
|
||||
const int segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
|
||||
if (i < segmentSamplesA)
|
||||
{
|
||||
ranksA[i] = i * SAMPLE_STRIDE;
|
||||
ranksB[i] = binarySearchExclusive(
|
||||
srcKey[i * SAMPLE_STRIDE], srcKey + stride,
|
||||
segmentElementsB, nextPowerOfTwo(segmentElementsB));
|
||||
}
|
||||
|
||||
if (i < segmentSamplesB)
|
||||
{
|
||||
ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
|
||||
ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
|
||||
srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
|
||||
segmentElementsA, nextPowerOfTwo(segmentElementsA));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline
|
||||
void generateSampleRanks(
|
||||
uniform int ranksA[],
|
||||
uniform int ranksB[],
|
||||
uniform Key_t srcKey[],
|
||||
uniform int stride,
|
||||
uniform int N)
|
||||
{
|
||||
uniform int lastSegmentElements = N % (2 * stride);
|
||||
uniform int threadCount = (lastSegmentElements > stride) ?
|
||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
|
||||
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
||||
uniform int nTasks = num_cores()*4;
|
||||
#ifdef __NVPTX__
|
||||
nTasks = iDivUp(nBlocks,1);
|
||||
#endif
|
||||
|
||||
launch [nTasks] generateSampleRanksKernel(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
|
||||
sync;
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge step 2: generate sample ranks and indices
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
task
|
||||
void mergeRanksAndIndicesKernel(
|
||||
uniform int nBlocks,
|
||||
uniform int in_Limits[],
|
||||
uniform int in_Ranks[],
|
||||
uniform int stride,
|
||||
uniform int N,
|
||||
uniform int totalProgramCount)
|
||||
{
|
||||
const uniform int blockIdx = taskIndex;
|
||||
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
|
||||
const uniform int blockBeg = blockIdx * blockDim;
|
||||
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
|
||||
|
||||
for (uniform int block = blockBeg; block < blockEnd; block++)
|
||||
{
|
||||
int pos = block * programCount + programIndex;
|
||||
cif (pos >= totalProgramCount)
|
||||
return;
|
||||
|
||||
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||
uniform int * ranks = in_Ranks + (pos - i) * 2;
|
||||
uniform int * limits = in_Limits + (pos - i) * 2;
|
||||
|
||||
const int segmentElementsA = stride;
|
||||
const int segmentElementsB = min(stride, N - segmentBase - stride);
|
||||
const int segmentSamplesA = getSampleCount(segmentElementsA);
|
||||
const int segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
|
||||
if (i < segmentSamplesA)
|
||||
{
|
||||
int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
|
||||
limits[dstPos] = ranks[i];
|
||||
}
|
||||
|
||||
if (i < segmentSamplesB)
|
||||
{
|
||||
int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
|
||||
limits[dstPos] = ranks[segmentSamplesA + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
static inline
|
||||
void mergeRanksAndIndices(
|
||||
uniform int limitsA[],
|
||||
uniform int limitsB[],
|
||||
uniform int ranksA[],
|
||||
uniform int ranksB[],
|
||||
uniform int stride,
|
||||
uniform int N)
|
||||
{
|
||||
const uniform int lastSegmentElements = N % (2 * stride);
|
||||
const uniform int threadCount = (lastSegmentElements > stride) ?
|
||||
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
|
||||
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
|
||||
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
|
||||
uniform int nTasks = num_cores()*4;
|
||||
|
||||
#ifdef __NVPTX__
|
||||
nTasks = iDivUp(nBlocks,1);
|
||||
#endif
|
||||
|
||||
launch [nTasks] mergeRanksAndIndicesKernel(
|
||||
nBlocks,
|
||||
limitsA,
|
||||
ranksA,
|
||||
stride,
|
||||
N,
|
||||
threadCount);
|
||||
launch [nTasks] mergeRanksAndIndicesKernel(
|
||||
nBlocks,
|
||||
limitsB,
|
||||
ranksB,
|
||||
stride,
|
||||
N,
|
||||
threadCount);
|
||||
sync;
|
||||
}
|
||||
|
||||
|
||||
task
|
||||
void mergeElementaryIntervalsKernel(
|
||||
uniform int mergePairs,
|
||||
uniform Key_t dstKey[],
|
||||
uniform Val_t dstVal[],
|
||||
uniform Key_t srcKey[],
|
||||
uniform Val_t srcVal[],
|
||||
uniform int limitsA[],
|
||||
uniform int limitsB[],
|
||||
uniform int stride,
|
||||
uniform int N)
|
||||
{
|
||||
const uniform int blockIdx = taskIndex;
|
||||
const uniform int blockDim = (mergePairs + taskCount - 1)/taskCount;
|
||||
const uniform int blockBeg = blockIdx * blockDim;
|
||||
const uniform int blockEnd = min(blockBeg + blockDim, mergePairs);
|
||||
|
||||
for (uniform int block = blockBeg; block < blockEnd; block++)
|
||||
{
|
||||
const int uniform intervalI = block & ((2 * stride) / SAMPLE_STRIDE - 1);
|
||||
const int uniform segmentBase = (block - intervalI) * SAMPLE_STRIDE;
|
||||
|
||||
//Set up threadblock-wide parameters
|
||||
|
||||
const uniform int segmentElementsA = stride;
|
||||
const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
|
||||
const uniform int segmentSamplesA = getSampleCount(segmentElementsA);
|
||||
const uniform int segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
const uniform int segmentSamples = segmentSamplesA + segmentSamplesB;
|
||||
|
||||
const uniform int startSrcA = limitsA[block];
|
||||
const uniform int startSrcB = limitsB[block];
|
||||
const uniform int endSrcA = (intervalI + 1 < segmentSamples) ? limitsA[block + 1] : segmentElementsA;
|
||||
const uniform int endSrcB = (intervalI + 1 < segmentSamples) ? limitsB[block + 1] : segmentElementsB;
|
||||
const uniform int lenSrcA = endSrcA - startSrcA;
|
||||
const uniform int lenSrcB = endSrcB - startSrcB;
|
||||
const uniform int startDstA = startSrcA + startSrcB;
|
||||
const uniform int startDstB = startDstA + lenSrcA;
|
||||
|
||||
//Load main input data
|
||||
|
||||
Key_t keyA, keyB;
|
||||
Val_t valA, valB;
|
||||
if (programIndex < lenSrcA)
|
||||
{
|
||||
keyA = srcKey[segmentBase + startSrcA + programIndex];
|
||||
valA = srcVal[segmentBase + startSrcA + programIndex];
|
||||
}
|
||||
|
||||
if (programIndex < lenSrcB)
|
||||
{
|
||||
keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
|
||||
valB = srcVal[segmentBase + stride + startSrcB + programIndex];
|
||||
}
|
||||
|
||||
// Compute destination addresses for merge data
|
||||
int dstPosA, dstPosB, dstA = -1, dstB = -1;
|
||||
if (programIndex < lenSrcA)
|
||||
dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
|
||||
if (programIndex < lenSrcB)
|
||||
dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
|
||||
|
||||
if (programIndex < lenSrcA && dstPosA < lenSrcA)
|
||||
dstA = segmentBase + startDstA + dstPosA;
|
||||
dstPosA -= lenSrcA;
|
||||
if (programIndex < lenSrcA && dstPosA < lenSrcB)
|
||||
dstA = segmentBase + startDstB + dstPosA;
|
||||
|
||||
if (programIndex < lenSrcB && dstPosB < lenSrcA)
|
||||
dstB = segmentBase + startDstA + dstPosB;
|
||||
dstPosB -= lenSrcA;
|
||||
if (programIndex < lenSrcB && dstPosB < lenSrcB)
|
||||
dstB = segmentBase + startDstB + dstPosB;
|
||||
|
||||
if (dstA >= 0)
|
||||
{
|
||||
dstKey[dstA] = keyA;
|
||||
dstVal[dstA] = valA;
|
||||
}
|
||||
if (dstB >= 0)
|
||||
{
|
||||
dstKey[dstB] = keyB;
|
||||
dstVal[dstB] = valB;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline
|
||||
void mergeElementaryIntervals(
|
||||
uniform Key_t dstKey[],
|
||||
uniform Val_t dstVal[],
|
||||
uniform Key_t srcKey[],
|
||||
uniform Val_t srcVal[],
|
||||
uniform int limitsA[],
|
||||
uniform int limitsB[],
|
||||
uniform int stride,
|
||||
uniform int N)
|
||||
{
|
||||
const uniform int lastSegmentElements = N % (2 * stride);
|
||||
const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||
|
||||
|
||||
uniform int nTasks = num_cores()*4;
|
||||
#ifdef __NVPTX__
|
||||
nTasks = iDivUp(mergePairs,1*programCount);
|
||||
#endif
|
||||
|
||||
launch [nTasks] mergeElementaryIntervalsKernel(
|
||||
mergePairs,
|
||||
dstKey,
|
||||
dstVal,
|
||||
srcKey,
|
||||
srcVal,
|
||||
limitsA,
|
||||
limitsB,
|
||||
stride,
|
||||
N);
|
||||
if (lastSegmentElements <= stride)
|
||||
foreach (i = 0 ... lastSegmentElements)
|
||||
{
|
||||
dstKey[N-lastSegmentElements+i] = srcKey[N-lastSegmentElements+i];
|
||||
dstVal[N-lastSegmentElements+i] = srcVal[N-lastSegmentElements+i];
|
||||
}
|
||||
sync;
|
||||
}
|
||||
|
||||
static uniform int * uniform memPool = NULL;
|
||||
static uniform int * uniform ranksA;
|
||||
static uniform int * uniform ranksB;
|
||||
static uniform int * uniform limitsA;
|
||||
static uniform int * uniform limitsB;
|
||||
static uniform int MAX_SAMPLE_COUNT = 0;
|
||||
|
||||
export
|
||||
void openMergeSort()
|
||||
{
|
||||
MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
|
||||
assert(memPool == NULL);
|
||||
const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
|
||||
memPool = uniform new uniform int[nalloc];
|
||||
ranksA = memPool;
|
||||
ranksB = ranksA + MAX_SAMPLE_COUNT;
|
||||
limitsA = ranksB + MAX_SAMPLE_COUNT;
|
||||
limitsB = limitsA + MAX_SAMPLE_COUNT;
|
||||
}
|
||||
|
||||
export
|
||||
void closeMergeSort()
|
||||
{
|
||||
assert(memPool != NULL);
|
||||
delete memPool;
|
||||
memPool = NULL;
|
||||
}
|
||||
|
||||
export
|
||||
void mergeSort(
|
||||
uniform Key_t dstKey[],
|
||||
uniform Val_t dstVal[],
|
||||
uniform Key_t bufKey[],
|
||||
uniform Val_t bufVal[],
|
||||
uniform Key_t srcKey[],
|
||||
uniform Val_t srcVal[],
|
||||
uniform int N)
|
||||
{
|
||||
uniform int stageCount = 0;
|
||||
for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);
|
||||
|
||||
uniform Key_t * uniform iKey, * uniform oKey;
|
||||
uniform Val_t * uniform iVal, * uniform oVal;
|
||||
|
||||
if (stageCount & 1)
|
||||
{
|
||||
iKey = bufKey;
|
||||
iVal = bufVal;
|
||||
oKey = dstKey;
|
||||
oVal = dstVal;
|
||||
}
|
||||
else
|
||||
{
|
||||
iKey = dstKey;
|
||||
iVal = dstVal;
|
||||
oKey = bufKey;
|
||||
oVal = bufVal;
|
||||
}
|
||||
|
||||
|
||||
|
||||
assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
|
||||
assert(N % (programCount*2) == 0);
|
||||
|
||||
// cpu: 28 gpu: 74 M/s
|
||||
{
|
||||
// cpu: 356 gpu: 534 M/s
|
||||
mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
|
||||
|
||||
#if 1
|
||||
for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
|
||||
{
|
||||
// cpu: 30 gpu: 112 M/s
|
||||
{
|
||||
#if 1
|
||||
// cpu: 121 gpu: 460 M/s
|
||||
{
|
||||
// cpu: 190 gpu: 600 M/s
|
||||
//Find sample ranks and prepare for limiters merge
|
||||
generateSampleRanks(ranksA, ranksB, iKey, stride, N);
|
||||
|
||||
// cpu: 120 gpu: 457 M/s
|
||||
//Merge ranks and indices
|
||||
mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
|
||||
}
|
||||
#endif
|
||||
|
||||
// cpu: 287 gpu: 194 M/s
|
||||
//Merge elementary intervals
|
||||
mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
|
||||
}
|
||||
|
||||
{
|
||||
uniform Key_t * uniform tmpKey = iKey;
|
||||
iKey = oKey;
|
||||
oKey = tmpKey;
|
||||
}
|
||||
{
|
||||
uniform Val_t * uniform tmpVal = iVal;
|
||||
iVal = oVal;
|
||||
oVal = tmpVal;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
8
examples/portable/nbody_hermite4/Makefile_cpu
Normal file
8
examples/portable/nbody_hermite4/Makefile_cpu
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=hermite4
|
||||
CPP_SRC=hermite4.cpp
|
||||
ISPC_SRC=hermite4.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x8
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common_cpu.mk
|
||||
7
examples/portable/nbody_hermite4/Makefile_knc
Normal file
7
examples/portable/nbody_hermite4/Makefile_knc
Normal file
@@ -0,0 +1,7 @@
|
||||
EXAMPLE=hermite4
|
||||
CXX_SRC=hermite4.cpp
|
||||
ISPC_SRC=hermite4.ispc
|
||||
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
|
||||
ISPC_TARGET=generic-16
|
||||
|
||||
include ../common_knc.mk
|
||||
14
examples/portable/nbody_hermite4/Makefile_ptx
Normal file
14
examples/portable/nbody_hermite4/Makefile_ptx
Normal file
@@ -0,0 +1,14 @@
|
||||
PROG=hermite4
|
||||
ISPC_SRC=hermite4.ispc
|
||||
#CU_SRC=hermite4.cu
|
||||
CXX_SRC=hermite4.cpp
|
||||
PTXCC_REGMAX=64
|
||||
#ISPC_FLAGS= --opt=disable-uniform-control-flow
|
||||
|
||||
#LLVM_GPU=1
|
||||
NVVM_GPU=1
|
||||
|
||||
include ../common_ptx.mk
|
||||
|
||||
|
||||
|
||||
361
examples/portable/nbody_hermite4/hermite4.cpp
Normal file
361
examples/portable/nbody_hermite4/hermite4.cpp
Normal file
@@ -0,0 +1,361 @@
|
||||
/*
|
||||
Copyright (c) 2014, Evghenii Gaburov
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* Hermite4 N-body integrator */
|
||||
/* Makino and Aarseth, 1992 */
|
||||
/* http://adsabs.harvard.edu/abs/1992PASJ...44..141M and references there in*/
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
|
||||
#include "timing.h"
|
||||
#include "ispc_malloc.h"
|
||||
|
||||
#include "typeReal.h"
|
||||
#include "hermite4_ispc.h"
|
||||
|
||||
struct Hermite4
|
||||
{
|
||||
enum {PP_FLOP=44};
|
||||
const int n;
|
||||
const real eta;
|
||||
real eps2;
|
||||
real *g_mass, *g_gpot;
|
||||
real *g_posx, *g_posy, *g_posz;
|
||||
real *g_velx, *g_vely, *g_velz;
|
||||
real *g_accx, *g_accy, *g_accz;
|
||||
real *g_jrkx, *g_jrky, *g_jrkz;
|
||||
|
||||
std::vector<real> accx0, accy0, accz0;
|
||||
std::vector<real> jrkx0, jrky0, jrkz0;
|
||||
|
||||
Hermite4(const int _n = 8192, const real _eta = 0.1) : n(_n), eta(_eta)
|
||||
{
|
||||
eps2 = 4.0/n; /* eps = 4/n to give Ebin = 1 KT */
|
||||
eps2 *= eps2;
|
||||
g_mass = new real[n];
|
||||
g_gpot = new real[n];
|
||||
g_posx = new real[n];
|
||||
g_posy = new real[n];
|
||||
g_posz = new real[n];
|
||||
g_velx = new real[n];
|
||||
g_vely = new real[n];
|
||||
g_velz = new real[n];
|
||||
g_accx = new real[n];
|
||||
g_accy = new real[n];
|
||||
g_accz = new real[n];
|
||||
g_jrkx = new real[n];
|
||||
g_jrky = new real[n];
|
||||
g_jrkz = new real[n];
|
||||
|
||||
accx0.resize(n);
|
||||
accy0.resize(n);
|
||||
accz0.resize(n);
|
||||
jrkx0.resize(n);
|
||||
jrky0.resize(n);
|
||||
jrkz0.resize(n);
|
||||
|
||||
printf("---Intializing nbody--- \n");
|
||||
|
||||
const real R0 = 1;
|
||||
const real mp = 1.0/n;
|
||||
#pragma omp parallel for schedule(runtime)
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
real xp, yp, zp, s2 = 2*R0;
|
||||
real vx, vy, vz;
|
||||
while (s2 > R0*R0) {
|
||||
xp = (1.0 - 2.0*drand48())*R0;
|
||||
yp = (1.0 - 2.0*drand48())*R0;
|
||||
zp = (1.0 - 2.0*drand48())*R0;
|
||||
s2 = xp*xp + yp*yp + zp*zp;
|
||||
vx = drand48() * 0.1;
|
||||
vy = drand48() * 0.1;
|
||||
vz = drand48() * 0.1;
|
||||
}
|
||||
g_posx[i] = xp;
|
||||
g_posy[i] = yp;
|
||||
g_posz[i] = zp;
|
||||
g_velx[i] = vx;
|
||||
g_vely[i] = vy;
|
||||
g_velz[i] = vz;
|
||||
g_mass[i] = mp;
|
||||
}
|
||||
}
|
||||
|
||||
~Hermite4()
|
||||
{
|
||||
delete g_mass;
|
||||
delete g_gpot;
|
||||
delete g_posx;
|
||||
delete g_posy;
|
||||
delete g_posz;
|
||||
delete g_velx;
|
||||
delete g_vely;
|
||||
delete g_velz;
|
||||
delete g_accx;
|
||||
delete g_accy;
|
||||
delete g_accz;
|
||||
delete g_jrkx;
|
||||
delete g_jrky;
|
||||
delete g_jrkz;
|
||||
}
|
||||
|
||||
void forces();
|
||||
|
||||
real step(const real dt)
|
||||
{
|
||||
const real dt2 = dt*real(1.0/2.0);
|
||||
const real dt3 = dt*real(1.0/3.0);
|
||||
|
||||
real dt_min = HUGE;
|
||||
|
||||
#pragma omp parallel for schedule(runtime)
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
accx0[i] = g_accx[i];
|
||||
accy0[i] = g_accy[i];
|
||||
accz0[i] = g_accz[i];
|
||||
jrkx0[i] = g_jrkx[i];
|
||||
jrky0[i] = g_jrky[i];
|
||||
jrkz0[i] = g_jrkz[i];
|
||||
|
||||
g_posx[i] += dt*(g_velx[i] + dt2*(g_accx[i] + dt3*g_jrkx[i]));
|
||||
g_posy[i] += dt*(g_vely[i] + dt2*(g_accy[i] + dt3*g_jrky[i]));
|
||||
g_posz[i] += dt*(g_velz[i] + dt2*(g_accz[i] + dt3*g_jrkz[i]));
|
||||
|
||||
g_velx[i] += dt*(g_accx[i] + dt2*g_jrkx[i]);
|
||||
g_vely[i] += dt*(g_accy[i] + dt2*g_jrky[i]);
|
||||
g_velz[i] += dt*(g_accz[i] + dt2*g_jrkz[i]);
|
||||
}
|
||||
|
||||
forces();
|
||||
|
||||
if (dt > 0.0)
|
||||
{
|
||||
const real h = dt*real(0.5);
|
||||
const real hinv = real(1.0)/h;
|
||||
const real f1 = real(0.5)*hinv*hinv;
|
||||
const real f2 = real(3.0)*hinv*f1;
|
||||
|
||||
const real dt2 = dt *dt * real(1.0/2.0);
|
||||
const real dt3 = dt2*dt * real(1.0/3.0);
|
||||
const real dt4 = dt3*dt * real(1.0/4.0);
|
||||
const real dt5 = dt4*dt * real(1.0/5.0);
|
||||
|
||||
#pragma omp parallel for schedule(runtime) reduction(min:dt_min)
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
/* compute snp & crk */
|
||||
|
||||
const real Amx = g_accx[i] - accx0[i];
|
||||
const real Amy = g_accy[i] - accy0[i];
|
||||
const real Amz = g_accz[i] - accz0[i];
|
||||
|
||||
const real Jmx = h*(g_jrkx[i] - jrkx0[i]);
|
||||
const real Jmy = h*(g_jrky[i] - jrky0[i]);
|
||||
const real Jmz = h*(g_jrkz[i] - jrkz0[i]);
|
||||
|
||||
const real Jpx = h*(g_jrkx[i] + jrkx0[i]);
|
||||
const real Jpy = h*(g_jrky[i] + jrky0[i]);
|
||||
const real Jpz = h*(g_jrkz[i] + jrkz0[i]);
|
||||
|
||||
|
||||
real snpx = f1*Jmx;
|
||||
real snpy = f1*Jmy;
|
||||
real snpz = f1*Jmz;
|
||||
|
||||
real crkx = f2*(Jpx - Amx);
|
||||
real crky = f2*(Jpy - Amy);
|
||||
real crkz = f2*(Jpz - Amz);
|
||||
|
||||
snpx -= h*crkx;
|
||||
snpy -= h*crky;
|
||||
snpz -= h*crkz;
|
||||
|
||||
/* correct */
|
||||
|
||||
g_posx[i] += dt4*snpx + dt5*crkx;
|
||||
g_posy[i] += dt4*snpy + dt5*crky;
|
||||
g_posz[i] += dt4*snpz + dt5*crkz;
|
||||
|
||||
g_velx[i] += dt3*snpx + dt4*crkx;
|
||||
g_vely[i] += dt3*snpy + dt4*crky;
|
||||
g_velz[i] += dt3*snpz + dt4*crkz;
|
||||
|
||||
/* compute new timestep */
|
||||
|
||||
const real s0 = g_accx[i]*g_accx[i] + g_accy[i]*g_accy[i] + g_accz[i]*g_accz[i];
|
||||
const real s1 = g_jrkx[i]*g_jrkx[i] + g_jrky[i]*g_jrky[i] + g_jrkz[i]*g_jrkz[i];
|
||||
const real s2 = snpx*snpx + snpy*snpy + snpz*snpz;
|
||||
const real s3 = crkx*crkx + crky*crky + crkz*crkz;
|
||||
|
||||
const double u = std::sqrt(s0*s2) + s1;
|
||||
const double l = std::sqrt(s1*s3) + s2;
|
||||
assert(l > 0.0f);
|
||||
const real dt_loc = eta *std::sqrt(u/l);
|
||||
dt_min = std::min(dt_min, dt_loc);
|
||||
}
|
||||
}
|
||||
|
||||
if (dt_min == HUGE)
|
||||
return dt;
|
||||
else
|
||||
return dt_min;
|
||||
}
|
||||
|
||||
void energy(real &Ekin, real &Epot)
|
||||
{
|
||||
real ekin = 0, epot = 0;
|
||||
|
||||
#pragma omp parallel for reduction(+:ekin,epot)
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
ekin += g_mass[i] * (g_velx[i]*g_velx[i] + g_vely[i]*g_vely[i] + g_velz[i]*g_velz[i]) * real(0.5f);
|
||||
epot += real(0.5f)*g_mass[i] * g_gpot[i];
|
||||
}
|
||||
Ekin = ekin;
|
||||
Epot = epot;
|
||||
}
|
||||
|
||||
void integrate(const int niter, const real t_end = HUGE)
|
||||
{
|
||||
const double tin = rtc();
|
||||
forces();
|
||||
const double fn = n;
|
||||
printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - tin,
|
||||
fn*fn*PP_FLOP/(rtc() - tin)/1e9);
|
||||
|
||||
real Epot0, Ekin0;
|
||||
energy(Ekin0, Epot0);
|
||||
const real Etot0 = Epot0 + Ekin0;
|
||||
printf(" E: %g %g %g \n", Epot0, Ekin0, Etot0);
|
||||
|
||||
/////////
|
||||
|
||||
real t_global = 0;
|
||||
double t0 = 0;
|
||||
int iter = 0;
|
||||
int ntime = 10;
|
||||
real dt = 1.0/131072;
|
||||
real Epot, Ekin, Etot = Etot0;
|
||||
while (t_global < t_end) {
|
||||
if (iter % ntime == 0)
|
||||
t0 = rtc();
|
||||
|
||||
if (iter >= niter) return;
|
||||
|
||||
dt = step(dt);
|
||||
iter++;
|
||||
t_global += dt;
|
||||
|
||||
const real Etot_pre = Etot;
|
||||
energy(Ekin, Epot);
|
||||
Etot = Ekin + Epot;
|
||||
|
||||
if (iter % 1 == 0) {
|
||||
const real Etot = Ekin + Epot;
|
||||
printf("iter= %d: t= %g dt= %g Ekin= %g Epot= %g Etot= %g , dE = %g d(dE)= %g \n",
|
||||
iter, t_global, dt, Ekin, Epot, Etot, (Etot - Etot0)/std::abs(Etot0),
|
||||
(Etot - Etot_pre)/std::abs(Etot_pre) );
|
||||
}
|
||||
|
||||
if (iter % ntime == 0) {
|
||||
printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - t0,
|
||||
fn*fn*PP_FLOP/(rtc() - t0)/1e9*ntime);
|
||||
}
|
||||
|
||||
fflush(stdout);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
void Hermite4::forces()
|
||||
{
|
||||
ispc::compute_forces(
|
||||
n,
|
||||
g_mass,
|
||||
g_posx,
|
||||
g_posy,
|
||||
g_posz,
|
||||
g_velx,
|
||||
g_vely,
|
||||
g_velz,
|
||||
g_accx,
|
||||
g_accy,
|
||||
g_accz,
|
||||
g_jrkx,
|
||||
g_jrky,
|
||||
g_jrkz,
|
||||
g_gpot,
|
||||
eps2);
|
||||
}
|
||||
|
||||
void run(const int nbodies, const real eta, const int nstep)
|
||||
{
|
||||
Hermite4 h4(nbodies, eta);
|
||||
h4.integrate(nstep);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
printf(" Usage: %s [nbodies=8192] [nsteps=40] [eta=0.1] \n", argv[0]);
|
||||
|
||||
int nbodies = 8192;
|
||||
if (argc > 1) nbodies = atoi(argv[1]);
|
||||
|
||||
int nstep = 40;
|
||||
if (argc > 2) nstep = atoi(argv[2]);
|
||||
|
||||
float eta = 0.1;
|
||||
if (argc > 3) eta = atof(argv[3]);
|
||||
|
||||
|
||||
|
||||
printf("nbodies= %d\n", nbodies);
|
||||
printf("nstep= %d\n", nstep);
|
||||
printf(" eta= %g \n", eta);
|
||||
|
||||
run(nbodies, eta, nstep);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
197
examples/portable/nbody_hermite4/hermite4.ispc
Normal file
197
examples/portable/nbody_hermite4/hermite4.ispc
Normal file
@@ -0,0 +1,197 @@
|
||||
/*
|
||||
Copyright (c) 2014, Evghenii Gaburov
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "typeReal.h"
|
||||
|
||||
typedef real<3> vec3;
|
||||
struct Force
|
||||
{
|
||||
vec3 acc, jrk;
|
||||
real pot, null;
|
||||
};
|
||||
|
||||
struct Predictor
|
||||
{
|
||||
vec3 pos, vel;
|
||||
};
|
||||
|
||||
static inline
|
||||
void body_body_force(
|
||||
Force &fi,
|
||||
const Predictor &pi,
|
||||
const Predictor &pj,
|
||||
const real mj,
|
||||
const real eps2)
|
||||
{
|
||||
const real dx = pj.pos.x - pi.pos.x;
|
||||
const real dy = pj.pos.y - pi.pos.y;
|
||||
const real dz = pj.pos.z - pi.pos.z;
|
||||
|
||||
const real ds2 = dx*dx + dy*dy + dz*dz + eps2;
|
||||
|
||||
#if 1
|
||||
const real inv_ds = rsqrt((float)ds2);
|
||||
#else
|
||||
const real inv_ds = rsqrt(ds2);
|
||||
#endif
|
||||
const real inv_ds2 = inv_ds*inv_ds;
|
||||
const real minv_ds = inv_ds * mj;
|
||||
const real minv_ds3 = inv_ds2 * minv_ds;
|
||||
|
||||
|
||||
fi.acc.x += minv_ds3 * dx;
|
||||
fi.acc.y += minv_ds3 * dy;
|
||||
fi.acc.z += minv_ds3 * dz;
|
||||
fi.pot -= minv_ds;
|
||||
|
||||
const real dvx = pj.vel.x - pi.vel.x;
|
||||
const real dvy = pj.vel.y - pi.vel.y;
|
||||
const real dvz = pj.vel.z - pi.vel.z;
|
||||
const real rv = dx*dvx + dy*dvy + dz*dvz;
|
||||
|
||||
const real Jij = (real)(-3.0) * (rv * inv_ds2 * minv_ds3);
|
||||
|
||||
fi.jrk.x += minv_ds3*dvx + Jij*dx;
|
||||
fi.jrk.y += minv_ds3*dvy + Jij*dy;
|
||||
fi.jrk.z += minv_ds3*dvz + Jij*dz;
|
||||
}
|
||||
|
||||
task void compute_forces_task(
|
||||
uniform const int n,
|
||||
uniform const int nPerTask,
|
||||
uniform const real mass[],
|
||||
uniform const real posx[],
|
||||
uniform const real posy[],
|
||||
uniform const real posz[],
|
||||
uniform const real velx[],
|
||||
uniform const real vely[],
|
||||
uniform const real velz[],
|
||||
uniform real accx[],
|
||||
uniform real accy[],
|
||||
uniform real accz[],
|
||||
uniform real jrkx[],
|
||||
uniform real jrky[],
|
||||
uniform real jrkz[],
|
||||
uniform real gpot[],
|
||||
const uniform real eps2)
|
||||
{
|
||||
const uniform int nibeg = taskIndex * nPerTask;
|
||||
const uniform int niend = min(n, nibeg + nPerTask);
|
||||
|
||||
if (nibeg >= n)
|
||||
return;
|
||||
|
||||
uniform real shdata[7][programCount];
|
||||
|
||||
assert((n%programCount) == 0);
|
||||
|
||||
foreach (i = nibeg ... niend)
|
||||
{
|
||||
Force fi;
|
||||
fi.acc = (real)0.0;
|
||||
fi.jrk = (real)0.0;
|
||||
fi.pot = (real)0.0;
|
||||
|
||||
Predictor pi;
|
||||
pi.pos.x = posx[i];
|
||||
pi.pos.y = posy[i];
|
||||
pi.pos.z = posz[i];
|
||||
pi.vel.x = velx[i];
|
||||
pi.vel.y = vely[i];
|
||||
pi.vel.z = velz[i];
|
||||
|
||||
for (uniform int jb = 0; jb < n; jb += programCount)
|
||||
{
|
||||
const int jp = jb + programIndex;
|
||||
shdata[0][programIndex] = posx[jp];
|
||||
shdata[1][programIndex] = posy[jp];
|
||||
shdata[2][programIndex] = posz[jp];
|
||||
shdata[3][programIndex] = mass[jp];
|
||||
shdata[4][programIndex] = velx[jp];
|
||||
shdata[5][programIndex] = vely[jp];
|
||||
shdata[6][programIndex] = velz[jp];
|
||||
|
||||
for (uniform int j = 0; j < programCount; j++)
|
||||
{
|
||||
Predictor pj;
|
||||
pj.pos.x = shdata[0][j];
|
||||
pj.pos.y = shdata[1][j];
|
||||
pj.pos.z = shdata[2][j];
|
||||
pj.vel.x = shdata[4][j];
|
||||
pj.vel.y = shdata[5][j];
|
||||
pj.vel.z = shdata[6][j];
|
||||
const real jmass = shdata[3][j];
|
||||
body_body_force(fi,pi,pj,jmass,eps2);
|
||||
}
|
||||
}
|
||||
|
||||
accx[i] = fi.acc.x;
|
||||
accy[i] = fi.acc.y;
|
||||
accz[i] = fi.acc.z;
|
||||
jrkx[i] = fi.jrk.x;
|
||||
jrky[i] = fi.jrk.y;
|
||||
jrkz[i] = fi.jrk.z;
|
||||
gpot[i] = fi.pot;
|
||||
}
|
||||
}
|
||||
|
||||
export void compute_forces(
|
||||
uniform const int n,
|
||||
uniform const real mass[],
|
||||
uniform const real posx[],
|
||||
uniform const real posy[],
|
||||
uniform const real posz[],
|
||||
uniform const real velx[],
|
||||
uniform const real vely[],
|
||||
uniform const real velz[],
|
||||
uniform real accx[],
|
||||
uniform real accy[],
|
||||
uniform real accz[],
|
||||
uniform real jrkx[],
|
||||
uniform real jrky[],
|
||||
uniform real jrkz[],
|
||||
uniform real gpot[],
|
||||
const uniform real eps2)
|
||||
{
|
||||
const uniform int nPerTask = min(128,programCount*8);
|
||||
const uniform int nTask = (n+nPerTask-1)/nPerTask;
|
||||
|
||||
launch [nTask] compute_forces_task(
|
||||
n, nPerTask,
|
||||
mass,
|
||||
posx,posy,posz,
|
||||
velx,vely,velz,
|
||||
accx,accy,accz,
|
||||
jrkx,jrky,jrkz,
|
||||
gpot,eps2);
|
||||
}
|
||||
2
examples/portable/nbody_hermite4/typeReal.h
Normal file
2
examples/portable/nbody_hermite4/typeReal.h
Normal file
@@ -0,0 +1,2 @@
|
||||
#pragma once
|
||||
typedef double real;
|
||||
409
examples/portable/omp_tasksys.cpp
Normal file
409
examples/portable/omp_tasksys.cpp
Normal file
@@ -0,0 +1,409 @@
|
||||
/*
|
||||
Copyright (c) 2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#define DBG(x)
|
||||
#include <omp.h>
|
||||
#include <malloc.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
|
||||
// Signature of ispc-generated 'task' functions
|
||||
typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
|
||||
int taskIndex, int taskCount,
|
||||
int taskIndex0, int taskIndex1, int taskIndex2,
|
||||
int taskCount0, int taskCount1, int taskCount2);
|
||||
|
||||
// Small structure used to hold the data for each task
|
||||
#ifdef _MSC_VER
|
||||
__declspec(align(16))
|
||||
#endif
|
||||
struct TaskInfo {
|
||||
TaskFuncType func;
|
||||
void *data;
|
||||
int taskIndex;
|
||||
int taskCount3d[3];
|
||||
#if defined(ISPC_IS_WINDOWS)
|
||||
event taskEvent;
|
||||
#endif
|
||||
int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; }
|
||||
int taskIndex0() const
|
||||
{
|
||||
return taskIndex % taskCount3d[0];
|
||||
}
|
||||
int taskIndex1() const
|
||||
{
|
||||
return ( taskIndex / taskCount3d[0] ) % taskCount3d[1];
|
||||
}
|
||||
int taskIndex2() const
|
||||
{
|
||||
return taskIndex / ( taskCount3d[0]*taskCount3d[1] );
|
||||
}
|
||||
int taskCount0() const { return taskCount3d[0]; }
|
||||
int taskCount1() const { return taskCount3d[1]; }
|
||||
int taskCount2() const { return taskCount3d[2]; }
|
||||
TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); }
|
||||
}
|
||||
#ifndef _MSC_VER
|
||||
__attribute__((aligned(32)));
|
||||
#endif
|
||||
;
|
||||
|
||||
// ispc expects these functions to have C linkage / not be mangled
|
||||
extern "C" {
|
||||
void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz);
|
||||
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
|
||||
void ISPCSync(void *handle);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// TaskGroupBase
|
||||
|
||||
#define LOG_TASK_QUEUE_CHUNK_SIZE 14
|
||||
#define MAX_TASK_QUEUE_CHUNKS 8
|
||||
#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
|
||||
|
||||
#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
|
||||
|
||||
#define NUM_MEM_BUFFERS 16
|
||||
|
||||
class TaskGroup;
|
||||
|
||||
/** The TaskGroupBase structure provides common functionality for "task
|
||||
groups"; a task group is the set of tasks launched from within a single
|
||||
ispc function. When the function is ready to return, it waits for all
|
||||
of the tasks in its task group to finish before it actually returns.
|
||||
*/
|
||||
class TaskGroupBase {
|
||||
public:
|
||||
void Reset();
|
||||
|
||||
int AllocTaskInfo(int count);
|
||||
TaskInfo *GetTaskInfo(int index);
|
||||
|
||||
void *AllocMemory(int64_t size, int32_t alignment);
|
||||
|
||||
protected:
|
||||
TaskGroupBase();
|
||||
~TaskGroupBase();
|
||||
|
||||
int nextTaskInfoIndex;
|
||||
|
||||
private:
|
||||
/* We allocate blocks of TASK_QUEUE_CHUNK_SIZE TaskInfo structures as
|
||||
needed by the calling function. We hold up to MAX_TASK_QUEUE_CHUNKS
|
||||
of these (and then exit at runtime if more than this many tasks are
|
||||
launched.)
|
||||
*/
|
||||
TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
|
||||
|
||||
/* We also allocate chunks of memory to service ISPCAlloc() calls. The
|
||||
memBuffers[] array holds pointers to this memory. The first element
|
||||
of this array is initialized to point to mem and then any subsequent
|
||||
elements required are initialized with dynamic allocation.
|
||||
*/
|
||||
int curMemBuffer, curMemBufferOffset;
|
||||
int memBufferSize[NUM_MEM_BUFFERS];
|
||||
char *memBuffers[NUM_MEM_BUFFERS];
|
||||
char mem[256];
|
||||
};
|
||||
|
||||
|
||||
inline TaskGroupBase::TaskGroupBase() {
|
||||
nextTaskInfoIndex = 0;
|
||||
|
||||
curMemBuffer = 0;
|
||||
curMemBufferOffset = 0;
|
||||
memBuffers[0] = mem;
|
||||
memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
|
||||
for (int i = 1; i < NUM_MEM_BUFFERS; ++i) {
|
||||
memBuffers[i] = NULL;
|
||||
memBufferSize[i] = 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < MAX_TASK_QUEUE_CHUNKS; ++i)
|
||||
taskInfo[i] = NULL;
|
||||
}
|
||||
|
||||
|
||||
inline TaskGroupBase::~TaskGroupBase() {
|
||||
// Note: don't delete memBuffers[0], since it points to the start of
|
||||
// the "mem" member!
|
||||
for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
|
||||
delete[](memBuffers[i]);
|
||||
}
|
||||
|
||||
|
||||
inline void
|
||||
TaskGroupBase::Reset() {
|
||||
nextTaskInfoIndex = 0;
|
||||
curMemBuffer = 0;
|
||||
curMemBufferOffset = 0;
|
||||
}
|
||||
|
||||
|
||||
inline int
|
||||
TaskGroupBase::AllocTaskInfo(int count) {
|
||||
int ret = nextTaskInfoIndex;
|
||||
nextTaskInfoIndex += count;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
inline TaskInfo *
|
||||
TaskGroupBase::GetTaskInfo(int index) {
|
||||
int chunk = (index >> LOG_TASK_QUEUE_CHUNK_SIZE);
|
||||
int offset = index & (TASK_QUEUE_CHUNK_SIZE-1);
|
||||
|
||||
if (chunk == MAX_TASK_QUEUE_CHUNKS) {
|
||||
fprintf(stderr, "A total of %d tasks have been launched from the "
|
||||
"current function--the simple built-in task system can handle "
|
||||
"no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE "
|
||||
"and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation. "
|
||||
"Sorry! Exiting.\n", index);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (taskInfo[chunk] == NULL)
|
||||
taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
|
||||
return &taskInfo[chunk][offset];
|
||||
}
|
||||
|
||||
|
||||
inline void *
|
||||
TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
|
||||
char *basePtr = memBuffers[curMemBuffer];
|
||||
intptr_t iptr = (intptr_t)(basePtr + curMemBufferOffset);
|
||||
iptr = (iptr + (alignment-1)) & ~(alignment-1);
|
||||
|
||||
int newOffset = int(iptr - (intptr_t)basePtr + size);
|
||||
if (newOffset < memBufferSize[curMemBuffer]) {
|
||||
curMemBufferOffset = newOffset;
|
||||
return (char *)iptr;
|
||||
}
|
||||
|
||||
++curMemBuffer;
|
||||
curMemBufferOffset = 0;
|
||||
assert(curMemBuffer < NUM_MEM_BUFFERS);
|
||||
|
||||
int allocSize = 1 << (12 + curMemBuffer);
|
||||
allocSize = std::max(int(size+alignment), allocSize);
|
||||
char *newBuf = new char[allocSize];
|
||||
memBufferSize[curMemBuffer] = allocSize;
|
||||
memBuffers[curMemBuffer] = newBuf;
|
||||
return AllocMemory(size, alignment);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Atomics and the like
|
||||
|
||||
static inline void
|
||||
lMemFence() {
|
||||
// Windows atomic functions already contain the fence
|
||||
// KNC doesn't need the memory barrier
|
||||
#if !defined ISPC_IS_KNC && !defined ISPC_IS_WINDOWS
|
||||
__sync_synchronize();
|
||||
#endif
|
||||
}
|
||||
|
||||
static void *
|
||||
lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return InterlockedCompareExchangePointer(v, newValue, oldValue);
|
||||
#else
|
||||
void *result = __sync_val_compare_and_swap(v, oldValue, newValue);
|
||||
lMemFence();
|
||||
return result;
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
}
|
||||
|
||||
static int32_t
|
||||
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue);
|
||||
#else
|
||||
int32_t result = __sync_val_compare_and_swap(v, oldValue, newValue);
|
||||
lMemFence();
|
||||
return result;
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
}
|
||||
|
||||
static inline int32_t
|
||||
lAtomicAdd(volatile int32_t *v, int32_t delta) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
|
||||
#else
|
||||
return __sync_fetch_and_add(v, delta);
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class TaskGroup : public TaskGroupBase {
|
||||
public:
|
||||
void Launch(int baseIndex, int count);
|
||||
void Sync();
|
||||
|
||||
};
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// OpenMP
|
||||
|
||||
static void
|
||||
InitTaskSystem() {
|
||||
// No initialization needed
|
||||
}
|
||||
|
||||
inline void
|
||||
TaskGroup::Launch(int baseIndex, int count) {
|
||||
#pragma omp parallel
|
||||
{
|
||||
const int threadIndex = omp_get_thread_num();
|
||||
const int threadCount = omp_get_num_threads();
|
||||
|
||||
TaskInfo ti = *GetTaskInfo(baseIndex);
|
||||
#pragma omp for schedule(runtime)
|
||||
for(int i = 0; i < count; i++)
|
||||
{
|
||||
ti.taskIndex = i;
|
||||
|
||||
// Actually run the task.
|
||||
ti.func(ti.data, threadIndex, threadCount, ti.taskIndex, ti.taskCount(),
|
||||
ti.taskIndex0(), ti.taskIndex1(), ti.taskIndex2(),
|
||||
ti.taskCount0(), ti.taskCount1(), ti.taskCount2());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
TaskGroup::Sync() {
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define MAX_FREE_TASK_GROUPS 64
|
||||
static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
|
||||
|
||||
static inline TaskGroup *
|
||||
AllocTaskGroup()
|
||||
{
|
||||
for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
|
||||
TaskGroup *tg = freeTaskGroups[i];
|
||||
if (tg != NULL) {
|
||||
void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
|
||||
if (ptr != NULL) {
|
||||
return (TaskGroup *)ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new TaskGroup;
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
FreeTaskGroup(TaskGroup *tg)
|
||||
{
|
||||
tg->Reset();
|
||||
|
||||
for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
|
||||
if (freeTaskGroups[i] == NULL) {
|
||||
void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL);
|
||||
if (ptr == NULL)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
delete tg;
|
||||
}
|
||||
|
||||
void
|
||||
ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2)
|
||||
{
|
||||
const int count = count0*count1*count2;
|
||||
TaskGroup *taskGroup;
|
||||
if (*taskGroupPtr == NULL) {
|
||||
InitTaskSystem();
|
||||
taskGroup = AllocTaskGroup();
|
||||
*taskGroupPtr = taskGroup;
|
||||
}
|
||||
else
|
||||
taskGroup = (TaskGroup *)(*taskGroupPtr);
|
||||
|
||||
int baseIndex = taskGroup->AllocTaskInfo(count);
|
||||
for (int i = 0; i < 1; ++i) {
|
||||
TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i);
|
||||
ti->func = (TaskFuncType)func;
|
||||
ti->data = data;
|
||||
ti->taskIndex = i;
|
||||
ti->taskCount3d[0] = count0;
|
||||
ti->taskCount3d[1] = count1;
|
||||
ti->taskCount3d[2] = count2;
|
||||
}
|
||||
taskGroup->Launch(baseIndex, count);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ISPCSync(void *h)
|
||||
{
|
||||
TaskGroup *taskGroup = (TaskGroup *)h;
|
||||
if (taskGroup != NULL) {
|
||||
taskGroup->Sync();
|
||||
FreeTaskGroup(taskGroup);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void *
|
||||
ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment)
|
||||
{
|
||||
TaskGroup *taskGroup;
|
||||
if (*taskGroupPtr == NULL) {
|
||||
InitTaskSystem();
|
||||
taskGroup = AllocTaskGroup();
|
||||
*taskGroupPtr = taskGroup;
|
||||
}
|
||||
else
|
||||
taskGroup = (TaskGroup *)(*taskGroupPtr);
|
||||
|
||||
return taskGroup->AllocMemory(size, alignment);
|
||||
}
|
||||
|
||||
1
examples/portable/options/.gitignore
vendored
Normal file
1
examples/portable/options/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
options
|
||||
8
examples/portable/options/Makefile_cpu
Normal file
8
examples/portable/options/Makefile_cpu
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=options
|
||||
CPP_SRC=options.cpp
|
||||
ISPC_SRC=options.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x16
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common_cpu.mk
|
||||
7
examples/portable/options/Makefile_knc
Normal file
7
examples/portable/options/Makefile_knc
Normal file
@@ -0,0 +1,7 @@
|
||||
EXAMPLE=options
|
||||
CXX_SRC=options.cpp
|
||||
ISPC_SRC=options.ispc
|
||||
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
|
||||
ISPC_TARGET=generic-16
|
||||
|
||||
include ../common_knc.mk
|
||||
14
examples/portable/options/Makefile_ptx
Normal file
14
examples/portable/options/Makefile_ptx
Normal file
@@ -0,0 +1,14 @@
|
||||
PROG=options
|
||||
ISPC_SRC=options.ispc
|
||||
CU_SRC=options.cu
|
||||
CXX_SRC=options.cpp
|
||||
PTXCC_REGMAX=128
|
||||
|
||||
|
||||
#LLVM_GPU=1
|
||||
NVVM_GPU=1
|
||||
|
||||
include ../common_ptx.mk
|
||||
|
||||
|
||||
|
||||
120
examples/portable/options/options.cpp
Normal file
120
examples/portable/options/options.cpp
Normal file
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#define NOMINMAX
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
using std::max;
|
||||
|
||||
#include "options_defs.h"
|
||||
#include "timing.h"
|
||||
#include "ispc_malloc.h"
|
||||
|
||||
#include "options_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
static void usage() {
|
||||
printf("usage: options [--count=<num options>]\n");
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int nOptions = 128*1024;
|
||||
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (strncmp(argv[i], "--count=", 8) == 0) {
|
||||
nOptions = atoi(argv[i] + 8);
|
||||
if (nOptions <= 0) {
|
||||
usage();
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float *S = new float[nOptions];
|
||||
float *X = new float[nOptions];
|
||||
float *T = new float[nOptions];
|
||||
float *r = new float[nOptions];
|
||||
float *v = new float[nOptions];
|
||||
float *result = new float[nOptions];
|
||||
|
||||
for (int i = 0; i < nOptions; ++i) {
|
||||
S[i] = 100; // stock price
|
||||
X[i] = 98; // option strike price
|
||||
T[i] = 2; // time (years)
|
||||
r[i] = .02; // risk-free interest rate
|
||||
v[i] = 5; // volatility
|
||||
}
|
||||
|
||||
double sum;
|
||||
|
||||
//
|
||||
// Binomial options pricing model, ispc implementation, tasks
|
||||
//
|
||||
double binomial_tasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
binomial_put_ispc_tasks(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_msec();
|
||||
binomial_tasks = std::min(binomial_tasks, dt);
|
||||
}
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum += result[i];
|
||||
printf("[binomial ispc, tasks]:\t\t[%.3f] msec (avg %f)\n",
|
||||
binomial_tasks, sum / nOptions);
|
||||
|
||||
//
|
||||
// Black-Scholes options pricing model, ispc implementation, tasks
|
||||
//
|
||||
double bs_ispc_tasks = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
black_scholes_ispc_tasks(S, X, T, r, v, result, nOptions);
|
||||
double dt = get_elapsed_msec();
|
||||
sum = 0.;
|
||||
for (int i = 0; i < nOptions; ++i)
|
||||
sum += result[i];
|
||||
bs_ispc_tasks = std::min(bs_ispc_tasks, dt);
|
||||
}
|
||||
printf("[black-scholes ispc, tasks]:\t[%.3f] msec (avg %f)\n",
|
||||
bs_ispc_tasks, sum / nOptions);
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
334
examples/portable/options/options.cu
Normal file
334
examples/portable/options/options.cu
Normal file
@@ -0,0 +1,334 @@
|
||||
// -*- mode: c++ -*-
|
||||
/*
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "options_defs.h"
|
||||
#include "cuda_helpers.cuh"
|
||||
|
||||
__device__ static inline void __range_reduce_log(float input, float * reduced,
|
||||
int * exponent) {
|
||||
int int_version = __float_as_int(input); //intbits(input);
|
||||
// single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
|
||||
// exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000
|
||||
// 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0
|
||||
// non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111
|
||||
// = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF
|
||||
|
||||
//const int exponent_mask(0x7F800000)
|
||||
const int nonexponent_mask = 0x807FFFFF;
|
||||
|
||||
// We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126
|
||||
const int exponent_neg1 = (126l << 23);
|
||||
// NOTE(boulos): We don't need to mask anything out since we know
|
||||
// the sign bit has to be 0. If it's 1, we need to return infinity/nan
|
||||
// anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
|
||||
int biased_exponent = int_version >> 23; // This number is [0, 255] but it means [-127, 128]
|
||||
|
||||
int offset_exponent = biased_exponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
|
||||
*exponent = offset_exponent - 127; // get the real value
|
||||
|
||||
// Blend the offset_exponent with the original input (do this in
|
||||
// int for now, until I decide if float can have & and ¬)
|
||||
int blended = (int_version & nonexponent_mask) | (exponent_neg1);
|
||||
*reduced = __int_as_float(blended); //floatbits(blended);
|
||||
}
|
||||
|
||||
|
||||
__device__ static inline float __Logf(const float x_full)
|
||||
{
|
||||
#if 1
|
||||
return __logf(x_full);
|
||||
#else
|
||||
float reduced;
|
||||
int exponent;
|
||||
|
||||
const int NaN_bits = 0x7fc00000;
|
||||
const int Neg_Inf_bits = 0xFF800000;
|
||||
const float NaN = __int_as_float(NaN_bits); //floatbits(NaN_bits);
|
||||
const float neg_inf = __int_as_float(Neg_Inf_bits); //floatbits(Neg_Inf_bits);
|
||||
bool use_nan = x_full < 0.f;
|
||||
bool use_inf = x_full == 0.f;
|
||||
bool exceptional = use_nan || use_inf;
|
||||
const float one = 1.0f;
|
||||
|
||||
float patched = exceptional ? one : x_full;
|
||||
__range_reduce_log(patched, &reduced, &exponent);
|
||||
|
||||
const float ln2 = 0.693147182464599609375f;
|
||||
|
||||
float x1 = one - reduced;
|
||||
const float c1 = 0.50000095367431640625f;
|
||||
const float c2 = 0.33326041698455810546875f;
|
||||
const float c3 = 0.2519190013408660888671875f;
|
||||
const float c4 = 0.17541764676570892333984375f;
|
||||
const float c5 = 0.3424419462680816650390625f;
|
||||
const float c6 = -0.599632322788238525390625f;
|
||||
const float c7 = +1.98442304134368896484375f;
|
||||
const float c8 = -2.4899270534515380859375f;
|
||||
const float c9 = +1.7491014003753662109375f;
|
||||
|
||||
float result = x1 * c9 + c8;
|
||||
result = x1 * result + c7;
|
||||
result = x1 * result + c6;
|
||||
result = x1 * result + c5;
|
||||
result = x1 * result + c4;
|
||||
result = x1 * result + c3;
|
||||
result = x1 * result + c2;
|
||||
result = x1 * result + c1;
|
||||
result = x1 * result + one;
|
||||
|
||||
// Equation was for -(ln(red)/(1-red))
|
||||
result *= -x1;
|
||||
result += (float)(exponent) * ln2;
|
||||
|
||||
return exceptional ? (use_nan ? NaN : neg_inf) : result;
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ static inline float __Expf(const float x_full)
|
||||
{
|
||||
#if 1
|
||||
return __expf(x_full);
|
||||
#else
|
||||
const float ln2_part1 = 0.6931457519f;
|
||||
const float ln2_part2 = 1.4286067653e-6f;
|
||||
const float one_over_ln2 = 1.44269502162933349609375f;
|
||||
|
||||
float scaled = x_full * one_over_ln2;
|
||||
float k_real = floor(scaled);
|
||||
int k = (int)k_real;
|
||||
|
||||
// Reduced range version of x
|
||||
float x = x_full - k_real * ln2_part1;
|
||||
x -= k_real * ln2_part2;
|
||||
|
||||
// These coefficients are for e^x in [0, ln(2)]
|
||||
const float one = 1.f;
|
||||
const float c2 = 0.4999999105930328369140625f;
|
||||
const float c3 = 0.166668415069580078125f;
|
||||
const float c4 = 4.16539050638675689697265625e-2f;
|
||||
const float c5 = 8.378830738365650177001953125e-3f;
|
||||
const float c6 = 1.304379315115511417388916015625e-3f;
|
||||
const float c7 = 2.7555381529964506626129150390625e-4f;
|
||||
|
||||
float result = x * c7 + c6;
|
||||
result = x * result + c5;
|
||||
result = x * result + c4;
|
||||
result = x * result + c3;
|
||||
result = x * result + c2;
|
||||
result = x * result + one;
|
||||
result = x * result + one;
|
||||
|
||||
// Compute 2^k (should differ for float and double, but I'll avoid
|
||||
// it for now and just do floats)
|
||||
const int fpbias = 127;
|
||||
int biased_n = k + fpbias;
|
||||
bool overflow = k > fpbias;
|
||||
// Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
|
||||
// we've got underflow. -127 * ln(2) -> -88.02. So the most
|
||||
// negative float input that doesn't result in zero is like -88.
|
||||
bool underflow = (biased_n <= 0);
|
||||
const int InfBits = 0x7f800000;
|
||||
biased_n <<= 23;
|
||||
// Reinterpret this thing as float
|
||||
float two_to_the_n = __int_as_float(biased_n); //floatbits(biased_n);
|
||||
// Handle both doubles and floats (hopefully eliding the copy for float)
|
||||
float elemtype_2n = two_to_the_n;
|
||||
result *= elemtype_2n;
|
||||
// result = overflow ? floatbits(InfBits) : result;
|
||||
result = overflow ? __int_as_float(InfBits) : result;
|
||||
result = underflow ? 0.0f : result;
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Cumulative normal distribution function
|
||||
//
|
||||
__device__
|
||||
static inline float
|
||||
CND(float X) {
|
||||
float L = fabsf(X);
|
||||
|
||||
float k = 1.0f / (1.0f + 0.2316419f * L);
|
||||
float k2 = k*k;
|
||||
float k3 = k2*k;
|
||||
float k4 = k2*k2;
|
||||
float k5 = k3*k2;
|
||||
|
||||
const float invSqrt2Pi = 0.39894228040f;
|
||||
float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +
|
||||
-1.821255978f * k4 + 1.330274429f * k5);
|
||||
w *= invSqrt2Pi * __Expf(-L * L * .5f);
|
||||
|
||||
if (X > 0.f)
|
||||
w = 1.0f - w;
|
||||
return w;
|
||||
}
|
||||
|
||||
__global__
|
||||
void bs_task( float Sa[], float Xa[], float Ta[],
|
||||
float ra[], float va[],
|
||||
float result[], int count) {
|
||||
if (taskIndex >= taskCount) return;
|
||||
int first = taskIndex * (count/taskCount);
|
||||
int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
|
||||
|
||||
for (int i = programIndex + first; i < last; i += programCount)
|
||||
if (i < last)
|
||||
{
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
|
||||
float d1 = (__Logf(S/X) + (r + v * v * .5f) * T) / (v * sqrtf(T));
|
||||
float d2 = d1 - v * sqrtf(T);
|
||||
|
||||
result[i] = S * CND(d1) - X * __Expf(-r * T) * CND(d2);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C"
|
||||
__global__ void
|
||||
black_scholes_ispc_tasks___export( float Sa[], float Xa[], float Ta[],
|
||||
float ra[], float va[],
|
||||
float result[], int count) {
|
||||
int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
|
||||
launch(nTasks,1,1,bs_task)
|
||||
(Sa, Xa, Ta, ra, va, result, count);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
extern "C"
|
||||
__host__ void
|
||||
black_scholes_ispc_tasks( float Sa[], float Xa[], float Ta[],
|
||||
float ra[], float va[],
|
||||
float result[], int count) {
|
||||
black_scholes_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
/********/
|
||||
|
||||
|
||||
template<int NBEG, int NEND, int STEP>
|
||||
struct loop
|
||||
{
|
||||
__device__ static void op1(float V[], const float u, const float X, const float S)
|
||||
{
|
||||
const int j = NBEG;
|
||||
float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
|
||||
V[j] = max(0.0f, X - S * upow);
|
||||
loop<j+STEP,NEND,STEP>::op1(V,u,X,S);
|
||||
}
|
||||
__device__ static void op2(float V[], const float Pu, const float disc)
|
||||
{
|
||||
const int j = NBEG;
|
||||
#pragma unroll
|
||||
for ( int k = 0; k < j; ++k)
|
||||
V[k] = ((1.0f - Pu) * V[k] + Pu * V[k+ 1]) / disc;
|
||||
loop<j+STEP,NEND,STEP>::op2(V, Pu,disc);
|
||||
}
|
||||
};
|
||||
|
||||
template<int NEND, int STEP>
|
||||
struct loop<NEND,NEND,STEP>
|
||||
{
|
||||
__device__ static void op1(float V[], const float u, const float X, const float S) {}
|
||||
__device__ static void op2(float V[], const float Pu, const float disc) {}
|
||||
};
|
||||
|
||||
__device__
|
||||
static inline float
|
||||
binomial_put(float S, float X, float T, float r, float v)
|
||||
{
|
||||
|
||||
float V[BINOMIAL_NUM];
|
||||
|
||||
float dt = T / BINOMIAL_NUM;
|
||||
float u = exp(v * sqrt(dt));
|
||||
float d = 1.f / u;
|
||||
float disc = exp(r * dt);
|
||||
float Pu = (disc - d) / (u - d);
|
||||
|
||||
#if 0 /* slow */
|
||||
for ( int j = 0; j < BINOMIAL_NUM; ++j) {
|
||||
float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
|
||||
V[j] = max(0.0f, X - S * upow);
|
||||
}
|
||||
for ( int j = BINOMIAL_NUM-1; j >= 0; --j)
|
||||
for ( int k = 0; k < j; ++k)
|
||||
V[k] = ((1.0f - Pu) * V[k] + Pu * V[k+ 1]) / disc;
|
||||
#else /* with loop unrolling, stores resutls in registers */
|
||||
loop<0,BINOMIAL_NUM,1>::op1(V,u,X,S);
|
||||
loop<BINOMIAL_NUM-1, -1, -1>::op2(V, Pu, disc);
|
||||
#endif
|
||||
return V[0];
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__ void
|
||||
binomial_task( float Sa[], float Xa[],
|
||||
float Ta[], float ra[],
|
||||
float va[], float result[],
|
||||
int count)
|
||||
{
|
||||
int first = taskIndex * (count/taskCount);
|
||||
int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
|
||||
|
||||
for (int i = programIndex + first; i < last; i += programCount)
|
||||
if (i < last)
|
||||
{
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
result[i] = binomial_put(S, X, T, r, v);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern "C" __global__ void
|
||||
binomial_put_ispc_tasks___export( float Sa[], float Xa[],
|
||||
float Ta[], float ra[],
|
||||
float va[], float result[],
|
||||
int count) {
|
||||
int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
|
||||
launch(nTasks,1,1,binomial_task)
|
||||
(Sa, Xa, Ta, ra, va, result, count);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
extern "C"
|
||||
__host__ void
|
||||
binomial_put_ispc_tasks( float Sa[], float Xa[], float Ta[],
|
||||
float ra[], float va[],
|
||||
float result[], int count) {
|
||||
|
||||
cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);
|
||||
binomial_put_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
211
examples/portable/options/options.ispc
Normal file
211
examples/portable/options/options.ispc
Normal file
@@ -0,0 +1,211 @@
|
||||
// -*- mode: c++ -*-
|
||||
/*
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "options_defs.h"
|
||||
|
||||
// Cumulative normal distribution function
|
||||
static inline float
|
||||
CND(float X) {
|
||||
float L = abs(X);
|
||||
|
||||
float k = 1.0 / (1.0 + 0.2316419 * L);
|
||||
float k2 = k*k;
|
||||
float k3 = k2*k;
|
||||
float k4 = k2*k2;
|
||||
float k5 = k3*k2;
|
||||
|
||||
const float invSqrt2Pi = 0.39894228040f;
|
||||
float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +
|
||||
-1.821255978f * k4 + 1.330274429f * k5);
|
||||
w *= invSqrt2Pi * exp(-L * L * .5f);
|
||||
|
||||
if (X > 0.f)
|
||||
w = 1.0 - w;
|
||||
return w;
|
||||
}
|
||||
|
||||
task void
|
||||
bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
uniform int first = taskIndex * (count/taskCount);
|
||||
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
|
||||
|
||||
foreach (i = first ... last) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
|
||||
float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
|
||||
float d2 = d1 - v * sqrt(T);
|
||||
|
||||
result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
|
||||
}
|
||||
}
|
||||
|
||||
export void
|
||||
black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
|
||||
launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
|
||||
}
|
||||
|
||||
/********/
|
||||
|
||||
|
||||
export void
|
||||
black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
foreach (i = 0 ... count) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
|
||||
float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
|
||||
float d2 = d1 - v * sqrt(T);
|
||||
|
||||
result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
binomial_put(float S, float X, float T, float r, float v) {
|
||||
float V[BINOMIAL_NUM];
|
||||
|
||||
float dt = T / BINOMIAL_NUM;
|
||||
float u = exp(v * sqrt(dt));
|
||||
float d = 1. / u;
|
||||
float disc = exp(r * dt);
|
||||
float Pu = (disc - d) / (u - d);
|
||||
|
||||
#ifndef __NVPTX__
|
||||
|
||||
for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
|
||||
float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
|
||||
V[j] = max(0., X - S * upow);
|
||||
}
|
||||
for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
|
||||
for (uniform int k = 0; k < j; ++k)
|
||||
V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
|
||||
|
||||
#else
|
||||
|
||||
/* loop unrolling helps NVVM to place V -> registers therefore boosting performance */
|
||||
/* takes looong time to compile... */
|
||||
#if BINOMIAL_NUM != 64
|
||||
#error "Cannot unroll. Please use generic version above"
|
||||
#endif
|
||||
|
||||
// with PTX target unroll loops which will store data in registers..
|
||||
|
||||
/* first loop */
|
||||
|
||||
#define OP(j) { \
|
||||
float upow = pow(u, (float)(2*(j)-BINOMIAL_NUM)); \
|
||||
V[j] = max(0., X - S * upow); }
|
||||
#define OP10(k) \
|
||||
OP(k+0); OP(k+1); OP(k+2); OP(k+3); OP(k+4) \
|
||||
OP(k+5); OP(k+6); OP(k+7); OP(k+8); OP(k+9);
|
||||
OP10(0)
|
||||
OP10(10)
|
||||
OP10(20)
|
||||
OP10(30)
|
||||
OP10(40)
|
||||
OP10(50)
|
||||
OP(60)
|
||||
OP(61)
|
||||
OP(62)
|
||||
OP(63)
|
||||
#undef OP10
|
||||
#undef OP
|
||||
|
||||
/* second loop */
|
||||
|
||||
#define OP(j) {\
|
||||
for (uniform int k = 0; k < (j); ++k) \
|
||||
V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc; }
|
||||
#define OP10(k) \
|
||||
OP(k+9); OP(k+8); OP(k+7); OP(k+6); OP(k+5); \
|
||||
OP(k+4); OP(k+3); OP(k+2); OP(k+1); OP(k+0);
|
||||
OP(63)
|
||||
OP(62)
|
||||
OP(61)
|
||||
OP(60)
|
||||
OP10(50)
|
||||
OP10(40)
|
||||
OP10(30)
|
||||
OP10(20)
|
||||
OP10(10)
|
||||
OP10(0)
|
||||
#undef OP10
|
||||
#undef OP
|
||||
|
||||
#endif
|
||||
return V[0];
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
|
||||
uniform float ra[], uniform float va[],
|
||||
uniform float result[], uniform int count) {
|
||||
foreach (i = 0 ... count) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
result[i] = binomial_put(S, X, T, r, v);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
task void
|
||||
binomial_task(uniform float Sa[], uniform float Xa[],
|
||||
uniform float Ta[], uniform float ra[],
|
||||
uniform float va[], uniform float result[],
|
||||
uniform int count) {
|
||||
uniform int first = taskIndex * (count/taskCount);
|
||||
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
|
||||
|
||||
foreach (i = first ... last) {
|
||||
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
|
||||
result[i] = binomial_put(S, X, T, r, v);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
|
||||
uniform float Ta[], uniform float ra[],
|
||||
uniform float va[], uniform float result[],
|
||||
uniform int count) {
|
||||
uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
|
||||
launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
|
||||
}
|
||||
40
examples/portable/options/options_defs.h
Normal file
40
examples/portable/options/options_defs.h
Normal file
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef OPTIONS_DEFS_H
|
||||
#define OPTIONS_DEFS_H 1
|
||||
|
||||
#define BINOMIAL_NUM 64
|
||||
|
||||
|
||||
#endif // OPTIONS_DEFS_H
|
||||
9
examples/portable/radixSort/Makefile_cpu
Normal file
9
examples/portable/radixSort/Makefile_cpu
Normal file
@@ -0,0 +1,9 @@
|
||||
|
||||
EXAMPLE=radixSort
|
||||
CPP_SRC=radixSort.cpp
|
||||
ISPC_SRC=radixSort.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x8
|
||||
ISPC_ARM_TARGETS=neon
|
||||
#ISPC_FLAGS=-DDEBUG -g
|
||||
|
||||
include ../common_cpu.mk
|
||||
7
examples/portable/radixSort/Makefile_knc
Normal file
7
examples/portable/radixSort/Makefile_knc
Normal file
@@ -0,0 +1,7 @@
|
||||
EXAMPLE=radixSort
|
||||
CXX_SRC=radixSort.cpp
|
||||
ISPC_SRC=radixSort.ispc
|
||||
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
|
||||
ISPC_TARGET=generic-16
|
||||
|
||||
include ../common_knc.mk
|
||||
15
examples/portable/radixSort/Makefile_ptx
Normal file
15
examples/portable/radixSort/Makefile_ptx
Normal file
@@ -0,0 +1,15 @@
|
||||
PROG=radixSort
|
||||
ISPC_SRC=radixSort.ispc
|
||||
|
||||
CU_SRC=radixSort.cu
|
||||
# NVCC_FLAGS=-Xptxas=-O1
|
||||
CXX_SRC=radixSort.cpp radixSort.cpp
|
||||
PTXCC_REGMAX=64
|
||||
|
||||
LLVM_GPU=1
|
||||
NVVM_GPU=1
|
||||
|
||||
include ../common_ptx.mk
|
||||
|
||||
|
||||
|
||||
154
examples/portable/radixSort/radixSort.cpp
Normal file
154
examples/portable/radixSort/radixSort.cpp
Normal file
@@ -0,0 +1,154 @@
|
||||
/*
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <iomanip>
|
||||
#include "timing.h"
|
||||
#include "ispc_malloc.h"
|
||||
#include "radixSort_ispc.h"
|
||||
|
||||
static void progressBar(const int x, const int n, const int width = 50)
|
||||
{
|
||||
assert(n > 1);
|
||||
assert(x >= 0 && x < n);
|
||||
assert(width > 10);
|
||||
const float f = static_cast<float>(x)/(n-1);
|
||||
const int w = static_cast<int>(f * width);
|
||||
|
||||
// print bar
|
||||
std::string bstr("[");
|
||||
for (int i = 0; i < width; i++)
|
||||
bstr += i < w ? '=' : ' ';
|
||||
bstr += "]";
|
||||
|
||||
// print percentage
|
||||
char pstr0[32];
|
||||
sprintf(pstr0, " %2d %c ", static_cast<int>(f*100.0),'%');
|
||||
const std::string pstr(pstr0);
|
||||
std::copy(pstr.begin(), pstr.end(), bstr.begin() + (width/2-2));
|
||||
|
||||
std::cout << bstr;
|
||||
std::cout << (x == n-1 ? "\n" : "\r") << std::flush;
|
||||
}
|
||||
|
||||
struct Key
|
||||
{
|
||||
int32_t key,val;
|
||||
};
|
||||
|
||||
int main (int argc, char *argv[])
|
||||
{
|
||||
int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
|
||||
double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
|
||||
Key *keys = new Key [n];
|
||||
Key *keys_orig = new Key [n];
|
||||
unsigned int *keys_gold = new unsigned int [n];
|
||||
|
||||
srand48(rtc()*65536);
|
||||
|
||||
int sortBits = 32;
|
||||
assert(sortBits <= 32);
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
keys[i].key = ((int)(drand48() * (1<<30))) & ((1ULL << sortBits) - 1);
|
||||
keys[i].val = i;
|
||||
}
|
||||
|
||||
std::random_shuffle(keys, keys + n);
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
keys_gold[i] = keys[i].key;
|
||||
keys_orig[i] = keys[i];
|
||||
}
|
||||
|
||||
ispcSetMallocHeapLimit(1024*1024*1024);
|
||||
|
||||
ispc::radixSort_alloc(n);
|
||||
|
||||
tISPC2 = 1e30;
|
||||
for (i = 0; i < m; i ++)
|
||||
{
|
||||
ispcMemcpy(keys, keys_orig, n*sizeof(Key));
|
||||
reset_and_start_timer();
|
||||
ispc::radixSort(n, (int64_t*)keys, sortBits);
|
||||
tISPC2 = std::min(tISPC2, get_elapsed_msec());
|
||||
if (argc != 3)
|
||||
progressBar (i, m);
|
||||
}
|
||||
|
||||
ispc::radixSort_free();
|
||||
|
||||
printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n/tISPC2);
|
||||
|
||||
std::sort(keys_gold, keys_gold + n);
|
||||
for (int i = 0; i < n; i++)
|
||||
assert(keys[i].key == keys_gold[i]);
|
||||
|
||||
|
||||
#if 0
|
||||
for (i = 0; i < m; i ++)
|
||||
{
|
||||
ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
|
||||
|
||||
reset_and_start_timer();
|
||||
|
||||
sort_serial (n, code, order);
|
||||
|
||||
tSerial += get_elapsed_msec();
|
||||
|
||||
if (argc != 3)
|
||||
progressBar (i, m);
|
||||
}
|
||||
|
||||
printf("[sort serial]:\t\t[%.3f] msec [%.3f Mpair/s]\n", tSerial, 1.0e-3*n*m/tSerial);
|
||||
|
||||
#ifndef _CUDA_
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
|
||||
#else
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", tSerial/tISPC2);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
delete keys;
|
||||
delete keys_orig;
|
||||
delete keys_gold;
|
||||
return 0;
|
||||
}
|
||||
401
examples/portable/radixSort/radixSort.cu
Normal file
401
examples/portable/radixSort/radixSort.cu
Normal file
@@ -0,0 +1,401 @@
|
||||
/*
|
||||
Copyright (c) 2014, Evghenii Gaburov
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
Based on radixSort from http://www.moderngpu.com
|
||||
*/
|
||||
|
||||
#include "cuda_helpers.cuh"
|
||||
#include <cassert>
|
||||
|
||||
#define NUMBITS 8
|
||||
#define NUMDIGITS (1<<NUMBITS)
|
||||
|
||||
typedef long long Key;
|
||||
|
||||
__forceinline__ __device__ int atomic_add_global(int* ptr, int value)
|
||||
{
|
||||
return atomicAdd(ptr, value);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int shfl_scan_add_step(int partial, int up_offset)
|
||||
{
|
||||
int result;
|
||||
asm(
|
||||
"{.reg .u32 r0;"
|
||||
".reg .pred p;"
|
||||
"shfl.up.b32 r0|p, %1, %2, 0;"
|
||||
"@p add.u32 r0, r0, %3;"
|
||||
"mov.u32 %0, r0;}"
|
||||
: "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
|
||||
return result;
|
||||
}
|
||||
|
||||
__forceinline__ __device__ int exclusive_scan_add(int value)
|
||||
{
|
||||
int mysum = value;
|
||||
#pragma unroll
|
||||
for(int i = 0; i < 5; ++i)
|
||||
mysum = shfl_scan_add_step(mysum, 1 << i);
|
||||
return mysum - value;
|
||||
}
|
||||
|
||||
__global__
|
||||
void countPass(
|
||||
const Key keysAll[],
|
||||
Key sortedAll[],
|
||||
const int bit,
|
||||
const int numElements,
|
||||
int countsAll[],
|
||||
int countsGlobal[])
|
||||
{
|
||||
const int blkIdx = taskIndex;
|
||||
const int numBlocks = taskCount;
|
||||
const int blkDim = (numElements + numBlocks - 1) / numBlocks;
|
||||
|
||||
const int mask = (1 << NUMBITS) - 1;
|
||||
|
||||
const Key * keys = keysAll + blkIdx*blkDim;
|
||||
Key * sorted = sortedAll + blkIdx*blkDim;
|
||||
int * counts = countsAll + blkIdx*NUMDIGITS;
|
||||
const int nloc = min(numElements - blkIdx*blkDim, blkDim);
|
||||
|
||||
#pragma unroll 8
|
||||
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||
counts[digit] = 0;
|
||||
|
||||
for (int i = programIndex; i < nloc; i += programCount)
|
||||
if (i < nloc)
|
||||
{
|
||||
sorted[i] = keys[i];
|
||||
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||
atomic_add_global(&counts[key], 1);
|
||||
}
|
||||
|
||||
#pragma unroll 8
|
||||
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||
atomic_add_global(&countsGlobal[digit], counts[digit]);
|
||||
}
|
||||
|
||||
__global__
|
||||
void sortPass(
|
||||
Key keysAll[],
|
||||
Key sorted[],
|
||||
int bit,
|
||||
int numElements,
|
||||
int digitOffsetsAll[])
|
||||
{
|
||||
const int blkIdx = taskIndex;
|
||||
const int numBlocks = taskCount;
|
||||
|
||||
const int blkDim = (numElements + numBlocks - 1) / numBlocks;
|
||||
|
||||
|
||||
const int keyIndex = blkIdx * blkDim;
|
||||
Key * keys = keysAll + keyIndex;
|
||||
|
||||
|
||||
const int nloc = min(numElements - keyIndex, blkDim);
|
||||
|
||||
const int mask = (1 << NUMBITS) - 1;
|
||||
|
||||
/* copy digit offset from Gmem to Lmem */
|
||||
#if 1
|
||||
__shared__ int digitOffsets_sh[NUMDIGITS*4];
|
||||
volatile int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS;
|
||||
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||
digitOffsets[digit] = digitOffsetsAll[blkIdx*NUMDIGITS + digit];
|
||||
#else
|
||||
int *digitOffsets = &digitOffsetsAll[blkIdx*NUMDIGITS];
|
||||
#endif
|
||||
|
||||
|
||||
for (int i = programIndex; i < nloc; i += programCount)
|
||||
if (i < nloc)
|
||||
{
|
||||
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||
int scatter;
|
||||
/* not a vector friendly loop */
|
||||
#pragma unroll 1 /* needed, otherwise compiler unroll and optimizes the result :S */
|
||||
for (int iv = 0; iv < programCount; iv++)
|
||||
if (programIndex == iv)
|
||||
scatter = digitOffsets[key]++;
|
||||
sorted [scatter] = keys[i];
|
||||
}
|
||||
}
|
||||
|
||||
__global__
|
||||
void partialScanLocal(
|
||||
int numBlocks,
|
||||
int excScanAll[],
|
||||
int countsAll[],
|
||||
int partialSumAll[])
|
||||
{
|
||||
const int blkIdx = taskIndex;
|
||||
|
||||
const int blkDim = (numBlocks+taskCount-1)/taskCount;
|
||||
const int bbeg = blkIdx * blkDim;
|
||||
const int bend = min(bbeg + blkDim, numBlocks);
|
||||
|
||||
int (* countsBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])countsAll;
|
||||
int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
|
||||
int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll;
|
||||
|
||||
#pragma unroll 8
|
||||
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||
{
|
||||
int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
|
||||
for ( int block = bbeg; block < bend; block++)
|
||||
{
|
||||
const int y = countsBlock[block][digit];
|
||||
excScanBlock[block][digit] = prev;
|
||||
prev += y;
|
||||
}
|
||||
partialSum[blkIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
|
||||
}
|
||||
}
|
||||
|
||||
__global__
|
||||
void partialScanGlobal(
|
||||
const int numBlocks,
|
||||
int partialSumAll[],
|
||||
int prefixSumAll[])
|
||||
{
|
||||
int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll;
|
||||
int (* prefixSum)[NUMDIGITS] = ( int (*)[NUMDIGITS]) prefixSumAll;
|
||||
const int digit = taskIndex;
|
||||
int carry = 0;
|
||||
for (int block = programIndex; block < numBlocks; block += programCount)
|
||||
{
|
||||
const int value = partialSum[block][digit];
|
||||
const int scan = exclusive_scan_add(value);
|
||||
if (block < numBlocks)
|
||||
prefixSum[block][digit] = scan + carry;
|
||||
carry += __shfl(scan+value, programCount-1);
|
||||
}
|
||||
}
|
||||
|
||||
__global__
|
||||
void completeScanGlobal(
|
||||
int numBlocks,
|
||||
int excScanAll[],
|
||||
int carryValueAll[])
|
||||
{
|
||||
const int blkIdx = taskIndex;
|
||||
const int blkDim = (numBlocks+taskCount-1)/taskCount;
|
||||
const int bbeg = blkIdx * blkDim;
|
||||
const int bend = min(bbeg + blkDim, numBlocks);
|
||||
|
||||
int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
|
||||
int (* carryValue)[NUMDIGITS] = ( int (*)[NUMDIGITS])carryValueAll;
|
||||
|
||||
#pragma unroll 8
|
||||
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||
{
|
||||
const int carry = carryValue[blkIdx][digit];
|
||||
for ( int block = bbeg; block < bend; block++)
|
||||
excScanBlock[block][digit] += carry;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ static
|
||||
inline void radixExclusiveScan(
|
||||
const int numBlocks,
|
||||
int excScanPtr[],
|
||||
int countsPtr[],
|
||||
int partialSum[],
|
||||
int prefixSum[])
|
||||
{
|
||||
const int scale = 8;
|
||||
launch (numBlocks/scale, 1,1, partialScanLocal)(numBlocks, excScanPtr, countsPtr, partialSum);
|
||||
sync;
|
||||
|
||||
launch (NUMDIGITS,1,1,partialScanGlobal) (numBlocks/scale, partialSum, prefixSum);
|
||||
sync;
|
||||
|
||||
launch (numBlocks/scale,1,1, completeScanGlobal) (numBlocks, excScanPtr, prefixSum);
|
||||
sync;
|
||||
}
|
||||
|
||||
__device__ static int * memoryPool = NULL;
|
||||
__device__ static int numBlocks;
|
||||
__device__ static int nSharedCounts;
|
||||
__device__ static int nCountsGlobal;
|
||||
__device__ static int nExcScan;
|
||||
__device__ static int nCountsBlock;
|
||||
__device__ static int nPartialSum;
|
||||
__device__ static int nPrefixSum;
|
||||
|
||||
__device__ static int * sharedCounts;
|
||||
__device__ static int * countsGlobal;
|
||||
__device__ static int * excScan;
|
||||
__device__ static int * counts;
|
||||
__device__ static int * partialSum;
|
||||
__device__ static int * prefixSum;
|
||||
|
||||
__device__ static int numElementsBuf = 0;
|
||||
__device__ static Key * bufKeys;
|
||||
|
||||
__global__
|
||||
void radixSort_alloc___export(const int n)
|
||||
{
|
||||
assert(memoryPool == NULL);
|
||||
numBlocks = 13*32*4;
|
||||
nSharedCounts = NUMDIGITS*numBlocks;
|
||||
nCountsGlobal = NUMDIGITS;
|
||||
nExcScan = NUMDIGITS*numBlocks;
|
||||
nCountsBlock = NUMDIGITS*numBlocks;
|
||||
nPartialSum = NUMDIGITS*numBlocks;
|
||||
nPrefixSum = NUMDIGITS*numBlocks;
|
||||
|
||||
|
||||
const int nalloc =
|
||||
nSharedCounts +
|
||||
nCountsGlobal +
|
||||
nExcScan +
|
||||
nCountsBlock +
|
||||
nPartialSum +
|
||||
nPrefixSum;
|
||||
|
||||
if (programIndex == 0)
|
||||
memoryPool = new int[nalloc];
|
||||
|
||||
sharedCounts = memoryPool;
|
||||
countsGlobal = sharedCounts + nSharedCounts;
|
||||
excScan = countsGlobal + nCountsGlobal;
|
||||
counts = excScan + nExcScan;
|
||||
partialSum = counts + nCountsBlock;
|
||||
prefixSum = partialSum + nPartialSum;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void radixSort_alloc(const int n)
|
||||
{
|
||||
radixSort_alloc___export<<<1,32>>>(n);
|
||||
sync;
|
||||
}
|
||||
|
||||
|
||||
__device__ static
|
||||
void radixSort_freeBufKeys()
|
||||
{
|
||||
if (numElementsBuf > 0)
|
||||
{
|
||||
if (programIndex == 0)
|
||||
delete bufKeys;
|
||||
numElementsBuf = 0;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void radixSort_free___export()
|
||||
{
|
||||
assert(memoryPool != NULL);
|
||||
if (programIndex == 0)
|
||||
delete memoryPool;
|
||||
memoryPool = NULL;
|
||||
|
||||
radixSort_freeBufKeys();
|
||||
}
|
||||
extern "C"
|
||||
void radixSort_free()
|
||||
{
|
||||
radixSort_free___export<<<1,32>>>();
|
||||
sync;
|
||||
}
|
||||
|
||||
__global__ void radixSort___export(
|
||||
const int numElements,
|
||||
Key keys[],
|
||||
const int nBits)
|
||||
{
|
||||
#ifdef __NVPTX__
|
||||
assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */
|
||||
#endif
|
||||
|
||||
if (numElementsBuf < numElements)
|
||||
radixSort_freeBufKeys();
|
||||
if (numElementsBuf == 0)
|
||||
{
|
||||
numElementsBuf = numElements;
|
||||
if (programIndex == 0)
|
||||
bufKeys = new Key[numElementsBuf];
|
||||
}
|
||||
|
||||
const int blkDim = (numElements + numBlocks - 1) / numBlocks;
|
||||
|
||||
for ( int bit = 0; bit < nBits; bit += NUMBITS)
|
||||
{
|
||||
/* initialize histogram for each digit */
|
||||
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||
countsGlobal[digit] = 0;
|
||||
|
||||
/* compute histogram for each digit */
|
||||
launch (numBlocks,1,1, countPass)(keys, bufKeys, bit, numElements, counts, countsGlobal);
|
||||
sync;
|
||||
|
||||
/* exclusive scan on global histogram */
|
||||
int carry = 0;
|
||||
excScan[0] = 0;
|
||||
#pragma unroll 8
|
||||
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
|
||||
{
|
||||
const int value = countsGlobal[digit];
|
||||
const int scan = exclusive_scan_add(value);
|
||||
excScan[digit] = scan + carry;
|
||||
carry += __shfl(scan+value, programCount-1);
|
||||
}
|
||||
|
||||
/* computing offsets for each digit */
|
||||
radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
|
||||
|
||||
/* sorting */
|
||||
launch (numBlocks,1,1,
|
||||
sortPass)(
|
||||
bufKeys,
|
||||
keys,
|
||||
bit,
|
||||
numElements,
|
||||
excScan);
|
||||
sync;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void radixSort(
|
||||
const int numElements,
|
||||
Key keys[],
|
||||
const int nBits)
|
||||
{
|
||||
cudaDeviceSetCacheConfig ( cudaFuncCachePreferEqual );
|
||||
radixSort___export<<<1,32>>>(numElements, keys, nBits);
|
||||
sync;
|
||||
}
|
||||
337
examples/portable/radixSort/radixSort.ispc
Normal file
337
examples/portable/radixSort/radixSort.ispc
Normal file
@@ -0,0 +1,337 @@
|
||||
/*
|
||||
Copyright (c) 2014, Evghenii Gaburov
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
Based on radixSort from http://www.moderngpu.com
|
||||
*/
|
||||
|
||||
#define NUMBITS 8
|
||||
#define NUMDIGITS (1<<NUMBITS)
|
||||
|
||||
typedef int64 Key;
|
||||
|
||||
task
|
||||
void countPass(
|
||||
const uniform Key keysAll[],
|
||||
uniform Key sortedAll[],
|
||||
const uniform int bit,
|
||||
const uniform int numElements,
|
||||
uniform int countsAll[],
|
||||
uniform int countsGlobal[])
|
||||
{
|
||||
const uniform int blockIdx = taskIndex;
|
||||
const uniform int numBlocks = taskCount;
|
||||
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
||||
|
||||
const uniform int mask = (1 << NUMBITS) - 1;
|
||||
|
||||
const uniform Key * uniform keys = keysAll + blockIdx*blockDim;
|
||||
uniform Key * uniform sorted = sortedAll + blockIdx*blockDim;
|
||||
uniform int * uniform counts = countsAll + blockIdx*NUMDIGITS;
|
||||
const uniform int nloc = min(numElements - blockIdx*blockDim, blockDim);
|
||||
|
||||
foreach (digit = 0 ... NUMDIGITS)
|
||||
counts[digit] = 0;
|
||||
|
||||
foreach (i = 0 ... nloc)
|
||||
{
|
||||
sorted[i] = keys[i];
|
||||
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||
#ifdef __NVPTX__
|
||||
atomic_add_global(&counts[key], 1);
|
||||
#else
|
||||
atomic_add_local(&counts[key], 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
foreach (digit = 0 ... NUMDIGITS)
|
||||
atomic_add_global(&countsGlobal[digit], counts[digit]);
|
||||
}
|
||||
|
||||
task
|
||||
void sortPass(
|
||||
uniform Key keysAll[],
|
||||
uniform Key sorted[],
|
||||
uniform int bit,
|
||||
uniform int numElements,
|
||||
uniform int digitOffsetsAll[])
|
||||
{
|
||||
const uniform int blockIdx = taskIndex;
|
||||
const uniform int numBlocks = taskCount;
|
||||
|
||||
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
||||
|
||||
|
||||
const uniform int keyIndex = blockIdx * blockDim;
|
||||
uniform Key * uniform keys = keysAll + keyIndex;
|
||||
|
||||
|
||||
const uniform int nloc = min(numElements - keyIndex, blockDim);
|
||||
|
||||
const uniform int mask = (1 << NUMBITS) - 1;
|
||||
|
||||
/* copy digit offset from Gmem to Lmem */
|
||||
#if 1
|
||||
uniform int digitOffsets[NUMDIGITS];
|
||||
foreach (digit = 0 ... NUMDIGITS)
|
||||
digitOffsets[digit] = digitOffsetsAll[blockIdx*NUMDIGITS + digit];
|
||||
#else
|
||||
uniform int * uniform digitOffsets = &digitOffsetsAll[blockIdx*NUMDIGITS];
|
||||
#endif
|
||||
|
||||
foreach (i = 0 ... nloc)
|
||||
{
|
||||
const int key = mask & ((unsigned int)keys[i] >> bit);
|
||||
int scatter;
|
||||
/* not a vector friendly loop */
|
||||
foreach_active(iv)
|
||||
scatter = digitOffsets[key]++;
|
||||
sorted[scatter] = keys[i];
|
||||
}
|
||||
}
|
||||
|
||||
task
|
||||
void partialScanLocal(
|
||||
uniform int numBlocks,
|
||||
uniform int excScanAll[],
|
||||
uniform int countsAll[],
|
||||
uniform int partialSumAll[])
|
||||
{
|
||||
const uniform int blockIdx = taskIndex;
|
||||
|
||||
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
||||
const uniform int bbeg = blockIdx * blockDim;
|
||||
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||
|
||||
uniform int (* uniform countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])countsAll;
|
||||
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
|
||||
uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
|
||||
|
||||
foreach (digit = 0 ... NUMDIGITS)
|
||||
{
|
||||
int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
|
||||
for (uniform int block = bbeg; block < bend; block++)
|
||||
{
|
||||
const int y = countsBlock[block][digit];
|
||||
excScanBlock[block][digit] = prev;
|
||||
prev += y;
|
||||
}
|
||||
partialSum[blockIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
|
||||
}
|
||||
}
|
||||
|
||||
task
|
||||
void partialScanGlobal(
|
||||
const uniform int numBlocks,
|
||||
uniform int partialSumAll[],
|
||||
uniform int prefixSumAll[])
|
||||
{
|
||||
uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
|
||||
uniform int (* uniform prefixSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS]) prefixSumAll;
|
||||
const uniform int digit = taskIndex;
|
||||
int carry = 0;
|
||||
foreach (block = 0 ... numBlocks)
|
||||
{
|
||||
const int value = partialSum[block][digit];
|
||||
const int scan = exclusive_scan_add(value);
|
||||
prefixSum[block][digit] = scan + carry;
|
||||
carry += broadcast(scan+value, programCount-1);
|
||||
}
|
||||
}
|
||||
|
||||
task
|
||||
void completeScanGlobal(
|
||||
uniform int numBlocks,
|
||||
uniform int excScanAll[],
|
||||
uniform int carryValueAll[])
|
||||
{
|
||||
const uniform int blockIdx = taskIndex;
|
||||
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
|
||||
const uniform int bbeg = blockIdx * blockDim;
|
||||
const uniform int bend = min(bbeg + blockDim, numBlocks);
|
||||
|
||||
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
|
||||
uniform int (* uniform carryValue)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])carryValueAll;
|
||||
|
||||
foreach (digit = 0 ... NUMDIGITS)
|
||||
{
|
||||
const int carry = carryValue[blockIdx][digit];
|
||||
for (uniform int block = bbeg; block < bend; block++)
|
||||
excScanBlock[block][digit] += carry;
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
inline void radixExclusiveScan(
|
||||
const uniform int numBlocks,
|
||||
uniform int excScanPtr[],
|
||||
uniform int countsPtr[],
|
||||
uniform int partialSum[],
|
||||
uniform int prefixSum[])
|
||||
{
|
||||
const uniform int scale = 8;
|
||||
launch [numBlocks/scale] partialScanLocal(numBlocks, excScanPtr, countsPtr, partialSum);
|
||||
sync;
|
||||
|
||||
launch [NUMDIGITS] partialScanGlobal(numBlocks/scale, partialSum, prefixSum);
|
||||
sync;
|
||||
|
||||
launch [numBlocks/scale] completeScanGlobal(numBlocks, excScanPtr, prefixSum);
|
||||
sync;
|
||||
}
|
||||
|
||||
static uniform int * uniform memoryPool = NULL;
|
||||
static uniform int numBlocks;
|
||||
static uniform int nSharedCounts;
|
||||
static uniform int nCountsGlobal;
|
||||
static uniform int nExcScan;
|
||||
static uniform int nCountsBlock;
|
||||
static uniform int nPartialSum;
|
||||
static uniform int nPrefixSum;
|
||||
|
||||
static uniform int * uniform sharedCounts;
|
||||
static uniform int * uniform countsGlobal;
|
||||
static uniform int * uniform excScan;
|
||||
static uniform int * uniform counts;
|
||||
static uniform int * uniform partialSum;
|
||||
static uniform int * uniform prefixSum;
|
||||
|
||||
static uniform int numElementsBuf = 0;
|
||||
static uniform Key * uniform bufKeys;
|
||||
|
||||
export void radixSort_alloc(const uniform int n)
|
||||
{
|
||||
assert(memoryPool == NULL);
|
||||
numBlocks = num_cores()*4;
|
||||
#ifdef __NVPTX__
|
||||
numBlocks = 13*32*4; //num_cores()*4;
|
||||
#endif
|
||||
nSharedCounts = NUMDIGITS*numBlocks;
|
||||
nCountsGlobal = NUMDIGITS;
|
||||
nExcScan = NUMDIGITS*numBlocks;
|
||||
nCountsBlock = NUMDIGITS*numBlocks;
|
||||
nPartialSum = NUMDIGITS*numBlocks;
|
||||
nPrefixSum = NUMDIGITS*numBlocks;
|
||||
|
||||
|
||||
const uniform int nalloc =
|
||||
nSharedCounts +
|
||||
nCountsGlobal +
|
||||
nExcScan +
|
||||
nCountsBlock +
|
||||
nPartialSum +
|
||||
nPrefixSum;
|
||||
|
||||
memoryPool = uniform new uniform int[nalloc];
|
||||
|
||||
sharedCounts = memoryPool;
|
||||
countsGlobal = sharedCounts + nSharedCounts;
|
||||
excScan = countsGlobal + nCountsGlobal;
|
||||
counts = excScan + nExcScan;
|
||||
partialSum = counts + nCountsBlock;
|
||||
prefixSum = partialSum + nPartialSum;
|
||||
}
|
||||
|
||||
static
|
||||
void radixSort_freeBufKeys()
|
||||
{
|
||||
if (numElementsBuf > 0)
|
||||
{
|
||||
delete bufKeys;
|
||||
numElementsBuf = 0;
|
||||
}
|
||||
}
|
||||
|
||||
export void radixSort_free()
|
||||
{
|
||||
assert(memoryPool != NULL);
|
||||
delete memoryPool;
|
||||
memoryPool = NULL;
|
||||
|
||||
radixSort_freeBufKeys();
|
||||
}
|
||||
|
||||
export void radixSort(
|
||||
const uniform int numElements,
|
||||
uniform Key keys[],
|
||||
const uniform int nBits)
|
||||
{
|
||||
#ifdef __NVPTX__
|
||||
assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */
|
||||
#endif
|
||||
|
||||
if (numElementsBuf < numElements)
|
||||
radixSort_freeBufKeys();
|
||||
if (numElementsBuf == 0)
|
||||
{
|
||||
numElementsBuf = numElements;
|
||||
bufKeys = uniform new uniform Key[numElementsBuf];
|
||||
}
|
||||
|
||||
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
|
||||
|
||||
for (uniform int bit = 0; bit < nBits; bit += NUMBITS)
|
||||
{
|
||||
/* initialize histogram for each digit */
|
||||
foreach (digit = 0 ... NUMDIGITS)
|
||||
countsGlobal[digit] = 0;
|
||||
|
||||
/* compute histogram for each digit */
|
||||
launch [numBlocks] countPass(keys, bufKeys, bit, numElements, counts, countsGlobal);
|
||||
sync;
|
||||
|
||||
/* exclusive scan on global histogram */
|
||||
int carry = 0;
|
||||
excScan[0] = 0;
|
||||
foreach (digit = 0 ... NUMDIGITS)
|
||||
{
|
||||
const int value = countsGlobal[digit];
|
||||
const int scan = exclusive_scan_add(value);
|
||||
excScan[digit] = scan + carry;
|
||||
carry += broadcast(scan+value, programCount-1);
|
||||
}
|
||||
|
||||
/* computing offsets for each digit */
|
||||
radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
|
||||
|
||||
/* sorting */
|
||||
launch [numBlocks]
|
||||
sortPass(
|
||||
bufKeys,
|
||||
keys,
|
||||
bit,
|
||||
numElements,
|
||||
excScan);
|
||||
sync;
|
||||
}
|
||||
|
||||
}
|
||||
2
examples/portable/rt/.gitignore
vendored
Normal file
2
examples/portable/rt/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
rt
|
||||
*.ppm
|
||||
8
examples/portable/rt/Makefile_cpu
Normal file
8
examples/portable/rt/Makefile_cpu
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=rt
|
||||
CPP_SRC=rt.cpp
|
||||
ISPC_SRC=rt.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x8
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common_cpu.mk
|
||||
7
examples/portable/rt/Makefile_knc
Normal file
7
examples/portable/rt/Makefile_knc
Normal file
@@ -0,0 +1,7 @@
|
||||
EXAMPLE=rt
|
||||
CXX_SRC=rt.cpp
|
||||
ISPC_SRC=rt.ispc
|
||||
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
|
||||
ISPC_TARGET=generic-16
|
||||
|
||||
include ../common_knc.mk
|
||||
13
examples/portable/rt/Makefile_ptx
Normal file
13
examples/portable/rt/Makefile_ptx
Normal file
@@ -0,0 +1,13 @@
|
||||
PROG=rt
|
||||
ISPC_SRC=rt.ispc
|
||||
CU_SRC=rt.cu
|
||||
CXX_SRC=rt.cpp
|
||||
PTXCC_REGMAX=32
|
||||
|
||||
#LLVM_GPU=1
|
||||
NVVM_GPU=1
|
||||
|
||||
include ../common_ptx.mk
|
||||
|
||||
|
||||
|
||||
1
examples/portable/rt/cornell.bvh
Symbolic link
1
examples/portable/rt/cornell.bvh
Symbolic link
@@ -0,0 +1 @@
|
||||
../../rt/cornell.bvh
|
||||
1
examples/portable/rt/cornell.camera
Symbolic link
1
examples/portable/rt/cornell.camera
Symbolic link
@@ -0,0 +1 @@
|
||||
../../rt/cornell.camera
|
||||
229
examples/portable/rt/rt.cpp
Normal file
229
examples/portable/rt/rt.cpp
Normal file
@@ -0,0 +1,229 @@
|
||||
/*
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <sys/types.h>
|
||||
#include "timing.h"
|
||||
#include "rt_ispc.h"
|
||||
#include "ispc_malloc.h"
|
||||
|
||||
using namespace ispc;
|
||||
|
||||
typedef unsigned int uint;
|
||||
|
||||
static void writeImage(int *idImage, float *depthImage, int width, int height,
|
||||
const char *filename) {
|
||||
FILE *f = fopen(filename, "wb");
|
||||
if (!f) {
|
||||
perror(filename);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fprintf(f, "P6\n%d %d\n255\n", width, height);
|
||||
for (int y = 0; y < height; ++y) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
// use the bits from the object id of the hit object to make a
|
||||
// random color
|
||||
int id = idImage[y * width + x];
|
||||
unsigned char r = 0, g = 0, b = 0;
|
||||
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
// extract bit 3*i for red, 3*i+1 for green, 3*i+2 for blue
|
||||
int rbit = (id & (1 << (3*i))) >> (3*i);
|
||||
int gbit = (id & (1 << (3*i+1))) >> (3*i+1);
|
||||
int bbit = (id & (1 << (3*i+2))) >> (3*i+2);
|
||||
// and then set the bits of the colors starting from the
|
||||
// high bits...
|
||||
r |= rbit << (7-i);
|
||||
g |= gbit << (7-i);
|
||||
b |= bbit << (7-i);
|
||||
}
|
||||
fputc(r, f);
|
||||
fputc(g, f);
|
||||
fputc(b, f);
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
printf("Wrote image file %s\n", filename);
|
||||
}
|
||||
|
||||
|
||||
static void usage() {
|
||||
fprintf(stderr, "rt <scene name base> [--scale=<factor>] [ispc iterations] [tasks iterations] [serial iterations]\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
static unsigned int test_iterations[] = {3, 7, 1};
|
||||
float scale = 1.f;
|
||||
const char *filename = NULL;
|
||||
if (argc < 2) usage();
|
||||
filename = argv[1];
|
||||
if (argc > 2) {
|
||||
if (strncmp(argv[2], "--scale=", 8) == 0) {
|
||||
scale = atof(argv[2] + 8);
|
||||
}
|
||||
}
|
||||
if ((argc == 6) || (argc == 5)) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
test_iterations[i] = atoi(argv[argc - 3 + i]);
|
||||
}
|
||||
}
|
||||
|
||||
#define READ(var, n) \
|
||||
if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \
|
||||
fprintf(stderr, "Unexpected EOF reading scene file\n"); \
|
||||
return 1; \
|
||||
} else /* eat ; */
|
||||
|
||||
//
|
||||
// Read the camera specification information from the camera file
|
||||
//
|
||||
char fnbuf[1024];
|
||||
sprintf(fnbuf, "%s.camera", filename);
|
||||
FILE *f = fopen(fnbuf, "rb");
|
||||
if (!f) {
|
||||
perror(fnbuf);
|
||||
return 1;
|
||||
}
|
||||
|
||||
//
|
||||
// Nothing fancy, and trouble if we run on a big-endian system, just
|
||||
// fread in the bits
|
||||
//
|
||||
int baseWidth, baseHeight;
|
||||
// float camera2world[4][4], raster2camera[4][4];
|
||||
float *camera2world_ispc = new float[4*4];
|
||||
float *raster2camera_ispc = new float[4*4];
|
||||
float (*camera2world )[4] = (float (*)[4])camera2world_ispc;
|
||||
float (*raster2camera)[4] = (float (*)[4])raster2camera_ispc;
|
||||
READ(baseWidth, 1);
|
||||
READ(baseHeight, 1);
|
||||
READ(camera2world[0][0], 16);
|
||||
READ(raster2camera[0][0], 16);
|
||||
|
||||
//
|
||||
// Read in the serialized BVH
|
||||
//
|
||||
sprintf(fnbuf, "%s.bvh", filename);
|
||||
f = fopen(fnbuf, "rb");
|
||||
if (!f) {
|
||||
perror(fnbuf);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// The BVH file starts with an int that gives the total number of BVH
|
||||
// nodes
|
||||
uint nNodes;
|
||||
READ(nNodes, 1);
|
||||
|
||||
LinearBVHNode *nodes = new LinearBVHNode[nNodes];
|
||||
for (unsigned int i = 0; i < nNodes; ++i) {
|
||||
// Each node is 6x floats for a boox, then an integer for an offset
|
||||
// to the second child node, then an integer that encodes the type
|
||||
// of node, the total number of int it if a leaf node, etc.
|
||||
float b[6];
|
||||
READ(b[0], 6);
|
||||
nodes[i].bounds[0][0] = b[0];
|
||||
nodes[i].bounds[0][1] = b[1];
|
||||
nodes[i].bounds[0][2] = b[2];
|
||||
nodes[i].bounds[1][0] = b[3];
|
||||
nodes[i].bounds[1][1] = b[4];
|
||||
nodes[i].bounds[1][2] = b[5];
|
||||
READ(nodes[i].offset, 1);
|
||||
READ(nodes[i].nPrimitives, 1);
|
||||
READ(nodes[i].splitAxis, 1);
|
||||
READ(nodes[i].pad, 1);
|
||||
}
|
||||
|
||||
// And then read the triangles
|
||||
uint nTris;
|
||||
READ(nTris, 1);
|
||||
Triangle *triangles = new Triangle[nTris];
|
||||
for (uint i = 0; i < nTris; ++i) {
|
||||
// 9x floats for the 3 vertices
|
||||
float v[9];
|
||||
READ(v[0], 9);
|
||||
float *vp = v;
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
triangles[i].p[j][0] = *vp++;
|
||||
triangles[i].p[j][1] = *vp++;
|
||||
triangles[i].p[j][2] = *vp++;
|
||||
}
|
||||
// And create an object id
|
||||
triangles[i].id = i+1;
|
||||
}
|
||||
fclose(f);
|
||||
|
||||
int height = int(baseHeight * scale);
|
||||
int width = int(baseWidth * scale);
|
||||
|
||||
// allocate images; one to hold hit object ids, one to hold depth to
|
||||
// the first interseciton
|
||||
int *id = new int[width*height];
|
||||
float *image = new float[width*height];
|
||||
|
||||
ispc_memset(id, 0, width*height*sizeof(int));
|
||||
ispc_memset(image, 0, width*height*sizeof(float));
|
||||
|
||||
//
|
||||
// Run 3 iterations with ispc + 1 core, record the minimum time
|
||||
//
|
||||
double minTimeISPCtasks = 1e30;
|
||||
for (int i = 0; i < test_iterations[1]; ++i) {
|
||||
reset_and_start_timer();
|
||||
raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
|
||||
camera2world, image, id, nodes, triangles);
|
||||
double dt = get_elapsed_msec();
|
||||
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
|
||||
minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
|
||||
}
|
||||
printf("[rt ispc + tasks]:\t\t[%.3f] msec for %d x %d image\n",
|
||||
minTimeISPCtasks, width, height);
|
||||
|
||||
writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
|
||||
|
||||
return 0;
|
||||
}
|
||||
373
examples/portable/rt/rt.cu
Normal file
373
examples/portable/rt/rt.cu
Normal file
@@ -0,0 +1,373 @@
|
||||
#include "cuda_helpers.cuh"
|
||||
|
||||
#define float3 Float3
|
||||
struct Float3
|
||||
{
|
||||
float x,y,z;
|
||||
__device__ friend Float3 operator+(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x+b.x;
|
||||
c.y = a.y+b.y;
|
||||
c.z = a.z+b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator-(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x-b.x;
|
||||
c.y = a.y-b.y;
|
||||
c.z = a.z-b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator/(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x/b.x;
|
||||
c.y = a.y/b.y;
|
||||
c.z = a.z/b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator/(const float a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a/b.x;
|
||||
c.y = a/b.y;
|
||||
c.z = a/b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator*(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x*b.x;
|
||||
c.y = a.y*b.y;
|
||||
c.z = a.z*b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator*(const Float3 a, const float b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x*b;
|
||||
c.y = a.y*b;
|
||||
c.z = a.z*b;
|
||||
return c;
|
||||
}
|
||||
};
|
||||
|
||||
#define int8 char
|
||||
#define int16 short
|
||||
|
||||
struct Ray {
|
||||
float3 origin, dir, invDir;
|
||||
unsigned int dirIsNeg0, dirIsNeg1, dirIsNeg2;
|
||||
float mint, maxt;
|
||||
int hitId;
|
||||
};
|
||||
|
||||
struct Triangle {
|
||||
float p[3][4];
|
||||
int id;
|
||||
int pad[3];
|
||||
};
|
||||
|
||||
struct LinearBVHNode {
|
||||
float bounds[2][3];
|
||||
unsigned int offset; // num primitives for leaf, second child for interior
|
||||
unsigned int8 nPrimitives;
|
||||
unsigned int8 splitAxis;
|
||||
unsigned int16 pad;
|
||||
};
|
||||
|
||||
__device__
|
||||
static inline float3 Cross(const float3 v1, const float3 v2) {
|
||||
float v1x = v1.x, v1y = v1.y, v1z = v1.z;
|
||||
float v2x = v2.x, v2y = v2.y, v2z = v2.z;
|
||||
float3 ret;
|
||||
ret.x = (v1y * v2z) - (v1z * v2y);
|
||||
ret.y = (v1z * v2x) - (v1x * v2z);
|
||||
ret.z = (v1x * v2y) - (v1y * v2x);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline float Dot(const float3 a, const float3 b) {
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void generateRay( const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float x, float y, Ray &ray) {
|
||||
ray.mint = 0.f;
|
||||
ray.maxt = 1e30f;
|
||||
|
||||
ray.hitId = 0;
|
||||
|
||||
// transform raster coordinate (x, y, 0) to camera space
|
||||
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
|
||||
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
|
||||
float camz = raster2camera[2][3];
|
||||
float camw = raster2camera[3][3];
|
||||
camx /= camw;
|
||||
camy /= camw;
|
||||
camz /= camw;
|
||||
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
|
||||
camera2world[0][2] * camz;
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
|
||||
camera2world[1][2] * camz;
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
|
||||
camera2world[2][2] * camz;
|
||||
|
||||
ray.origin.x = camera2world[0][3] / camera2world[3][3];
|
||||
ray.origin.y = camera2world[1][3] / camera2world[3][3];
|
||||
ray.origin.z = camera2world[2][3] / camera2world[3][3];
|
||||
|
||||
ray.invDir = 1.f / ray.dir;
|
||||
|
||||
#if 0
|
||||
ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
|
||||
ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
|
||||
ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
|
||||
#else
|
||||
ray.dirIsNeg0 = any(ray.invDir.x < 0) ? 1 : 0;
|
||||
ray.dirIsNeg1 = any(ray.invDir.y < 0) ? 1 : 0;
|
||||
ray.dirIsNeg2 = any(ray.invDir.z < 0) ? 1 : 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
static bool BBoxIntersect(const float bounds[2][3],
|
||||
const Ray &ray) {
|
||||
float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
|
||||
float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
|
||||
float t0 = ray.mint, t1 = ray.maxt;
|
||||
|
||||
// Check all three axis-aligned slabs. Don't try to early out; it's
|
||||
// not worth the trouble
|
||||
float3 tNear = (bounds0 - ray.origin) * ray.invDir;
|
||||
float3 tFar = (bounds1 - ray.origin) * ray.invDir;
|
||||
if (tNear.x > tFar.x) {
|
||||
float tmp = tNear.x;
|
||||
tNear.x = tFar.x;
|
||||
tFar.x = tmp;
|
||||
}
|
||||
t0 = max(tNear.x, t0);
|
||||
t1 = min(tFar.x, t1);
|
||||
|
||||
if (tNear.y > tFar.y) {
|
||||
float tmp = tNear.y;
|
||||
tNear.y = tFar.y;
|
||||
tFar.y = tmp;
|
||||
}
|
||||
t0 = max(tNear.y, t0);
|
||||
t1 = min(tFar.y, t1);
|
||||
|
||||
if (tNear.z > tFar.z) {
|
||||
float tmp = tNear.z;
|
||||
tNear.z = tFar.z;
|
||||
tFar.z = tmp;
|
||||
}
|
||||
t0 = max(tNear.z, t0);
|
||||
t1 = min(tFar.z, t1);
|
||||
|
||||
return (t0 <= t1);
|
||||
}
|
||||
|
||||
|
||||
__device__
|
||||
inline
|
||||
static bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
|
||||
float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
|
||||
float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
|
||||
float3 e1 = p1 - p0;
|
||||
float3 e2 = p2 - p0;
|
||||
|
||||
float3 s1 = Cross(ray.dir, e2);
|
||||
float divisor = Dot(s1, e1);
|
||||
bool hit = true;
|
||||
|
||||
if (divisor == 0.)
|
||||
hit = false;
|
||||
float invDivisor = 1.f / divisor;
|
||||
|
||||
// Compute first barycentric coordinate
|
||||
float3 d = ray.origin - p0;
|
||||
float b1 = Dot(d, s1) * invDivisor;
|
||||
if (b1 < 0. || b1 > 1.)
|
||||
hit = false;
|
||||
|
||||
// Compute second barycentric coordinate
|
||||
float3 s2 = Cross(d, e1);
|
||||
float b2 = Dot(ray.dir, s2) * invDivisor;
|
||||
if (b2 < 0. || b1 + b2 > 1.)
|
||||
hit = false;
|
||||
|
||||
// Compute _t_ to intersection point
|
||||
float t = Dot(e2, s2) * invDivisor;
|
||||
if (t < ray.mint || t > ray.maxt)
|
||||
hit = false;
|
||||
|
||||
if (hit) {
|
||||
ray.maxt = t;
|
||||
ray.hitId = tri.id;
|
||||
}
|
||||
return hit;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
bool BVHIntersect(const LinearBVHNode nodes[],
|
||||
const Triangle tris[], Ray &r,
|
||||
int todo[]) {
|
||||
Ray ray = r;
|
||||
bool hit = false;
|
||||
// Follow ray through BVH nodes to find primitive intersections
|
||||
int todoOffset = 0, nodeNum = 0;
|
||||
|
||||
while (true) {
|
||||
// Check ray against BVH node
|
||||
LinearBVHNode node = nodes[nodeNum];
|
||||
if (any(BBoxIntersect(node.bounds, ray))) {
|
||||
unsigned int nPrimitives = node.nPrimitives;
|
||||
if (nPrimitives > 0) {
|
||||
// Intersect ray with primitives in leaf BVH node
|
||||
unsigned int primitivesOffset = node.offset;
|
||||
for ( unsigned int i = 0; i < nPrimitives; ++i) {
|
||||
if (TriIntersect(tris[primitivesOffset+i], ray))
|
||||
hit = true;
|
||||
}
|
||||
if (todoOffset == 0)
|
||||
break;
|
||||
nodeNum = todo[--todoOffset];
|
||||
}
|
||||
else {
|
||||
// Put far BVH node on _todo_ stack, advance to near node
|
||||
int dirIsNeg;
|
||||
if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg0;
|
||||
if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg1;
|
||||
if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg2;
|
||||
if (dirIsNeg) {
|
||||
todo[todoOffset++] = nodeNum + 1;
|
||||
nodeNum = node.offset;
|
||||
}
|
||||
else {
|
||||
todo[todoOffset++] = node.offset;
|
||||
nodeNum = nodeNum + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (todoOffset == 0)
|
||||
break;
|
||||
nodeNum = todo[--todoOffset];
|
||||
}
|
||||
}
|
||||
r.maxt = ray.maxt;
|
||||
r.hitId = ray.hitId;
|
||||
|
||||
return hit;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void raytrace_tile( int x0, int x1,
|
||||
int y0, int y1,
|
||||
int width, int height,
|
||||
int baseWidth, int baseHeight,
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float image[], int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
float widthScale = (float)(baseWidth) / (float)(width);
|
||||
float heightScale = (float)(baseHeight) / (float)(height);
|
||||
|
||||
#if 0
|
||||
int * todo = new int[64];
|
||||
#define ALLOC
|
||||
#else
|
||||
int todo[64];
|
||||
#endif
|
||||
|
||||
for (int y = y0 ;y < y1; y++)
|
||||
for (int x = x0 + programIndex; x < x1; x += programCount)
|
||||
if (x < x1)
|
||||
{
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, x*widthScale,
|
||||
y*heightScale, ray);
|
||||
BVHIntersect(nodes, triangles, ray, todo);
|
||||
|
||||
int offset = y * width + x;
|
||||
image[offset] = ray.maxt;
|
||||
id[offset] = ray.hitId;
|
||||
}
|
||||
|
||||
#ifdef ALLOC
|
||||
delete todo;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__
|
||||
void raytrace_tile_task( int width, int height,
|
||||
int baseWidth, int baseHeight,
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float image[], int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
int dx = 64, dy = 8; // must match dx, dy below
|
||||
int xBuckets = (width + (dx-1)) / dx;
|
||||
int x0 = (taskIndex % xBuckets) * dx;
|
||||
int x1 = min(x0 + dx, width);
|
||||
int y0 = (taskIndex / xBuckets) * dy;
|
||||
int y1 = min(y0 + dy, height);
|
||||
|
||||
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world, image,
|
||||
id, nodes, triangles);
|
||||
}
|
||||
|
||||
|
||||
extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height,
|
||||
int baseWidth, int baseHeight,
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float image[], int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
int dx = 64, dy = 8;
|
||||
int xBuckets = (width + (dx-1)) / dx;
|
||||
int yBuckets = (height + (dy-1)) / dy;
|
||||
int nTasks = xBuckets * yBuckets;
|
||||
launch(nTasks,1,1,raytrace_tile_task)
|
||||
(width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world,
|
||||
image, id, nodes, triangles);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
|
||||
extern "C" __host__ void raytrace_ispc_tasks( int width, int height,
|
||||
int baseWidth, int baseHeight,
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float image[], int id[],
|
||||
const LinearBVHNode nodes[],
|
||||
const Triangle triangles[]) {
|
||||
raytrace_ispc_tasks___export<<<1,32>>>( width, height,
|
||||
baseWidth, baseHeight,
|
||||
raster2camera,
|
||||
camera2world,
|
||||
image, id,
|
||||
nodes,
|
||||
triangles);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
351
examples/portable/rt/rt.ispc
Normal file
351
examples/portable/rt/rt.ispc
Normal file
@@ -0,0 +1,351 @@
|
||||
/*
|
||||
Copyright (c) 2010-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#if 1
|
||||
typedef int bool_t;
|
||||
#else
|
||||
typedef bool bool_t;
|
||||
#endif
|
||||
typedef float<3> float3;
|
||||
|
||||
#ifdef __NVPTX__
|
||||
#define uniform_t varying
|
||||
#else
|
||||
#define uniform_t uniform
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
struct int3
|
||||
{
|
||||
int x,y,z;
|
||||
};
|
||||
|
||||
struct Ray {
|
||||
float3 origin, dir, invDir;
|
||||
uniform unsigned int dirIsNeg[3];
|
||||
float mint, maxt;
|
||||
int hitId;
|
||||
};
|
||||
|
||||
struct Triangle {
|
||||
float p[3][4];
|
||||
int id;
|
||||
int pad[3];
|
||||
};
|
||||
|
||||
struct LinearBVHNode {
|
||||
float bounds[2][3];
|
||||
unsigned int offset; // num primitives for leaf, second child for interior
|
||||
unsigned int8 nPrimitives;
|
||||
unsigned int8 splitAxis;
|
||||
unsigned int16 pad;
|
||||
};
|
||||
|
||||
static inline float3 Cross(const float3 v1, const float3 v2) {
|
||||
float v1x = v1.x, v1y = v1.y, v1z = v1.z;
|
||||
float v2x = v2.x, v2y = v2.y, v2z = v2.z;
|
||||
float3 ret;
|
||||
ret.x = (v1y * v2z) - (v1z * v2y);
|
||||
ret.y = (v1z * v2x) - (v1x * v2z);
|
||||
ret.z = (v1x * v2y) - (v1y * v2x);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline float Dot(const float3 a, const float3 b) {
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static void generateRay(uniform const float raster2camera[4][4],
|
||||
uniform const float camera2world[4][4],
|
||||
float x, float y, Ray &ray) {
|
||||
ray.mint = 0.f;
|
||||
ray.maxt = 1e30f;
|
||||
|
||||
ray.hitId = 0;
|
||||
|
||||
// transform raster coordinate (x, y, 0) to camera space
|
||||
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
|
||||
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
|
||||
float camz = raster2camera[2][3];
|
||||
float camw = raster2camera[3][3];
|
||||
camx /= camw;
|
||||
camy /= camw;
|
||||
camz /= camw;
|
||||
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
|
||||
camera2world[0][2] * camz;
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
|
||||
camera2world[1][2] * camz;
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
|
||||
camera2world[2][2] * camz;
|
||||
|
||||
ray.origin.x = camera2world[0][3] / camera2world[3][3];
|
||||
ray.origin.y = camera2world[1][3] / camera2world[3][3];
|
||||
ray.origin.z = camera2world[2][3] / camera2world[3][3];
|
||||
|
||||
ray.invDir = 1.f / ray.dir;
|
||||
|
||||
ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
|
||||
ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
|
||||
ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static bool_t BBoxIntersect(const uniform float bounds[2][3],
|
||||
const Ray &ray) {
|
||||
const uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
|
||||
const uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
|
||||
float t0 = ray.mint, t1 = ray.maxt;
|
||||
|
||||
// Check all three axis-aligned slabs. Don't try to early out; it's
|
||||
// not worth the trouble
|
||||
float3 tNear = (bounds0 - ray.origin) * ray.invDir;
|
||||
float3 tFar = (bounds1 - ray.origin) * ray.invDir;
|
||||
if (tNear.x > tFar.x) {
|
||||
float tmp = tNear.x;
|
||||
tNear.x = tFar.x;
|
||||
tFar.x = tmp;
|
||||
}
|
||||
t0 = max(tNear.x, t0);
|
||||
t1 = min(tFar.x, t1);
|
||||
|
||||
if (tNear.y > tFar.y) {
|
||||
float tmp = tNear.y;
|
||||
tNear.y = tFar.y;
|
||||
tFar.y = tmp;
|
||||
}
|
||||
t0 = max(tNear.y, t0);
|
||||
t1 = min(tFar.y, t1);
|
||||
|
||||
if (tNear.z > tFar.z) {
|
||||
float tmp = tNear.z;
|
||||
tNear.z = tFar.z;
|
||||
tFar.z = tmp;
|
||||
}
|
||||
t0 = max(tNear.z, t0);
|
||||
t1 = min(tFar.z, t1);
|
||||
|
||||
return (t0 <= t1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static bool_t TriIntersect(const uniform_t Triangle tri, Ray &ray) {
|
||||
const uniform_t float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
|
||||
const uniform_t float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
|
||||
const uniform_t float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
|
||||
const uniform_t float3 e1 = p1 - p0;
|
||||
const uniform_t float3 e2 = p2 - p0;
|
||||
|
||||
float3 s1 = Cross(ray.dir, e2);
|
||||
float divisor = Dot(s1, e1);
|
||||
bool_t hit = true;
|
||||
|
||||
if (divisor == 0.)
|
||||
hit = false;
|
||||
float invDivisor = 1.f / divisor;
|
||||
|
||||
// Compute first barycentric coordinate
|
||||
float3 d = ray.origin - p0;
|
||||
float b1 = Dot(d, s1) * invDivisor;
|
||||
if (b1 < 0. || b1 > 1.)
|
||||
hit = false;
|
||||
|
||||
// Compute second barycentric coordinate
|
||||
float3 s2 = Cross(d, e1);
|
||||
float b2 = Dot(ray.dir, s2) * invDivisor;
|
||||
if (b2 < 0. || b1 + b2 > 1.)
|
||||
hit = false;
|
||||
|
||||
// Compute _t_ to intersection point
|
||||
float t = Dot(e2, s2) * invDivisor;
|
||||
if (t < ray.mint || t > ray.maxt)
|
||||
hit = false;
|
||||
|
||||
if (hit) {
|
||||
ray.maxt = t;
|
||||
ray.hitId = tri.id;
|
||||
}
|
||||
return hit;
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
bool_t
|
||||
BVHIntersect(const uniform LinearBVHNode nodes[],
|
||||
const uniform Triangle tris[], Ray &r) {
|
||||
Ray ray = r;
|
||||
bool_t hit = false;
|
||||
// Follow ray through BVH nodes to find primitive intersections
|
||||
uniform int todoOffset = 0, nodeNum = 0;
|
||||
uniform int todo[64];
|
||||
|
||||
while (true) {
|
||||
// Check ray against BVH node
|
||||
const uniform LinearBVHNode node = nodes[nodeNum];
|
||||
if (any(BBoxIntersect(node.bounds, ray))) {
|
||||
const uniform unsigned int nPrimitives = node.nPrimitives;
|
||||
if (nPrimitives > 0) {
|
||||
// Intersect ray with primitives in leaf BVH node
|
||||
const uniform unsigned int primitivesOffset = node.offset;
|
||||
for (uniform_t unsigned int i = 0; i < nPrimitives; ++i) {
|
||||
if (TriIntersect(tris[primitivesOffset+i], ray))
|
||||
hit = true;
|
||||
}
|
||||
if (todoOffset == 0)
|
||||
break;
|
||||
nodeNum = todo[--todoOffset];
|
||||
}
|
||||
else {
|
||||
// Put far BVH node on _todo_ stack, advance to near node
|
||||
#if 0 /* fails */
|
||||
int dirIsNeg = r.dirIsNeg[node.splitAxis];
|
||||
#else
|
||||
int dirIsNeg;
|
||||
if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg[0];
|
||||
if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg[1];
|
||||
if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg[2];
|
||||
#endif
|
||||
if (dirIsNeg) {
|
||||
todo[todoOffset++] = nodeNum + 1;
|
||||
nodeNum = node.offset;
|
||||
}
|
||||
else {
|
||||
todo[todoOffset++] = node.offset;
|
||||
nodeNum = nodeNum + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (todoOffset == 0)
|
||||
break;
|
||||
nodeNum = todo[--todoOffset];
|
||||
}
|
||||
}
|
||||
r.maxt = ray.maxt;
|
||||
r.hitId = ray.hitId;
|
||||
|
||||
return hit;
|
||||
}
|
||||
|
||||
|
||||
#if 1
|
||||
inline
|
||||
#endif
|
||||
static void raytrace_tile(uniform int x0, uniform int x1,
|
||||
uniform int y0, uniform int y1,
|
||||
uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const uniform LinearBVHNode nodes[],
|
||||
const uniform Triangle triangles[]) {
|
||||
const uniform float widthScale = (float)(baseWidth) / (float)(width);
|
||||
const uniform float heightScale = (float)(baseHeight) / (float)(height);
|
||||
|
||||
foreach_tiled (y = y0 ... y1, x = x0 ... x1) {
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, x*widthScale,
|
||||
y*heightScale, ray);
|
||||
BVHIntersect(nodes, triangles, ray);
|
||||
|
||||
int offset = y * width + x;
|
||||
image[offset] = ray.maxt;
|
||||
id[offset] = ray.hitId;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export void raytrace_ispc(uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const uniform LinearBVHNode nodes[],
|
||||
const uniform Triangle triangles[]) {
|
||||
raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world, image,
|
||||
id, nodes, triangles);
|
||||
}
|
||||
|
||||
|
||||
task void raytrace_tile_task(uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const uniform LinearBVHNode nodes[],
|
||||
const uniform Triangle triangles[]) {
|
||||
const uniform int dx = 64, dy = 8; // must match dx, dy below
|
||||
const uniform int xBuckets = (width + (dx-1)) / dx;
|
||||
const uniform int x0 = (taskIndex % xBuckets) * dx;
|
||||
const uniform int x1 = min(x0 + dx, width);
|
||||
const uniform int y0 = (taskIndex / xBuckets) * dy;
|
||||
const uniform int y1 = min(y0 + dy, height);
|
||||
|
||||
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world, image,
|
||||
id, nodes, triangles);
|
||||
}
|
||||
|
||||
|
||||
export void raytrace_ispc_tasks(uniform int width, uniform int height,
|
||||
uniform int baseWidth, uniform int baseHeight,
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform float image[], uniform int id[],
|
||||
const uniform LinearBVHNode nodes[],
|
||||
const uniform Triangle triangles[]) {
|
||||
const uniform int dx = 64, dy = 8;
|
||||
const uniform int xBuckets = (width + (dx-1)) / dx;
|
||||
const uniform int yBuckets = (height + (dy-1)) / dy;
|
||||
const uniform int nTasks = xBuckets * yBuckets;
|
||||
launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight,
|
||||
raster2camera, camera2world,
|
||||
image, id, nodes, triangles);
|
||||
}
|
||||
|
||||
1
examples/portable/rt/sponza.bvh
Symbolic link
1
examples/portable/rt/sponza.bvh
Symbolic link
@@ -0,0 +1 @@
|
||||
../../rt/sponza.bvh
|
||||
1
examples/portable/rt/sponza.camera
Symbolic link
1
examples/portable/rt/sponza.camera
Symbolic link
@@ -0,0 +1 @@
|
||||
../../rt/sponza.camera
|
||||
1
examples/portable/rt/teapot.bvh
Symbolic link
1
examples/portable/rt/teapot.bvh
Symbolic link
@@ -0,0 +1 @@
|
||||
../../rt/teapot.bvh
|
||||
1
examples/portable/rt/teapot.camera
Symbolic link
1
examples/portable/rt/teapot.camera
Symbolic link
@@ -0,0 +1 @@
|
||||
../../rt/teapot.camera
|
||||
2
examples/portable/volume_rendering/.gitignore
vendored
Normal file
2
examples/portable/volume_rendering/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
mandelbrot
|
||||
*.ppm
|
||||
8
examples/portable/volume_rendering/Makefile_cpu
Normal file
8
examples/portable/volume_rendering/Makefile_cpu
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=volume
|
||||
CPP_SRC=volume.cpp
|
||||
ISPC_SRC=volume.ispc
|
||||
ISPC_IA_TARGETS=avx1-i32x8
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common_cpu.mk
|
||||
7
examples/portable/volume_rendering/Makefile_knc
Normal file
7
examples/portable/volume_rendering/Makefile_knc
Normal file
@@ -0,0 +1,7 @@
|
||||
EXAMPLE=volume
|
||||
CXX_SRC=volume.cpp
|
||||
ISPC_SRC=volume.ispc
|
||||
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
|
||||
ISPC_TARGET=generic-16
|
||||
|
||||
include ../common_knc.mk
|
||||
13
examples/portable/volume_rendering/Makefile_ptx
Normal file
13
examples/portable/volume_rendering/Makefile_ptx
Normal file
@@ -0,0 +1,13 @@
|
||||
PROG=volume
|
||||
ISPC_SRC=volume.ispc
|
||||
CU_SRC=volume.cu
|
||||
CXX_SRC=volume.cpp
|
||||
PTXCC_REGMAX=64
|
||||
|
||||
#LLVM_GPU=1
|
||||
NVVM_GPU=1
|
||||
|
||||
include ../common_ptx.mk
|
||||
|
||||
|
||||
|
||||
11
examples/portable/volume_rendering/camera.dat
Normal file
11
examples/portable/volume_rendering/camera.dat
Normal file
@@ -0,0 +1,11 @@
|
||||
896 1184
|
||||
|
||||
0.000155 0.000000 0.000000 -0.069927
|
||||
0.000000 -0.000155 0.000000 0.093236
|
||||
0.000000 0.000000 0.000000 1.000000
|
||||
0.000000 0.000000 -99.999001 100.000000
|
||||
|
||||
1.000000 0.000000 0.000000 1.000000
|
||||
0.000000 0.980129 -0.198360 2.900000
|
||||
0.000000 0.198360 0.980129 -10.500000
|
||||
0.000000 0.000000 0.000000 1.000000
|
||||
1
examples/portable/volume_rendering/density_highres.vol
Symbolic link
1
examples/portable/volume_rendering/density_highres.vol
Symbolic link
@@ -0,0 +1 @@
|
||||
../../volume_rendering/density_highres.vol
|
||||
1
examples/portable/volume_rendering/density_lowres.vol
Symbolic link
1
examples/portable/volume_rendering/density_lowres.vol
Symbolic link
@@ -0,0 +1 @@
|
||||
../../volume_rendering/density_lowres.vol
|
||||
183
examples/portable/volume_rendering/volume.cpp
Normal file
183
examples/portable/volume_rendering/volume.cpp
Normal file
@@ -0,0 +1,183 @@
|
||||
/*
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
#include "timing.h"
|
||||
#include "ispc_malloc.h"
|
||||
#include "volume_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
/* Write a PPM image file with the image */
|
||||
static void
|
||||
writePPM(float *buf, int width, int height, const char *fn) {
|
||||
FILE *fp = fopen(fn, "wb");
|
||||
fprintf(fp, "P6\n");
|
||||
fprintf(fp, "%d %d\n", width, height);
|
||||
fprintf(fp, "255\n");
|
||||
for (int i = 0; i < width*height; ++i) {
|
||||
float v = buf[i] * 255.f;
|
||||
if (v < 0.f) v = 0.f;
|
||||
else if (v > 255.f) v = 255.f;
|
||||
unsigned char c = (unsigned char)v;
|
||||
for (int j = 0; j < 3; ++j)
|
||||
fputc(c, fp);
|
||||
}
|
||||
fclose(fp);
|
||||
printf("Wrote image file %s\n", fn);
|
||||
}
|
||||
|
||||
|
||||
/* Load image and viewing parameters from a camera data file.
|
||||
FIXME: we should add support to be able to specify viewing parameters
|
||||
in the program here directly. */
|
||||
static void
|
||||
loadCamera(const char *fn, int *width, int *height, float raster2camera[4][4],
|
||||
float camera2world[4][4]) {
|
||||
FILE *f = fopen(fn, "r");
|
||||
if (!f) {
|
||||
perror(fn);
|
||||
exit(1);
|
||||
}
|
||||
if (fscanf(f, "%d %d", width, height) != 2) {
|
||||
fprintf(stderr, "Unexpected end of file in camera file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
if (fscanf(f, "%f", &raster2camera[i][j]) != 1) {
|
||||
fprintf(stderr, "Unexpected end of file in camera file\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
if (fscanf(f, "%f", &camera2world[i][j]) != 1) {
|
||||
fprintf(stderr, "Unexpected end of file in camera file\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
|
||||
/* Load a volume density file. Expects the number of x, y, and z samples
|
||||
as the first three values (as integer strings), then x*y*z
|
||||
floating-point values (also as strings) to give the densities. */
|
||||
static float *
|
||||
loadVolume(const char *fn, int n[3]) {
|
||||
FILE *f = fopen(fn, "r");
|
||||
if (!f) {
|
||||
perror(fn);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (fscanf(f, "%d %d %d", &n[0], &n[1], &n[2]) != 3) {
|
||||
fprintf(stderr, "Couldn't find resolution at start of density file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int count = n[0] * n[1] * n[2];
|
||||
float *v = new float[count];
|
||||
for (int i = 0; i < count; ++i) {
|
||||
if (fscanf(f, "%f", &v[i]) != 1) {
|
||||
fprintf(stderr, "Unexpected end of file at %d'th density value\n", i);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
static unsigned int test_iterations[] = {3, 7, 1};
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "usage: volume <camera.dat> <volume_density.vol> [ispc iterations] [tasks iterations] [serial iterations]\n");
|
||||
return 1;
|
||||
}
|
||||
if (argc == 6) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
test_iterations[i] = atoi(argv[3 + i]);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Load viewing data and the volume density data
|
||||
//
|
||||
int width, height;
|
||||
|
||||
float *camera2world_ispc = new float[4*4];
|
||||
float *raster2camera_ispc = new float[4*4];
|
||||
float (*camera2world )[4] = (float (*)[4])camera2world_ispc;
|
||||
float (*raster2camera)[4] = (float (*)[4])raster2camera_ispc;
|
||||
|
||||
loadCamera(argv[1], &width, &height, raster2camera, camera2world);
|
||||
float *image = new float[width*height];
|
||||
|
||||
int *n = new int[3];
|
||||
float *density = loadVolume(argv[2], n);
|
||||
|
||||
// Clear out the buffer
|
||||
for (int i = 0; i < width * height; ++i)
|
||||
image[i] = 0.;
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation that also uses
|
||||
// tasks; report the minimum time of three runs.
|
||||
//
|
||||
double minISPCtasks = 1e30;
|
||||
for (int i = 0; i < test_iterations[1]; ++i) {
|
||||
reset_and_start_timer();
|
||||
volume_ispc_tasks(density, n, raster2camera, camera2world,
|
||||
width, height, image);
|
||||
double dt = get_elapsed_msec();
|
||||
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
|
||||
minISPCtasks = std::min(minISPCtasks, dt);
|
||||
}
|
||||
|
||||
printf("[volume ispc + tasks]:\t\t[%.3f] msec\n", minISPCtasks);
|
||||
writePPM(image, width, height, "volume-ispc-tasks.ppm");
|
||||
|
||||
return 0;
|
||||
}
|
||||
454
examples/portable/volume_rendering/volume.cu
Normal file
454
examples/portable/volume_rendering/volume.cu
Normal file
@@ -0,0 +1,454 @@
|
||||
/*
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "cuda_helpers.cuh"
|
||||
__device__ static inline float clamp(float v, float low, float high)
|
||||
{
|
||||
return min(max(v, low), high);
|
||||
}
|
||||
|
||||
|
||||
#define float3 Float3
|
||||
struct Float3
|
||||
{
|
||||
float x,y,z;
|
||||
__device__ friend Float3 operator+(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x+b.x;
|
||||
c.y = a.y+b.y;
|
||||
c.z = a.z+b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator-(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x-b.x;
|
||||
c.y = a.y-b.y;
|
||||
c.z = a.z-b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator/(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x/b.x;
|
||||
c.y = a.y/b.y;
|
||||
c.z = a.z/b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator*(const Float3 a, const Float3 b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x*b.x;
|
||||
c.y = a.y*b.y;
|
||||
c.z = a.z*b.z;
|
||||
return c;
|
||||
}
|
||||
__device__ friend Float3 operator*(const Float3 a, const float b)
|
||||
{
|
||||
Float3 c;
|
||||
c.x = a.x*b;
|
||||
c.y = a.y*b;
|
||||
c.z = a.z*b;
|
||||
return c;
|
||||
}
|
||||
};
|
||||
|
||||
struct Ray {
|
||||
float3 origin, dir;
|
||||
};
|
||||
|
||||
|
||||
__device__ static void
|
||||
generateRay(const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
float x, float y, Ray &ray) {
|
||||
// transform raster coordinate (x, y, 0) to camera space
|
||||
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
|
||||
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
|
||||
float camz = raster2camera[2][3];
|
||||
float camw = raster2camera[3][3];
|
||||
camx /= camw;
|
||||
camy /= camw;
|
||||
camz /= camw;
|
||||
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
|
||||
|
||||
ray.origin.x = camera2world[0][3] / camera2world[3][3];
|
||||
ray.origin.y = camera2world[1][3] / camera2world[3][3];
|
||||
ray.origin.z = camera2world[2][3] / camera2world[3][3];
|
||||
}
|
||||
|
||||
|
||||
__device__ static inline bool
|
||||
Inside(float3 p, float3 pMin, float3 pMax) {
|
||||
return (p.x >= pMin.x && p.x <= pMax.x &&
|
||||
p.y >= pMin.y && p.y <= pMax.y &&
|
||||
p.z >= pMin.z && p.z <= pMax.z);
|
||||
}
|
||||
|
||||
|
||||
__device__ static bool
|
||||
IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
|
||||
float t0 = -1e30f, t1 = 1e30f;
|
||||
|
||||
float3 tNear = (pMin - ray.origin) / ray.dir;
|
||||
float3 tFar = (pMax - ray.origin) / ray.dir;
|
||||
if (tNear.x > tFar.x) {
|
||||
float tmp = tNear.x;
|
||||
tNear.x = tFar.x;
|
||||
tFar.x = tmp;
|
||||
}
|
||||
t0 = max(tNear.x, t0);
|
||||
t1 = min(tFar.x, t1);
|
||||
|
||||
if (tNear.y > tFar.y) {
|
||||
float tmp = tNear.y;
|
||||
tNear.y = tFar.y;
|
||||
tFar.y = tmp;
|
||||
}
|
||||
t0 = max(tNear.y, t0);
|
||||
t1 = min(tFar.y, t1);
|
||||
|
||||
if (tNear.z > tFar.z) {
|
||||
float tmp = tNear.z;
|
||||
tNear.z = tFar.z;
|
||||
tFar.z = tmp;
|
||||
}
|
||||
t0 = max(tNear.z, t0);
|
||||
t1 = min(tFar.z, t1);
|
||||
|
||||
if (t0 <= t1) {
|
||||
hit0 = t0;
|
||||
hit1 = t1;
|
||||
return true;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
__device__ static inline float Lerp(float t, float a, float b) {
|
||||
return (1.f - t) * a + t * b;
|
||||
}
|
||||
|
||||
|
||||
__device__ static inline float D(int x, int y, int z, int nVoxels[3],
|
||||
float density[]) {
|
||||
x = clamp(x, 0, nVoxels[0]-1);
|
||||
y = clamp(y, 0, nVoxels[1]-1);
|
||||
z = clamp(z, 0, nVoxels[2]-1);
|
||||
|
||||
return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
|
||||
}
|
||||
|
||||
|
||||
__device__ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
|
||||
return (p - pMin) / (pMax - pMin);
|
||||
}
|
||||
|
||||
|
||||
__device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
float density[], int nVoxels[3]) {
|
||||
if (!Inside(Pobj, pMin, pMax))
|
||||
return 0;
|
||||
// Compute voxel coordinates and offsets for _Pobj_
|
||||
float3 vox = Offset(Pobj, pMin, pMax);
|
||||
vox.x = vox.x * nVoxels[0] - .5f;
|
||||
vox.y = vox.y * nVoxels[1] - .5f;
|
||||
vox.z = vox.z * nVoxels[2] - .5f;
|
||||
int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
|
||||
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
|
||||
|
||||
// Trilinearly interpolate density values to compute local density
|
||||
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
|
||||
D(vx+1, vy, vz, nVoxels, density));
|
||||
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
|
||||
D(vx+1, vy+1, vz, nVoxels, density));
|
||||
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
|
||||
D(vx+1, vy, vz+1, nVoxels, density));
|
||||
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
|
||||
D(vx+1, vy+1, vz+1, nVoxels, density));
|
||||
float d0 = Lerp(dy, d00, d10);
|
||||
float d1 = Lerp(dy, d01, d11);
|
||||
return Lerp(dz, d0, d1);
|
||||
}
|
||||
|
||||
|
||||
/* Returns the transmittance between two points p0 and p1, in a volume
|
||||
with extent (pMin,pMax) with transmittance coefficient sigma_t,
|
||||
defined by nVoxels[3] voxels in each dimension in the given density
|
||||
array. */
|
||||
__device__ static inline float
|
||||
transmittance(float3 p0, float3 p1, float3 pMin,
|
||||
float3 pMax, float sigma_t,
|
||||
float density[], int nVoxels[3]) {
|
||||
float rayT0, rayT1;
|
||||
Ray ray;
|
||||
ray.origin = p1;
|
||||
ray.dir = p0 - p1;
|
||||
|
||||
// Find the parametric t range along the ray that is inside the volume.
|
||||
if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
|
||||
return 1.f;
|
||||
|
||||
rayT0 = max(rayT0, 0.f);
|
||||
|
||||
// Accumulate beam transmittance in tau
|
||||
float tau = 0.0f;
|
||||
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
|
||||
ray.dir.z * ray.dir.z);
|
||||
float stepDist = 0.2f;
|
||||
float stepT = stepDist / rayLength;
|
||||
|
||||
float t = rayT0;
|
||||
float3 pos = ray.origin + ray.dir * rayT0;
|
||||
float3 dirStep = ray.dir * stepT;
|
||||
while (t < rayT1) {
|
||||
tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
|
||||
pos = pos + dirStep;
|
||||
t += stepT;
|
||||
}
|
||||
|
||||
return exp(-tau);
|
||||
}
|
||||
|
||||
|
||||
__device__ static inline float
|
||||
distanceSquared(float3 a, float3 b) {
|
||||
float3 d = a-b;
|
||||
return d.x*d.x + d.y*d.y + d.z*d.z;
|
||||
}
|
||||
|
||||
|
||||
__device__ static inline float
|
||||
raymarch(float density[], int nVoxels[3], Ray ray) {
|
||||
float rayT0, rayT1;
|
||||
float3 pMin = {.3f, -.2f, .3f}, pMax = {1.8f, 2.3f, 1.8f};
|
||||
float3 lightPos = { -1.f, 4., 1.5f };
|
||||
|
||||
if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
|
||||
return 0.f;
|
||||
|
||||
rayT0 = max(rayT0, 0.f);
|
||||
|
||||
// Parameters that define the volume scattering characteristics and
|
||||
// sampling rate for raymarching
|
||||
float Le = .25f; // Emission coefficient
|
||||
float sigma_a = 10.f; // Absorption coefficient
|
||||
float sigma_s = 10.f; // Scattering coefficient
|
||||
float stepDist = 0.025f; // Ray step amount
|
||||
float lightIntensity = 40.0f; // Light source intensity
|
||||
|
||||
float tau = 0.f; // accumulated beam transmittance
|
||||
float L = 0.f; // radiance along the ray
|
||||
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
|
||||
ray.dir.z * ray.dir.z);
|
||||
float stepT = stepDist / rayLength;
|
||||
|
||||
float t = rayT0;
|
||||
float3 pos = ray.origin + ray.dir * rayT0;
|
||||
float3 dirStep = ray.dir * stepT;
|
||||
while (t < rayT1)
|
||||
{
|
||||
float d = Density(pos, pMin, pMax, density, nVoxels);
|
||||
|
||||
// terminate once attenuation is high
|
||||
float atten = exp(-tau);
|
||||
if (atten < .005f)
|
||||
break;
|
||||
|
||||
// direct lighting
|
||||
float Li = lightIntensity / distanceSquared(lightPos, pos) *
|
||||
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
|
||||
density, nVoxels);
|
||||
L += stepDist * atten * d * sigma_s * (Li + Le);
|
||||
|
||||
// update beam transmittance
|
||||
tau += stepDist * (sigma_a + sigma_s) * d;
|
||||
|
||||
pos = pos + dirStep;
|
||||
t += stepT;
|
||||
}
|
||||
|
||||
// Gamma correction
|
||||
return pow(L, 1.f / 2.2f);
|
||||
}
|
||||
|
||||
|
||||
/* Utility routine used by both the task-based and the single-core entrypoints.
|
||||
Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
|
||||
result into the image[] array.
|
||||
*/
|
||||
__device__ static void
|
||||
volume_tile(int x0, int y0, int x1,
|
||||
int y1, float density[], int nVoxels[3],
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
int width, int height, float image[]) {
|
||||
// Work on 4x4=16 pixel big tiles of the image. This function thus
|
||||
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
|
||||
// by 4.
|
||||
for (int y = y0; y < y1; y += 8) {
|
||||
for (int x = x0; x < x1; x += 8) {
|
||||
for (int ob = 0; ob < 64; ob += programCount)
|
||||
{
|
||||
const int o = ob + programIndex;
|
||||
|
||||
|
||||
// These two arrays encode the mapping from [0,15] to
|
||||
// offsets within the 4x4 pixel block so that we render
|
||||
// each pixel inside the block
|
||||
const int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
|
||||
0, 1, 0, 1, 2, 3, 2, 3 };
|
||||
const int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
|
||||
2, 2, 3, 3, 2, 2, 3, 3 };
|
||||
|
||||
const int xblock[4] = {0, 4, 0, 4};
|
||||
const int yblock[4] = {0, 0, 4, 4};
|
||||
|
||||
// Figure out the pixel to render for this program instance
|
||||
const int xo = x + xblock[o/16] + xoffsets[o&15];
|
||||
const int yo = y + yblock[o/16] + yoffsets[o&15];
|
||||
|
||||
// Use viewing parameters to compute the corresponding ray
|
||||
// for the pixel
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, xo, yo, ray);
|
||||
|
||||
// And raymarch through the volume to compute the pixel's
|
||||
// value
|
||||
int offset = yo * width + xo;
|
||||
if (xo < x1 && yo < y1)
|
||||
image[offset] = raymarch(density, nVoxels, ray);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void
|
||||
volume_task(float density[], int _nVoxels[3],
|
||||
const float _raster2camera[4][4],
|
||||
const float _camera2world[4][4],
|
||||
int width, int height, float image[]) {
|
||||
if (taskIndex0 >= taskCount0) return;
|
||||
|
||||
#if 0
|
||||
int nVoxels[3];
|
||||
nVoxels[0] = _nVoxels[0];
|
||||
nVoxels[1] = _nVoxels[1];
|
||||
nVoxels[2] = _nVoxels[2];
|
||||
|
||||
float raster2camera[4][4];
|
||||
raster2camera[0][0] = _raster2camera[0][0];
|
||||
raster2camera[0][1] = _raster2camera[0][1];
|
||||
raster2camera[0][2] = _raster2camera[0][2];
|
||||
raster2camera[0][3] = _raster2camera[0][3];
|
||||
raster2camera[1][0] = _raster2camera[1][0];
|
||||
raster2camera[1][1] = _raster2camera[1][1];
|
||||
raster2camera[1][2] = _raster2camera[1][2];
|
||||
raster2camera[1][3] = _raster2camera[1][3];
|
||||
raster2camera[2][0] = _raster2camera[2][0];
|
||||
raster2camera[2][1] = _raster2camera[2][1];
|
||||
raster2camera[2][2] = _raster2camera[2][2];
|
||||
raster2camera[2][3] = _raster2camera[2][3];
|
||||
raster2camera[3][0] = _raster2camera[3][0];
|
||||
raster2camera[3][1] = _raster2camera[3][1];
|
||||
raster2camera[3][2] = _raster2camera[3][2];
|
||||
raster2camera[3][3] = _raster2camera[3][3];
|
||||
|
||||
float camera2world[4][4];
|
||||
camera2world[0][0] = _camera2world[0][0];
|
||||
camera2world[0][1] = _camera2world[0][1];
|
||||
camera2world[0][2] = _camera2world[0][2];
|
||||
camera2world[0][3] = _camera2world[0][3];
|
||||
camera2world[1][0] = _camera2world[1][0];
|
||||
camera2world[1][1] = _camera2world[1][1];
|
||||
camera2world[1][2] = _camera2world[1][2];
|
||||
camera2world[1][3] = _camera2world[1][3];
|
||||
camera2world[2][0] = _camera2world[2][0];
|
||||
camera2world[2][1] = _camera2world[2][1];
|
||||
camera2world[2][2] = _camera2world[2][2];
|
||||
camera2world[2][3] = _camera2world[2][3];
|
||||
camera2world[3][0] = _camera2world[3][0];
|
||||
camera2world[3][1] = _camera2world[3][1];
|
||||
camera2world[3][2] = _camera2world[3][2];
|
||||
camera2world[3][3] = _camera2world[3][3];
|
||||
#else
|
||||
#define nVoxels _nVoxels
|
||||
#define raster2camera _raster2camera
|
||||
#define camera2world _camera2world
|
||||
#endif
|
||||
|
||||
int dx = 8, dy = 8; // must match value in volume_ispc_tasks
|
||||
int xbuckets = (width + (dx-1)) / dx;
|
||||
int ybuckets = (height + (dy-1)) / dy;
|
||||
|
||||
int x0 = (taskIndex % xbuckets) * dx;
|
||||
int y0 = (taskIndex / xbuckets) * dy;
|
||||
int x1 = x0 + dx, y1 = y0 + dy;
|
||||
x1 = min(x1, width);
|
||||
y1 = min(y1, height);
|
||||
|
||||
volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
|
||||
camera2world, width, height, image);
|
||||
}
|
||||
|
||||
|
||||
extern "C"
|
||||
__global__ void
|
||||
volume_ispc_tasks___export( float density[], int nVoxels[3],
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
int width, int height, float image[]) {
|
||||
// Launch tasks to work on (dx,dy)-sized tiles of the image
|
||||
int dx = 8, dy = 8;
|
||||
int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
|
||||
launch(nTasks,1,1,volume_task)
|
||||
(density, nVoxels, raster2camera, camera2world,
|
||||
width, height, image);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
__host__ void
|
||||
volume_ispc_tasks( float density[], int nVoxels[3],
|
||||
const float raster2camera[4][4],
|
||||
const float camera2world[4][4],
|
||||
int width, int height, float image[]) {
|
||||
volume_ispc_tasks___export<<<1,32>>>(density, nVoxels, raster2camera, camera2world, width, height,image);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
413
examples/portable/volume_rendering/volume.ispc
Normal file
413
examples/portable/volume_rendering/volume.ispc
Normal file
@@ -0,0 +1,413 @@
|
||||
/*
|
||||
Copyright (c) 2011-2014, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
typedef float<3> float3;
|
||||
|
||||
struct Ray {
|
||||
float3 origin, dir;
|
||||
};
|
||||
|
||||
|
||||
static inline void
|
||||
generateRay(const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
float x, float y, Ray &ray) {
|
||||
// transform raster coordinate (x, y, 0) to camera space
|
||||
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
|
||||
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
|
||||
float camz = raster2camera[2][3];
|
||||
float camw = raster2camera[3][3];
|
||||
camx /= camw;
|
||||
camy /= camw;
|
||||
camz /= camw;
|
||||
|
||||
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
|
||||
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
|
||||
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
|
||||
|
||||
ray.origin.x = camera2world[0][3] / camera2world[3][3];
|
||||
ray.origin.y = camera2world[1][3] / camera2world[3][3];
|
||||
ray.origin.z = camera2world[2][3] / camera2world[3][3];
|
||||
}
|
||||
|
||||
|
||||
static inline bool
|
||||
Inside(float3 p, float3 pMin, float3 pMax) {
|
||||
return (p.x >= pMin.x && p.x <= pMax.x &&
|
||||
p.y >= pMin.y && p.y <= pMax.y &&
|
||||
p.z >= pMin.z && p.z <= pMax.z);
|
||||
}
|
||||
|
||||
|
||||
static inline bool
|
||||
IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
|
||||
float t0 = -1e30, t1 = 1e30;
|
||||
|
||||
float3 tNear = (pMin - ray.origin) / ray.dir;
|
||||
float3 tFar = (pMax - ray.origin) / ray.dir;
|
||||
if (tNear.x > tFar.x) {
|
||||
float tmp = tNear.x;
|
||||
tNear.x = tFar.x;
|
||||
tFar.x = tmp;
|
||||
}
|
||||
t0 = max(tNear.x, t0);
|
||||
t1 = min(tFar.x, t1);
|
||||
|
||||
if (tNear.y > tFar.y) {
|
||||
float tmp = tNear.y;
|
||||
tNear.y = tFar.y;
|
||||
tFar.y = tmp;
|
||||
}
|
||||
t0 = max(tNear.y, t0);
|
||||
t1 = min(tFar.y, t1);
|
||||
|
||||
if (tNear.z > tFar.z) {
|
||||
float tmp = tNear.z;
|
||||
tNear.z = tFar.z;
|
||||
tFar.z = tmp;
|
||||
}
|
||||
t0 = max(tNear.z, t0);
|
||||
t1 = min(tFar.z, t1);
|
||||
|
||||
if (t0 <= t1) {
|
||||
hit0 = t0;
|
||||
hit1 = t1;
|
||||
return true;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static inline float Lerp(float t, float a, float b) {
|
||||
return (1.f - t) * a + t * b;
|
||||
}
|
||||
|
||||
|
||||
static inline float D(int x, int y, int z, uniform int nVoxels[3],
|
||||
uniform float density[]) {
|
||||
x = clamp(x, 0, nVoxels[0]-1);
|
||||
y = clamp(y, 0, nVoxels[1]-1);
|
||||
z = clamp(z, 0, nVoxels[2]-1);
|
||||
|
||||
return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
|
||||
}
|
||||
|
||||
|
||||
static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
|
||||
return (p - pMin) / (pMax - pMin);
|
||||
}
|
||||
|
||||
|
||||
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
uniform float density[], uniform int nVoxels[3]) {
|
||||
if (!Inside(Pobj, pMin, pMax))
|
||||
return 0;
|
||||
// Compute voxel coordinates and offsets for _Pobj_
|
||||
float3 vox = Offset(Pobj, pMin, pMax);
|
||||
vox.x = vox.x * nVoxels[0] - .5f;
|
||||
vox.y = vox.y * nVoxels[1] - .5f;
|
||||
vox.z = vox.z * nVoxels[2] - .5f;
|
||||
int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
|
||||
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
|
||||
|
||||
// Trilinearly interpolate density values to compute local density
|
||||
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
|
||||
D(vx+1, vy, vz, nVoxels, density));
|
||||
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
|
||||
D(vx+1, vy+1, vz, nVoxels, density));
|
||||
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
|
||||
D(vx+1, vy, vz+1, nVoxels, density));
|
||||
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
|
||||
D(vx+1, vy+1, vz+1, nVoxels, density));
|
||||
float d0 = Lerp(dy, d00, d10);
|
||||
float d1 = Lerp(dy, d01, d11);
|
||||
return Lerp(dz, d0, d1);
|
||||
}
|
||||
|
||||
|
||||
/* Returns the transmittance between two points p0 and p1, in a volume
|
||||
with extent (pMin,pMax) with transmittance coefficient sigma_t,
|
||||
defined by nVoxels[3] voxels in each dimension in the given density
|
||||
array. */
|
||||
static inline float
|
||||
transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
|
||||
uniform float3 pMax, uniform float sigma_t,
|
||||
uniform float density[], uniform int nVoxels[3]) {
|
||||
float rayT0, rayT1;
|
||||
Ray ray;
|
||||
ray.origin = p1;
|
||||
ray.dir = p0 - p1;
|
||||
|
||||
// Find the parametric t range along the ray that is inside the volume.
|
||||
if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
|
||||
return 1.;
|
||||
|
||||
rayT0 = max(rayT0, 0.f);
|
||||
|
||||
// Accumulate beam transmittance in tau
|
||||
float tau = 0;
|
||||
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
|
||||
ray.dir.z * ray.dir.z);
|
||||
const uniform float stepDist = 0.2;
|
||||
float stepT = stepDist / rayLength;
|
||||
|
||||
float t = rayT0;
|
||||
float3 pos = ray.origin + ray.dir * rayT0;
|
||||
float3 dirStep = ray.dir * stepT;
|
||||
while (t < rayT1) {
|
||||
tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
|
||||
pos = pos + dirStep;
|
||||
t += stepT;
|
||||
}
|
||||
|
||||
return exp(-tau);
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
distanceSquared(float3 a, float3 b) {
|
||||
float3 d = a-b;
|
||||
return d.x*d.x + d.y*d.y + d.z*d.z;
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
|
||||
float rayT0, rayT1;
|
||||
const uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
|
||||
const uniform float3 lightPos = { -1, 4, 1.5 };
|
||||
|
||||
if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
|
||||
return 0.;
|
||||
|
||||
rayT0 = max(rayT0, 0.f);
|
||||
|
||||
// Parameters that define the volume scattering characteristics and
|
||||
// sampling rate for raymarching
|
||||
const uniform float Le = .25; // Emission coefficient
|
||||
const uniform float sigma_a = 10; // Absorption coefficient
|
||||
const uniform float sigma_s = 10; // Scattering coefficient
|
||||
const uniform float stepDist = 0.025; // Ray step amount
|
||||
const uniform float lightIntensity = 40; // Light source intensity
|
||||
|
||||
float tau = 0.f; // accumulated beam transmittance
|
||||
float L = 0; // radiance along the ray
|
||||
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
|
||||
ray.dir.z * ray.dir.z);
|
||||
float stepT = stepDist / rayLength;
|
||||
|
||||
float t = rayT0;
|
||||
float3 pos = ray.origin + ray.dir * rayT0;
|
||||
float3 dirStep = ray.dir * stepT;
|
||||
while (t < rayT1)
|
||||
{
|
||||
float d = Density(pos, pMin, pMax, density, nVoxels);
|
||||
|
||||
// terminate once attenuation is high
|
||||
float atten = exp(-tau);
|
||||
if (atten < .005)
|
||||
break;
|
||||
|
||||
// direct lighting
|
||||
float Li = lightIntensity / distanceSquared(lightPos, pos) *
|
||||
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
|
||||
density, nVoxels);
|
||||
L += stepDist * atten * d * sigma_s * (Li + Le);
|
||||
|
||||
// update beam transmittance
|
||||
tau += stepDist * (sigma_a + sigma_s) * d;
|
||||
|
||||
pos = pos + dirStep;
|
||||
t += stepT;
|
||||
}
|
||||
|
||||
// Gamma correction
|
||||
return pow(L, 1.f / 2.2f);
|
||||
}
|
||||
|
||||
|
||||
/* Utility routine used by both the task-based and the single-core entrypoints.
|
||||
Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
|
||||
result into the image[] array.
|
||||
*/
|
||||
static inline void
|
||||
volume_tile(uniform int x0, uniform int y0, uniform int x1,
|
||||
uniform int y1, uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
// Work on 4x4=16 pixel big tiles of the image. This function thus
|
||||
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
|
||||
// by 4.
|
||||
#if 0
|
||||
for (uniform int y = y0; y < y1; y += 8)
|
||||
for (uniform int x = x0; x < x1; x += 8)
|
||||
foreach (o = 0 ... 64)
|
||||
{
|
||||
// These two arrays encode the mapping from [0,15] to
|
||||
// offsets within the 4x4 pixel block so that we render
|
||||
// each pixel inside the block
|
||||
const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
|
||||
0, 1, 0, 1, 2, 3, 2, 3 };
|
||||
const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
|
||||
2, 2, 3, 3, 2, 2, 3, 3 };
|
||||
|
||||
const uniform int xblock[4] = {0, 4, 0, 4};
|
||||
const uniform int yblock[4] = {0, 0, 4, 4};
|
||||
|
||||
// Figure out the pixel to render for this program instance
|
||||
const int xo = x + xblock[o/16] + xoffsets[o&15];
|
||||
const int yo = y + yblock[o/16] + yoffsets[o&15];
|
||||
|
||||
// Use viewing parameters to compute the corresponding ray
|
||||
// for the pixel
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, xo, yo, ray);
|
||||
|
||||
// And raymarch through the volume to compute the pixel's
|
||||
// value
|
||||
int offset = yo * width + xo;
|
||||
if (xo < x1 && yo < y1)
|
||||
image[offset] = raymarch(density, nVoxels, ray);
|
||||
}
|
||||
#else
|
||||
foreach_tiled (y = y0 ... y1, x = x0 ... x1)
|
||||
{
|
||||
// Use viewing parameters to compute the corresponding ray
|
||||
// for the pixel
|
||||
Ray ray;
|
||||
generateRay(raster2camera, camera2world, x, y, ray);
|
||||
|
||||
// And raymarch through the volume to compute the pixel's
|
||||
// value
|
||||
int offset = y * width + x;
|
||||
image[offset] = raymarch(density, nVoxels, ray);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
task void
|
||||
volume_task(uniform float density[], uniform int _nVoxels[3],
|
||||
const uniform float _raster2camera[4][4],
|
||||
const uniform float _camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[])
|
||||
{
|
||||
if (taskIndex >= taskCount) return;
|
||||
|
||||
#if 1 /* cannot pass shared memory pointers to functions, need to find a way to solve this one :S */
|
||||
uniform int nVoxels[3];
|
||||
nVoxels[0] = _nVoxels[0];
|
||||
nVoxels[1] = _nVoxels[1];
|
||||
nVoxels[2] = _nVoxels[2];
|
||||
|
||||
uniform float raster2camera[4][4];
|
||||
raster2camera[0][0] = _raster2camera[0][0];
|
||||
raster2camera[0][1] = _raster2camera[0][1];
|
||||
raster2camera[0][2] = _raster2camera[0][2];
|
||||
raster2camera[0][3] = _raster2camera[0][3];
|
||||
raster2camera[1][0] = _raster2camera[1][0];
|
||||
raster2camera[1][1] = _raster2camera[1][1];
|
||||
raster2camera[1][2] = _raster2camera[1][2];
|
||||
raster2camera[1][3] = _raster2camera[1][3];
|
||||
raster2camera[2][0] = _raster2camera[2][0];
|
||||
raster2camera[2][1] = _raster2camera[2][1];
|
||||
raster2camera[2][2] = _raster2camera[2][2];
|
||||
raster2camera[2][3] = _raster2camera[2][3];
|
||||
raster2camera[3][0] = _raster2camera[3][0];
|
||||
raster2camera[3][1] = _raster2camera[3][1];
|
||||
raster2camera[3][2] = _raster2camera[3][2];
|
||||
raster2camera[3][3] = _raster2camera[3][3];
|
||||
|
||||
uniform float camera2world[4][4];
|
||||
camera2world[0][0] = _camera2world[0][0];
|
||||
camera2world[0][1] = _camera2world[0][1];
|
||||
camera2world[0][2] = _camera2world[0][2];
|
||||
camera2world[0][3] = _camera2world[0][3];
|
||||
camera2world[1][0] = _camera2world[1][0];
|
||||
camera2world[1][1] = _camera2world[1][1];
|
||||
camera2world[1][2] = _camera2world[1][2];
|
||||
camera2world[1][3] = _camera2world[1][3];
|
||||
camera2world[2][0] = _camera2world[2][0];
|
||||
camera2world[2][1] = _camera2world[2][1];
|
||||
camera2world[2][2] = _camera2world[2][2];
|
||||
camera2world[2][3] = _camera2world[2][3];
|
||||
camera2world[3][0] = _camera2world[3][0];
|
||||
camera2world[3][1] = _camera2world[3][1];
|
||||
camera2world[3][2] = _camera2world[3][2];
|
||||
camera2world[3][3] = _camera2world[3][3];
|
||||
#else
|
||||
#define nVoxels _nVoxels
|
||||
#define raster2camera _raster2camera
|
||||
#define camera2world _camera2world
|
||||
#endif
|
||||
|
||||
const uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks
|
||||
const uniform int xbuckets = (width + (dx-1)) / dx;
|
||||
const uniform int ybuckets = (height + (dy-1)) / dy;
|
||||
|
||||
const uniform int x0 = (taskIndex % xbuckets) * dx;
|
||||
const uniform int y0 = (taskIndex / xbuckets) * dy;
|
||||
const uniform int x1 = min(x0 + dx, width);
|
||||
const uniform int y1 = min(y0 + dy, height);
|
||||
|
||||
volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
|
||||
camera2world, width, height, image);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
volume_ispc(uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
volume_tile(0, 0, width, height, density, nVoxels, raster2camera,
|
||||
camera2world, width, height, image);
|
||||
}
|
||||
|
||||
|
||||
export void
|
||||
volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
|
||||
const uniform float raster2camera[4][4],
|
||||
const uniform float camera2world[4][4],
|
||||
uniform int width, uniform int height, uniform float image[]) {
|
||||
// Launch tasks to work on (dx,dy)-sized tiles of the image
|
||||
const uniform int dx = 8, dy = 8;
|
||||
const uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
|
||||
launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
|
||||
width, height, image);
|
||||
sync;
|
||||
}
|
||||
@@ -37,6 +37,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <iomanip>
|
||||
#include "../timing.h"
|
||||
#include "sort_ispc.h"
|
||||
@@ -45,26 +46,28 @@ using namespace ispc;
|
||||
|
||||
extern void sort_serial (int n, unsigned int code[], int order[]);
|
||||
|
||||
/* progress bar by Ross Hemsley;
|
||||
* http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */
|
||||
static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50)
|
||||
static void progressBar(const int x, const int n, const int width = 50)
|
||||
{
|
||||
if (n < 100)
|
||||
{
|
||||
x *= 100/n;
|
||||
n = 100;
|
||||
}
|
||||
assert(n > 1);
|
||||
assert(x >= 0 && x < n);
|
||||
assert(width > 10);
|
||||
const float f = static_cast<float>(x)/(n-1);
|
||||
const int w = static_cast<int>(f * width);
|
||||
|
||||
if ((x != n) && (x % (n/100) != 0)) return;
|
||||
// print bar
|
||||
std::string bstr("[");
|
||||
for (int i = 0; i < width; i++)
|
||||
bstr += i < w ? '=' : ' ';
|
||||
bstr += "]";
|
||||
|
||||
using namespace std;
|
||||
float ratio = x/(float)n;
|
||||
int c = ratio * w;
|
||||
// print percentage
|
||||
char pstr0[32];
|
||||
sprintf(pstr0, " %2d %c ", static_cast<int>(f*100.0),'%');
|
||||
const std::string pstr(pstr0);
|
||||
std::copy(pstr.begin(), pstr.end(), bstr.begin() + (width/2-2));
|
||||
|
||||
cout << setw(3) << (int)(ratio*100) << "% [";
|
||||
for (int x=0; x<c; x++) cout << "=";
|
||||
for (int x=c; x<w; x++) cout << " ";
|
||||
cout << "]\r" << flush;
|
||||
std::cout << bstr;
|
||||
std::cout << (x == n-1 ? "\n" : "\r") << std::flush;
|
||||
}
|
||||
|
||||
int main (int argc, char *argv[])
|
||||
@@ -87,7 +90,7 @@ int main (int argc, char *argv[])
|
||||
tISPC1 += get_elapsed_mcycles();
|
||||
|
||||
if (argc != 3)
|
||||
progressbar (i, m);
|
||||
progressBar (i, m);
|
||||
}
|
||||
|
||||
printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1);
|
||||
@@ -105,7 +108,7 @@ int main (int argc, char *argv[])
|
||||
tISPC2 += get_elapsed_mcycles();
|
||||
|
||||
if (argc != 3)
|
||||
progressbar (i, m);
|
||||
progressBar (i, m);
|
||||
}
|
||||
|
||||
printf("[sort ispc + tasks]:\t[%.3f] million cycles\n", tISPC2);
|
||||
@@ -123,7 +126,7 @@ int main (int argc, char *argv[])
|
||||
tSerial += get_elapsed_mcycles();
|
||||
|
||||
if (argc != 3)
|
||||
progressbar (i, m);
|
||||
progressBar (i, m);
|
||||
}
|
||||
|
||||
printf("[sort serial]:\t\t[%.3f] million cycles\n", tSerial);
|
||||
|
||||
@@ -960,17 +960,22 @@ InitTaskSystem() {
|
||||
|
||||
inline void
|
||||
TaskGroup::Launch(int baseIndex, int count) {
|
||||
#pragma omp parallel for
|
||||
for(int i = 0; i < count; i++) {
|
||||
#pragma omp parallel
|
||||
{
|
||||
const int threadIndex = omp_get_thread_num();
|
||||
const int threadCount = omp_get_num_threads();
|
||||
|
||||
#pragma omp for schedule(runtime)
|
||||
for(int i = 0; i < count; i++)
|
||||
{
|
||||
TaskInfo *ti = GetTaskInfo(baseIndex + i);
|
||||
|
||||
// Actually run the task.
|
||||
int threadIndex = omp_get_thread_num();
|
||||
int threadCount = omp_get_num_threads();
|
||||
ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
|
||||
ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
|
||||
ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
|
||||
@@ -58,6 +58,7 @@ __inline__ uint64_t rdtsc() {
|
||||
|
||||
#ifdef WIN32
|
||||
#include <windows.h>
|
||||
double rtc();
|
||||
#define rdtsc __rdtsc
|
||||
#else // WIN32
|
||||
__inline__ uint64_t rdtsc() {
|
||||
@@ -72,14 +73,30 @@ __inline__ uint64_t rdtsc() {
|
||||
__asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
|
||||
return (uint64_t)high << 32 | low;
|
||||
}
|
||||
|
||||
#include <sys/time.h>
|
||||
static inline double rtc(void)
|
||||
{
|
||||
struct timeval Tvalue;
|
||||
double etime;
|
||||
struct timezone dummy;
|
||||
|
||||
gettimeofday(&Tvalue,&dummy);
|
||||
etime = (double) Tvalue.tv_sec +
|
||||
1.e-6*((double) Tvalue.tv_usec);
|
||||
return etime;
|
||||
}
|
||||
|
||||
#endif // !WIN32
|
||||
#endif // !__arm__
|
||||
|
||||
static uint64_t start, end;
|
||||
static uint64_t start, end;
|
||||
static double tstart, tend;
|
||||
|
||||
static inline void reset_and_start_timer()
|
||||
{
|
||||
start = rdtsc();
|
||||
tstart = rtc();
|
||||
}
|
||||
|
||||
/* Returns the number of millions of elapsed processor cycles since the
|
||||
@@ -89,3 +106,9 @@ static inline double get_elapsed_mcycles()
|
||||
end = rdtsc();
|
||||
return (end-start) / (1024. * 1024.);
|
||||
}
|
||||
|
||||
static inline double get_elapsed_msec()
|
||||
{
|
||||
tend = rtc();
|
||||
return (tend - tstart)*1e3;
|
||||
}
|
||||
|
||||
58
examples/util/cuda_helpers.cuh
Normal file
58
examples/util/cuda_helpers.cuh
Normal file
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
Copyright (c) 2014, Evghenii Gaburov
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#define programCount 32
|
||||
#define programIndex (threadIdx.x & 31)
|
||||
#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5))
|
||||
#define taskCount0 (gridDim.x*4)
|
||||
#define taskIndex1 (blockIdx.y)
|
||||
#define taskCount1 (gridDim.y)
|
||||
#define taskIndex2 (blockIdx.z)
|
||||
#define taskCount2 (gridDim.z)
|
||||
#define taskIndex (taskIndex0 + taskCount0*(taskIndex1 + taskCount1*taskIndex2))
|
||||
#define taskCount (taskCount0*taskCount1*taskCount2)
|
||||
#define warpIdx (threadIdx.x >> 5)
|
||||
#define launch(ntx,nty,ntz,func) if (programIndex==0) func<<<dim3(((ntx)+4-1)/4,nty,ntz),128>>>
|
||||
#define sync cudaDeviceSynchronize()
|
||||
#define cif if
|
||||
__device__ __forceinline__ static double __shfl(double x, int lane)
|
||||
{
|
||||
return __hiloint2double(
|
||||
__shfl_xor(__double2hiint(x), lane),
|
||||
__shfl_xor(__double2loint(x), lane));
|
||||
|
||||
}
|
||||
#define shuffle(x,y) __shfl(x,y)
|
||||
#define broadcast(x,y) __shfl(x,y)
|
||||
87
examples/util/ispc_malloc.cpp
Normal file
87
examples/util/ispc_malloc.cpp
Normal file
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
Copyright (c) 2014, Evghenii Gaburov
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
#include "ispc_malloc.h"
|
||||
|
||||
#ifdef _CUDA_
|
||||
|
||||
void * operator new(size_t size) throw(std::bad_alloc)
|
||||
{
|
||||
void *ptr;
|
||||
ispc_malloc(&ptr, size);
|
||||
return ptr;
|
||||
}
|
||||
void operator delete(void *ptr) throw()
|
||||
{
|
||||
ispc_free(ptr);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void ispc_malloc(void **ptr, const size_t size)
|
||||
{
|
||||
*ptr = malloc(size);
|
||||
}
|
||||
void ispc_free(void *ptr)
|
||||
{
|
||||
free(ptr);
|
||||
}
|
||||
void ispc_memset(void *ptr, int value, size_t size)
|
||||
{
|
||||
memset(ptr, value, size);
|
||||
}
|
||||
void ispcSetMallocHeapLimit(size_t value)
|
||||
{
|
||||
}
|
||||
void ispcSetStackLimit(size_t value)
|
||||
{
|
||||
}
|
||||
unsigned long long ispcGetMallocHeapLimit()
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
unsigned long long ispcGetStackLimit()
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
void * ispcMemcpy(void *dest, void *src, size_t num)
|
||||
{
|
||||
memcpy(dest, src, num);
|
||||
return dest;
|
||||
}
|
||||
|
||||
#endif
|
||||
43
examples/util/ispc_malloc.h
Normal file
43
examples/util/ispc_malloc.h
Normal file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
Copyright (c) 2014, Evghenii Gaburov
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
extern void ispc_malloc(void **ptr, const size_t size);
|
||||
extern void ispc_free(void *ptr);
|
||||
extern void ispc_memset(void *ptr, int value, size_t size);
|
||||
extern void ispcSetMallocHeapLimit(size_t value);
|
||||
extern void ispcSetStackLimit(size_t value);
|
||||
extern unsigned long long ispcGetMallocHeapLimit();
|
||||
extern unsigned long long ispcGetStackLimit();
|
||||
extern void * ispcMemcpy(void *dest, void *src, size_t num);
|
||||
76
examples/util/nvcc_helpers.cu
Normal file
76
examples/util/nvcc_helpers.cu
Normal file
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
Copyright (c) 2014, Evghenii Gaburov
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _CUDA_
|
||||
#error "Something went wrong..."
|
||||
#endif
|
||||
|
||||
void ispc_malloc(void **ptr, const size_t size)
|
||||
{
|
||||
cudaMallocManaged(ptr, size);
|
||||
}
|
||||
void ispc_free(void *ptr)
|
||||
{
|
||||
cudaFree(ptr);
|
||||
}
|
||||
void ispc_memset(void *ptr, int value, size_t size)
|
||||
{
|
||||
cudaMemset(ptr, value, size);
|
||||
}
|
||||
void ispcSetMallocHeapLimit(size_t value)
|
||||
{
|
||||
cudaDeviceSetLimit(cudaLimitMallocHeapSize,value);
|
||||
}
|
||||
void ispcSetStackLimit(size_t value)
|
||||
{
|
||||
cudaDeviceSetLimit(cudaLimitStackSize,value);
|
||||
}
|
||||
unsigned long long ispcGetMallocHeapLimit()
|
||||
{
|
||||
size_t value;
|
||||
cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize);
|
||||
return value;
|
||||
}
|
||||
unsigned long long ispcGetStackLimit()
|
||||
{
|
||||
size_t value;
|
||||
cudaDeviceGetLimit(&value, cudaLimitStackSize);
|
||||
return value;
|
||||
}
|
||||
void * ispcMemcpy(void *dest, void *src, size_t num)
|
||||
{
|
||||
cudaMemcpy(dest, src, num, cudaMemcpyDefault);
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
15
expr.cpp
15
expr.cpp
@@ -7872,6 +7872,14 @@ SizeOfExpr::TypeCheck() {
|
||||
"struct type \"%s\".", type->GetString().c_str());
|
||||
return NULL;
|
||||
}
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
if (type != NULL)
|
||||
if (g->target->getISA() == Target::NVPTX && type->IsVaryingType())
|
||||
{
|
||||
Error(pos, "\"sizeof\" with varying data types is not yet supported with \"nvptx\" target.");
|
||||
return NULL;
|
||||
}
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
|
||||
return this;
|
||||
}
|
||||
@@ -8704,6 +8712,13 @@ NewExpr::TypeCheck() {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
if (g->target->getISA() == Target::NVPTX && allocType->IsVaryingType())
|
||||
{
|
||||
Error(pos, "\"new\" with varying data types is not yet supported with \"nvptx\" target.");
|
||||
return NULL;
|
||||
}
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
if (CastType<UndefinedStructType>(allocType) != NULL) {
|
||||
Error(pos, "Can't dynamically allocate storage for declared "
|
||||
"but not defined type \"%s\".", allocType->GetString().c_str());
|
||||
|
||||
57
func.cpp
57
func.cpp
@@ -47,6 +47,9 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(LLVM_3_2)
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
#include <llvm/Metadata.h>
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Module.h>
|
||||
#include <llvm/Type.h>
|
||||
@@ -54,6 +57,9 @@
|
||||
#include <llvm/Intrinsics.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#else
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
#include <llvm/IR/Metadata.h>
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
#include <llvm/IR/LLVMContext.h>
|
||||
#include <llvm/IR/Module.h>
|
||||
#include <llvm/IR/Type.h>
|
||||
@@ -129,7 +135,11 @@ Function::Function(Symbol *s, Stmt *c) {
|
||||
sym->parentFunction = this;
|
||||
}
|
||||
|
||||
if (type->isTask) {
|
||||
if (type->isTask
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
&& (g->target->getISA() != Target::NVPTX)
|
||||
#endif
|
||||
){
|
||||
threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
|
||||
Assert(threadIndexSym);
|
||||
threadCountSym = m->symbolTable->LookupVariable("threadCount");
|
||||
@@ -240,7 +250,11 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
#endif
|
||||
const FunctionType *type = CastType<FunctionType>(sym->type);
|
||||
Assert(type != NULL);
|
||||
if (type->isTask == true) {
|
||||
if (type->isTask == true
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
&& (g->target->getISA() != Target::NVPTX)
|
||||
#endif
|
||||
){
|
||||
// For tasks, there should always be three parameters: the
|
||||
// pointer to the structure that holds all of the arguments, the
|
||||
// thread index, and the thread count variables.
|
||||
@@ -338,6 +352,18 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
ctx->SetFunctionMask(argIter);
|
||||
Assert(++argIter == function->arg_end());
|
||||
}
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
if (type->isTask == true && g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
llvm::NamedMDNode* annotations =
|
||||
m->module->getOrInsertNamedMetadata("nvvm.annotations");
|
||||
llvm::SmallVector<llvm::Value*, 3> av;
|
||||
av.push_back(function);
|
||||
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
|
||||
av.push_back(LLVMInt32(1));
|
||||
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
|
||||
}
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
}
|
||||
|
||||
// Finally, we can generate code for the function
|
||||
@@ -499,6 +525,21 @@ Function::GenerateIR() {
|
||||
std::string functionName = sym->name;
|
||||
if (g->mangleFunctionsWithTarget)
|
||||
functionName += std::string("_") + g->target->GetISAString();
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
functionName += std::string("___export"); /* add ___export to the end, for ptxcc to recognize it is exported */
|
||||
#if 0
|
||||
llvm::NamedMDNode* annotations =
|
||||
m->module->getOrInsertNamedMetadata("nvvm.annotations");
|
||||
llvm::SmallVector<llvm::Value*, 3> av;
|
||||
av.push_back(function);
|
||||
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
|
||||
av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
|
||||
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
|
||||
#endif
|
||||
}
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
llvm::Function *appFunction =
|
||||
llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
|
||||
appFunction->setDoesNotThrow();
|
||||
@@ -536,6 +577,18 @@ Function::GenerateIR() {
|
||||
FATAL("Function verificication failed");
|
||||
}
|
||||
}
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
llvm::NamedMDNode* annotations =
|
||||
m->module->getOrInsertNamedMetadata("nvvm.annotations");
|
||||
llvm::SmallVector<llvm::Value*, 3> av;
|
||||
av.push_back(appFunction);
|
||||
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
|
||||
av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
|
||||
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
|
||||
}
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
44
ispc.cpp
44
ispc.cpp
@@ -243,6 +243,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
arch = "arm";
|
||||
else
|
||||
#endif
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
if(!strncmp(isa, "nvptx", 5))
|
||||
arch = "nvptx64";
|
||||
else
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
arch = "x86-64";
|
||||
}
|
||||
|
||||
@@ -582,6 +587,23 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
this->m_maskBitCount = 32;
|
||||
}
|
||||
#endif
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
else if (!strcasecmp(isa, "nvptx"))
|
||||
{
|
||||
this->m_isa = Target::NVPTX;
|
||||
this->m_cpu = "sm_35";
|
||||
this->m_nativeVectorWidth = 32;
|
||||
this->m_nativeVectorAlignment = 32;
|
||||
this->m_vectorWidth = 1;
|
||||
this->m_hasHalf = true;
|
||||
this->m_maskingIsFree = true;
|
||||
this->m_maskBitCount = 1;
|
||||
this->m_hasTranscendentals = true;
|
||||
this->m_hasTrigonometry = true;
|
||||
this->m_hasGather = this->m_hasScatter = false;
|
||||
cpuFromIsa = "sm_35";
|
||||
}
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
else {
|
||||
Error(SourcePos(), "Target \"%s\" is unknown. Choices are: %s.",
|
||||
isa, SupportedTargets());
|
||||
@@ -679,6 +701,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
"i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-"
|
||||
"f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32-v4:128:128";
|
||||
}
|
||||
else if (m_isa == Target::NVPTX)
|
||||
{
|
||||
dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
|
||||
}
|
||||
|
||||
// 3. Finally set member data
|
||||
m_dataLayout = new llvm::DataLayout(dl_string);
|
||||
@@ -695,6 +721,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
// Initialize target-specific "target-feature" attribute.
|
||||
if (!m_attributes.empty()) {
|
||||
llvm::AttrBuilder attrBuilder;
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
if (m_isa != Target::NVPTX)
|
||||
#endif
|
||||
attrBuilder.addAttribute("target-cpu", this->m_cpu);
|
||||
attrBuilder.addAttribute("target-features", this->m_attributes);
|
||||
this->m_tf_attributes = new llvm::AttributeSet(
|
||||
@@ -742,6 +771,9 @@ Target::SupportedTargets() {
|
||||
return
|
||||
#ifdef ISPC_ARM_ENABLED
|
||||
"neon-i8x16, neon-i16x8, neon-i32x4, "
|
||||
#endif
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
"nvptx, "
|
||||
#endif
|
||||
"sse2-i32x4, sse2-i32x8, "
|
||||
"sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
|
||||
@@ -777,6 +809,10 @@ Target::GetTripleString() const {
|
||||
triple.setArchName("i386");
|
||||
else if (m_arch == "x86-64")
|
||||
triple.setArchName("x86_64");
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
else if (m_arch == "nvptx64")
|
||||
triple = llvm::Triple("nvptx64", "nvidia", "cuda");
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
else
|
||||
triple.setArchName(m_arch);
|
||||
}
|
||||
@@ -809,6 +845,10 @@ Target::ISAToString(ISA isa) {
|
||||
return "avx2";
|
||||
case Target::GENERIC:
|
||||
return "generic";
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
case Target::NVPTX:
|
||||
return "nvptx";
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
default:
|
||||
FATAL("Unhandled target in ISAToString()");
|
||||
}
|
||||
@@ -847,6 +887,10 @@ Target::ISAToTargetString(ISA isa) {
|
||||
return "avx2-i32x8";
|
||||
case Target::GENERIC:
|
||||
return "generic-4";
|
||||
#ifdef ISPC_NVPTX_ENABLED
|
||||
case Target::NVPTX:
|
||||
return "nvptx";
|
||||
#endif /* ISPC_NVPTX_ENABLED */
|
||||
default:
|
||||
FATAL("Unhandled target in ISAToTargetString()");
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user