Merge pull request #749 from egaburov/nvptx_clean

Experimental support for PTX with examples
This commit is contained in:
Dmitry Babokin
2014-10-16 15:56:02 +04:00
158 changed files with 21326 additions and 204 deletions

View File

@@ -141,3 +141,46 @@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
---------------------------------------------------------------------------
The ptxtools use parts of the PTX parser code from GPU Ocelot project
(https://code.google.com/p/gpuocelot/), which is covered by the following
license:
Copyright 2011
GEORGIA TECH RESEARCH CORPORATION
ALL RIGHTS RESERVED
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimers.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimers in the
documentation and/or other materials provided with the
distribution.
* Neither the name of GEORGIA TECH RESEARCH CORPORATION nor the
names of its contributors may be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY GEORGIA TECH RESEARCH CORPORATION ''AS IS''
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GEORGIA TECH RESEARCH
CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
You agree that the Software will not be shipped, transferred, exported,
or re-exported directly into any country prohibited by the United States
Export Administration Act and the regulations thereunder nor will be
used for any purpose prohibited by the Act.

View File

@@ -73,6 +73,10 @@ endif
# To enable: make ARM_ENABLED=1
ARM_ENABLED=0
# Disable NVPTX by request
# To disable: make NVPTX_ENABLED=0
NVPTX_ENABLED=1
# Add llvm bin to the path so any scripts run will go to the right llvm-config
LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
export PATH:=$(LLVM_BIN):$(PATH)
@@ -89,7 +93,7 @@ LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e 's/svn//' -e 's/\./_/' -e 's/\..*//')
LLVM_VERSION_DEF=-D$(LLVM_VERSION)
LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker
LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker
# Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step.
# We check if it's available before adding it (to not break 3.2 and earlier).
ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1)
@@ -98,6 +102,9 @@ endif
ifneq ($(ARM_ENABLED), 0)
LLVM_COMPONENTS+=arm
endif
ifneq ($(NVPTX_ENABLED), 0)
LLVM_COMPONENTS+=nvptx
endif
LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS))
CLANG=clang
@@ -160,6 +167,9 @@ endif
ifneq ($(ARM_ENABLED), 0)
CXXFLAGS+=-DISPC_ARM_ENABLED
endif
ifneq ($(NVPTX_ENABLED), 0)
CXXFLAGS+=-DISPC_NVPTX_ENABLED
endif
LDFLAGS=
ifeq ($(ARCH_OS),Linux)
@@ -184,6 +194,9 @@ TARGETS=avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-
ifneq ($(ARM_ENABLED), 0)
TARGETS+=neon-32 neon-16 neon-8
endif
ifneq ($(NVPTX_ENABLED), 0)
TARGETS+=nvptx
endif
# These files need to be compiled in two versions - 32 and 64 bits.
BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
# These are files to be compiled in single version.
@@ -289,15 +302,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
@echo Compiling $<
@$(CXX) $(CXXFLAGS) -o $@ -c $<
objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
@echo Creating C++ source from builtins definition file $<
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
@echo Creating C++ source from builtins definition file $< \(32 bit version\)
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
@echo Creating C++ source from builtins definition file $< \(64 bit version\)
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@

View File

@@ -342,11 +342,17 @@ lSetInternalFunctions(llvm::Module *module) {
"__all",
"__any",
"__aos_to_soa3_float",
//#ifdef ISPC_NVPTX_ENABLED
"__aos_to_soa3_float1",
//#endif /* ISPC_NVPTX_ENABLED */
"__aos_to_soa3_float16",
"__aos_to_soa3_float4",
"__aos_to_soa3_float8",
"__aos_to_soa3_int32",
"__aos_to_soa4_float",
//#ifdef ISPC_NVPTX_ENABLED
"__aos_to_soa4_float1",
//#endif /* ISPC_NVPTX_ENABLED */
"__aos_to_soa4_float16",
"__aos_to_soa4_float4",
"__aos_to_soa4_float8",
@@ -395,6 +401,38 @@ lSetInternalFunctions(llvm::Module *module) {
"__atomic_xor_int64_global",
"__atomic_xor_uniform_int32_global",
"__atomic_xor_uniform_int64_global",
//#ifdef ISPC_NVPTX_ENABLED
"__atomic_add_varying_int32_global",
"__atomic_add_varying_int64_global",
"__atomic_and_varying_int32_global",
"__atomic_and_varying_int64_global",
"__atomic_compare_exchange_varying_double_global",
"__atomic_compare_exchange_varying_float_global",
"__atomic_compare_exchange_varying_int32_global",
"__atomic_compare_exchange_varying_int64_global",
"__atomic_max_varying_int32_global",
"__atomic_max_varying_int64_global",
"__atomic_min_varying_int32_global",
"__atomic_min_varying_int64_global",
"__atomic_or_varying_int32_global",
"__atomic_or_varying_int64_global",
"__atomic_sub_varying_int32_global",
"__atomic_sub_varying_int64_global",
"__atomic_swap_varying_double_global",
"__atomic_swap_varying_float_global",
"__atomic_swap_varying_int32_global",
"__atomic_swap_varying_int64_global",
"__atomic_umax_varying_uint32_global",
"__atomic_umax_varying_uint64_global",
"__atomic_umin_varying_uint32_global",
"__atomic_umin_varying_uint64_global",
"__atomic_xor_uniform_int32_global",
"__atomic_xor_uniform_int64_global",
"__atomic_xor_varying_int32_global",
"__atomic_xor_varying_int64_global",
"__atomic_xor_varying_int32_global",
"__atomic_xor_varying_int64_global",
//#endif /* ISPC_NVPTX_ENABLED */
"__broadcast_double",
"__broadcast_float",
"__broadcast_i16",
@@ -417,6 +455,9 @@ lSetInternalFunctions(llvm::Module *module) {
"__do_assert_uniform",
"__do_assert_varying",
"__do_print",
//#ifdef ISPC_NVPTX_ENABLED
"__do_print_nvptx",
//#endif /* ISPC_NVPTX_ENABLED */
"__doublebits_uniform_int64",
"__doublebits_varying_int64",
"__exclusive_scan_add_double",
@@ -431,6 +472,10 @@ lSetInternalFunctions(llvm::Module *module) {
"__extract_int32",
"__extract_int64",
"__extract_int8",
//#ifdef ISPC_NVPTX_ENABLED
"__extract_float",
"__extract_double",
//#endif /* ISPC_NVPTX_ENABLED */
"__fastmath",
"__float_to_half_uniform",
"__float_to_half_varying",
@@ -447,6 +492,10 @@ lSetInternalFunctions(llvm::Module *module) {
"__insert_int32",
"__insert_int64",
"__insert_int8",
//#ifdef ISPC_NVPTX_ENABLED
"__insert_float",
"__insert_double",
//#endif /* ISPC_NVPTX_ENABLED */
"__intbits_uniform_double",
"__intbits_uniform_float",
"__intbits_varying_double",
@@ -483,6 +532,9 @@ lSetInternalFunctions(llvm::Module *module) {
"__min_varying_uint32",
"__min_varying_uint64",
"__movmsk",
//#ifdef ISPC_NVPTX_ENABLED
"__movmsk_ptx",
//#endif /* ISPC_NVPTX_ENABLED */
"__new_uniform_32rt",
"__new_uniform_64rt",
"__new_varying32_32rt",
@@ -581,6 +633,10 @@ lSetInternalFunctions(llvm::Module *module) {
"__soa_to_aos3_float8",
"__soa_to_aos3_int32",
"__soa_to_aos4_float",
//#ifdef ISPC_NVPTX_ENABLED
"__soa_to_aos3_float1",
"__soa_to_aos4_float1",
//#endif /* ISPC_NVPTX_ENABLED */
"__soa_to_aos4_float16",
"__soa_to_aos4_float4",
"__soa_to_aos4_float8",
@@ -681,6 +737,26 @@ lSetInternalFunctions(llvm::Module *module) {
"__vec4_add_float",
"__vec4_add_int32",
"__vselect_float",
//#ifdef ISPC_NVPTX_ENABLED
"__program_index",
"__program_count",
"__warp_index",
"__task_index0",
"__task_index1",
"__task_index2",
"__task_index",
"__task_count0",
"__task_count1",
"__task_count2",
"__task_count",
"__cvt_loc2gen",
"__cvt_loc2gen_var",
"__cvt_const2gen",
"__puts_nvptx",
"ISPCAlloc",
"ISPCLaunch",
"ISPCSync",
//#endif /* ISPC_NVPTX_ENABLED */
"__vselect_i32"
};
@@ -759,6 +835,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
g->target->getISA() != Target::NEON16 &&
g->target->getISA() != Target::NEON8)
#endif // !__arm__
#ifdef ISPC_NVPTX_ENABLED
if (g->target->getISA() != Target::NVPTX)
#endif /* ISPC_NVPTX_ENABLED */
{
Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
mTriple.getArch() == bcTriple.getArch());
@@ -954,6 +1033,19 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
// Next, add the target's custom implementations of the various needed
// builtin functions (e.g. __masked_store_32(), etc).
switch (g->target->getISA()) {
#ifdef ISPC_NVPTX_ENABLED
case Target::NVPTX:
{
if (runtime32) {
fprintf(stderr, "Unfortunatly 32bit targets are not supported at the moment .. \n");
assert(0);
}
else {
EXPORT_MODULE(builtins_bitcode_nvptx_64bit);
}
break;
};
#endif /* ISPC_NVPTX_ENABLED */
#ifdef ISPC_ARM_ENABLED
case Target::NEON8: {
@@ -1224,7 +1316,18 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
}
// define the 'programCount' builtin variable
lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
#ifdef ISPC_NVPTX_ENABLED
if (g->target->getISA() == Target::NVPTX)
{
lDefineConstantInt("programCount", 32, module, symbolTable);
}
else
{
#endif /* ISPC_NVPTX_ENABLED */
lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
#ifdef ISPC_NVPTX_ENABLED
}
#endif /* ISPC_NVPTX_ENABLED */
// define the 'programIndex' builtin
lDefineProgramIndex(module, symbolTable);
@@ -1256,6 +1359,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
module, symbolTable);
#ifdef ISPC_NVPTX_ENABLED
lDefineConstantInt("__is_nvptx_target", (int)(g->target->getISA() == Target::NVPTX),
module, symbolTable);
#else
lDefineConstantInt("__is_nvptx_target", (int)0, module, symbolTable);
#endif /* ISPC_NVPTX_ENABLED */
if (g->forceAlignment != -1) {
llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
alignment->setInitializer(LLVMInt32(g->forceAlignment));

View File

@@ -0,0 +1,130 @@
#include <cstdio>
#define PRINT_BUF_SIZE 4096
#define uint64_t unsigned long long
static __device__ size_t d_strlen(const char *str)
{
const char *s;
for (s = str; *s; ++s)
;
return (s - str);
}
static __device__ char* d_strncat(char *dest, const char *src, size_t n)
{
size_t dest_len = d_strlen(dest);
size_t i;
for (i = 0 ; i < n && src[i] != '\0' ; i++)
dest[dest_len + i] = src[i];
dest[dest_len + i] = '\0';
return dest;
}
#define APPEND(str) \
do { \
int offset = bufp - &printString[0]; \
*bufp = '\0'; \
d_strncat(bufp, str, PRINT_BUF_SIZE-offset); \
bufp += d_strlen(str); \
if (bufp >= &printString[PRINT_BUF_SIZE]) \
goto done; \
} while (0) /* eat semicolon */
#define PRINT_SCALAR(fmt, type) \
sprintf(tmpBuf, fmt, *((type *)ptr)); \
APPEND(tmpBuf); \
break
#define PRINT_VECTOR(fmt, type) \
*bufp++ = '['; \
if (bufp == &printString[PRINT_BUF_SIZE]) break; \
for (int i = 0; i < width; ++i) { \
/* only print the value if the current lane is executing */ \
type val0 = *((type*)ptr); \
type val = val0; \
if (mask & (1ull<<i)) \
sprintf(tmpBuf, fmt, val); \
else \
sprintf(tmpBuf, "(( * )) "); \
APPEND(tmpBuf); \
*bufp++ = (i != width-1 ? ',' : ']'); \
} \
break
extern "C"
__device__ void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
void **args) {
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
char *bufp = &printString[0];
char tmpBuf[256];
const char trueBuf[] = "true";
const char falseBuf[] = "false";
int argCount = 0;
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
// Format strings are just single percent signs.
if (*format != '%') {
*bufp++ = *format;
}
else {
if (*types) {
void *ptr = args[argCount++];
// Based on the encoding in the types string, cast the
// value appropriately and print it with a reasonable
// printf() formatting string.
switch (*types) {
case 'b': {
const char *tmpBuf1 = *((bool *)ptr) ? trueBuf : falseBuf;
APPEND(tmpBuf1);
break;
}
case 'B': {
*bufp++ = '[';
if (bufp == &printString[PRINT_BUF_SIZE])
break;
for (int i = 0; i < width; ++i) {
bool val0 = *((bool*)ptr);
bool val = val0; \
if (mask & (1ull << i)) {
const char *tmpBuf1 = val ? trueBuf : falseBuf;
APPEND(tmpBuf1);
}
else
APPEND("_________");
*bufp++ = (i != width-1) ? ',' : ']';
}
break;
}
case 'i': PRINT_SCALAR("%d", int);
case 'I': PRINT_VECTOR("%d", int);
case 'u': PRINT_SCALAR("%u", unsigned int);
case 'U': PRINT_VECTOR("%u", unsigned int);
case 'f': PRINT_SCALAR("%f", float);
case 'F': PRINT_VECTOR("%f", float);
case 'l': PRINT_SCALAR("%lld", long long);
case 'L': PRINT_VECTOR("%lld", long long);
case 'v': PRINT_SCALAR("%llu", unsigned long long);
case 'V': PRINT_VECTOR("%llu", unsigned long long);
case 'd': PRINT_SCALAR("%f", double);
case 'D': PRINT_VECTOR("%f", double);
case 'p': PRINT_SCALAR("%p", void *);
case 'P': PRINT_VECTOR("%p", void *);
default:
APPEND("UNKNOWN TYPE ");
*bufp++ = *types;
}
++types;
}
}
++format;
}
done:
*bufp = '\n'; bufp++;
*bufp = '\0';
}

View File

@@ -185,6 +185,81 @@ void __do_print(const char *format, const char *types, int width, uint64_t mask,
fflush(stdout);
}
/* this is print for PTX target only */
int __puts_nvptx(const char *);
void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
void **args) {
#if 0
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
char *bufp = &printString[0];
char tmpBuf[256];
int argCount = 0;
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
// Format strings are just single percent signs.
if (*format != '%') {
*bufp++ = *format;
}
else {
if (*types) {
void *ptr = args[argCount++];
// Based on the encoding in the types string, cast the
// value appropriately and print it with a reasonable
// printf() formatting string.
switch (*types) {
case 'b': {
sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
APPEND(tmpBuf);
break;
}
case 'B': {
*bufp++ = '[';
if (bufp == &printString[PRINT_BUF_SIZE])
break;
for (int i = 0; i < width; ++i) {
if (mask & (1ull << i)) {
sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
APPEND(tmpBuf);
}
else
APPEND("_________");
*bufp++ = (i != width-1) ? ',' : ']';
}
break;
}
case 'i': PRINT_SCALAR("%d", int);
case 'I': PRINT_VECTOR("%d", int);
case 'u': PRINT_SCALAR("%u", unsigned int);
case 'U': PRINT_VECTOR("%u", unsigned int);
case 'f': PRINT_SCALAR("%f", float);
case 'F': PRINT_VECTOR("%f", float);
case 'l': PRINT_SCALAR("%lld", long long);
case 'L': PRINT_VECTOR("%lld", long long);
case 'v': PRINT_SCALAR("%llu", unsigned long long);
case 'V': PRINT_VECTOR("%llu", unsigned long long);
case 'd': PRINT_SCALAR("%f", double);
case 'D': PRINT_VECTOR("%f", double);
case 'p': PRINT_SCALAR("%p", void *);
case 'P': PRINT_VECTOR("%p", void *);
default:
APPEND("UNKNOWN TYPE ");
*bufp++ = *types;
}
++types;
}
}
++format;
}
done:
*bufp = '\n'; bufp++;
*bufp = '\0';
__puts_nvptx(printString);
#else
__puts_nvptx("---nvptx printing is not support---\n");
#endif
}
int __num_cores() {
#if defined(_MSC_VER) || defined(__MINGW32__)

View File

@@ -289,4 +289,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
;; int8/int16 builtins
define_avgs()
declare_nvptx()

View File

@@ -42,6 +42,7 @@ packed_load_and_store()
scans()
int64minmax()
aossoa()
declare_nvptx()
saturation_arithmetic_novec()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@@ -382,6 +382,7 @@ declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x
;; int8/int16 builtins
define_avgs()
declare_nvptx()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported

View File

@@ -344,3 +344,4 @@ packed_load_and_store(4)
;; prefetch
define_prefetches()
declare_nvptx()

2340
builtins/target-nvptx.ll Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -274,3 +274,4 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
define_avgs()
declare_nvptx()

View File

@@ -278,3 +278,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
%call = call i64 @llvm.ctpop.i64(i64 %0)
ret i64 %call
}
declare_nvptx()

3492
builtins/util-nvptx.m4 Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -4964,6 +4964,62 @@ declare double @__rcp_uniform_double(double)
declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
')
define(`declare_nvptx',
`
declare i32 @__program_index() nounwind readnone alwaysinline
declare i32 @__program_count() nounwind readnone alwaysinline
declare i32 @__warp_index() nounwind readnone alwaysinline
declare i32 @__task_index0() nounwind readnone alwaysinline
declare i32 @__task_index1() nounwind readnone alwaysinline
declare i32 @__task_index2() nounwind readnone alwaysinline
declare i32 @__task_index() nounwind readnone alwaysinline
declare i32 @__task_count0() nounwind readnone alwaysinline
declare i32 @__task_count1() nounwind readnone alwaysinline
declare i32 @__task_count2() nounwind readnone alwaysinline
declare i32 @__task_count() nounwind readnone alwaysinline
declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline
declare i64 @__movmsk_ptx(<WIDTH x i1>) nounwind readnone alwaysinline;
')
define(`global_atomic_varying',`
declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
')
define(`global_atomic_cas_varying',`
declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %cmp, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
')
global_atomic_cas_varying(WIDTH, compare_exchange, i32, int32)
global_atomic_cas_varying(WIDTH, compare_exchange, i64, int64)
global_atomic_cas_varying(WIDTH, compare_exchange, float, float)
global_atomic_cas_varying(WIDTH, compare_exchange, double, double)
global_atomic_varying(WIDTH, swap, i32, int32)
global_atomic_varying(WIDTH, swap, i64, int64)
global_atomic_varying(WIDTH, swap, float, float)
global_atomic_varying(WIDTH, swap, double, double)
global_atomic_varying(WIDTH, add, i32, int32)
global_atomic_varying(WIDTH, sub, i32, int32)
global_atomic_varying(WIDTH, and, i32, int32)
global_atomic_varying(WIDTH, or, i32, int32)
global_atomic_varying(WIDTH, xor, i32, int32)
global_atomic_varying(WIDTH, min, i32, int32)
global_atomic_varying(WIDTH, max, i32, int32)
global_atomic_varying(WIDTH, umin, i32, uint32)
global_atomic_varying(WIDTH, umax, i32, uint32)
global_atomic_varying(WIDTH, add, i64, int64)
global_atomic_varying(WIDTH, sub, i64, int64)
global_atomic_varying(WIDTH, and, i64, int64)
global_atomic_varying(WIDTH, or, i64, int64)
global_atomic_varying(WIDTH, xor, i64, int64)
global_atomic_varying(WIDTH, min, i64, int64)
global_atomic_varying(WIDTH, max, i64, int64)
global_atomic_varying(WIDTH, umin, i64, uint64)
global_atomic_varying(WIDTH, umax, i64, uint64)
define(`transcendetals_decl',`
declare float @__log_uniform_float(float) nounwind readnone

233
ctx.cpp
View File

@@ -57,6 +57,10 @@
#include <llvm/IR/Instructions.h>
#include <llvm/IR/DerivedTypes.h>
#endif
#ifdef ISPC_NVPTX_ENABLED
#include <llvm/Support/raw_ostream.h>
#include <llvm/Support/FormattedStream.h>
#endif /* ISPC_NVPTX_ENABLED */
/** This is a small utility structure that records information related to one
level of nested control flow. It's mostly used in correctly restoring
@@ -1383,10 +1387,17 @@ FunctionEmitContext::None(llvm::Value *mask) {
llvm::Value *
FunctionEmitContext::LaneMask(llvm::Value *v) {
#ifdef ISPC_NVPTX_ENABLED
/* this makes mandelbrot example slower with "nvptx" target.
* Needs further investigation. */
const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
#else
const char *__movmsk = "__movmsk";
#endif
// Call the target-dependent movmsk function to turn the vector mask
// into an i64 value
std::vector<Symbol *> mm;
m->symbolTable->LookupFunction("__movmsk", &mm);
m->symbolTable->LookupFunction(__movmsk, &mm);
if (g->target->getMaskBitCount() == 1)
AssertPos(currentPos, mm.size() == 1);
else
@@ -1398,13 +1409,78 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
}
#ifdef ISPC_NVPTX_ENABLED
bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName)
{
llvm::Type *type = vector->getType();
if (type == LLVMTypes::Int8VectorType)
funcName += "_int8";
else if (type == LLVMTypes::Int16VectorType)
funcName += "_int16";
else if (type == LLVMTypes::Int32VectorType)
funcName += "_int32";
else if (type == LLVMTypes::Int64VectorType)
funcName += "_int64";
else if (type == LLVMTypes::FloatVectorType)
funcName += "_float";
else if (type == LLVMTypes::DoubleVectorType)
funcName += "_double";
else
return false;
return true;
}
llvm::Value*
FunctionEmitContext::Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar)
{
std::string funcName = "__insert";
assert(lAppendInsertExtractName(vector, funcName));
assert(lane->getType() == LLVMTypes::Int32Type);
llvm::Function *func = m->module->getFunction(funcName.c_str());
assert(func != NULL);
std::vector<llvm::Value *> args;
args.push_back(vector);
args.push_back(lane);
args.push_back(scalar);
llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
return ret;
}
llvm::Value*
FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane)
{
std::string funcName = "__extract";
assert(lAppendInsertExtractName(vector, funcName));
assert(lane->getType() == LLVMTypes::Int32Type);
llvm::Function *func = m->module->getFunction(funcName.c_str());
assert(func != NULL);
std::vector<llvm::Value *> args;
args.push_back(vector);
args.push_back(lane);
llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
return ret;
}
#endif /* ISPC_NVPTX_ENABLED */
llvm::Value *
FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
#ifdef ISPC_NVPTX_ENABLED
if (g->target->getISA() == Target::NVPTX)
{
// Compare the two masks to get a vector of i1s
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
v1, v2, "v1==v2");
return ExtractInst(cmp, 0); /* this works without calling All(..) in PTX. Why ?!? */
}
#endif /* ISPC_NVPTX_ENABLED */
#if 0
// Compare the two masks to get a vector of i1s
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
v1, v2, "v1==v2");
v1, v2, "v1==v2");
// Turn that into a bool vector type (often i32s)
cmp = I1VecToBoolVec(cmp);
// And see if it's all on
@@ -1413,7 +1489,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
llvm::Value *mm1 = LaneMask(v1);
llvm::Value *mm2 = LaneMask(v2);
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
LLVMGetName("equal", v1, v2));
LLVMGetName("equal", v1, v2));
#endif
}
@@ -1421,8 +1497,8 @@ llvm::Value *
FunctionEmitContext::ProgramIndexVector(bool is32bits) {
llvm::SmallVector<llvm::Constant*, 16> array;
for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
array.push_back(C);
llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
array.push_back(C);
}
llvm::Constant* index = llvm::ConstantVector::get(array);
@@ -1430,6 +1506,20 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) {
return index;
}
#ifdef ISPC_NVPTX_ENABLED
llvm::Value *
FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) {
llvm::Function *func_program_index = m->module->getFunction("__program_index");
llvm::Value *__program_index = CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__program_indexS");
llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV");
#if 0
if (!is32bits)
index = ZExtInst(index, LLVMTypes::Int64VectandType);
#endif
return index;
}
#endif /* ISPC_NVPTX_ENABLED */
llvm::Value *
FunctionEmitContext::GetStringPtr(const std::string &str) {
@@ -3555,31 +3645,117 @@ llvm::Value *
FunctionEmitContext::LaunchInst(llvm::Value *callee,
std::vector<llvm::Value *> &argVals,
llvm::Value *launchCount[3]){
if (callee == NULL) {
#ifdef ISPC_NVPTX_ENABLED
if (g->target->getISA() == Target::NVPTX)
{
if (callee == NULL) {
AssertPos(currentPos, m->errorCount > 0);
return NULL;
}
launchedTasks = true;
AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
std::vector<llvm::Type*> argTypes;
llvm::Function *F = llvm::dyn_cast<llvm::Function>(callee);
const unsigned int nArgs = F->arg_size();
llvm::Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
for (; I != E; ++I)
argTypes.push_back(I->getType());
llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
if (structSize->getType() != LLVMTypes::Int64Type)
structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
"struct_size_to_64");
const int align = 8;
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
AssertPos(currentPos, falloc != NULL);
std::vector<llvm::Value *> allocArgs;
allocArgs.push_back(launchGroupHandlePtr);
allocArgs.push_back(structSize);
allocArgs.push_back(LLVMInt32(align));
llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
llvm::BasicBlock* if_true = CreateBasicBlock("if_true");
llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
/* check if the pointer returned by ISPCAlloc is not NULL
* --------------
* this is a workaround for not checking the value of programIndex
* because ISPCAlloc will return NULL pointer for all programIndex > 0
* of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0
* will also be NULL
* This check must be added, and also rewrite the code to make it less opaque
*/
llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
BranchInst(if_true, if_false, cmp1);
/**********************/
bblock = if_true;
// label_if_then block:
llvm::Type *pt = llvm::PointerType::getUnqual(st);
llvm::Value *argmem = BitCastInst(voidmem, pt);
for (unsigned int i = 0; i < argVals.size(); ++i)
{
llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
// don't need to do masked store here, I think
StoreInst(argVals[i], ptr);
}
if (nArgs == argVals.size() + 1) {
// copy in the mask
llvm::Value *mask = GetFullMask();
llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
"funarg_mask");
StoreInst(mask, ptr);
}
BranchInst(if_false);
/**********************/
bblock = if_false;
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
AssertPos(currentPos, flaunch != NULL);
std::vector<llvm::Value *> args;
args.push_back(launchGroupHandlePtr);
args.push_back(fptr);
args.push_back(voidmem);
args.push_back(launchCount[0]);
args.push_back(launchCount[1]);
args.push_back(launchCount[2]);
llvm::Value *ret = CallInst(flaunch, NULL, args, "");
return ret;
}
#endif /* ISPC_NVPTX_ENABLED */
if (callee == NULL) {
AssertPos(currentPos, m->errorCount > 0);
return NULL;
}
launchedTasks = true;
AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
llvm::Type *argType =
(llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
(llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
AssertPos(currentPos, llvm::PointerType::classof(argType));
llvm::PointerType *pt =
llvm::dyn_cast<llvm::PointerType>(argType);
llvm::dyn_cast<llvm::PointerType>(argType);
AssertPos(currentPos, llvm::StructType::classof(pt->getElementType()));
llvm::StructType *argStructType =
static_cast<llvm::StructType *>(pt->getElementType());
static_cast<llvm::StructType *>(pt->getElementType());
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
AssertPos(currentPos, falloc != NULL);
llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
if (structSize->getType() != LLVMTypes::Int64Type)
// ISPCAlloc expects the size as an uint64_t, but on 32-bit
// targets, SizeOf returns a 32-bit value
structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
"struct_size_to_64");
// ISPCAlloc expects the size as an uint64_t, but on 32-bit
// targets, SizeOf returns a 32-bit value
structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
"struct_size_to_64");
int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth());
std::vector<llvm::Value *> allocArgs;
@@ -3592,17 +3768,17 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
// Copy the values of the parameters into the appropriate place in
// the argument block
for (unsigned int i = 0; i < argVals.size(); ++i) {
llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
// don't need to do masked store here, I think
StoreInst(argVals[i], ptr);
llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
// don't need to do masked store here, I think
StoreInst(argVals[i], ptr);
}
if (argStructType->getNumElements() == argVals.size() + 1) {
// copy in the mask
llvm::Value *mask = GetFullMask();
llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
"funarg_mask");
StoreInst(mask, ptr);
// copy in the mask
llvm::Value *mask = GetFullMask();
llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
"funarg_mask");
StoreInst(mask, ptr);
}
// And emit the call to the user-supplied task launch function, passing
@@ -3624,6 +3800,21 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
void
FunctionEmitContext::SyncInst() {
#ifdef ISPC_NVPTX_ENABLED
if (g->target->getISA() == Target::NVPTX)
{
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
llvm::Value *nullPtrValue =
llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
llvm::Function *fsync = m->module->getFunction("ISPCSync");
if (fsync == NULL)
FATAL("Couldn't find ISPCSync declaration?!");
CallInst(fsync, NULL, launchGroupHandle, "");
StoreInst(nullPtrValue, launchGroupHandlePtr);
return;
}
#endif /* ISPC_NVPTX_ENABLED */
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
llvm::Value *nullPtrValue =
llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);

10
ctx.h
View File

@@ -302,9 +302,17 @@ public:
that indicates whether the two masks are equal. */
llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
/** Generate ConstantVector, which contains ProgramIndex, i.e.
/** generate constantvector, which contains programindex, i.e.
< i32 0, i32 1, i32 2, i32 3> */
llvm::Value *ProgramIndexVector(bool is32bits = true);
#ifdef ISPC_NVPTX_ENABLED
llvm::Value *ProgramIndexVectorPTX(bool is32bits = true);
/** Issues a call to __insert_int8/int16/int32/int64/float/double */
llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar);
/** Issues a call to __extract_int8/int16/int32/int64/float/double */
llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane);
#endif
/** Given a string, create an anonymous global variable to hold its
value and return the pointer to the string. */

View File

@@ -168,6 +168,15 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
if (soaWidth > 0) {
#ifdef ISPC_NVPTX_ENABLED
#if 0 /* see stmt.cpp in DeclStmt::EmitCode for work-around of SOAType Declaration */
if (g->target->getISA() == Target::NVPTX)
{
Error(pos, "\"soa\" data types are currently not supported with \"nvptx\" target.");
return NULL;
}
#endif
#endif /* ISPC_NVPTX_ENABLED */
const StructType *st = CastType<StructType>(retType);
if (st == NULL) {
@@ -402,6 +411,15 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
return;
}
#ifdef ISPC_NVPTX_ENABLED
#if 0 /* NVPTX */
if (baseType->IsUniformType())
{
fprintf(stderr, " detected uniform array of size= %d array= %s\n" ,arraySize,
baseType->IsArrayType() ? " true " : " false ");
}
#endif
#endif /* ISPC_NVPTX_ENABLED */
const Type *arrayType = new ArrayType(baseType, arraySize);
if (child != NULL) {
child->InitFromType(arrayType, ds);

View File

@@ -178,6 +178,13 @@ Contents:
+ `Data Alignment and Aliasing`_
+ `Restructuring Existing Programs to Use ISPC`_
* `Experimental support for PTX`_
+ `Overview`_
+ `Compiling For The NVIDIA Kepler GPU`_
+ `Hints`_
+ `Limitations & known issues`_
* `Disclaimer and Legal Information`_
* `Optimization Notice`_
@@ -4936,6 +4943,129 @@ program instances improves performance.
.. _ispc Performance Tuning Guide: http://ispc.github.com/perfguide.html
Experimental support for PTX
============================
``ispc`` provides experimental support for PTX code generation which currently
targets NVIDIA GPUs with compute capability >3.5 [Kepler GPUs with support for
dynamic parallelism]. Due to its nature, the PTX backend currently impose
several restrictions on the ``ispc`` program, which will be described below.
Overview
--------
SPMD programming in ``ispc`` is similar to a warp-synchronous CUDA programming.
Namely, program instances in a gang are equivalent of CUDA threads in a single
warp. Hence, to run efficiently on a GPU ``ispc`` program must use tasking
functionality via ``launch`` keyword to ensure multiple number of warps are
executed concurrently on the GPU.
``export`` functions are equipped with a CUDA C wrapper which schedules a
single warp--a thread-block with a total of 32 threads. In contract to CPU
programming, this exported function, either directly or otherwise, should
utilize ``launch`` keyword to schedule work on a GPU.
At the PTX level, ``launch`` keyword is mapped to CUDA Dynamic Parallelism and
it schedules a grid of thread-blocks each 4 warps-wide (128 threads). As a
result, ``ispc`` has a tasking-granularity of 4 tasks with PTX target; this
restriction will be eliminated in future.
When passing pointers to an ``export`` function, it is important that they
remain legal when are accessed from GPU. Prior to CUDA 6.0, such a pointer were
holding an address that is only accessible from the GPU. With the release of
CUDA 6.0, it is possible to pass a pointer to a unified memory allocated with
``cudaMallocManaged``. Examples provides rudimentary wrapper functions that
call CUDA API for managed memory allocations, allowing the programmers to avoid
explicit memory copies.
Compiling For The NVIDIA Kepler GPU
-----------------------------------
Compilation for NVIDIA Kepler GPU is a several step procedure.
First, we need to generate a LLVM assembly from ``ispc`` source file (``ispc``
generates LLVM assembly instead of bitcode when ``nvptx`` target is chosen):
::
$ISPC_HOME/ispc foo.ispc --emit-llvm --target=nvptx -o foo.ll
This LLVM assembly can immediately be compiled into PTX with the help of
``ptxgen`` tool; this tool uses ``libNVVM`` which is a part of a CUDA Toolkit.
::
$ISPC_HOME/ptxtools/ptxgen --use_fast_math foo.ll -o foo.ptx
.. If ``ispc`` is compiled with LLVM >3.2, the resulting bitcode must first be
.. decompiled with the ``llvm-dis`` from LLVM 3.2 distribution; this "trick" is
.. required to generate an IR compatible with libNVVM:
.. ::
..
.. $LLVM32/bin/llvm-dis foo.bc -o foo.ll
.. $ISPC_HOME/ptxtools/ptxgen --use_fast_math foo.ll -o foo.ptx
This PTX is ready for execution on a GPU, for example via CUDA
Driver API. Alternatively, we also provide a simple ``ptxcc`` tool, which
compiles the resulting PTX code into an object file:
::
$ISPC_HOME/ptxtools/ptxcc foo.ptx -o foo_cu.o -Xnvcc="--maxrregcount=64
-Xptxas=-v"
This object file can be linked with the main program via ``nvcc``:
::
nvcc foo_cu.o foo_main.o -o foo
Hints
-----
- ``uniform`` arrays in a function scope are statically allocated in
``__shared__`` memory, with all ensuing consequences. For example, if more
than avaiable shared memory per SMX is allocated, a link- or runtime-error will occur
- If ``uniform`` arrays of large size are desired, we recommend to use
``uniform new uniform T[size]`` for their allocation, ideally outside the
tasking function (see ``deferred/kernels.ispc`` in the deferred shading example)
Examples that produces executables for CPU, XeonPhi and Kepler GPU display
several tuning approaches that can benefit GPU performance.
``ispc`` may also generate performance warning, that if followed, may improve
GPU application performance.
Limitations & known issues
--------------------------
Due to its experimental form, PTX code generation is known to impose several
limitation on the ``ispc`` program which are documented in the following list:
- Must use ``ispc`` tasking functionality to run efficiently on GPU
- Must use ``new/delete`` and/or ``ispc_malloc``/``ispc_free``/``ispc_memset``/``ispc_memcpy`` to allocate/free/set/copy memory that is visible to GPU
- ``export`` functions must have ``void`` return type.
- ``task``/``export`` functions do not accept varying data-types
- ``new``/``delete`` currently only works with ``uniform`` data-types
- ``aossoa``/``soaaos`` is not yet supported
- ``sizeof(varying)`` is not yet unsupported
- Function pointers do not work yet (may or may not generate compilation fail)
- ``memset``/``memcpy``/``memmove`` is not yet supported
- ``uniform`` arrays in global scope are mapped to global memory
- ``varying`` arrays in global scope are not yet supported
- ``uniform`` arrays in local scope are mapped to shared memory
- ``varying`` arrays in local scope are mapped to local memory
- ``const uniform/varying`` arrays are mapped to local memory
- ``const static uniform`` arrays are mapped to constant memory
- ``const static varying`` arrays are mapped to global memory
- ``static`` data types in local scope are not allowed; compilation will fail
- Best performance is obtained with libNVVM (LLVM PTX backend can also be used but it requires libdevice.compute_35.10.bc that comes with libNVVM)
Likely there are more... which, together with some of the above-mentioned
issues, will be fixed in due time.
Disclaimer and Legal Information
================================

2
examples/portable/aobench/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
ao
*.ppm

View File

@@ -0,0 +1,8 @@
EXAMPLE=ao
CPP_SRC=ao.cpp
ISPC_SRC=ao.ispc
ISPC_IA_TARGETS=avx1-i32x8
ISPC_ARM_TARGETS=neon
include ../common_cpu.mk

View File

@@ -0,0 +1,7 @@
EXAMPLE=ao
CXX_SRC=ao.cpp
ISPC_SRC=ao.ispc
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
ISPC_TARGET=generic-16
include ../common_knc.mk

View File

@@ -0,0 +1,14 @@
PROG=ao
ISPC_SRC=ao.ispc
CU_SRC=ao.cu
CXX_SRC=ao.cpp
PTXCC_REGMAX=64
#ISPC_FLAGS= --opt=disable-uniform-control-flow
#LLVM_GPU=1
NVVM_GPU=1
include ../common_ptx.mk

View File

@@ -0,0 +1,152 @@
/*
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#define NOMINMAX
#pragma warning (disable: 4244)
#pragma warning (disable: 4305)
#endif
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cassert>
#ifdef __linux__
#include <malloc.h>
#endif
#include <math.h>
#include <map>
#include <string>
#include <algorithm>
#include <sys/types.h>
#include "ao_ispc.h"
#include "timing.h"
#include "ispc_malloc.h"
#define NSUBSAMPLES 2
static unsigned int test_iterations[] = {3, 7, 1};
static unsigned int width, height;
static unsigned char *img;
static float *fimg;
static unsigned char
clamp(float f)
{
int i = (int)(f * 255.5);
if (i < 0) i = 0;
if (i > 255) i = 255;
return (unsigned char)i;
}
static void
savePPM(const char *fname, int w, int h)
{
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]);
img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]);
img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]);
}
}
FILE *fp = fopen(fname, "wb");
if (!fp) {
perror(fname);
exit(1);
}
fprintf(fp, "P6\n");
fprintf(fp, "%d %d\n", w, h);
fprintf(fp, "255\n");
fwrite(img, w * h * 3, 1, fp);
fclose(fp);
printf("Wrote image file %s\n", fname);
}
int main(int argc, char **argv)
{
if (argc < 3) {
printf ("%s\n", argv[0]);
printf ("Usage: ao [width] [height] [ispc iterations] [tasks iterations] [serial iterations]\n");
getchar();
exit(-1);
}
else {
if (argc == 6) {
for (int i = 0; i < 3; i++) {
test_iterations[i] = atoi(argv[3 + i]);
}
}
width = atoi (argv[1]);
height = atoi (argv[2]);
}
// Allocate space for output images
img = new unsigned char[width * height * 3];
fimg = new float[width * height * 3];
//
// Run the ispc + tasks path, test_iterations times, and report the
// minimum time for any of them.
//
double minTimeISPCTasks = 1e30;
for (unsigned int i = 0; i < test_iterations[1]; i++) {
ispc_memset(fimg, 0, sizeof(float) * width * height * 3);
assert(NSUBSAMPLES == 2);
reset_and_start_timer();
ispc::ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
double t = get_elapsed_msec();
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", t);
minTimeISPCTasks = std::min(minTimeISPCTasks, t);
}
// Report results and save image
printf("[aobench ispc + tasks]:\t\t[%.3f] msec (%d x %d image)\n",
minTimeISPCTasks, width, height);
savePPM("ao-ispc-tasks.ppm", width, height);
delete img;
delete fimg;
return 0;
}

View File

@@ -0,0 +1,447 @@
// -*- mode: c++ -*-
/*
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
*/
#include "cuda_helpers.cuh"
#define NAO_SAMPLES 8
//#define M_PI 3.1415926535f
#define vec Float3
struct Float3
{
float x,y,z;
__device__ friend Float3 operator+(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x+b.x;
c.y = a.y+b.y;
c.z = a.z+b.z;
return c;
}
__device__ friend Float3 operator-(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x-b.x;
c.y = a.y-b.y;
c.z = a.z-b.z;
return c;
}
__device__ friend Float3 operator/(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x/b.x;
c.y = a.y/b.y;
c.z = a.z/b.z;
return c;
}
__device__ friend Float3 operator/(const float a, const Float3 b)
{
Float3 c;
c.x = a/b.x;
c.y = a/b.y;
c.z = a/b.z;
return c;
}
__device__ friend Float3 operator*(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x*b.x;
c.y = a.y*b.y;
c.z = a.z*b.z;
return c;
}
__device__ friend Float3 operator*(const Float3 a, const float b)
{
Float3 c;
c.x = a.x*b;
c.y = a.y*b;
c.z = a.z*b;
return c;
}
};
///////////////////////////////////////////////////////////////////////////
// RNG stuff
struct RNGState {
unsigned int z1, z2, z3, z4;
};
__device__
static inline unsigned int random(RNGState * state)
{
unsigned int b;
b = ((state->z1 << 6) ^ state->z1) >> 13;
state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
b = ((state->z2 << 2) ^ state->z2) >> 27;
state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
b = ((state->z3 << 13) ^ state->z3) >> 21;
state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
b = ((state->z4 << 3) ^ state->z4) >> 12;
state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
}
__device__
static inline float frandom(RNGState * state)
{
unsigned int irand = random(state);
irand &= (1ul<<23)-1;
return __int_as_float(0x3F800000 | irand)-1.0f;
}
__device__
static inline void seed_rng(RNGState * state,
unsigned int seed) {
state->z1 = seed;
state->z2 = seed ^ 0xbeeff00d;
state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) |
((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
}
struct Isect {
float t;
vec p;
vec n;
int hit;
};
struct Sphere {
vec center;
float radius;
};
struct Plane {
vec p;
vec n;
};
struct Ray {
vec org;
vec dir;
};
__device__
static inline float dot(vec a, vec b) {
return a.x * b.x + a.y * b.y + a.z * b.z;
}
__device__
static inline vec vcross(vec v0, vec v1) {
vec ret;
ret.x = v0.y * v1.z - v0.z * v1.y;
ret.y = v0.z * v1.x - v0.x * v1.z;
ret.z = v0.x * v1.y - v0.y * v1.x;
return ret;
}
__device__
static inline void vnormalize(vec &v) {
float len2 = dot(v, v);
float invlen = rsqrt(len2);
v = v*invlen;
}
__device__
static inline void
ray_plane_intersect(Isect &isect,const Ray &ray, const Plane &plane) {
float d = -dot(plane.p, plane.n);
float v = dot(ray.dir, plane.n);
#if 0
if (abs(v) < 1.0f-17)
return;
else {
float t = -(dot(ray.org, plane.n) + d) / v;
if ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + ray.dir * t;
isect.n = plane.n;
}
}
#else
if (abs(v) <= 1.0e-17)
return;
float t = -(dot(ray.org, plane.n) + d) / v;
if ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + ray.dir * t;
isect.n = plane.n;
}
#endif
}
__device__
static inline void
ray_sphere_intersect(Isect &isect,const Ray &ray, const Sphere &sphere) {
vec rs = ray.org - sphere.center;
float B = dot(rs, ray.dir);
float C = dot(rs, rs) - sphere.radius * sphere.radius;
float D = B * B - C;
#if 0
if (D > 0.) {
float t = -B - sqrt(D);
if ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + ray.dir * t;
isect.n = isect.p - sphere.center;
vnormalize(isect.n);
}
}
#else
if (D <= 0.0f)
return;
float t = -B - sqrt(D);
if ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + ray.dir * t;
isect.n = isect.p - sphere.center;
vnormalize(isect.n);
}
#endif
}
__device__
static inline void
orthoBasis(vec basis[3], vec n) {
basis[2] = n;
basis[1].x = 0.0f; basis[1].y = 0.0f; basis[1].z = 0.0f;
if ((n.x < 0.6f) && (n.x > -0.6f)) {
basis[1].x = 1.0f;
} else if ((n.y < 0.6f) && (n.y > -0.6f)) {
basis[1].y = 1.0f;
} else if ((n.z < 0.6f) && (n.z > -0.6f)) {
basis[1].z = 1.0f;
} else {
basis[1].x = 1.0f;
}
basis[0] = vcross(basis[1], basis[2]);
vnormalize(basis[0]);
basis[1] = vcross(basis[2], basis[0]);
vnormalize(basis[1]);
}
__device__
static inline float
ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
RNGState &rngstate) {
float eps = 0.0001f;
vec p; //, n;
vec basis[3];
float occlusion = 0.0f;
p = isect.p + isect.n * eps;
orthoBasis(basis, isect.n);
const int ntheta = NAO_SAMPLES;
const int nphi = NAO_SAMPLES;
for ( int j = 0; j < ntheta; j++) {
for ( int i = 0; i < nphi; i++) {
Ray ray;
Isect occIsect;
float theta = sqrt(frandom(&rngstate));
float phi = 2.0f * M_PI * frandom(&rngstate);
float x = cos(phi) * theta;
float y = sin(phi) * theta;
float z = sqrtf(1.0f - theta * theta);
// local . global
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
ray.org = p;
ray.dir.x = rx;
ray.dir.y = ry;
ray.dir.z = rz;
occIsect.t = 1.0f+17;
occIsect.hit = 0;
for ( int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(occIsect, ray, spheres[snum]);
ray_plane_intersect (occIsect, ray, plane);
if (occIsect.hit) occlusion += 1.0f;
}
}
occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
return occlusion;
}
/* Compute the image for the scanlines from [y0,y1), for an overall image
of width w and height h.
*/
__device__
static inline void ao_tiles(
int x0, int x1,
int y0, int y1,
int w, int h,
int nsubsamples,
float image[])
{
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
const Sphere spheres[3] = {
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
RNGState rngstate;
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
float invSamples = 1.f / nsubsamples;
for ( int y = y0; y < y1; y++)
for ( int x = programIndex+x0; x < x1; x += programCount)
{
const int offset = 3 * (y * w + x);
float res = 0.0f;
for ( int u = 0; u < nsubsamples; u++)
for ( int v = 0; v < nsubsamples; v++)
{
float du = (float)u * invSamples, dv = (float)v * invSamples;
// Figure out x,y pixel in NDC
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
float ret = 0.f;
Ray ray;
Isect isect;
ray.org.x = 0.0f;
ray.org.y = 0.0f;
ray.org.z = 0.0f;
// Poor man's perspective projection
ray.dir.x = px;
ray.dir.y = py;
ray.dir.z = -1.0;
vnormalize(ray.dir);
isect.t = 1.0e+17;
isect.hit = 0;
for ( int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(isect, ray, spheres[snum]);
ray_plane_intersect(isect, ray, plane);
// Note use of 'coherent' if statement; the set of rays we
// trace will often all hit or all miss the scene
if (any(isect.hit)) {
ret = isect.hit*ambient_occlusion(isect, plane, spheres, rngstate);
ret *= invSamples * invSamples;
res += ret;
}
}
if (x < x1)
{
image[offset ] = res;
image[offset+1] = res;
image[offset+2] = res;
}
}
}
#define TILEX 64
#define TILEY 4
extern "C"
__global__
void ao_task( int width, int height,
int nsubsamples, float image[])
{
if (taskIndex0 >= taskCount0) return;
if (taskIndex1 >= taskCount1) return;
const int x0 = taskIndex0 * TILEX;
const int x1 = min(x0 + TILEX, width);
const int y0 = taskIndex1 * TILEY;
const int y1 = min(y0 + TILEY, height);
ao_tiles(x0,x1,y0,y1, width, height, nsubsamples, image);
}
extern "C"
__global__
void ao_ispc_tasks___export(
int w, int h, int nsubsamples,
float image[])
{
const int ntilex = (w+TILEX-1)/TILEX;
const int ntiley = (h+TILEY-1)/TILEY;
launch(ntilex,ntiley,1,ao_task)(w,h,nsubsamples,image);
cudaDeviceSynchronize();
}
extern "C"
__host__ void ao_ispc_tasks(
int w, int h, int nsubsamples,
float image[])
{
ao_ispc_tasks___export<<<1,32>>>(w,h,nsubsamples,image);
cudaDeviceSynchronize();
}

View File

@@ -0,0 +1,340 @@
// -*- mode: c++ -*-
/*
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
*/
#define NAO_SAMPLES 8
#define M_PI 3.1415926535f
typedef float<3> vec;
#if 1
#define __inline inline
#else
#define __inline
#endif
struct Isect {
float t;
vec p;
vec n;
int hit;
};
struct Sphere {
vec center;
float radius;
};
struct Plane {
vec p;
vec n;
};
struct Ray {
vec org;
vec dir;
};
static inline float dot(vec a, vec b) {
return a.x * b.x + a.y * b.y + a.z * b.z;
}
static inline vec vcross(vec v0, vec v1) {
vec ret;
ret.x = v0.y * v1.z - v0.z * v1.y;
ret.y = v0.z * v1.x - v0.x * v1.z;
ret.z = v0.x * v1.y - v0.y * v1.x;
return ret;
}
static inline void vnormalize(vec &v) {
float len2 = dot(v, v);
float invlen = rsqrt(len2);
v *= invlen;
}
__inline
static void
ray_plane_intersect(Isect &isect, Ray &ray, const Plane &plane) {
float d = -dot(plane.p, plane.n);
float v = dot(ray.dir, plane.n);
#if 0
cif (abs(v) < 1.0e-17)
return;
else {
float t = -(dot(ray.org, plane.n) + d) / v;
cif ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + ray.dir * t;
isect.n = plane.n;
}
}
#else
cif (abs(v) <= 1.0e-17)
return;
float t = -(dot(ray.org, plane.n) + d) / v;
cif ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + ray.dir * t;
isect.n = plane.n;
}
#endif
}
static inline void
ray_sphere_intersect(Isect &isect, Ray &ray, const Sphere &sphere) {
vec rs = ray.org - sphere.center;
float B = dot(rs, ray.dir);
float C = dot(rs, rs) - sphere.radius * sphere.radius;
float D = B * B - C;
#if 0
cif (D > 0.) {
float t = -B - sqrt(D);
cif ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + t * ray.dir;
isect.n = isect.p - sphere.center;
vnormalize(isect.n);
}
}
#else
cif (D <=0.0f)
return;
float t = -B - sqrt(D);
cif ((t > 0.0) && (t < isect.t)) {
isect.t = t;
isect.hit = 1;
isect.p = ray.org + t * ray.dir;
isect.n = isect.p - sphere.center;
vnormalize(isect.n);
}
#endif
}
__inline
static void
orthoBasis(vec basis[3], vec n) {
basis[2] = n;
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
if ((n.x < 0.6) && (n.x > -0.6)) {
basis[1].x = 1.0;
} else if ((n.y < 0.6) && (n.y > -0.6)) {
basis[1].y = 1.0;
} else if ((n.z < 0.6) && (n.z > -0.6)) {
basis[1].z = 1.0;
} else {
basis[1].x = 1.0;
}
basis[0] = vcross(basis[1], basis[2]);
vnormalize(basis[0]);
basis[1] = vcross(basis[2], basis[0]);
vnormalize(basis[1]);
}
__inline
static float
ambient_occlusion(Isect &isect, const Plane &plane, const Sphere spheres[3],
RNGState &rngstate) {
float eps = 0.0001f;
vec p, n;
vec basis[3];
float occlusion = 0.0;
p = isect.p + eps * isect.n;
orthoBasis(basis, isect.n);
static const uniform int ntheta = NAO_SAMPLES;
static const uniform int nphi = NAO_SAMPLES;
for (uniform int j = 0; j < ntheta; j++) {
for (uniform int i = 0; i < nphi; i++) {
Ray ray;
Isect occIsect;
float theta = sqrt(frandom(&rngstate));
float phi = 2.0f * M_PI * frandom(&rngstate);
float x = cos(phi) * theta;
float y = sin(phi) * theta;
float z = sqrt(1.0 - theta * theta);
// local . global
float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
ray.org = p;
ray.dir.x = rx;
ray.dir.y = ry;
ray.dir.z = rz;
occIsect.t = 1.0e+17;
occIsect.hit = 0;
for (uniform int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(occIsect, ray, spheres[snum]);
ray_plane_intersect (occIsect, ray, plane);
if (occIsect.hit) occlusion += 1.0;
}
}
occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
return occlusion;
}
static inline void ao_tiles(
uniform int x0, uniform int x1,
uniform int y0, uniform int y1,
uniform int w, uniform int h,
uniform int nsubsamples,
uniform float image[])
{
const Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
const Sphere spheres[3] = {
{ { -2.0f, 0.0f, -3.5f }, 0.5f },
{ { -0.5f, 0.0f, -3.0f }, 0.5f },
{ { 1.0f, 0.0f, -2.2f }, 0.5f } };
RNGState rngstate;
seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
float invSamples = 1.f / nsubsamples;
foreach_tiled (y = y0 ... y1, x = x0 ... x1)
{
const int offset = 3 * (y * w + x);
float res = 0.0f;
for (uniform int u = 0; u < nsubsamples; u++)
for (uniform int v = 0; v < nsubsamples; v++)
{
float du = (float)u * invSamples, dv = (float)v * invSamples;
// Figure out x,y pixel in NDC
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
float ret = 0.f;
Ray ray;
Isect isect;
ray.org = 0.f;
// Poor man's perspective projection
ray.dir.x = px;
ray.dir.y = py;
ray.dir.z = -1.0;
vnormalize(ray.dir);
isect.t = 1.0e+17;
isect.hit = 0;
for (uniform int snum = 0; snum < 3; ++snum)
ray_sphere_intersect(isect, ray, spheres[snum]);
ray_plane_intersect(isect, ray, plane);
// Note use of 'coherent' if statement; the set of rays we
// trace will often all hit or all miss the scene
#if 0
cif (isect.hit) {
ret = ambient_occlusion(isect, plane, spheres, rngstate);
ret *= invSamples * invSamples;
res += ret;
}
#else
if(any(isect.hit))
{
ret = isect.hit*ambient_occlusion(isect, plane, spheres, rngstate);
ret *= invSamples * invSamples;
res += ret;
}
#endif
}
image[offset ] = res;
image[offset+1] = res;
image[offset+2] = res;
}
}
#define TILEX max(64,programCount*2)
#define TILEY 4
export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
uniform float image[]) {
const uniform int x0 = 0;
const uniform int x1 = w;
const uniform int y0 = 0;
const uniform int y1 = h;
ao_tiles(x0,x1,y0,y1, w, h, nsubsamples, image);
}
void task ao_task(uniform int width, uniform int height,
uniform int nsubsamples, uniform float image[])
{
if (taskIndex0 >= taskCount0) return;
if (taskIndex1 >= taskCount1) return;
const uniform int x0 = taskIndex0 * TILEX;
const uniform int x1 = min(x0 + TILEX, width);
const uniform int y0 = taskIndex1 * TILEY;
const uniform int y1 = min(y0 + TILEY, height);
ao_tiles(x0,x1,y0,y1, width, height, nsubsamples, image);
}
export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples,
uniform float image[])
{
const uniform int ntilex = (w+TILEX-1)/TILEX;
const uniform int ntiley = (h+TILEY-1)/TILEY;
launch[ntilex,ntiley] ao_task(w, h, nsubsamples, image);
sync;
}

View File

@@ -0,0 +1,122 @@
TASK_CXX=../omp_tasksys.cpp ../../util/ispc_malloc.cpp
TASK_LIB=-lpthread
TASK_OBJ=objs/omp_tasksys.o objs/ispc_malloc.o
CXX=clang++
CXX=icc -openmp
CXXFLAGS+=-Iobjs/ -O2 -I../../ -I../../util
CXXFLAGS+=-DISPC_USE_OMP
CC=clang
CC=icc -openmp
CCFLAGS+=-Iobjs/ -O2 -I../../ -I../../util
CCFLAGS+=-DISPC_USE_OMP
LIBS=-lm $(TASK_LIB) -lstdc++
ISPC=ispc
ISPC_FLAGS+=-O2
ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
ifeq ($(ARCH),x86)
ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o)
COMMA=,
ifneq (,$(findstring $(COMMA),$(ISPC_IA_TARGETS)))
#$(info multi-target detected: $(ISPC_IA_TARGETS))
ifneq (,$(findstring sse2,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse2.o)
endif
ifneq (,$(findstring sse4,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse4.o)
endif
ifneq (,$(findstring avx1-,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx.o)
endif
ifneq (,$(findstring avx1.1,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx11.o)
endif
ifneq (,$(findstring avx2,$(ISPC_IA_TARGETS)))
ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx2.o)
endif
endif
ISPC_TARGETS=$(ISPC_IA_TARGETS)
ARCH_BIT:=$(shell getconf LONG_BIT)
ifeq ($(ARCH_BIT),32)
ISPC_FLAGS += --arch=x86
CXXFLAGS += -m32
CCFLAGS += -m32
else
ISPC_FLAGS += --arch=x86-64
CXXFLAGS += -m64
CCFLAGS += -m64
endif
else ifeq ($(ARCH),arm)
ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o))
ISPC_TARGETS=$(ISPC_ARM_TARGETS)
else
$(error Unknown architecture $(ARCH) from uname -m)
endif
CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))
CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))
OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)
default: $(EXAMPLE)
all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar
.PHONY: dirs clean
dirs:
/bin/mkdir -p objs/
objs/%.cpp objs/%.o objs/%.h: dirs
clean:
/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test
$(EXAMPLE): $(OBJS)
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
objs/%.o: %.cpp dirs $(ISPC_HEADER)
$(CXX) $< $(CXXFLAGS) -c -o $@
objs/%.o: %.c dirs $(ISPC_HEADER)
$(CC) $< $(CCFLAGS) -c -o $@
objs/%.o: ../%.cpp dirs
$(CXX) $< $(CXXFLAGS) -c -o $@
objs/%.o: ../../%.cpp dirs
$(CXX) $< $(CXXFLAGS) -c -o $@
objs/%.o: ../../util/%.cpp dirs
$(CXX) $< $(CXXFLAGS) -c -o $@
objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h dirs
objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o: %.ispc dirs
$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
$(CXX) -I../../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
$(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
$(CXX) -I../../intrinsics $< $(CXXFLAGS) -c -o $@
$(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1
$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)

View File

@@ -0,0 +1,52 @@
TASK_CXX=../omp_tasksys.cpp ../../util/ispc_malloc.cpp
TASK_OBJ=objs_knc/omp_tasksys.o objs_knc/ispc_malloc.o
TASK_LIB=-openmp
CXX=icc -openmp -mmic
CXXFLAGS+=-Iobjs_knc/ -O2 -I../../ -I../../util -I./
CXXFLAGS+= -DISPC_USE_OMP
CC=icc -openmp -mmic
CCFLAGS+= -Iobjs_knc/ -O2 -I../../ -I../../util -I./
CCFLAGS+=-DISPC_USE_OMP
LD=icc -mmic -openmp
LIBS=-lm $(TASK_LIB) -lstdc++
ISPC=ispc
ISPC_FLAGS+=-O2
ISPC_FLAGS+= --target=$(ISPC_TARGET) --c++-include-file=$(ISPC_INTRINSICS)
ISPC_HEADERS=$(ISPC_SRC:%.ispc=objs_knc/%_ispc.h)
ISPC_OBJ=$(ISPC_SRC:%.ispc=objs_knc/%_ispc.o)
CXX_OBJ=$(CXX_SRC:%.cpp=objs_knc/%.o)
CXX_OBJ+=$(TASK_OBJ)
PROG=$(EXAMPLE)_knc
all: dirs $(PROG)
dirs:
/bin/mkdir -p objs_knc/
objs_knc/%.cpp objs_knc/%.o objs_knc/%.h: dirs
clean:
/bin/rm -rf $(PROG) objs_knc
$(PROG): $(ISPC_OBJ) $(CXX_OBJ)
$(LD) -o $@ $^ $(LDFLAGS)
objs_knc/%.o: %.cpp
$(CXX) $(CXXFLAGS) -o $@ -c $<
objs_knc/%.o: ../%.cpp
$(CXX) $(CXXFLAGS) -o $@ -c $<
objs_knc/%.o: ../../%.cpp
$(CXX) $(CXXFLAGS) -o $@ -c $<
objs_knc/%.o: ../../util/%.cpp
$(CXX) $(CXXFLAGS) -o $@ -c $<
objs_knc/%_ispc.o: %.ispc
$(ISPC) $(ISPC_FLAGS) --emit-c++ -o objs_knc/$*_ispc_zmm.cpp -h objs_knc/$*_ispc.h $<
$(CXX) $(CXXFLAGS) -o $@ objs_knc/$*_ispc_zmm.cpp -c

View File

@@ -0,0 +1,136 @@
NVCC_SRC=../../util/nvcc_helpers.cu
NVCC_OBJS=objs_ptx/nvcc_helpers_nvcc.o
#
CXX=g++ -ffast-math
CXXFLAGS=-O3 -I$(CUDATK)/include -Iobjs_ptx/ -D_CUDA_ -I../../util -I../../
#
NVCC=nvcc
NVCC_FLAGS+=-O3 -arch=sm_35 -D_CUDA_ -I../../util -Xptxas=-v -Iobjs_ptx/
ifdef PTXCC_REGMAX
NVCC_FLAGS += --maxrregcount=$(PTXCC_REGMAX)
endif
NVCC_FLAGS+=--use_fast_math
#
LD=nvcc
LDFLAGS=-lcudart -lcudadevrt -arch=sm_35
#
PTXCC=$(ISPC_HOME)/ptxtools/ptxcc
PTXCC_FLAGS+= -Xptxas=-v
ifdef PTXCC_REGMAX
PTXCC_FLAGS += -maxrregcount=$(PTXCC_REGMAX)
endif
#
ISPC=$(ISPC_HOME)/ispc
ISPC_FLAGS+=-O3 --math-lib=fast --target=nvptx --opt=fast-math
#
#
#
ISPC_LLVM_OBJS=$(ISPC_SRC:%.ispc=objs_ptx/%_llvm_ispc.o)
ISPC_NVVM_OBJS=$(ISPC_SRC:%.ispc=objs_ptx/%_nvvm_ispc.o)
#ISPC_BCS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.bc)
ISPC_LLS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.ll)
ISPC_LLVM_PTX=$(ISPC_SRC:%.ispc=objs_ptx/%_llvm_ispc.ptx)
ISPC_NVVM_PTX=$(ISPC_SRC:%.ispc=objs_ptx/%_nvvm_ispc.ptx)
ISPC_HEADERS=$(ISPC_SRC:%.ispc=objs_ptx/%_ispc.h)
CXX_OBJS=$(CXX_SRC:%.cpp=objs_ptx/%_gcc.o)
CU_OBJS=$(CU_SRC:%.cu=objs_ptx/%_cu.o)
#NVCC_OBJS=$(NVCC_SRC:%.cu=objs_ptx/%_nvcc.o)
CXX_SRC+=ispc_malloc.cpp
CXX_OBJS+=objs_ptx/ispc_malloc_gcc.o
PTXGEN = $(ISPC_HOME)/ptxtools/ptxgen
PTXGEN += --use_fast_math
#LLVM32=$(HOME)/usr/local/llvm/bin-3.2
#LLVM32DIS=$(LLVM32)/bin/llvm-dis
LLC=$(LLVM_ROOT)/bin/llc
LLC_FLAGS=-march=nvptx64 -mcpu=sm_35
# .SUFFIXES: .bc .o .cu .ll
ifdef LLVM_GPU
OBJSptx_llvm=$(ISPC_LLVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS)
PROGptx_llvm=$(PROG)_llvm_ptx
else
ISPC_LLVM_PTX=
endif
ifdef NVVM_GPU
OBJSptx_nvvm=$(ISPC_NVVM_OBJS) $(CXX_OBJS) $(NVCC_OBJS) $(ISPC_LVVM_PTX)
PROGptx_nvvm=$(PROG)_nvvm_ptx
else
ISPC_NVVM_PTX=
endif
ifdef CU_SRC
OBJScu=$(CU_OBJS) $(CXX_OBJS) $(NVCC_OBJS)
PROGcu=$(PROG)_cu
endif
all: dirs \
$(PROGptx_nvvm) \
$(PROGptx_llvm) \
$(PROGcu) $(ISPC_BCS) $(ISPC_LLS) $(ISPC_HEADERS) $(ISPC_NVVM_PTX) $(ISPC_LLVM_PTX)
dirs:
/bin/mkdir -p objs_ptx/
objs_ptx/%.cpp objs_ptx/%.o objs_ptx/%.h: dirs
clean:
/bin/rm -rf $(PROGptx_nvvm) $(PROGptx_llvm) $(PROGcu) objs_ptx
# generate binaries
$(PROGptx_llvm): $(OBJSptx_llvm)
$(LD) -o $@ $^ $(LDFLAGS)
$(PROGptx_nvvm): $(OBJSptx_nvvm)
$(LD) -o $@ $^ $(LDFLAGS)
$(PROGcu): $(OBJScu)
$(LD) -o $@ $^ $(LDFLAGS)
# compile C++ code
objs_ptx/%_gcc.o: %.cpp $(ISPC_HEADERS)
$(CXX) $(CXXFLAGS) -o $@ -c $<
objs_ptx/%_gcc.o: ../../util/%.cpp
$(CXX) $(CXXFLAGS) -o $@ -c $<
# CUDA helpers
objs_ptx/%_cu.o: %.cu $(ISPC_HEADERS)
$(NVCC) $(NVCC_FLAGS) -o $@ -dc $<
# compile CUDA code
objs_ptx/%_nvcc.o: ../../util/%.cu
$(NVCC) $(NVCC_FLAGS) -o $@ -c $<
objs_ptx/%_nvcc.o: %.cu
$(NVCC) $(NVCC_FLAGS) -o $@ -c $<
# compile ISPC to LLVM BC
#objs_ptx/%_ispc.h objs_ptx/%_ispc.bc: %.ispc
# $(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.bc $<
objs_ptx/%_ispc.h objs_ptx/%_ispc.ll: %.ispc
$(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_ptx/$*_ispc.h -o objs_ptx/$*_ispc.ll $<
# generate PTX from LLVM BC
#objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.bc
# $(LLC) $(LLC_FLAGS) -o $@ $<
objs_ptx/%_llvm_ispc.ptx: objs_ptx/%_ispc.ll
$(LLC) $(LLC_FLAGS) -o $@ $<
#objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.bc
# $(LLVM32DIS) $< -o objs_ptx/$*_ispc-ll32.ll
# $(PTXGEN) objs_ptx/$*_ispc-ll32.ll -o $@
objs_ptx/%_nvvm_ispc.ptx: objs_ptx/%_ispc.ll
$(PTXGEN) $< -o $@
# generate an object file from PTX
objs_ptx/%_ispc.o: objs_ptx/%_ispc.ptx
$(PTXCC) $< -Xnvcc="$(PTXCC_FLAGS)" -o $@

View File

@@ -0,0 +1,10 @@
EXAMPLE=deferred_shading
CPP_SRC=common.cpp main.cpp dynamic_c.cpp
# CPP_SRC+=dynamic_cilk.cpp
ISPC_SRC=kernels.ispc
ISPC_IA_TARGETS=avx1-i32x16
ISPC_ARM_TARGETS=neon
ISPC_FLAGS=--opt=fast-math
include ../common_cpu.mk

View File

@@ -0,0 +1,8 @@
EXAMPLE=deferred_shading
CXX_SRC=common.cpp main.cpp dynamic_c.cpp
ISPC_SRC=kernels.ispc
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
ISPC_TARGET=generic-16
ISPC_FLAGS=--opt=fast-math
include ../common_knc.mk

View File

@@ -0,0 +1,13 @@
PROG=deferred_shading
ISPC_SRC=kernels.ispc
CU_SRC=kernels.cu
CXX_SRC=common.cpp main.cpp
PTXCC_REGMAX=64
NVVM_GPU=1
#LLVM_GPU=1
include ../common_ptx.mk

View File

@@ -0,0 +1,222 @@
/*
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#define ISPC_IS_WINDOWS
#elif defined(__linux__)
#define ISPC_IS_LINUX
#elif defined(__APPLE__)
#define ISPC_IS_APPLE
#endif
#include <fcntl.h>
#include <float.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <stdint.h>
#include <algorithm>
#include <assert.h>
#include <vector>
#ifdef ISPC_IS_WINDOWS
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
#ifdef ISPC_IS_LINUX
#include <malloc.h>
#endif
#include "deferred.h"
#include "timing.h"
#include "ispc_malloc.h"
///////////////////////////////////////////////////////////////////////////
static void *
lAlignedMalloc(size_t size, int32_t alignment) {
#ifndef _CUDA_
#ifdef ISPC_IS_WINDOWS
return _aligned_malloc(size, alignment);
#endif
#ifdef ISPC_IS_LINUX
return memalign(alignment, size);
#endif
#ifdef ISPC_IS_APPLE
void *mem = malloc(size + (alignment-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
(alignment - 1)));
((void**)amem)[-1] = mem;
return amem;
#endif
#else
void *ptr;
ispc_malloc(&ptr, size);
return ptr;
#endif
}
static void
lAlignedFree(void *ptr) {
#ifndef _CUDA_
#ifdef ISPC_IS_WINDOWS
_aligned_free(ptr);
#endif
#ifdef ISPC_IS_LINUX
free(ptr);
#endif
#ifdef ISPC_IS_APPLE
free(((void**)ptr)[-1]);
#endif
#else
ispc_free(ptr);
#endif
}
Framebuffer::Framebuffer(int width, int height) {
nPixels = width*height;
r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
}
Framebuffer::~Framebuffer() {
lAlignedFree(r);
lAlignedFree(g);
lAlignedFree(b);
}
void
Framebuffer::clear() {
memset(r, 0, nPixels);
memset(g, 0, nPixels);
memset(b, 0, nPixels);
}
InputData *
CreateInputDataFromFile(const char *path) {
FILE *in = fopen(path, "rb");
if (!in) return 0;
InputData *input = new InputData;
// Load header
if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
return NULL;
}
// Load data chunk and update pointers
input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize,
ALIGNMENT_BYTES);
if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
return NULL;
}
input->arrays.zBuffer =
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
input->arrays.normalEncoded_x =
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
input->arrays.normalEncoded_y =
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
input->arrays.specularAmount =
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
input->arrays.specularPower =
(uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
input->arrays.albedo_x =
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
input->arrays.albedo_y =
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
input->arrays.albedo_z =
(uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
input->arrays.lightPositionView_x =
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
input->arrays.lightPositionView_y =
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
input->arrays.lightPositionView_z =
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
input->arrays.lightAttenuationBegin =
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
input->arrays.lightColor_x =
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
input->arrays.lightColor_y =
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
input->arrays.lightColor_z =
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
input->arrays.lightAttenuationEnd =
(float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
fclose(in);
return input;
}
void DeleteInputData(InputData *input) {
lAlignedFree(input->chunk);
}
void WriteFrame(const char *filename, const InputData *input,
const Framebuffer &framebuffer) {
// Deswizzle and copy to RGBA output
// Doesn't need to be fast... only happens once
size_t imageBytes = 3 * input->header.framebufferWidth *
input->header.framebufferHeight;
uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
memset(framebufferAOS, 0, imageBytes);
for (int i = 0; i < input->header.framebufferWidth *
input->header.framebufferHeight; ++i) {
framebufferAOS[3 * i + 0] = framebuffer.r[i];
framebufferAOS[3 * i + 1] = framebuffer.g[i];
framebufferAOS[3 * i + 2] = framebuffer.b[i];
}
// Write out simple PPM file
FILE *out = fopen(filename, "wb");
fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth,
input->header.framebufferHeight);
fwrite(framebufferAOS, imageBytes, 1, out);
fclose(out);
lAlignedFree(framebufferAOS);
}

View File

@@ -0,0 +1 @@
../../deferred/data

View File

@@ -0,0 +1,108 @@
/*
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DEFERRED_H
#define DEFERRED_H
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
#define MIN_TILE_WIDTH 64
#define MIN_TILE_HEIGHT 16
#define MAX_LIGHTS 1024
enum InputDataArraysEnum {
idaZBuffer = 0,
idaNormalEncoded_x,
idaNormalEncoded_y,
idaSpecularAmount,
idaSpecularPower,
idaAlbedo_x,
idaAlbedo_y,
idaAlbedo_z,
idaLightPositionView_x,
idaLightPositionView_y,
idaLightPositionView_z,
idaLightAttenuationBegin,
idaLightColor_x,
idaLightColor_y,
idaLightColor_z,
idaLightAttenuationEnd,
idaNum
};
#ifndef ISPC
#include <stdint.h>
#include "kernels_ispc.h"
#define ALIGNMENT_BYTES 64
#define MAX_LIGHTS 1024
#define VISUALIZE_LIGHT_COUNT 0
struct InputData
{
ispc::InputHeader header;
ispc::InputDataArrays arrays;
uint8_t *chunk;
};
struct Framebuffer {
Framebuffer(int width, int height);
~Framebuffer();
void clear();
uint8_t *r, *g, *b;
private:
int nPixels;
Framebuffer(const Framebuffer &);
Framebuffer &operator=(const Framebuffer *);
};
InputData *CreateInputDataFromFile(const char *path);
void DeleteInputData(InputData *input);
void WriteFrame(const char *filename, const InputData *input,
const Framebuffer &framebuffer);
void InitDynamicC(InputData *input);
void InitDynamicCilk(InputData *input);
void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
#endif // !ISPC
#endif // DEFERRED_H

View File

@@ -0,0 +1,874 @@
/*
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "deferred.h"
#include "kernels_ispc.h"
#include <algorithm>
#include <stdint.h>
#include <assert.h>
#include <math.h>
#ifdef _MSC_VER
#define ISPC_IS_WINDOWS
#elif defined(__linux__)
#define ISPC_IS_LINUX
#elif defined(__APPLE__)
#define ISPC_IS_APPLE
#endif
#ifdef ISPC_IS_LINUX
#include <malloc.h>
#endif // ISPC_IS_LINUX
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
#ifndef MIN_TILE_WIDTH
#define MIN_TILE_WIDTH 16
#endif
#ifndef MIN_TILE_HEIGHT
#define MIN_TILE_HEIGHT 16
#endif
#define DYNAMIC_TREE_LEVELS 5
// If this is set to 1 then the result will be identical to the static version
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
static void *
lAlignedMalloc(size_t size, int32_t alignment) {
#ifdef ISPC_IS_WINDOWS
return _aligned_malloc(size, alignment);
#endif
#ifdef ISPC_IS_LINUX
return memalign(alignment, size);
#endif
#ifdef ISPC_IS_APPLE
void *mem = malloc(size + (alignment-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
(alignment - 1)));
((void**)amem)[-1] = mem;
return amem;
#endif
}
static void
lAlignedFree(void *ptr) {
#ifdef ISPC_IS_WINDOWS
_aligned_free(ptr);
#endif
#ifdef ISPC_IS_LINUX
free(ptr);
#endif
#ifdef ISPC_IS_APPLE
free(((void**)ptr)[-1]);
#endif
}
static void
ComputeZBounds(int tileStartX, int tileEndX,
int tileStartY, int tileEndY,
// G-buffer data
float zBuffer[],
int gBufferWidth,
// Camera data
float cameraProj_33, float cameraProj_43,
float cameraNear, float cameraFar,
// Output
float *minZ, float *maxZ)
{
// Find Z bounds
float laneMinZ = cameraFar;
float laneMaxZ = cameraNear;
for (int y = tileStartY; y < tileEndY; ++y) {
for (int x = tileStartX; x < tileEndX; ++x) {
// Unproject depth buffer Z value into view space
float z = zBuffer[(y * gBufferWidth + x)];
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
// Work out Z bounds for our samples
// Avoid considering skybox/background or otherwise invalid pixels
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
laneMinZ = std::min(laneMinZ, viewSpaceZ);
laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
}
}
}
*minZ = laneMinZ;
*maxZ = laneMaxZ;
}
static void
ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
int numTilesX, int numTilesY,
// G-buffer data
float zBuffer[],
int gBufferWidth,
// Camera data
float cameraProj_33, float cameraProj_43,
float cameraNear, float cameraFar,
// Output
float minZArray[],
float maxZArray[])
{
for (int tileX = 0; tileX < numTilesX; ++tileX) {
float minZ, maxZ;
ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
tileY * tileHeight, tileY * tileHeight + tileHeight,
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43,
cameraNear, cameraFar, &minZ, &maxZ);
minZArray[tileX] = minZ;
maxZArray[tileX] = maxZ;
}
}
class MinMaxZTree
{
public:
// Currently (min) tile dimensions must divide gBuffer dimensions evenly
// Levels must be small enough that neither dimension goes below one tile
MinMaxZTree(
int tileWidth, int tileHeight, int levels,
int gBufferWidth, int gBufferHeight)
: mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
{
mNumTilesX = gBufferWidth / mTileWidth;
mNumTilesY = gBufferHeight / mTileHeight;
// Allocate arrays
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
for (int i = 0; i < mLevels; ++i) {
int x = NumTilesX(i);
int y = NumTilesY(i);
assert(x > 0);
assert(y > 0);
// NOTE: If the following two asserts fire it probably means that
// the base tile dimensions do not evenly divide the G-buffer dimensions
assert(x * (mTileWidth << i) >= gBufferWidth);
assert(y * (mTileHeight << i) >= gBufferHeight);
mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
}
}
void Update(float *zBuffer, int gBufferPitchInElements,
float cameraProj_33, float cameraProj_43,
float cameraNear, float cameraFar)
{
for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
zBuffer, gBufferPitchInElements,
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
mMinZArrays[0] + (tileY * mNumTilesX),
mMaxZArrays[0] + (tileY * mNumTilesX));
}
// Generate other levels
for (int level = 1; level < mLevels; ++level) {
int destTilesX = NumTilesX(level);
int destTilesY = NumTilesY(level);
int srcLevel = level - 1;
int srcTilesX = NumTilesX(srcLevel);
int srcTilesY = NumTilesY(srcLevel);
for (int y = 0; y < destTilesY; ++y) {
for (int x = 0; x < destTilesX; ++x) {
int srcX = x << 1;
int srcY = y << 1;
// NOTE: Ugly branches to deal with non-multiple dimensions at some levels
// TODO: SSE branchless min/max is probably better...
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
if (srcX + 1 < srcTilesX) {
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
(srcX + 1)]);
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
(srcX + 1)]);
if (srcY + 1 < srcTilesY) {
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
(srcX + 1)]);
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
(srcX + 1)]);
}
}
if (srcY + 1 < srcTilesY) {
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
(srcX )]);
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
(srcX )]);
}
mMinZArrays[level][y * destTilesX + x] = minZ;
mMaxZArrays[level][y * destTilesX + x] = maxZ;
}
}
}
}
~MinMaxZTree() {
for (int i = 0; i < mLevels; ++i) {
lAlignedFree(mMinZArrays[i]);
lAlignedFree(mMaxZArrays[i]);
}
lAlignedFree(mMinZArrays);
lAlignedFree(mMaxZArrays);
}
int Levels() const { return mLevels; }
// These round UP, so beware that the last tile for a given level may not be completely full
// TODO: Verify this...
int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
int TileWidth(int level = 0) const { return (mTileWidth << level); }
int TileHeight(int level = 0) const { return (mTileHeight << level); }
float MinZ(int level, int tileX, int tileY) const {
return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
}
float MaxZ(int level, int tileX, int tileY) const {
return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
}
private:
int mTileWidth;
int mTileHeight;
int mLevels;
int mNumTilesX;
int mNumTilesY;
// One array for each "level" in the tree
float **mMinZArrays;
float **mMaxZArrays;
};
static MinMaxZTree *gMinMaxZTree = 0;
void InitDynamicC(InputData *input) {
gMinMaxZTree =
new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
input->header.framebufferWidth,
input->header.framebufferHeight);
}
/* We're going to split a tile into 4 sub-tiles. This function
reclassifies the tile's lights with respect to the sub-tiles. */
static void
SplitTileMinMax(
int tileMidX, int tileMidY,
// Subtile data (00, 10, 01, 11)
float subtileMinZ[],
float subtileMaxZ[],
// G-buffer data
int gBufferWidth, int gBufferHeight,
// Camera data
float cameraProj_11, float cameraProj_22,
// Light Data
int lightIndices[],
int numLights,
float light_positionView_x_array[],
float light_positionView_y_array[],
float light_positionView_z_array[],
float light_attenuationEnd_array[],
// Outputs
int subtileIndices[],
int subtileIndicesPitch,
int subtileNumLights[]
)
{
float gBufferScale_x = 0.5f * (float)gBufferWidth;
float gBufferScale_y = 0.5f * (float)gBufferHeight;
float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y) };
float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
tileMidY - gBufferScale_y };
for (int i = 0; i < 2; ++i) {
// Normalize
float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
frustumPlanes_z[i] * frustumPlanes_z[i]);
frustumPlanes_xy[i] *= norm;
frustumPlanes_z[i] *= norm;
}
// Initialize
int subtileLightOffset[4];
subtileLightOffset[0] = 0 * subtileIndicesPitch;
subtileLightOffset[1] = 1 * subtileIndicesPitch;
subtileLightOffset[2] = 2 * subtileIndicesPitch;
subtileLightOffset[3] = 3 * subtileIndicesPitch;
for (int i = 0; i < numLights; ++i) {
int lightIndex = lightIndices[i];
float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex];
float light_positionView_z = light_positionView_z_array[lightIndex];
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
float light_attenuationEndNeg = -light_attenuationEnd;
// Test lights again against subtile z bounds
bool inFrustum[4];
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
float dx = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0];
float dy = light_positionView_z * frustumPlanes_z[1] +
light_positionView_y * frustumPlanes_xy[1];
if (fabsf(dx) > light_attenuationEnd) {
bool positiveX = dx > 0.0f;
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile
inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile
inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile
}
if (fabsf(dy) > light_attenuationEnd) {
bool positiveY = dy > 0.0f;
inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile
inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile
inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile
inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile
}
if (inFrustum[0])
subtileIndices[subtileLightOffset[0]++] = lightIndex;
if (inFrustum[1])
subtileIndices[subtileLightOffset[1]++] = lightIndex;
if (inFrustum[2])
subtileIndices[subtileLightOffset[2]++] = lightIndex;
if (inFrustum[3])
subtileIndices[subtileLightOffset[3]++] = lightIndex;
}
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
}
static inline float
dot3(float x, float y, float z, float a, float b, float c) {
return (x*a + y*b + z*c);
}
static inline void
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
float n = 1.f / sqrtf(x*x + y*y + z*z);
ox = x * n;
oy = y * n;
oz = z * n;
}
static inline float
Unorm8ToFloat32(uint8_t u) {
return (float)u * (1.0f / 255.0f);
}
static inline uint8_t
Float32ToUnorm8(float f) {
return (uint8_t)(f * 255.0f);
}
static inline float
half_to_float_fast(uint16_t h) {
uint32_t hs = h & (int32_t)0x8000u; // Pick off sign bit
uint32_t he = h & (int32_t)0x7C00u; // Pick off exponent bits
uint32_t hm = h & (int32_t)0x03FFu; // Pick off mantissa bits
// sign
uint32_t xs = ((uint32_t) hs) << 16;
// Exponent: unbias the halfp, then bias the single
int32_t xes = ((int32_t) (he >> 10)) - 15 + 127;
// Exponent
uint32_t xe = (uint32_t) (xes << 23);
// Mantissa
uint32_t xm = ((uint32_t) hm) << 13;
uint32_t bits = (xs | xe | xm);
float *fp = reinterpret_cast<float *>(&bits);
return *fp;
}
static void
ShadeTileC(
int32_t tileStartX, int32_t tileEndX,
int32_t tileStartY, int32_t tileEndY,
int32_t gBufferWidth, int32_t gBufferHeight,
const ispc::InputDataArrays &inputData,
// Camera data
float cameraProj_11, float cameraProj_22,
float cameraProj_33, float cameraProj_43,
// Light list
int32_t tileLightIndices[],
int32_t tileNumLights,
// UI
bool visualizeLightCount,
// Output
uint8_t framebuffer_r[],
uint8_t framebuffer_g[],
uint8_t framebuffer_b[]
)
{
if (tileNumLights == 0 || visualizeLightCount) {
uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
for (int32_t y = tileStartY; y < tileEndY; ++y) {
for (int32_t x = tileStartX; x < tileEndX; ++x) {
int32_t framebufferIndex = (y * gBufferWidth + x);
framebuffer_r[framebufferIndex] = c;
framebuffer_g[framebufferIndex] = c;
framebuffer_b[framebufferIndex] = c;
}
}
} else {
float twoOverGBufferWidth = 2.0f / gBufferWidth;
float twoOverGBufferHeight = 2.0f / gBufferHeight;
for (int32_t y = tileStartY; y < tileEndY; ++y) {
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
for (int32_t x = tileStartX; x < tileEndX; ++x) {
int32_t gBufferOffset = y * gBufferWidth + x;
// Reconstruct position and (negative) view vector from G-buffer
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
float Vneg_x, Vneg_y, Vneg_z;
float z = inputData.zBuffer[gBufferOffset];
// Compute screen/clip-space position
// NOTE: Mind DX11 viewport transform and pixel center!
float positionScreen_x = (0.5f + (float)(x)) *
twoOverGBufferWidth - 1.0f;
// Unproject depth buffer Z value into view space
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
surface_positionView_x = positionScreen_x * surface_positionView_z /
cameraProj_11;
surface_positionView_y = positionScreen_y * surface_positionView_z /
cameraProj_22;
// We actually end up with a vector pointing *at* the
// surface (i.e. the negative view vector)
normalize3(surface_positionView_x, surface_positionView_y,
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
// Reconstruct normal from G-buffer
float surface_normal_x, surface_normal_y, surface_normal_z;
float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
float m = sqrtf(4.0f * f - 1.0f);
surface_normal_x = m * (4.0f * normal_x - 2.0f);
surface_normal_y = m * (4.0f * normal_y - 2.0f);
surface_normal_z = 3.0f - 8.0f * f;
// Load other G-buffer parameters
float surface_specularAmount =
half_to_float_fast(inputData.specularAmount[gBufferOffset]);
float surface_specularPower =
half_to_float_fast(inputData.specularPower[gBufferOffset]);
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
float lit_x = 0.0f;
float lit_y = 0.0f;
float lit_z = 0.0f;
for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights;
++tileLightIndex) {
int32_t lightIndex = tileLightIndices[tileLightIndex];
// Gather light data relevant to initial culling
float light_positionView_x =
inputData.lightPositionView_x[lightIndex];
float light_positionView_y =
inputData.lightPositionView_y[lightIndex];
float light_positionView_z =
inputData.lightPositionView_z[lightIndex];
float light_attenuationEnd =
inputData.lightAttenuationEnd[lightIndex];
// Compute light vector
float L_x = light_positionView_x - surface_positionView_x;
float L_y = light_positionView_y - surface_positionView_y;
float L_z = light_positionView_z - surface_positionView_z;
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
// Clip at end of attenuation
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
if (distanceToLight2 < light_attenutaionEnd2) {
float distanceToLight = sqrtf(distanceToLight2);
float distanceToLightRcp = 1.f / distanceToLight;
L_x *= distanceToLightRcp;
L_y *= distanceToLightRcp;
L_z *= distanceToLightRcp;
// Start computing brdf
float NdotL = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, L_x, L_y, L_z);
// Clip back facing
if (NdotL > 0.0f) {
float light_attenuationBegin =
inputData.lightAttenuationBegin[lightIndex];
// Light distance attenuation (linstep)
float lightRange = (light_attenuationEnd - light_attenuationBegin);
float falloffPosition = (light_attenuationEnd - distanceToLight);
float attenuation = std::min(falloffPosition / lightRange, 1.0f);
float H_x = (L_x - Vneg_x);
float H_y = (L_y - Vneg_y);
float H_z = (L_z - Vneg_z);
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
float NdotH = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, H_x, H_y, H_z);
NdotH = std::max(NdotH, 0.0f);
float specular = powf(NdotH, surface_specularPower);
float specularNorm = (surface_specularPower + 2.0f) *
(1.0f / 8.0f);
float specularContrib = surface_specularAmount *
specularNorm * specular;
float k = attenuation * NdotL * (1.0f + specularContrib);
float light_color_x = inputData.lightColor_x[lightIndex];
float light_color_y = inputData.lightColor_y[lightIndex];
float light_color_z = inputData.lightColor_z[lightIndex];
float lightContrib_x = surface_albedo_x * light_color_x;
float lightContrib_y = surface_albedo_y * light_color_y;
float lightContrib_z = surface_albedo_z * light_color_z;
lit_x += lightContrib_x * k;
lit_y += lightContrib_y * k;
lit_z += lightContrib_z * k;
}
}
}
// Gamma correct
float gamma = 1.0 / 2.2f;
lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
}
}
}
}
void
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
int *lightIndices, int numLights,
Framebuffer *framebuffer) {
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
// If we few enough lights or this is the base case (last level), shade
// this full tile directly
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
int width = minMaxZTree->TileWidth(level);
int height = minMaxZTree->TileHeight(level);
int startX = tileX * width;
int startY = tileY * height;
int endX = std::min(input->header.framebufferWidth, startX + width);
int endY = std::min(input->header.framebufferHeight, startY + height);
// Skip entirely offscreen tiles
if (endX > startX && endY > startY) {
ShadeTileC(startX, endX, startY, endY,
input->header.framebufferWidth, input->header.framebufferHeight,
input->arrays,
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
framebuffer->r, framebuffer->g, framebuffer->b);
}
}
else {
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
// Move down a level in the tree
--level;
tileX <<= 1;
tileY <<= 1;
int width = minMaxZTree->TileWidth(level);
int height = minMaxZTree->TileHeight(level);
// Work out splitting coords
int midX = (tileX + 1) * width;
int midY = (tileY + 1) * height;
// Read subtile min/max data
// NOTE: We must be sure to handle out-of-bounds access here since
// sometimes we'll only have 1 or 2 subtiles for non-pow-2
// framebuffer sizes.
bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
// NOTE: Order is 00, 10, 01, 11
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
input->header.cameraFar, input->header.cameraFar};
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
input->header.cameraNear, input->header.cameraNear};
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
if (rightTileExists) {
minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
if (bottomTileExists) {
minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
}
}
if (bottomTileExists) {
minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
}
// Cull lights into subtile lists
#ifdef ISPC_IS_WINDOWS
__declspec(align(ALIGNMENT_BYTES))
#endif
int subtileLightIndices[4][MAX_LIGHTS]
#ifndef ISPC_IS_WINDOWS
__attribute__ ((aligned(ALIGNMENT_BYTES)))
#endif
;
int subtileNumLights[4];
SplitTileMinMax(midX, midY, minZ, maxZ,
input->header.framebufferWidth, input->header.framebufferHeight,
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
lightIndices, numLights, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
input->arrays.lightAttenuationEnd,
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
// Recurse into subtiles
ShadeDynamicTileRecurse(input, level, tileX , tileY,
subtileLightIndices[0], subtileNumLights[0],
framebuffer);
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
subtileLightIndices[1], subtileNumLights[1],
framebuffer);
ShadeDynamicTileRecurse(input, level, tileX , tileY + 1,
subtileLightIndices[2], subtileNumLights[2],
framebuffer);
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
subtileLightIndices[3], subtileNumLights[3],
framebuffer);
}
}
static int
IntersectLightsWithTileMinMax(
int tileStartX, int tileEndX,
int tileStartY, int tileEndY,
// Tile data
float minZ,
float maxZ,
// G-buffer data
int gBufferWidth, int gBufferHeight,
// Camera data
float cameraProj_11, float cameraProj_22,
// Light Data
int numLights,
float light_positionView_x_array[],
float light_positionView_y_array[],
float light_positionView_z_array[],
float light_attenuationEnd_array[],
// Output
int tileLightIndices[]
)
{
float gBufferScale_x = 0.5f * (float)gBufferWidth;
float gBufferScale_y = 0.5f * (float)gBufferHeight;
float frustumPlanes_xy[4];
float frustumPlanes_z[4];
// This one is totally constant over the whole screen... worth pulling it up at all?
float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
(cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y),
-(cameraProj_22 * gBufferScale_y) };
float frustumPlanes_z_v[4] = { tileEndX - gBufferScale_x,
-tileStartX + gBufferScale_x,
tileEndY - gBufferScale_y,
-tileStartY + gBufferScale_y };
for (int i = 0; i < 4; ++i) {
float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] +
frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
frustumPlanes_xy_v[i] *= norm;
frustumPlanes_z_v[i] *= norm;
frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
frustumPlanes_z[i] = frustumPlanes_z_v[i];
}
int tileNumLights = 0;
for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
float light_positionView_z = light_positionView_z_array[lightIndex];
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
float light_attenuationEndNeg = -light_attenuationEnd;
float d = light_positionView_z - minZ;
bool inFrustum = (d >= light_attenuationEndNeg);
d = maxZ - light_positionView_z;
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
if (!inFrustum)
continue;
float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex];
d = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[1] +
light_positionView_x * frustumPlanes_xy[1];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[2] +
light_positionView_y * frustumPlanes_xy[2];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[3] +
light_positionView_y * frustumPlanes_xy[3];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// Pack and store intersecting lights
if (inFrustum)
tileLightIndices[tileNumLights++] = lightIndex;
}
return tileNumLights;
}
void
ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
Framebuffer *framebuffer) {
const MinMaxZTree *minMaxZTree = gMinMaxZTree;
// Get Z min/max for this tile
int width = minMaxZTree->TileWidth(level);
int height = minMaxZTree->TileHeight(level);
float minZ = minMaxZTree->MinZ(level, tileX, tileY);
float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
int startX = tileX * width;
int startY = tileY * height;
int endX = std::min(input->header.framebufferWidth, startX + width);
int endY = std::min(input->header.framebufferHeight, startY + height);
// This is a root tile, so first do a full 6-plane cull
#ifdef ISPC_IS_WINDOWS
__declspec(align(ALIGNMENT_BYTES))
#endif
int lightIndices[MAX_LIGHTS]
#ifndef ISPC_IS_WINDOWS
__attribute__ ((aligned(ALIGNMENT_BYTES)))
#endif
;
int numLights = IntersectLightsWithTileMinMax(
startX, endX, startY, endY, minZ, maxZ,
input->header.framebufferWidth, input->header.framebufferHeight,
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
MAX_LIGHTS, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
input->arrays.lightAttenuationEnd, lightIndices);
// Now kick off the recursive process for this tile
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
numLights, framebuffer);
}
void
DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
{
MinMaxZTree *minMaxZTree = gMinMaxZTree;
// Update min/max Z tree
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
input->header.cameraNear, input->header.cameraFar);
int rootLevel = minMaxZTree->Levels() - 1;
int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
int rootTiles = rootTilesX * rootTilesY;
for (int g = 0; g < rootTiles; ++g) {
uint32_t tileY = g / rootTilesX;
uint32_t tileX = g % rootTilesX;
ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
}
}

View File

@@ -0,0 +1,398 @@
/*
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef __cilk
#include "deferred.h"
#include "kernels_ispc.h"
#include <algorithm>
#include <assert.h>
#ifdef _MSC_VER
#define ISPC_IS_WINDOWS
#elif defined(__linux__)
#define ISPC_IS_LINUX
#elif defined(__APPLE__)
#define ISPC_IS_APPLE
#endif
#ifdef ISPC_IS_LINUX
#include <malloc.h>
#endif // ISPC_IS_LINUX
// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
#define MIN_TILE_WIDTH 16
#define MIN_TILE_HEIGHT 16
#define DYNAMIC_TREE_LEVELS 5
// If this is set to 1 then the result will be identical to the static version
#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
static void *
lAlignedMalloc(size_t size, int32_t alignment) {
#ifdef ISPC_IS_WINDOWS
return _aligned_malloc(size, alignment);
#endif
#ifdef ISPC_IS_LINUX
return memalign(alignment, size);
#endif
#ifdef ISPC_IS_APPLE
void *mem = malloc(size + (alignment-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
(alignment - 1)));
((void**)amem)[-1] = mem;
return amem;
#endif
}
static void
lAlignedFree(void *ptr) {
#ifdef ISPC_IS_WINDOWS
_aligned_free(ptr);
#endif
#ifdef ISPC_IS_LINUX
free(ptr);
#endif
#ifdef ISPC_IS_APPLE
free(((void**)ptr)[-1]);
#endif
}
class MinMaxZTreeCilk
{
public:
// Currently (min) tile dimensions must divide gBuffer dimensions evenly
// Levels must be small enough that neither dimension goes below one tile
MinMaxZTreeCilk(
int tileWidth, int tileHeight, int levels,
int gBufferWidth, int gBufferHeight)
: mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
{
mNumTilesX = gBufferWidth / mTileWidth;
mNumTilesY = gBufferHeight / mTileHeight;
// Allocate arrays
mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
for (int i = 0; i < mLevels; ++i) {
int x = NumTilesX(i);
int y = NumTilesY(i);
assert(x > 0);
assert(y > 0);
// NOTE: If the following two asserts fire it probably means that
// the base tile dimensions do not evenly divide the G-buffer dimensions
assert(x * (mTileWidth << i) >= gBufferWidth);
assert(y * (mTileHeight << i) >= gBufferHeight);
mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
}
}
void Update(float *zBuffer, int gBufferPitchInElements,
float cameraProj_33, float cameraProj_43,
float cameraNear, float cameraFar)
{
// Compute level 0 in parallel. Outer loops is here since we use Cilk
_Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
ispc::ComputeZBoundsRow(tileY,
mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
zBuffer, gBufferPitchInElements,
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
mMinZArrays[0] + (tileY * mNumTilesX),
mMaxZArrays[0] + (tileY * mNumTilesX));
}
// Generate other levels
// NOTE: We currently don't use ispc here since it's sort of an
// awkward gather-based reduction Using SSE odd pack/unpack
// instructions might actually work here when we need to optimize
for (int level = 1; level < mLevels; ++level) {
int destTilesX = NumTilesX(level);
int destTilesY = NumTilesY(level);
int srcLevel = level - 1;
int srcTilesX = NumTilesX(srcLevel);
int srcTilesY = NumTilesY(srcLevel);
_Cilk_for (int y = 0; y < destTilesY; ++y) {
for (int x = 0; x < destTilesX; ++x) {
int srcX = x << 1;
int srcY = y << 1;
// NOTE: Ugly branches to deal with non-multiple dimensions at some levels
// TODO: SSE branchless min/max is probably better...
float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
if (srcX + 1 < srcTilesX) {
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX +
(srcX + 1)]);
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
(srcX + 1)]);
if (srcY + 1 < srcTilesY) {
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
(srcX + 1)]);
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
(srcX + 1)]);
}
}
if (srcY + 1 < srcTilesY) {
minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
(srcX )]);
maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
(srcX )]);
}
mMinZArrays[level][y * destTilesX + x] = minZ;
mMaxZArrays[level][y * destTilesX + x] = maxZ;
}
}
}
}
~MinMaxZTreeCilk() {
for (int i = 0; i < mLevels; ++i) {
lAlignedFree(mMinZArrays[i]);
lAlignedFree(mMaxZArrays[i]);
}
lAlignedFree(mMinZArrays);
lAlignedFree(mMaxZArrays);
}
int Levels() const { return mLevels; }
// These round UP, so beware that the last tile for a given level may not be completely full
// TODO: Verify this...
int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
int TileWidth(int level = 0) const { return (mTileWidth << level); }
int TileHeight(int level = 0) const { return (mTileHeight << level); }
float MinZ(int level, int tileX, int tileY) const {
return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
}
float MaxZ(int level, int tileX, int tileY) const {
return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
}
private:
int mTileWidth;
int mTileHeight;
int mLevels;
int mNumTilesX;
int mNumTilesY;
// One array for each "level" in the tree
float **mMinZArrays;
float **mMaxZArrays;
};
static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
void InitDynamicCilk(InputData *input) {
gMinMaxZTreeCilk =
new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
input->header.framebufferWidth,
input->header.framebufferHeight);
}
static void
ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY,
int *lightIndices, int numLights,
Framebuffer *framebuffer) {
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
// If we few enough lights or this is the base case (last level), shade
// this full tile directly
if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
int width = minMaxZTree->TileWidth(level);
int height = minMaxZTree->TileHeight(level);
int startX = tileX * width;
int startY = tileY * height;
int endX = std::min(input->header.framebufferWidth, startX + width);
int endY = std::min(input->header.framebufferHeight, startY + height);
// Skip entirely offscreen tiles
if (endX > startX && endY > startY) {
ispc::ShadeTile(
startX, endX, startY, endY,
input->header.framebufferWidth, input->header.framebufferHeight,
&input->arrays,
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
lightIndices, numLights, VISUALIZE_LIGHT_COUNT,
framebuffer->r, framebuffer->g, framebuffer->b);
}
}
else {
// Otherwise, subdivide and 4-way recurse using X and Y splitting planes
// Move down a level in the tree
--level;
tileX <<= 1;
tileY <<= 1;
int width = minMaxZTree->TileWidth(level);
int height = minMaxZTree->TileHeight(level);
// Work out splitting coords
int midX = (tileX + 1) * width;
int midY = (tileY + 1) * height;
// Read subtile min/max data
// NOTE: We must be sure to handle out-of-bounds access here since
// sometimes we'll only have 1 or 2 subtiles for non-pow-2
// framebuffer sizes.
bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
// NOTE: Order is 00, 10, 01, 11
// Set defaults up to cull all lights if the tile doesn't exist (offscreen)
float minZ[4] = {input->header.cameraFar, input->header.cameraFar,
input->header.cameraFar, input->header.cameraFar};
float maxZ[4] = {input->header.cameraNear, input->header.cameraNear,
input->header.cameraNear, input->header.cameraNear};
minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
if (rightTileExists) {
minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
if (bottomTileExists) {
minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
}
}
if (bottomTileExists) {
minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
}
// Cull lights into subtile lists
#ifdef ISPC_IS_WINDOWS
__declspec(align(ALIGNMENT_BYTES))
#endif
int subtileLightIndices[4][MAX_LIGHTS]
#ifndef ISPC_IS_WINDOWS
__attribute__ ((aligned(ALIGNMENT_BYTES)))
#endif
;
int subtileNumLights[4];
ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
input->header.framebufferWidth, input->header.framebufferHeight,
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
lightIndices, numLights, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
input->arrays.lightAttenuationEnd,
subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
// Recurse into subtiles
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY,
subtileLightIndices[0], subtileNumLights[0],
framebuffer);
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
subtileLightIndices[1], subtileNumLights[1],
framebuffer);
_Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX , tileY + 1,
subtileLightIndices[2], subtileNumLights[2],
framebuffer);
ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
subtileLightIndices[3], subtileNumLights[3],
framebuffer);
}
}
static void
ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
Framebuffer *framebuffer) {
const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
// Get Z min/max for this tile
int width = minMaxZTree->TileWidth(level);
int height = minMaxZTree->TileHeight(level);
float minZ = minMaxZTree->MinZ(level, tileX, tileY);
float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
int startX = tileX * width;
int startY = tileY * height;
int endX = std::min(input->header.framebufferWidth, startX + width);
int endY = std::min(input->header.framebufferHeight, startY + height);
// This is a root tile, so first do a full 6-plane cull
#ifdef ISPC_IS_WINDOWS
__declspec(align(ALIGNMENT_BYTES))
#endif
int lightIndices[MAX_LIGHTS]
#ifndef ISPC_IS_WINDOWS
__attribute__ ((aligned(ALIGNMENT_BYTES)))
#endif
;
int numLights = ispc::IntersectLightsWithTileMinMax(
startX, endX, startY, endY, minZ, maxZ,
input->header.framebufferWidth, input->header.framebufferHeight,
input->header.cameraProj[0][0], input->header.cameraProj[1][1],
MAX_LIGHTS, input->arrays.lightPositionView_x,
input->arrays.lightPositionView_y, input->arrays.lightPositionView_z,
input->arrays.lightAttenuationEnd, lightIndices);
// Now kick off the recursive process for this tile
ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices,
numLights, framebuffer);
}
void
DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
{
MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
// Update min/max Z tree
minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
input->header.cameraProj[2][2], input->header.cameraProj[3][2],
input->header.cameraNear, input->header.cameraFar);
// Launch the "root" tiles. Ideally these should at least fill the
// machine... at the moment we have a static number of "levels" to the
// mip tree but it might make sense to compute it based on the width of
// the machine.
int rootLevel = minMaxZTree->Levels() - 1;
int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
int rootTiles = rootTilesX * rootTilesY;
_Cilk_for (int g = 0; g < rootTiles; ++g) {
uint32_t tileY = g / rootTilesX;
uint32_t tileX = g % rootTilesX;
ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
}
}
#endif // __cilk

View File

@@ -0,0 +1,778 @@
/*
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "deferred.h"
#include <stdio.h>
#include <assert.h>
#define programCount 32
#define programIndex (threadIdx.x & 31)
#define taskIndex (blockIdx.x*4 + (threadIdx.x >> 5))
#define taskCount (gridDim.x*4)
#define warpIdx (threadIdx.x >> 5)
#define int32 int
#define int16 short
#define int8 char
__device__ static inline float clamp(float v, float low, float high)
{
return min(max(v, low), high);
}
struct InputDataArrays
{
float *zBuffer;
unsigned int16 *normalEncoded_x; // half float
unsigned int16 *normalEncoded_y; // half float
unsigned int16 *specularAmount; // half float
unsigned int16 *specularPower; // half float
unsigned int8 *albedo_x; // unorm8
unsigned int8 *albedo_y; // unorm8
unsigned int8 *albedo_z; // unorm8
float *lightPositionView_x;
float *lightPositionView_y;
float *lightPositionView_z;
float *lightAttenuationBegin;
float *lightColor_x;
float *lightColor_y;
float *lightColor_z;
float *lightAttenuationEnd;
};
struct InputHeader
{
float cameraProj[4][4];
float cameraNear;
float cameraFar;
int32 framebufferWidth;
int32 framebufferHeight;
int32 numLights;
int32 inputDataChunkSize;
int32 inputDataArrayOffsets[idaNum];
};
///////////////////////////////////////////////////////////////////////////
// Common utility routines
__device__
static inline float
dot3(float x, float y, float z, float a, float b, float c) {
return (x*a + y*b + z*c);
}
#if 0
static __shared__ int shdata_full[128];
template<typename T, int N>
struct Uniform
{
T data[(N+programCount-1)/programCount];
volatile T *shdata;
__device__ inline Uniform()
{
shdata = ((T*)shdata_full) + warpIdx*32;
}
__device__ inline int2 get_chunk(const int i) const
{
const int elem = i & (programCount - 1);
const int chunk = i >> 5;
shdata[programIndex] = chunk;
shdata[ elem] = chunk;
return make_int2(shdata[programIndex], elem);
}
__device__ inline const T get(const int i) const
{
const int2 idx = get_chunk(i);
return __shfl(data[idx.x], idx.y);
}
__device__ inline void set(const bool active, const int i, T value)
{
const int2 idx = get_chunk(i);
const int chunkIdx = idx.x;
const int elemIdx = idx.y;
shdata[programIndex] = data[chunkIdx];
if (active) shdata[elemIdx] = value;
data[chunkIdx] = shdata[programIndex];
}
};
#elif 1
template<typename T, int N>
struct Uniform
{
union
{
T *data;
int32_t ptr[2];
};
__device__ inline Uniform()
{
if (programIndex == 0)
data = (T*)malloc(N*sizeof(T));
ptr[0] = __shfl(ptr[0], 0);
ptr[1] = __shfl(ptr[1], 0);
}
__device__ inline ~Uniform()
{
if (programIndex == 0)
free(data);
}
__device__ inline const T get(const int i) const
{
return data[i];
}
__device__ inline T* get_ptr(const int i) {return &data[i]; }
__device__ inline void set(const bool active, const int i, T value)
{
if (active)
data[i] = value;
}
};
#else
__shared__ int shdata_full[4*MAX_LIGHTS];
template<typename T, int N>
struct Uniform
{
/* volatile */ T *shdata;
__device__ Uniform()
{
shdata = (T*)&shdata_full[warpIdx*MAX_LIGHTS];
}
__device__ inline const T get(const int i) const
{
return shdata[i];
}
__device__ inline void set(const bool active, const int i, T value)
{
if (active)
shdata[i] = value;
}
};
#endif
__device__
static inline void
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
float n = rsqrt(x*x + y*y + z*z);
ox = x * n;
oy = y * n;
oz = z * n;
}
__device__ inline
static float reduce_min(float value)
{
#pragma unroll
for (int i = 4; i >=0; i--)
value = fminf(value, __shfl_xor(value, 1<<i, 32));
return value;
}
__device__ inline
static float reduce_max(float value)
{
#pragma unroll
for (int i = 4; i >=0; i--)
value = fmaxf(value, __shfl_xor(value, 1<<i, 32));
return value;
}
#if 0
__device__ inline
static int reduce_sum(int value)
{
#pragma unroll
for (int i = 4; i >=0; i--)
value += __shfl_xor(value, 1<<i, 32);
return value;
}
static __device__ __forceinline__ uint shfl_scan_add_step(uint partial, uint up_offset)
{
uint result;
asm(
"{.reg .u32 r0;"
".reg .pred p;"
"shfl.up.b32 r0|p, %1, %2, 0;"
"@p add.u32 r0, r0, %3;"
"mov.u32 %0, r0;}"
: "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
return result;
}
static __device__ __forceinline__ int inclusive_scan_warp(const int value)
{
uint sum = value;
#pragma unroll
for(int i = 0; i < 5; ++i)
sum = shfl_scan_add_step(sum, 1 << i);
return sum - value;
}
#endif
static __device__ __forceinline__ int lanemask_lt()
{
int mask;
asm("mov.u32 %0, %lanemask_lt;" : "=r" (mask));
return mask;
}
static __device__ __forceinline__ int2 warpBinExclusiveScan(const bool p)
{
const int b = __ballot(p);
return make_int2(__popc(b), __popc(b & lanemask_lt()));
}
__device__ static inline
int packed_store_active(bool active, int* ptr, int value)
{
const int2 res = warpBinExclusiveScan(active);
const int idx = res.y;
const int nactive = res.x;
if (active)
ptr[idx] = value;
return nactive;
}
__device__
static inline float
Unorm8ToFloat32(unsigned int8 u) {
return (float)u * (1.0f / 255.0f);
}
__device__
static inline unsigned int8
Float32ToUnorm8(float f) {
return (unsigned int8)(f * 255.0f);
}
__device__
static inline void
ComputeZBounds(
int32 tileStartX, int32 tileEndX,
int32 tileStartY, int32 tileEndY,
// G-buffer data
float zBuffer[],
int32 gBufferWidth,
// Camera data
float cameraProj_33, float cameraProj_43,
float cameraNear, float cameraFar,
// Output
float &minZ,
float &maxZ
)
{
// Find Z bounds
float laneMinZ = cameraFar;
float laneMaxZ = cameraNear;
for ( int32 y = tileStartY; y < tileEndY; ++y) {
for ( int xb = tileStartX; xb < tileEndX; xb += programCount)
{
const int x = xb + programIndex;
if (x >= tileEndX) break;
// Unproject depth buffer Z value into view space
float z = zBuffer[y * gBufferWidth + x];
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
// Work out Z bounds for our samples
// Avoid considering skybox/background or otherwise invalid pixels
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
laneMinZ = min(laneMinZ, viewSpaceZ);
laneMaxZ = max(laneMaxZ, viewSpaceZ);
}
}
}
minZ = reduce_min(laneMinZ);
maxZ = reduce_max(laneMaxZ);
}
__device__
static inline int32
IntersectLightsWithTileMinMax(
int32 tileStartX, int32 tileEndX,
int32 tileStartY, int32 tileEndY,
// Tile data
float minZ,
float maxZ,
// G-buffer data
int32 gBufferWidth, int32 gBufferHeight,
// Camera data
float cameraProj_11, float cameraProj_22,
// Light Data
int32 numLights,
float light_positionView_x_array[],
float light_positionView_y_array[],
float light_positionView_z_array[],
float light_attenuationEnd_array[],
// Output
Uniform<int,MAX_LIGHTS> &tileLightIndices
)
{
float gBufferScale_x = 0.5f * (float)gBufferWidth;
float gBufferScale_y = 0.5f * (float)gBufferHeight;
float frustumPlanes_xy[4] = {
-(cameraProj_11 * gBufferScale_x),
(cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y),
-(cameraProj_22 * gBufferScale_y) };
float frustumPlanes_z[4] = {
tileEndX - gBufferScale_x,
-tileStartX + gBufferScale_x,
tileEndY - gBufferScale_y,
-tileStartY + gBufferScale_y };
for ( int i = 0; i < 4; ++i) {
float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
frustumPlanes_z[i] * frustumPlanes_z[i]);
frustumPlanes_xy[i] *= norm;
frustumPlanes_z[i] *= norm;
}
int32 tileNumLights = 0;
for ( int lightIndexB = 0; lightIndexB < numLights; lightIndexB += programCount)
{
const int lightIndex = lightIndexB + programIndex;
if (lightIndex >= numLights) break;
float light_positionView_z = light_positionView_z_array[lightIndex];
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
float light_attenuationEndNeg = -light_attenuationEnd;
float d = light_positionView_z - minZ;
bool inFrustum = (d >= light_attenuationEndNeg);
d = maxZ - light_positionView_z;
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// This seems better than cif(!inFrustum) ccontinue; here since we
// don't actually need to mask the rest of this function - this is
// just a greedy early-out. Could also structure all of this as
// nested if() statements, but this a bit easier to read
if (__ballot(inFrustum) > 0)
{
float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex];
d = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[1] +
light_positionView_x * frustumPlanes_xy[1];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[2] +
light_positionView_y * frustumPlanes_xy[2];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[3] +
light_positionView_y * frustumPlanes_xy[3];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// Pack and store intersecting lights
const bool active = inFrustum && lightIndex < numLights;
#if 0
if (__ballot(active) > 0)
tileNumLights += packed_store_active(active, tileLightIndices.get_ptr(tileNumLights), lightIndex);
#else
if (__ballot(active) > 0)
{
const int2 res = warpBinExclusiveScan(active);
const int idx = tileNumLights + res.y;
const int nactive = res.x;
tileLightIndices.set(active, idx, lightIndex);
tileNumLights += nactive;
}
#endif
}
}
return tileNumLights;
}
__device__
static inline int32
IntersectLightsWithTile(
int32 tileStartX, int32 tileEndX,
int32 tileStartY, int32 tileEndY,
int32 gBufferWidth, int32 gBufferHeight,
// G-buffer data
float zBuffer[],
// Camera data
float cameraProj_11, float cameraProj_22,
float cameraProj_33, float cameraProj_43,
float cameraNear, float cameraFar,
// Light Data
int32 numLights,
float light_positionView_x_array[],
float light_positionView_y_array[],
float light_positionView_z_array[],
float light_attenuationEnd_array[],
// Output
Uniform<int,MAX_LIGHTS> &tileLightIndices
)
{
float minZ, maxZ;
ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
minZ, maxZ);
int32 tileNumLights = IntersectLightsWithTileMinMax(
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
light_positionView_z_array, light_attenuationEnd_array,
tileLightIndices);
return tileNumLights;
}
__device__
static inline void
ShadeTile(
int32 tileStartX, int32 tileEndX,
int32 tileStartY, int32 tileEndY,
int32 gBufferWidth, int32 gBufferHeight,
const InputDataArrays &inputData,
// Camera data
float cameraProj_11, float cameraProj_22,
float cameraProj_33, float cameraProj_43,
// Light list
Uniform<int,MAX_LIGHTS> &tileLightIndices,
int32 tileNumLights,
// UI
bool visualizeLightCount,
// Output
unsigned int8 framebuffer_r[],
unsigned int8 framebuffer_g[],
unsigned int8 framebuffer_b[]
)
{
if (tileNumLights == 0 || visualizeLightCount) {
unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
for ( int32 y = tileStartY; y < tileEndY; ++y) {
for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
{
const int x = xb + programIndex;
if (x >= tileEndX) continue;
int32 framebufferIndex = (y * gBufferWidth + x);
framebuffer_r[framebufferIndex] = c;
framebuffer_g[framebufferIndex] = c;
framebuffer_b[framebufferIndex] = c;
}
}
} else {
float twoOverGBufferWidth = 2.0f / gBufferWidth;
float twoOverGBufferHeight = 2.0f / gBufferHeight;
for ( int32 y = tileStartY; y < tileEndY; ++y) {
float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
for ( int xb = tileStartX ; xb < tileEndX; xb += programCount)
{
const int x = xb + programIndex;
// if (x >= tileEndX) break;
int32 gBufferOffset = y * gBufferWidth + x;
// Reconstruct position and (negative) view vector from G-buffer
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
float Vneg_x, Vneg_y, Vneg_z;
float z = inputData.zBuffer[gBufferOffset];
// Compute screen/clip-space position
// NOTE: Mind DX11 viewport transform and pixel center!
float positionScreen_x = (0.5f + (float)(x)) *
twoOverGBufferWidth - 1.0f;
// Unproject depth buffer Z value into view space
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
surface_positionView_x = positionScreen_x * surface_positionView_z /
cameraProj_11;
surface_positionView_y = positionScreen_y * surface_positionView_z /
cameraProj_22;
// We actually end up with a vector pointing *at* the
// surface (i.e. the negative view vector)
normalize3(surface_positionView_x, surface_positionView_y,
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
// Reconstruct normal from G-buffer
float surface_normal_x, surface_normal_y, surface_normal_z;
asm("// half2float //");
float normal_x = __half2float(inputData.normalEncoded_x[gBufferOffset]);
float normal_y = __half2float(inputData.normalEncoded_y[gBufferOffset]);
asm("// half2float //");
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
float m = sqrt(4.0f * f - 1.0f);
surface_normal_x = m * (4.0f * normal_x - 2.0f);
surface_normal_y = m * (4.0f * normal_y - 2.0f);
surface_normal_z = 3.0f - 8.0f * f;
// Load other G-buffer parameters
float surface_specularAmount =
__half2float(inputData.specularAmount[gBufferOffset]);
float surface_specularPower =
__half2float(inputData.specularPower[gBufferOffset]);
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
float lit_x = 0.0f;
float lit_y = 0.0f;
float lit_z = 0.0f;
for ( int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
++tileLightIndex) {
int32 lightIndex = tileLightIndices.get(tileLightIndex);
// Gather light data relevant to initial culling
float light_positionView_x =
__ldg(&inputData.lightPositionView_x[lightIndex]);
float light_positionView_y =
__ldg(&inputData.lightPositionView_y[lightIndex]);
float light_positionView_z =
__ldg(&inputData.lightPositionView_z[lightIndex]);
float light_attenuationEnd =
__ldg(&inputData.lightAttenuationEnd[lightIndex]);
// Compute light vector
float L_x = light_positionView_x - surface_positionView_x;
float L_y = light_positionView_y - surface_positionView_y;
float L_z = light_positionView_z - surface_positionView_z;
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
// Clip at end of attenuation
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
if (distanceToLight2 < light_attenutaionEnd2) {
float distanceToLight = sqrt(distanceToLight2);
// HLSL "rcp" is allowed to be fairly inaccurate
float distanceToLightRcp = 1.0f/distanceToLight;
L_x *= distanceToLightRcp;
L_y *= distanceToLightRcp;
L_z *= distanceToLightRcp;
// Start computing brdf
float NdotL = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, L_x, L_y, L_z);
// Clip back facing
if (NdotL > 0.0f) {
float light_attenuationBegin =
inputData.lightAttenuationBegin[lightIndex];
// Light distance attenuation (linstep)
float lightRange = (light_attenuationEnd - light_attenuationBegin);
float falloffPosition = (light_attenuationEnd - distanceToLight);
float attenuation = min(falloffPosition / lightRange, 1.0f);
float H_x = (L_x - Vneg_x);
float H_y = (L_y - Vneg_y);
float H_z = (L_z - Vneg_z);
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
float NdotH = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, H_x, H_y, H_z);
NdotH = max(NdotH, 0.0f);
float specular = pow(NdotH, surface_specularPower);
float specularNorm = (surface_specularPower + 2.0f) *
(1.0f / 8.0f);
float specularContrib = surface_specularAmount *
specularNorm * specular;
float k = attenuation * NdotL * (1.0f + specularContrib);
float light_color_x = inputData.lightColor_x[lightIndex];
float light_color_y = inputData.lightColor_y[lightIndex];
float light_color_z = inputData.lightColor_z[lightIndex];
float lightContrib_x = surface_albedo_x * light_color_x;
float lightContrib_y = surface_albedo_y * light_color_y;
float lightContrib_z = surface_albedo_z * light_color_z;
lit_x += lightContrib_x * k;
lit_y += lightContrib_y * k;
lit_z += lightContrib_z * k;
}
}
}
// Gamma correct
// These pows are pretty slow right now, but we can do
// something faster if really necessary to squeeze every
// last bit of performance out of it
float gamma = 1.0 / 2.2f;
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
}
}
}
}
///////////////////////////////////////////////////////////////////////////
// Static decomposition
__global__ void
RenderTile( int num_groups_x, int num_groups_y,
const InputHeader *inputHeaderPtr,
const InputDataArrays *inputDataPtr,
int visualizeLightCount,
// Output
unsigned int8 framebuffer_r[],
unsigned int8 framebuffer_g[],
unsigned int8 framebuffer_b[]) {
if (taskIndex >= taskCount) return;
const InputHeader inputHeader = *inputHeaderPtr;
const InputDataArrays inputData = *inputDataPtr;
int32 group_y = taskIndex / num_groups_x;
int32 group_x = taskIndex % num_groups_x;
int32 tile_start_x = group_x * MIN_TILE_WIDTH;
int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
int framebufferWidth = inputHeader.framebufferWidth;
int framebufferHeight = inputHeader.framebufferHeight;
float cameraProj_00 = inputHeader.cameraProj[0][0];
float cameraProj_11 = inputHeader.cameraProj[1][1];
float cameraProj_22 = inputHeader.cameraProj[2][2];
float cameraProj_32 = inputHeader.cameraProj[3][2];
// Light intersection: figure out which lights illuminate this tile.
Uniform<int,MAX_LIGHTS> tileLightIndices; // Light list for the tile
#if 1
int numTileLights =
IntersectLightsWithTile(tile_start_x, tile_end_x,
tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight,
inputData.zBuffer,
cameraProj_00, cameraProj_11,
cameraProj_22, cameraProj_32,
inputHeader.cameraNear, inputHeader.cameraFar,
MAX_LIGHTS,
inputData.lightPositionView_x,
inputData.lightPositionView_y,
inputData.lightPositionView_z,
inputData.lightAttenuationEnd,
tileLightIndices);
// And now shade the tile, using the lights in tileLightIndices
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight, inputData,
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
tileLightIndices, numTileLights, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
#endif
}
extern "C" __global__ void
RenderStatic___export( InputHeader inputHeaderPtr[],
InputDataArrays inputDataPtr[],
int visualizeLightCount,
// Output
unsigned int8 framebuffer_r[],
unsigned int8 framebuffer_g[],
unsigned int8 framebuffer_b[]) {
const InputHeader inputHeader = *inputHeaderPtr;
const InputDataArrays inputData = *inputDataPtr;
int num_groups_x = (inputHeader.framebufferWidth +
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
int num_groups_y = (inputHeader.framebufferHeight +
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
int num_groups = num_groups_x * num_groups_y;
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
// by MIN_TILE_HEIGHT pixels.
if (programIndex == 0)
RenderTile<<<(num_groups+4-1)/4,128>>>(num_groups_x, num_groups_y,
inputHeaderPtr, inputDataPtr, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
cudaDeviceSynchronize();
}
extern "C" __host__ void
RenderStatic( InputHeader inputHeaderPtr[],
InputDataArrays inputDataPtr[],
int visualizeLightCount,
// Output
unsigned int8 framebuffer_r[],
unsigned int8 framebuffer_g[],
unsigned int8 framebuffer_b[]) {
RenderStatic___export<<<1,32>>>( inputHeaderPtr,
inputDataPtr,
visualizeLightCount,
// Output
framebuffer_r,
framebuffer_g,
framebuffer_b);
cudaDeviceSynchronize();
}

View File

@@ -0,0 +1,717 @@
/*
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "deferred.h"
#ifdef __NVPTX__
#define uniform_t varying
#else
#define uniform_t uniform
#endif
struct InputDataArrays
{
float *zBuffer;
unsigned int16 *normalEncoded_x; // half float
unsigned int16 *normalEncoded_y; // half float
unsigned int16 *specularAmount; // half float
unsigned int16 *specularPower; // half float
unsigned int8 *albedo_x; // unorm8
unsigned int8 *albedo_y; // unorm8
unsigned int8 *albedo_z; // unorm8
float *lightPositionView_x;
float *lightPositionView_y;
float *lightPositionView_z;
float *lightAttenuationBegin;
float *lightColor_x;
float *lightColor_y;
float *lightColor_z;
float *lightAttenuationEnd;
};
struct InputHeader
{
float cameraProj[4][4];
float cameraNear;
float cameraFar;
int32 framebufferWidth;
int32 framebufferHeight;
int32 numLights;
int32 inputDataChunkSize;
int32 inputDataArrayOffsets[idaNum];
};
///////////////////////////////////////////////////////////////////////////
// Common utility routines
static inline float
dot3(float x, float y, float z, float a, float b, float c) {
return (x*a + y*b + z*c);
}
static inline void
normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
float n = rsqrt(x*x + y*y + z*z);
ox = x * n;
oy = y * n;
oz = z * n;
}
static inline float
Unorm8ToFloat32(unsigned int8 u) {
return (float)u * (1.0f / 255.0f);
}
static inline unsigned int8
Float32ToUnorm8(float f) {
return (unsigned int8)(f * 255.0f);
}
#if 1
inline
#endif
static void
ComputeZBounds(
uniform int32 tileStartX, uniform int32 tileEndX,
uniform int32 tileStartY, uniform int32 tileEndY,
// G-buffer data
uniform float zBuffer[],
uniform int32 gBufferWidth,
// Camera data
uniform float cameraProj_33, uniform float cameraProj_43,
uniform float cameraNear, uniform float cameraFar,
// Output
uniform float &minZ,
uniform float &maxZ
)
{
// Find Z bounds
float laneMinZ = cameraFar;
float laneMaxZ = cameraNear;
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
foreach (x = tileStartX ... tileEndX) {
// Unproject depth buffer Z value into view space
float z = zBuffer[y * gBufferWidth + x];
float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
// Work out Z bounds for our samples
// Avoid considering skybox/background or otherwise invalid pixels
if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
laneMinZ = min(laneMinZ, viewSpaceZ);
laneMaxZ = max(laneMaxZ, viewSpaceZ);
}
}
}
minZ = reduce_min(laneMinZ);
maxZ = reduce_max(laneMaxZ);
}
#if 1
inline
#endif
#ifndef __NVPTX__
export
#endif
uniform int32
IntersectLightsWithTileMinMax(
uniform int32 tileStartX, uniform int32 tileEndX,
uniform int32 tileStartY, uniform int32 tileEndY,
// Tile data
uniform float minZ,
uniform float maxZ,
// G-buffer data
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
// Camera data
uniform float cameraProj_11, uniform float cameraProj_22,
// Light Data
uniform int32 numLights,
uniform float light_positionView_x_array[],
uniform float light_positionView_y_array[],
uniform float light_positionView_z_array[],
uniform float light_attenuationEnd_array[],
// Output
uniform int32 tileLightIndices[]
)
{
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
uniform_t float frustumPlanes_xy[4] = {
-(cameraProj_11 * gBufferScale_x),
(cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y),
-(cameraProj_22 * gBufferScale_y) };
uniform_t float frustumPlanes_z[4] = {
tileEndX - gBufferScale_x,
-tileStartX + gBufferScale_x,
tileEndY - gBufferScale_y,
-tileStartY + gBufferScale_y };
for (uniform int i = 0; i < 4; ++i) {
uniform_t float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
frustumPlanes_z[i] * frustumPlanes_z[i]);
frustumPlanes_xy[i] *= norm;
frustumPlanes_z[i] *= norm;
}
uniform int32 tileNumLights = 0;
foreach (lightIndex = 0 ... numLights) {
float light_positionView_z = light_positionView_z_array[lightIndex];
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
float light_attenuationEndNeg = -light_attenuationEnd;
float d = light_positionView_z - minZ;
bool inFrustum = (d >= light_attenuationEndNeg);
d = maxZ - light_positionView_z;
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
// This seems better than cif(!inFrustum) ccontinue; here since we
// don't actually need to mask the rest of this function - this is
// just a greedy early-out. Could also structure all of this as
// nested if() statements, but this a bit easier to read
if (any(inFrustum)) {
float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex];
d = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[1] +
light_positionView_x * frustumPlanes_xy[1];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[2] +
light_positionView_y * frustumPlanes_xy[2];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
d = light_positionView_z * frustumPlanes_z[3] +
light_positionView_y * frustumPlanes_xy[3];
inFrustum = inFrustum && (d >= light_attenuationEndNeg);
#if 0
// Pack and store intersecting lights
cif (inFrustum) {
tileNumLights += packed_store_active(&tileLightIndices[tileNumLights],
lightIndex);
}
#else
const bool active = inFrustum && lightIndex < numLights;
if(any(active))
tileNumLights += packed_store_active(active, &tileLightIndices[tileNumLights], lightIndex);
#endif
}
}
return tileNumLights;
}
#if 1
inline
#endif
static uniform int32
IntersectLightsWithTile(
uniform int32 tileStartX, uniform int32 tileEndX,
uniform int32 tileStartY, uniform int32 tileEndY,
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
// G-buffer data
uniform float zBuffer[],
// Camera data
uniform float cameraProj_11, uniform float cameraProj_22,
uniform float cameraProj_33, uniform float cameraProj_43,
uniform float cameraNear, uniform float cameraFar,
// Light Data
uniform int32 numLights,
uniform float light_positionView_x_array[],
uniform float light_positionView_y_array[],
uniform float light_positionView_z_array[],
uniform float light_attenuationEnd_array[],
// Output
uniform int32 tileLightIndices[]
)
{
uniform float minZ, maxZ;
ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
minZ, maxZ);
uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array,
light_positionView_z_array, light_attenuationEnd_array,
tileLightIndices);
return tileNumLights;
}
#if 1
inline
#endif
#ifndef __NVPTX__
export
#endif
void
ShadeTile(
uniform int32 tileStartX, uniform int32 tileEndX,
uniform int32 tileStartY, uniform int32 tileEndY,
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
uniform InputDataArrays &inputData,
// Camera data
uniform float cameraProj_11, uniform float cameraProj_22,
uniform float cameraProj_33, uniform float cameraProj_43,
// Light list
uniform int32 tileLightIndices[],
uniform int32 tileNumLights,
// UI
uniform bool visualizeLightCount,
// Output
uniform unsigned int8 framebuffer_r[],
uniform unsigned int8 framebuffer_g[],
uniform unsigned int8 framebuffer_b[]
)
{
if (tileNumLights == 0 || visualizeLightCount) {
uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
foreach (x = tileStartX ... tileEndX) {
int32 framebufferIndex = (y * gBufferWidth + x);
framebuffer_r[framebufferIndex] = c;
framebuffer_g[framebufferIndex] = c;
framebuffer_b[framebufferIndex] = c;
}
}
} else {
uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
foreach (x = tileStartX ... tileEndX) {
int32 gBufferOffset = y * gBufferWidth + x;
// Reconstruct position and (negative) view vector from G-buffer
float surface_positionView_x, surface_positionView_y, surface_positionView_z;
float Vneg_x, Vneg_y, Vneg_z;
float z = inputData.zBuffer[gBufferOffset];
// Compute screen/clip-space position
// NOTE: Mind DX11 viewport transform and pixel center!
float positionScreen_x = (0.5f + (float)(x)) *
twoOverGBufferWidth - 1.0f;
// Unproject depth buffer Z value into view space
surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
surface_positionView_x = positionScreen_x * surface_positionView_z /
cameraProj_11;
surface_positionView_y = positionScreen_y * surface_positionView_z /
cameraProj_22;
// We actually end up with a vector pointing *at* the
// surface (i.e. the negative view vector)
normalize3(surface_positionView_x, surface_positionView_y,
surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
// Reconstruct normal from G-buffer
float surface_normal_x, surface_normal_y, surface_normal_z;
float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
float m = sqrt(4.0f * f - 1.0f);
surface_normal_x = m * (4.0f * normal_x - 2.0f);
surface_normal_y = m * (4.0f * normal_y - 2.0f);
surface_normal_z = 3.0f - 8.0f * f;
// Load other G-buffer parameters
float surface_specularAmount =
half_to_float(inputData.specularAmount[gBufferOffset]);
float surface_specularPower =
half_to_float(inputData.specularPower[gBufferOffset]);
float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
float lit_x = 0.0f;
float lit_y = 0.0f;
float lit_z = 0.0f;
for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights;
++tileLightIndex) {
uniform int32 lightIndex = tileLightIndices[tileLightIndex];
// Gather light data relevant to initial culling
uniform float light_positionView_x =
inputData.lightPositionView_x[lightIndex];
uniform float light_positionView_y =
inputData.lightPositionView_y[lightIndex];
uniform float light_positionView_z =
inputData.lightPositionView_z[lightIndex];
uniform float light_attenuationEnd =
inputData.lightAttenuationEnd[lightIndex];
// Compute light vector
float L_x = light_positionView_x - surface_positionView_x;
float L_y = light_positionView_y - surface_positionView_y;
float L_z = light_positionView_z - surface_positionView_z;
float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
// Clip at end of attenuation
float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
cif (distanceToLight2 < light_attenutaionEnd2) {
float distanceToLight = sqrt(distanceToLight2);
// HLSL "rcp" is allowed to be fairly inaccurate
float distanceToLightRcp = rcp(distanceToLight);
L_x *= distanceToLightRcp;
L_y *= distanceToLightRcp;
L_z *= distanceToLightRcp;
// Start computing brdf
float NdotL = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, L_x, L_y, L_z);
// Clip back facing
cif (NdotL > 0.0f) {
uniform float light_attenuationBegin =
inputData.lightAttenuationBegin[lightIndex];
// Light distance attenuation (linstep)
float lightRange = (light_attenuationEnd - light_attenuationBegin);
float falloffPosition = (light_attenuationEnd - distanceToLight);
float attenuation = min(falloffPosition / lightRange, 1.0f);
float H_x = (L_x - Vneg_x);
float H_y = (L_y - Vneg_y);
float H_z = (L_z - Vneg_z);
normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
float NdotH = dot3(surface_normal_x, surface_normal_y,
surface_normal_z, H_x, H_y, H_z);
NdotH = max(NdotH, 0.0f);
float specular = pow(NdotH, surface_specularPower);
float specularNorm = (surface_specularPower + 2.0f) *
(1.0f / 8.0f);
float specularContrib = surface_specularAmount *
specularNorm * specular;
float k = attenuation * NdotL * (1.0f + specularContrib);
uniform float light_color_x = inputData.lightColor_x[lightIndex];
uniform float light_color_y = inputData.lightColor_y[lightIndex];
uniform float light_color_z = inputData.lightColor_z[lightIndex];
float lightContrib_x = surface_albedo_x * light_color_x;
float lightContrib_y = surface_albedo_y * light_color_y;
float lightContrib_z = surface_albedo_z * light_color_z;
lit_x += lightContrib_x * k;
lit_y += lightContrib_y * k;
lit_z += lightContrib_z * k;
}
}
}
// Gamma correct
// These pows are pretty slow right now, but we can do
// something faster if really necessary to squeeze every
// last bit of performance out of it
float gamma = 1.0 / 2.2f;
lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
}
}
}
}
///////////////////////////////////////////////////////////////////////////
// Static decomposition
task void
RenderTile(uniform int num_groups_x, uniform int num_groups_y,
uniform InputHeader inputHeaderPtr[],
uniform InputDataArrays inputDataPtr[],
uniform int visualizeLightCount,
// Output
uniform unsigned int8 framebuffer_r[],
uniform unsigned int8 framebuffer_g[],
uniform unsigned int8 framebuffer_b[]) {
uniform InputHeader inputHeader = *inputHeaderPtr;
uniform InputDataArrays inputData = *inputDataPtr;
uniform int32 group_y = taskIndex / num_groups_x;
uniform int32 group_x = taskIndex % num_groups_x;
uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
uniform int framebufferWidth = inputHeader.framebufferWidth;
uniform int framebufferHeight = inputHeader.framebufferHeight;
uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
// Light intersection: figure out which lights illuminate this tile.
#if 1
uniform int * uniform tileLightIndices = uniform new uniform int [MAX_LIGHTS];
#define MALLOC
#else /* shared memory doesn't full work... why? */
uniform int tileLightIndices[MAX_LIGHTS]; // Light list for the tile
#endif
uniform int numTileLights =
IntersectLightsWithTile(tile_start_x, tile_end_x,
tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight,
inputData.zBuffer,
cameraProj_00, cameraProj_11,
cameraProj_22, cameraProj_32,
inputHeader.cameraNear, inputHeader.cameraFar,
MAX_LIGHTS,
inputData.lightPositionView_x,
inputData.lightPositionView_y,
inputData.lightPositionView_z,
inputData.lightAttenuationEnd,
tileLightIndices);
// And now shade the tile, using the lights in tileLightIndices
ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
framebufferWidth, framebufferHeight, inputData,
cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
tileLightIndices, numTileLights, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
#ifdef MALLOC
delete tileLightIndices;
#endif
}
export void
RenderStatic(uniform InputHeader inputHeaderPtr[],
uniform InputDataArrays inputDataPtr[],
uniform int visualizeLightCount,
// Output
uniform unsigned int8 framebuffer_r[],
uniform unsigned int8 framebuffer_g[],
uniform unsigned int8 framebuffer_b[]) {
uniform InputHeader inputHeader = *inputHeaderPtr;
uniform InputDataArrays inputData = *inputDataPtr;
uniform int num_groups_x = (inputHeader.framebufferWidth +
MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
uniform int num_groups_y = (inputHeader.framebufferHeight +
MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
uniform int num_groups = num_groups_x * num_groups_y;
// Launch a task to render each tile, each of which is MIN_TILE_WIDTH
// by MIN_TILE_HEIGHT pixels.
launch[num_groups] RenderTile(num_groups_x, num_groups_y,
inputHeaderPtr, inputDataPtr, visualizeLightCount,
framebuffer_r, framebuffer_g, framebuffer_b);
}
///////////////////////////////////////////////////////////////////////////
// Routines for dynamic decomposition path
// This computes the z min/max range for a whole row worth of tiles.
export void
ComputeZBoundsRow(
uniform int32 tileY,
uniform int32 tileWidth, uniform int32 tileHeight,
uniform int32 numTilesX, uniform int32 numTilesY,
// G-buffer data
uniform float zBuffer[],
uniform int32 gBufferWidth,
// Camera data
uniform float cameraProj_33, uniform float cameraProj_43,
uniform float cameraNear, uniform float cameraFar,
// Output
uniform float minZArray[],
uniform float maxZArray[]
)
{
for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
uniform float minZ, maxZ;
ComputeZBounds(
tileX * tileWidth, tileX * tileWidth + tileWidth,
tileY * tileHeight, tileY * tileHeight + tileHeight,
zBuffer, gBufferWidth,
cameraProj_33, cameraProj_43, cameraNear, cameraFar,
minZ, maxZ);
minZArray[tileX] = minZ;
maxZArray[tileX] = maxZ;
}
}
// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
// numLights need not be a multiple of programCount here, but the input and output arrays
// should be able to handle programCount-sized load/stores.
export void
SplitTileMinMax(
uniform int32 tileMidX, uniform int32 tileMidY,
// Subtile data (00, 10, 01, 11)
uniform float subtileMinZ[],
uniform float subtileMaxZ[],
// G-buffer data
uniform int32 gBufferWidth, uniform int32 gBufferHeight,
// Camera data
uniform float cameraProj_11, uniform float cameraProj_22,
// Light Data
uniform int32 lightIndices[],
uniform int32 numLights,
uniform float light_positionView_x_array[],
uniform float light_positionView_y_array[],
uniform float light_positionView_z_array[],
uniform float light_attenuationEnd_array[],
// Outputs
uniform int32 subtileIndices[],
uniform int32 subtileIndicesPitch,
uniform int32 subtileNumLights[]
)
{
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
uniform_t float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
(cameraProj_22 * gBufferScale_y) };
uniform_t float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
tileMidY - gBufferScale_y };
// Normalize
uniform_t float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
frustumPlanes_z[0] * frustumPlanes_z[0]),
rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
frustumPlanes_z[1] * frustumPlanes_z[1]) };
frustumPlanes_xy[0] *= norm[0];
frustumPlanes_xy[1] *= norm[1];
frustumPlanes_z[0] *= norm[0];
frustumPlanes_z[1] *= norm[1];
// Initialize
uniform int32 subtileLightOffset[4];
subtileLightOffset[0] = 0 * subtileIndicesPitch;
subtileLightOffset[1] = 1 * subtileIndicesPitch;
subtileLightOffset[2] = 2 * subtileIndicesPitch;
subtileLightOffset[3] = 3 * subtileIndicesPitch;
foreach (i = 0 ... numLights) {
int32 lightIndex = lightIndices[i];
float light_positionView_x = light_positionView_x_array[lightIndex];
float light_positionView_y = light_positionView_y_array[lightIndex];
float light_positionView_z = light_positionView_z_array[lightIndex];
float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
float light_attenuationEndNeg = -light_attenuationEnd;
// Test lights again subtile z bounds
bool inFrustum[4];
inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
(subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) &&
(subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) &&
(subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) &&
(subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
float dx = light_positionView_z * frustumPlanes_z[0] +
light_positionView_x * frustumPlanes_xy[0];
float dy = light_positionView_z * frustumPlanes_z[1] +
light_positionView_y * frustumPlanes_xy[1];
cif (abs(dx) > light_attenuationEnd) {
bool positiveX = dx > 0.0f;
inFrustum[0] = inFrustum[0] && positiveX; // 00 subtile
inFrustum[1] = inFrustum[1] && !positiveX; // 10 subtile
inFrustum[2] = inFrustum[2] && positiveX; // 01 subtile
inFrustum[3] = inFrustum[3] && !positiveX; // 11 subtile
}
cif (abs(dy) > light_attenuationEnd) {
bool positiveY = dy > 0.0f;
inFrustum[0] = inFrustum[0] && positiveY; // 00 subtile
inFrustum[1] = inFrustum[1] && positiveY; // 10 subtile
inFrustum[2] = inFrustum[2] && !positiveY; // 01 subtile
inFrustum[3] = inFrustum[3] && !positiveY; // 11 subtile
}
// Pack and store intersecting lights
// TODO: Experiment with a loop here instead
cif (inFrustum[0])
subtileLightOffset[0] +=
packed_store_active(&subtileIndices[subtileLightOffset[0]],
lightIndex);
cif (inFrustum[1])
subtileLightOffset[1] +=
packed_store_active(&subtileIndices[subtileLightOffset[1]],
lightIndex);
cif (inFrustum[2])
subtileLightOffset[2] +=
packed_store_active(&subtileIndices[subtileLightOffset[2]],
lightIndex);
cif (inFrustum[3])
subtileLightOffset[3] +=
packed_store_active(&subtileIndices[subtileLightOffset[3]],
lightIndex);
}
subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
}

View File

@@ -0,0 +1,107 @@
/*
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER
#define ISPC_IS_WINDOWS
#define NOMINMAX
#elif defined(__linux__)
#define ISPC_IS_LINUX
#elif defined(__APPLE__)
#define ISPC_IS_APPLE
#endif
#include <fcntl.h>
#include <cfloat>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <sys/types.h>
#include <stdint.h>
#include <algorithm>
#include <cassert>
#include <vector>
#ifdef ISPC_IS_WINDOWS
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
#include "deferred.h"
#include "kernels_ispc.h"
#include "timing.h"
#include "ispc_malloc.h"
///////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv) {
if (argc < 2) {
printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)> [tasks iterations] [serial iterations]\n");
return 1;
}
static unsigned int test_iterations[] = {5, 3, 500}; //last value is for nframes, it is scale.
if (argc == 5) {
for (int i = 0; i < 3; i++) {
test_iterations[i] = atoi(argv[2 + i]);
}
}
InputData *input = CreateInputDataFromFile(argv[1]);
if (!input) {
printf("Failed to load input file \"%s\"!\n", argv[1]);
return 1;
}
Framebuffer framebuffer(input->header.framebufferWidth,
input->header.framebufferHeight);
int nframes = test_iterations[2];
double ispcCycles = 1e30;
for (int i = 0; i < test_iterations[0]; ++i) {
framebuffer.clear();
reset_and_start_timer();
for (int j = 0; j < nframes; ++j)
ispc::RenderStatic(&input->header, &input->arrays,
VISUALIZE_LIGHT_COUNT,
framebuffer.r, framebuffer.g, framebuffer.b);
double msec = get_elapsed_msec() / nframes;
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec [%.3f fps]\n", msec, 1.0e3/msec);
ispcCycles = std::min(ispcCycles, msec);
}
printf("[ispc static + tasks]:\t\t[%.3f] msec to render "
"%d x %d image\n", ispcCycles,
input->header.framebufferWidth, input->header.framebufferHeight);
WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
DeleteInputData(input);
return 0;
}

View File

@@ -0,0 +1,12 @@
EXAMPLE=mergeSort
CPP_SRC=mergeSort.cpp
ISPC_SRC=mergeSort.ispc
ISPC_IA_TARGETS=avx1-i32x8
ISPC_ARM_TARGETS=neon
#ISPC_FLAGS=-DDEBUG -g
CXXFLAGS=-g
CCFLAGS=-g
#NVCC_FLAGS=-Xptxas=-O0
include ../common_cpu.mk

View File

@@ -0,0 +1,7 @@
EXAMPLE=mergeSort
CXX_SRC=mergeSort.cpp
ISPC_SRC=mergeSort.ispc
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
ISPC_TARGET=generic-16
include ../common_knc.mk

View File

@@ -0,0 +1,15 @@
PROG=mergeSort
ISPC_SRC=mergeSort.ispc
CU_SRC=mergeSort.cu
CXX_SRC=mergeSort.cpp mergeSort.cpp
PTXCC_REGMAX=64
#PTXCC_FLAGS= -Xptxas=-O3
#NVCC_FLAGS=-Xptxas=-O0
LLVM_GPU=1
NVVM_GPU=1
include ../common_ptx.mk

View File

@@ -0,0 +1,3 @@
#pragma once
typedef float Key_t;
typedef int Val_t;

View File

@@ -0,0 +1,171 @@
/*
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <cassert>
#include <iomanip>
#include "timing.h"
#include "ispc_malloc.h"
#include "mergeSort_ispc.h"
static void progressBar(const int x, const int n, const int width = 50)
{
assert(n > 1);
assert(x >= 0 && x < n);
assert(width > 10);
const float f = static_cast<float>(x)/(n-1);
const int w = static_cast<int>(f * width);
// print bar
std::string bstr("[");
for (int i = 0; i < width; i++)
bstr += i < w ? '=' : ' ';
bstr += "]";
// print percentage
char pstr0[32];
sprintf(pstr0, " %2d %c ", static_cast<int>(f*100.0),'%');
const std::string pstr(pstr0);
std::copy(pstr.begin(), pstr.end(), bstr.begin() + (width/2-2));
std::cout << bstr;
std::cout << (x == n-1 ? "\n" : "\r") << std::flush;
}
#include "keyType.h"
struct Key
{
Key_t key;
Val_t val;
};
int main (int argc, char *argv[])
{
int i, j, n = argc == 1 ? 1024*1024: atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
Key *keys = new Key[n];
srand48(rtc()*65536);
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
keys[i].key = i; //((int)(drand48() * (1<<30)));
keys[i].val = i;
}
std::random_shuffle(keys, keys + n);
Key_t *keysSrc = new Key_t[n];
Val_t *valsSrc = new Val_t[n];
Key_t *keysBuf = new Key_t[n];
Val_t *valsBuf = new Val_t[n];
Key_t *keysDst = new Key_t[n];
Val_t *valsDst = new Val_t[n];
Key_t *keysGld = new Key_t[n];
Val_t *valsGld = new Val_t[n];
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
keysSrc[i] = keys[i].key;
valsSrc[i] = keys[i].val;
keysGld[i] = keysSrc[i];
valsGld[i] = valsSrc[i];
}
delete keys;
ispcSetMallocHeapLimit(1024*1024*1024);
ispc::openMergeSort();
tISPC2 = 1e30;
for (i = 0; i < m; i ++)
{
ispcMemcpy(keysSrc, keysGld, n*sizeof(Key_t));
ispcMemcpy(valsSrc, valsGld, n*sizeof(Val_t));
reset_and_start_timer();
ispc::mergeSort(keysDst, valsDst, keysBuf, valsBuf, keysSrc, valsSrc, n);
tISPC2 = std::min(tISPC2, get_elapsed_msec());
if (argc != 3)
progressBar (i, m);
}
ispc::closeMergeSort();
printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n/tISPC2);
#if 0
printf("\n---\n");
for (int i = 0; i < 128; i++)
{
if ((i%32) == 0) printf("\n");
printf("%d ", (int)keysSrc[i]);
}
printf("\n---\n");
for (int i = 0; i < 128; i++)
{
if ((i%32) == 0) printf("\n");
printf("%d ", (int)keysBuf[i]);
}
printf("\n---\n");
for (int i = 0; i < 128; i++)
{
if ((i%32) == 0) printf("\n");
printf("%d ", (int)keysDst[i]);
}
printf("\n---\n");
#endif
std::sort(keysGld, keysGld + n);
for (int i = 0; i < n; i++)
assert(keysDst[i] == keysGld[i]);
delete keysSrc;
delete valsSrc;
delete keysDst;
delete valsDst;
delete keysBuf;
delete valsBuf;
delete keysGld;
delete valsGld;
return 0;
}

View File

@@ -0,0 +1,694 @@
/*
Copyright (c) 2014, Evghenii Gaburov
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Based on mergeSort from CUDA SDK
*/
#include "keyType.h"
#include "cuda_helpers.cuh"
#include <cassert>
#define uniform
#define SAMPLE_STRIDE programCount
#define iDivUp(a,b) (((a) + (b) - 1)/(b))
#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
#define W (/*sizeof(int)=*/4 * 8)
__device__ static inline
int nextPowerOfTwo(int x)
{
#if 0
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
#else
return 1U << (W - __clz(x - 1));
#endif
}
__device__ static inline
int binarySearchInclusiveRanks(
const int val,
uniform int *data,
const int L,
int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] <= val)
pos = newPos;
}
return pos;
}
__device__ static inline
int binarySearchExclusiveRanks(
const int val,
uniform int *data,
const int L,
int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] < val)
pos = newPos;
}
return pos;
}
__device__ static inline
int binarySearchInclusive(
const Key_t val,
uniform Key_t *data,
const int L,
int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] <= val)
pos = newPos;
}
return pos;
}
__device__ static inline
int binarySearchExclusive(
const Key_t val,
uniform Key_t *data,
const int L,
int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] < val)
pos = newPos;
}
return pos;
}
__device__ static inline
int binarySearchInclusive1(
const Key_t val,
Key_t data,
const uniform int L,
uniform int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (shuffle(data,newPos - 1) <= val)
pos = newPos;
}
return pos;
}
__device__ static inline
int binarySearchExclusive1(
const Key_t val,
Key_t data,
const uniform int L,
uniform int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (shuffle(data,newPos - 1) < val)
pos = newPos;
}
return pos;
}
////////////////////////////////////////////////////////////////////////////////
// Bottom-level merge sort (binary search-based)
////////////////////////////////////////////////////////////////////////////////
__global__
void mergeSortGangKernel(
uniform int batchSize,
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[])
{
const uniform int blkIdx = taskIndex;
const uniform int blkDim = (batchSize + taskCount - 1)/taskCount;
const uniform int blkBeg = blkIdx * blkDim;
const uniform int blkEnd = min(blkBeg + blkDim, batchSize);
__shared__ Key_t s_key_tmp[2*programCount*4];
__shared__ Val_t s_val_tmp[2*programCount*4];
Key_t *s_key = s_key_tmp + warpIdx*(2*programCount);
Val_t *s_val = s_val_tmp + warpIdx*(2*programCount);
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
{
const uniform int base = blk * (programCount*2);
s_key[programIndex + 0] = srcKey[base + programIndex + 0];
s_val[programIndex + 0] = srcVal[base + programIndex + 0];
s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];
for (uniform int stride = 1; stride < 2*programCount; stride <<= 1)
{
const int lPos = programIndex & (stride - 1);
uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
Key_t keyA = baseKey[lPos + 0];
Val_t valA = baseVal[lPos + 0];
Key_t keyB = baseKey[lPos + stride];
Val_t valB = baseVal[lPos + stride];
int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
int posB = binarySearchInclusive(keyB, baseKey + 0, stride, stride) + lPos;
baseKey[posA] = keyA;
baseVal[posA] = valA;
baseKey[posB] = keyB;
baseVal[posB] = valB;
}
dstKey[base + programIndex + 0] = s_key[programIndex + 0];
dstVal[base + programIndex + 0] = s_val[programIndex + 0];
dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
}
}
__device__ static inline
void mergeSortGang(
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int batchSize)
{
uniform int nTasks = batchSize;
launch (nTasks,1,1,mergeSortGangKernel)(batchSize, dstKey, dstVal, srcKey, srcVal);
sync;
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 1: generate sample ranks
////////////////////////////////////////////////////////////////////////////////
__global__
void generateSampleRanksKernel(
uniform int nBlocks,
uniform int in_ranksA[],
uniform int in_ranksB[],
uniform Key_t in_srcKey[],
uniform int stride,
uniform int N,
uniform int totalProgramCount)
{
const uniform int blkIdx = taskIndex;
const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blkBeg = blkIdx * blkDim;
const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
{
const int pos = blk * programCount + programIndex;
cif (pos >= totalProgramCount)
return;
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
uniform Key_t * srcKey = in_srcKey + segmentBase;
uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
const int segmentElementsA = stride;
const int segmentElementsB = min(stride, N - segmentBase - stride);
const int segmentSamplesA = getSampleCount(segmentElementsA);
const int segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA)
{
ranksA[i] = i * SAMPLE_STRIDE;
ranksB[i] = binarySearchExclusive(
srcKey[i * SAMPLE_STRIDE], srcKey + stride,
segmentElementsB, nextPowerOfTwo(segmentElementsB));
}
if (i < segmentSamplesB)
{
ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
segmentElementsA, nextPowerOfTwo(segmentElementsA));
}
}
}
__device__ static inline
void generateSampleRanks(
uniform int ranksA[],
uniform int ranksB[],
uniform Key_t srcKey[],
uniform int stride,
uniform int N)
{
uniform int lastSegmentElements = N % (2 * stride);
uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
uniform int nTasks = nBlocks;
launch (nTasks,1,1, generateSampleRanksKernel)(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
sync;
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices
////////////////////////////////////////////////////////////////////////////////
__global__
void mergeRanksAndIndicesKernel(
uniform int nBlocks,
uniform int in_Limits[],
uniform int in_Ranks[],
uniform int stride,
uniform int N,
uniform int totalProgramCount)
{
const uniform int blkIdx = taskIndex;
const uniform int blkDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blkBeg = blkIdx * blkDim;
const uniform int blkEnd = min(blkBeg + blkDim, nBlocks);
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
{
int pos = blk * programCount + programIndex;
cif (pos >= totalProgramCount)
return;
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
uniform int * ranks = in_Ranks + (pos - i) * 2;
uniform int * limits = in_Limits + (pos - i) * 2;
const int segmentElementsA = stride;
const int segmentElementsB = min(stride, N - segmentBase - stride);
const int segmentSamplesA = getSampleCount(segmentElementsA);
const int segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA)
{
int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
limits[dstPos] = ranks[i];
}
if (i < segmentSamplesB)
{
int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
limits[dstPos] = ranks[segmentSamplesA + i];
}
}
}
__device__ static inline
void mergeRanksAndIndices(
uniform int limitsA[],
uniform int limitsB[],
uniform int ranksA[],
uniform int ranksB[],
uniform int stride,
uniform int N)
{
const uniform int lastSegmentElements = N % (2 * stride);
const uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
uniform int nTasks = nBlocks;
launch (nTasks,1,1,mergeRanksAndIndicesKernel)(
nBlocks,
limitsA,
ranksA,
stride,
N,
threadCount);
launch (nTasks,1,1, mergeRanksAndIndicesKernel)(
nBlocks,
limitsB,
ranksB,
stride,
N,
threadCount);
sync;
}
__global__
void mergeElementaryIntervalsKernel(
uniform int mergePairs,
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int limitsA[],
uniform int limitsB[],
uniform int stride,
uniform int N)
{
const uniform int blkIdx = taskIndex;
const uniform int blkDim = (mergePairs + taskCount - 1)/taskCount;
const uniform int blkBeg = blkIdx * blkDim;
const uniform int blkEnd = min(blkBeg + blkDim, mergePairs);
for (uniform int blk = blkBeg; blk < blkEnd; blk++)
{
const int uniform intervalI = blk & ((2 * stride) / SAMPLE_STRIDE - 1);
const int uniform segmentBase = (blk - intervalI) * SAMPLE_STRIDE;
//Set up threadblk-wide parameters
const uniform int segmentElementsA = stride;
const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
const uniform int segmentSamplesA = getSampleCount(segmentElementsA);
const uniform int segmentSamplesB = getSampleCount(segmentElementsB);
const uniform int segmentSamples = segmentSamplesA + segmentSamplesB;
const uniform int startSrcA = limitsA[blk];
const uniform int startSrcB = limitsB[blk];
const uniform int endSrcA = (intervalI + 1 < segmentSamples) ? limitsA[blk + 1] : segmentElementsA;
const uniform int endSrcB = (intervalI + 1 < segmentSamples) ? limitsB[blk + 1] : segmentElementsB;
const uniform int lenSrcA = endSrcA - startSrcA;
const uniform int lenSrcB = endSrcB - startSrcB;
const uniform int startDstA = startSrcA + startSrcB;
const uniform int startDstB = startDstA + lenSrcA;
//Load main input data
Key_t keyA, keyB;
Val_t valA, valB;
if (programIndex < lenSrcA)
{
keyA = srcKey[segmentBase + startSrcA + programIndex];
valA = srcVal[segmentBase + startSrcA + programIndex];
}
if (programIndex < lenSrcB)
{
keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
valB = srcVal[segmentBase + stride + startSrcB + programIndex];
}
// Compute destination addresses for merge data
int dstPosA, dstPosB, dstA = -1, dstB = -1;
if (any(programIndex < lenSrcA))
dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
if (any(programIndex < lenSrcB))
dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
if (programIndex < lenSrcA && dstPosA < lenSrcA)
dstA = segmentBase + startDstA + dstPosA;
dstPosA -= lenSrcA;
if (programIndex < lenSrcA && dstPosA < lenSrcB)
dstA = segmentBase + startDstB + dstPosA;
if (programIndex < lenSrcB && dstPosB < lenSrcA)
dstB = segmentBase + startDstA + dstPosB;
dstPosB -= lenSrcA;
if (programIndex < lenSrcB && dstPosB < lenSrcB)
dstB = segmentBase + startDstB + dstPosB;
// store merge data
if (dstA >= 0)
{
// int dstA = segmentBase + startSrcA + programIndex;
dstKey[dstA] = keyA;
dstVal[dstA] = valA;
}
if (dstB >= 0)
{
// int dstB = segmentBase + stride + startSrcB + programIndex;
dstKey[dstB] = keyB;
dstVal[dstB] = valB;
}
}
}
__device__ static inline
void mergeElementaryIntervals(
uniform int nTasks,
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int limitsA[],
uniform int limitsB[],
uniform int stride,
uniform int N)
{
const uniform int lastSegmentElements = N % (2 * stride);
const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
nTasks = mergePairs/(programCount);
launch (nTasks,1,1, mergeElementaryIntervalsKernel)(
mergePairs,
dstKey,
dstVal,
srcKey,
srcVal,
limitsA,
limitsB,
stride,
N);
sync;
}
__device__ static uniform int * uniform memPool = NULL;
__device__ static uniform int * uniform ranksA;
__device__ static uniform int * uniform ranksB;
__device__ static uniform int * uniform limitsA;
__device__ static uniform int * uniform limitsB;
__device__ static uniform int nTasks;
__device__ static uniform int MAX_SAMPLE_COUNT = 0;
__global__
void openMergeSort___export()
{
nTasks = 13*32*13;
MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
assert(memPool == NULL);
const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
memPool = uniform new uniform int[nalloc];
ranksA = memPool;
ranksB = ranksA + MAX_SAMPLE_COUNT;
limitsA = ranksB + MAX_SAMPLE_COUNT;
limitsB = limitsA + MAX_SAMPLE_COUNT;
}
extern "C"
void openMergeSort()
{
openMergeSort___export<<<1,1>>>();
sync;
}
__global__
void closeMergeSort___export()
{
assert(memPool != NULL);
delete memPool;
memPool = NULL;
}
extern "C"
void closeMergeSort()
{
closeMergeSort___export<<<1,1>>>();
sync;
}
__global__
void mergeSort___export(
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t bufKey[],
uniform Val_t bufVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int N)
{
uniform int stageCount = 0;
for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);
uniform Key_t * uniform iKey, * uniform oKey;
uniform Val_t * uniform iVal, * uniform oVal;
if (stageCount & 1)
{
iKey = bufKey;
iVal = bufVal;
oKey = dstKey;
oVal = dstVal;
}
else
{
iKey = dstKey;
iVal = dstVal;
oKey = bufKey;
oVal = bufVal;
}
assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
assert(N % (programCount*2) == 0);
// k20m: 140 M/s
{
// k20m: 2367 M/s
mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
#if 1
for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
{
const uniform int lastSegmentElements = N % (2 * stride);
// k20m: 271 M/s
{
#if 1
// k20m: 944 M/s
{
// k20m: 1396 M/s
//Find sample ranks and prepare for limiters merge
generateSampleRanks(ranksA, ranksB, iKey, stride, N);
// k20m: 2379 M/s
//Merge ranks and indices
mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
}
#endif
// k20m: 371 M/s
//Merge elementary intervals
mergeElementaryIntervals(nTasks, oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
}
if (lastSegmentElements <= stride)
for (int i = programIndex; i < lastSegmentElements; i += programCount)
if (i < lastSegmentElements)
{
oKey[N-lastSegmentElements+i] = iKey[N-lastSegmentElements+i];
oVal[N-lastSegmentElements+i] = iVal[N-lastSegmentElements+i];
}
{
uniform Key_t * uniform tmpKey = iKey;
iKey = oKey;
oKey = tmpKey;
}
{
uniform Val_t * uniform tmpVal = iVal;
iVal = oVal;
oVal = tmpVal;
}
}
#endif
}
}
extern "C"
void mergeSort(
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t bufKey[],
uniform Val_t bufVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int N)
{
mergeSort___export<<<1,32>>>(
dstKey,
dstVal,
bufKey,
bufVal,
srcKey,
srcVal,
N);
sync;
}

View File

@@ -0,0 +1,658 @@
/*
Copyright (c) 2014, Evghenii Gaburov
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Based on mergeSort from CUDA SDK
*/
#include "keyType.h"
#define SAMPLE_STRIDE programCount
#define iDivUp(a,b) (((a) + (b) - 1)/(b))
#define getSampleCount(dividend) (iDivUp((dividend), (SAMPLE_STRIDE)))
#define W (/*sizeof(int)=*/4 * 8)
static inline
int nextPowerOfTwo(int x)
{
#if 0
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
#else
return 1U << (W - count_leading_zeros(x - 1));
#endif
}
static inline
int binarySearchInclusiveRanks(
const int val,
uniform int *data,
const int L,
int stride)
{
cif (L == 0)
return 0;
int pos = 0;
cfor (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
cif (data[newPos - 1] <= val)
pos = newPos;
}
return pos;
}
static inline
int binarySearchExclusiveRanks(
const int val,
uniform int *data,
const int L,
int stride)
{
cif (L == 0)
return 0;
int pos = 0;
cfor (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] < val)
pos = newPos;
}
return pos;
}
static inline
int binarySearchInclusive(
const Key_t val,
uniform Key_t *data,
const int L,
int stride)
{
cif (L == 0)
return 0;
int pos = 0;
cfor (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] <= val)
pos = newPos;
}
return pos;
}
static inline
int binarySearchExclusive(
const Key_t val,
uniform Key_t *data,
const int L,
int stride)
{
cif (L == 0)
return 0;
int pos = 0;
cfor (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (data[newPos - 1] < val)
pos = newPos;
}
return pos;
}
static inline
int binarySearchInclusive1(
const Key_t val,
Key_t data,
const uniform int L,
uniform int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (shuffle(data,newPos - 1) <= val)
pos = newPos;
}
return pos;
}
static inline
int binarySearchExclusive1(
const Key_t val,
Key_t data,
const uniform int L,
uniform int stride)
{
if (L == 0)
return 0;
int pos = 0;
for (; stride > 0; stride >>= 1)
{
int newPos = min(pos + stride, L);
if (shuffle(data,newPos - 1) < val)
pos = newPos;
}
return pos;
}
////////////////////////////////////////////////////////////////////////////////
// Bottom-level merge sort (binary search-based)
////////////////////////////////////////////////////////////////////////////////
task
void mergeSortGangKernel(
uniform int batchSize,
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int arrayLength)
{
const uniform int blockIdx = taskIndex;
const uniform int blockDim = (batchSize + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, batchSize);
uniform Key_t s_key[2*programCount];
uniform Val_t s_val[2*programCount];
for (uniform int block = blockBeg; block < blockEnd; block++)
{
const uniform int base = block * (programCount*2);
s_key[programIndex + 0] = srcKey[base + programIndex + 0];
s_val[programIndex + 0] = srcVal[base + programIndex + 0];
s_key[programIndex + programCount] = srcKey[base + programIndex + programCount];
s_val[programIndex + programCount] = srcVal[base + programIndex + programCount];
for (uniform int stride = 1; stride < arrayLength; stride <<= 1)
{
const int lPos = programIndex & (stride - 1);
const int offset = 2 * (programIndex - lPos);
uniform Key_t *baseKey = s_key + 2 * (programIndex - lPos);
uniform Val_t *baseVal = s_val + 2 * (programIndex - lPos);
Key_t keyA = baseKey[lPos + 0];
Val_t valA = baseVal[lPos + 0];
Key_t keyB = baseKey[lPos + stride];
Val_t valB = baseVal[lPos + stride];
int posA = binarySearchExclusive(keyA, baseKey + stride, stride, stride) + lPos;
int posB = binarySearchInclusive(keyB, baseKey + 0, stride, stride) + lPos;
baseKey[posA] = keyA;
baseVal[posA] = valA;
baseKey[posB] = keyB;
baseVal[posB] = valB;
}
dstKey[base + programIndex + 0] = s_key[programIndex + 0];
dstVal[base + programIndex + 0] = s_val[programIndex + 0];
dstKey[base + programIndex + programCount] = s_key[programIndex + programCount];
dstVal[base + programIndex + programCount] = s_val[programIndex + programCount];
}
}
static inline
void mergeSortGang(
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int batchSize)
{
uniform int nTasks = num_cores()*4;
#ifdef __NVPTX__
nTasks = iDivUp(batchSize,1);
#endif
launch [nTasks] mergeSortGangKernel(batchSize, dstKey, dstVal, srcKey, srcVal, 2*programCount);
sync;
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 1: generate sample ranks
////////////////////////////////////////////////////////////////////////////////
task
void generateSampleRanksKernel(
uniform int nBlocks,
uniform int in_ranksA[],
uniform int in_ranksB[],
uniform Key_t in_srcKey[],
uniform int stride,
uniform int N,
uniform int totalProgramCount)
{
const uniform int blockIdx = taskIndex;
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
for (uniform int block = blockBeg; block < blockEnd; block++)
{
const int pos = block * programCount + programIndex;
cif (pos >= totalProgramCount)
return;
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
uniform Key_t * srcKey = in_srcKey + segmentBase;
uniform int * ranksA = in_ranksA + segmentBase / SAMPLE_STRIDE;
uniform int * ranksB = in_ranksB + segmentBase / SAMPLE_STRIDE;
const int segmentElementsA = stride;
const int segmentElementsB = min(stride, N - segmentBase - stride);
const int segmentSamplesA = getSampleCount(segmentElementsA);
const int segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA)
{
ranksA[i] = i * SAMPLE_STRIDE;
ranksB[i] = binarySearchExclusive(
srcKey[i * SAMPLE_STRIDE], srcKey + stride,
segmentElementsB, nextPowerOfTwo(segmentElementsB));
}
if (i < segmentSamplesB)
{
ranksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
ranksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(
srcKey[stride + i * SAMPLE_STRIDE], srcKey + 0,
segmentElementsA, nextPowerOfTwo(segmentElementsA));
}
}
}
static inline
void generateSampleRanks(
uniform int ranksA[],
uniform int ranksB[],
uniform Key_t srcKey[],
uniform int stride,
uniform int N)
{
uniform int lastSegmentElements = N % (2 * stride);
uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
uniform int nTasks = num_cores()*4;
#ifdef __NVPTX__
nTasks = iDivUp(nBlocks,1);
#endif
launch [nTasks] generateSampleRanksKernel(nBlocks, ranksA, ranksB, srcKey, stride, N, threadCount);
sync;
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices
////////////////////////////////////////////////////////////////////////////////
task
void mergeRanksAndIndicesKernel(
uniform int nBlocks,
uniform int in_Limits[],
uniform int in_Ranks[],
uniform int stride,
uniform int N,
uniform int totalProgramCount)
{
const uniform int blockIdx = taskIndex;
const uniform int blockDim = (nBlocks + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, nBlocks);
for (uniform int block = blockBeg; block < blockEnd; block++)
{
int pos = block * programCount + programIndex;
cif (pos >= totalProgramCount)
return;
const int i = pos & ((stride / SAMPLE_STRIDE) - 1);
const int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
uniform int * ranks = in_Ranks + (pos - i) * 2;
uniform int * limits = in_Limits + (pos - i) * 2;
const int segmentElementsA = stride;
const int segmentElementsB = min(stride, N - segmentBase - stride);
const int segmentSamplesA = getSampleCount(segmentElementsA);
const int segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA)
{
int dstPos = binarySearchExclusiveRanks(ranks[i], ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB)) + i;
limits[dstPos] = ranks[i];
}
if (i < segmentSamplesB)
{
int dstPos = binarySearchInclusiveRanks(ranks[segmentSamplesA + i], ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA)) + i;
limits[dstPos] = ranks[segmentSamplesA + i];
}
}
}
static inline
void mergeRanksAndIndices(
uniform int limitsA[],
uniform int limitsB[],
uniform int ranksA[],
uniform int ranksB[],
uniform int stride,
uniform int N)
{
const uniform int lastSegmentElements = N % (2 * stride);
const uniform int threadCount = (lastSegmentElements > stride) ?
(N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) :
(N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
const uniform int nBlocks = iDivUp(threadCount, SAMPLE_STRIDE);
uniform int nTasks = num_cores()*4;
#ifdef __NVPTX__
nTasks = iDivUp(nBlocks,1);
#endif
launch [nTasks] mergeRanksAndIndicesKernel(
nBlocks,
limitsA,
ranksA,
stride,
N,
threadCount);
launch [nTasks] mergeRanksAndIndicesKernel(
nBlocks,
limitsB,
ranksB,
stride,
N,
threadCount);
sync;
}
task
void mergeElementaryIntervalsKernel(
uniform int mergePairs,
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int limitsA[],
uniform int limitsB[],
uniform int stride,
uniform int N)
{
const uniform int blockIdx = taskIndex;
const uniform int blockDim = (mergePairs + taskCount - 1)/taskCount;
const uniform int blockBeg = blockIdx * blockDim;
const uniform int blockEnd = min(blockBeg + blockDim, mergePairs);
for (uniform int block = blockBeg; block < blockEnd; block++)
{
const int uniform intervalI = block & ((2 * stride) / SAMPLE_STRIDE - 1);
const int uniform segmentBase = (block - intervalI) * SAMPLE_STRIDE;
//Set up threadblock-wide parameters
const uniform int segmentElementsA = stride;
const uniform int segmentElementsB = min(stride, N - segmentBase - stride);
const uniform int segmentSamplesA = getSampleCount(segmentElementsA);
const uniform int segmentSamplesB = getSampleCount(segmentElementsB);
const uniform int segmentSamples = segmentSamplesA + segmentSamplesB;
const uniform int startSrcA = limitsA[block];
const uniform int startSrcB = limitsB[block];
const uniform int endSrcA = (intervalI + 1 < segmentSamples) ? limitsA[block + 1] : segmentElementsA;
const uniform int endSrcB = (intervalI + 1 < segmentSamples) ? limitsB[block + 1] : segmentElementsB;
const uniform int lenSrcA = endSrcA - startSrcA;
const uniform int lenSrcB = endSrcB - startSrcB;
const uniform int startDstA = startSrcA + startSrcB;
const uniform int startDstB = startDstA + lenSrcA;
//Load main input data
Key_t keyA, keyB;
Val_t valA, valB;
if (programIndex < lenSrcA)
{
keyA = srcKey[segmentBase + startSrcA + programIndex];
valA = srcVal[segmentBase + startSrcA + programIndex];
}
if (programIndex < lenSrcB)
{
keyB = srcKey[segmentBase + stride + startSrcB + programIndex];
valB = srcVal[segmentBase + stride + startSrcB + programIndex];
}
// Compute destination addresses for merge data
int dstPosA, dstPosB, dstA = -1, dstB = -1;
if (programIndex < lenSrcA)
dstPosA = binarySearchExclusive1(keyA, keyB, lenSrcB, SAMPLE_STRIDE) + programIndex;
if (programIndex < lenSrcB)
dstPosB = binarySearchInclusive1(keyB, keyA, lenSrcA, SAMPLE_STRIDE) + programIndex;
if (programIndex < lenSrcA && dstPosA < lenSrcA)
dstA = segmentBase + startDstA + dstPosA;
dstPosA -= lenSrcA;
if (programIndex < lenSrcA && dstPosA < lenSrcB)
dstA = segmentBase + startDstB + dstPosA;
if (programIndex < lenSrcB && dstPosB < lenSrcA)
dstB = segmentBase + startDstA + dstPosB;
dstPosB -= lenSrcA;
if (programIndex < lenSrcB && dstPosB < lenSrcB)
dstB = segmentBase + startDstB + dstPosB;
if (dstA >= 0)
{
dstKey[dstA] = keyA;
dstVal[dstA] = valA;
}
if (dstB >= 0)
{
dstKey[dstB] = keyB;
dstVal[dstB] = valB;
}
}
}
static inline
void mergeElementaryIntervals(
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int limitsA[],
uniform int limitsB[],
uniform int stride,
uniform int N)
{
const uniform int lastSegmentElements = N % (2 * stride);
const uniform int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
uniform int nTasks = num_cores()*4;
#ifdef __NVPTX__
nTasks = iDivUp(mergePairs,1*programCount);
#endif
launch [nTasks] mergeElementaryIntervalsKernel(
mergePairs,
dstKey,
dstVal,
srcKey,
srcVal,
limitsA,
limitsB,
stride,
N);
if (lastSegmentElements <= stride)
foreach (i = 0 ... lastSegmentElements)
{
dstKey[N-lastSegmentElements+i] = srcKey[N-lastSegmentElements+i];
dstVal[N-lastSegmentElements+i] = srcVal[N-lastSegmentElements+i];
}
sync;
}
static uniform int * uniform memPool = NULL;
static uniform int * uniform ranksA;
static uniform int * uniform ranksB;
static uniform int * uniform limitsA;
static uniform int * uniform limitsB;
static uniform int MAX_SAMPLE_COUNT = 0;
export
void openMergeSort()
{
MAX_SAMPLE_COUNT = 8*32 * 131072 / programCount;
assert(memPool == NULL);
const uniform int nalloc = MAX_SAMPLE_COUNT * 4;
memPool = uniform new uniform int[nalloc];
ranksA = memPool;
ranksB = ranksA + MAX_SAMPLE_COUNT;
limitsA = ranksB + MAX_SAMPLE_COUNT;
limitsB = limitsA + MAX_SAMPLE_COUNT;
}
export
void closeMergeSort()
{
assert(memPool != NULL);
delete memPool;
memPool = NULL;
}
export
void mergeSort(
uniform Key_t dstKey[],
uniform Val_t dstVal[],
uniform Key_t bufKey[],
uniform Val_t bufVal[],
uniform Key_t srcKey[],
uniform Val_t srcVal[],
uniform int N)
{
uniform int stageCount = 0;
for (uniform int stride = 2*programCount; stride < N; stride <<= 1, stageCount++);
uniform Key_t * uniform iKey, * uniform oKey;
uniform Val_t * uniform iVal, * uniform oVal;
if (stageCount & 1)
{
iKey = bufKey;
iVal = bufVal;
oKey = dstKey;
oVal = dstVal;
}
else
{
iKey = dstKey;
iVal = dstVal;
oKey = bufKey;
oVal = bufVal;
}
assert(N <= SAMPLE_STRIDE * MAX_SAMPLE_COUNT);
assert(N % (programCount*2) == 0);
// cpu: 28 gpu: 74 M/s
{
// cpu: 356 gpu: 534 M/s
mergeSortGang(iKey, iVal, srcKey, srcVal, N/(2*programCount));
#if 1
for (uniform int stride = 2*programCount; stride < N; stride <<= 1)
{
// cpu: 30 gpu: 112 M/s
{
#if 1
// cpu: 121 gpu: 460 M/s
{
// cpu: 190 gpu: 600 M/s
//Find sample ranks and prepare for limiters merge
generateSampleRanks(ranksA, ranksB, iKey, stride, N);
// cpu: 120 gpu: 457 M/s
//Merge ranks and indices
mergeRanksAndIndices(limitsA, limitsB, ranksA, ranksB, stride, N);
}
#endif
// cpu: 287 gpu: 194 M/s
//Merge elementary intervals
mergeElementaryIntervals(oKey, oVal, iKey, iVal, limitsA, limitsB, stride, N);
}
{
uniform Key_t * uniform tmpKey = iKey;
iKey = oKey;
oKey = tmpKey;
}
{
uniform Val_t * uniform tmpVal = iVal;
iVal = oVal;
oVal = tmpVal;
}
}
#endif
}
}

View File

@@ -0,0 +1,8 @@
EXAMPLE=hermite4
CPP_SRC=hermite4.cpp
ISPC_SRC=hermite4.ispc
ISPC_IA_TARGETS=avx1-i32x8
ISPC_ARM_TARGETS=neon
include ../common_cpu.mk

View File

@@ -0,0 +1,7 @@
EXAMPLE=hermite4
CXX_SRC=hermite4.cpp
ISPC_SRC=hermite4.ispc
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
ISPC_TARGET=generic-16
include ../common_knc.mk

View File

@@ -0,0 +1,14 @@
PROG=hermite4
ISPC_SRC=hermite4.ispc
#CU_SRC=hermite4.cu
CXX_SRC=hermite4.cpp
PTXCC_REGMAX=64
#ISPC_FLAGS= --opt=disable-uniform-control-flow
#LLVM_GPU=1
NVVM_GPU=1
include ../common_ptx.mk

View File

@@ -0,0 +1,361 @@
/*
Copyright (c) 2014, Evghenii Gaburov
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Hermite4 N-body integrator */
/* Makino and Aarseth, 1992 */
/* http://adsabs.harvard.edu/abs/1992PASJ...44..141M and references there in*/
#include <cstdlib>
#include <cmath>
#include <cstdio>
#include <algorithm>
#include <vector>
#include <cassert>
#include "timing.h"
#include "ispc_malloc.h"
#include "typeReal.h"
#include "hermite4_ispc.h"
struct Hermite4
{
enum {PP_FLOP=44};
const int n;
const real eta;
real eps2;
real *g_mass, *g_gpot;
real *g_posx, *g_posy, *g_posz;
real *g_velx, *g_vely, *g_velz;
real *g_accx, *g_accy, *g_accz;
real *g_jrkx, *g_jrky, *g_jrkz;
std::vector<real> accx0, accy0, accz0;
std::vector<real> jrkx0, jrky0, jrkz0;
Hermite4(const int _n = 8192, const real _eta = 0.1) : n(_n), eta(_eta)
{
eps2 = 4.0/n; /* eps = 4/n to give Ebin = 1 KT */
eps2 *= eps2;
g_mass = new real[n];
g_gpot = new real[n];
g_posx = new real[n];
g_posy = new real[n];
g_posz = new real[n];
g_velx = new real[n];
g_vely = new real[n];
g_velz = new real[n];
g_accx = new real[n];
g_accy = new real[n];
g_accz = new real[n];
g_jrkx = new real[n];
g_jrky = new real[n];
g_jrkz = new real[n];
accx0.resize(n);
accy0.resize(n);
accz0.resize(n);
jrkx0.resize(n);
jrky0.resize(n);
jrkz0.resize(n);
printf("---Intializing nbody--- \n");
const real R0 = 1;
const real mp = 1.0/n;
#pragma omp parallel for schedule(runtime)
for (int i = 0; i < n; i++)
{
real xp, yp, zp, s2 = 2*R0;
real vx, vy, vz;
while (s2 > R0*R0) {
xp = (1.0 - 2.0*drand48())*R0;
yp = (1.0 - 2.0*drand48())*R0;
zp = (1.0 - 2.0*drand48())*R0;
s2 = xp*xp + yp*yp + zp*zp;
vx = drand48() * 0.1;
vy = drand48() * 0.1;
vz = drand48() * 0.1;
}
g_posx[i] = xp;
g_posy[i] = yp;
g_posz[i] = zp;
g_velx[i] = vx;
g_vely[i] = vy;
g_velz[i] = vz;
g_mass[i] = mp;
}
}
~Hermite4()
{
delete g_mass;
delete g_gpot;
delete g_posx;
delete g_posy;
delete g_posz;
delete g_velx;
delete g_vely;
delete g_velz;
delete g_accx;
delete g_accy;
delete g_accz;
delete g_jrkx;
delete g_jrky;
delete g_jrkz;
}
void forces();
real step(const real dt)
{
const real dt2 = dt*real(1.0/2.0);
const real dt3 = dt*real(1.0/3.0);
real dt_min = HUGE;
#pragma omp parallel for schedule(runtime)
for (int i = 0; i < n; i++)
{
accx0[i] = g_accx[i];
accy0[i] = g_accy[i];
accz0[i] = g_accz[i];
jrkx0[i] = g_jrkx[i];
jrky0[i] = g_jrky[i];
jrkz0[i] = g_jrkz[i];
g_posx[i] += dt*(g_velx[i] + dt2*(g_accx[i] + dt3*g_jrkx[i]));
g_posy[i] += dt*(g_vely[i] + dt2*(g_accy[i] + dt3*g_jrky[i]));
g_posz[i] += dt*(g_velz[i] + dt2*(g_accz[i] + dt3*g_jrkz[i]));
g_velx[i] += dt*(g_accx[i] + dt2*g_jrkx[i]);
g_vely[i] += dt*(g_accy[i] + dt2*g_jrky[i]);
g_velz[i] += dt*(g_accz[i] + dt2*g_jrkz[i]);
}
forces();
if (dt > 0.0)
{
const real h = dt*real(0.5);
const real hinv = real(1.0)/h;
const real f1 = real(0.5)*hinv*hinv;
const real f2 = real(3.0)*hinv*f1;
const real dt2 = dt *dt * real(1.0/2.0);
const real dt3 = dt2*dt * real(1.0/3.0);
const real dt4 = dt3*dt * real(1.0/4.0);
const real dt5 = dt4*dt * real(1.0/5.0);
#pragma omp parallel for schedule(runtime) reduction(min:dt_min)
for (int i = 0; i < n; i++)
{
/* compute snp & crk */
const real Amx = g_accx[i] - accx0[i];
const real Amy = g_accy[i] - accy0[i];
const real Amz = g_accz[i] - accz0[i];
const real Jmx = h*(g_jrkx[i] - jrkx0[i]);
const real Jmy = h*(g_jrky[i] - jrky0[i]);
const real Jmz = h*(g_jrkz[i] - jrkz0[i]);
const real Jpx = h*(g_jrkx[i] + jrkx0[i]);
const real Jpy = h*(g_jrky[i] + jrky0[i]);
const real Jpz = h*(g_jrkz[i] + jrkz0[i]);
real snpx = f1*Jmx;
real snpy = f1*Jmy;
real snpz = f1*Jmz;
real crkx = f2*(Jpx - Amx);
real crky = f2*(Jpy - Amy);
real crkz = f2*(Jpz - Amz);
snpx -= h*crkx;
snpy -= h*crky;
snpz -= h*crkz;
/* correct */
g_posx[i] += dt4*snpx + dt5*crkx;
g_posy[i] += dt4*snpy + dt5*crky;
g_posz[i] += dt4*snpz + dt5*crkz;
g_velx[i] += dt3*snpx + dt4*crkx;
g_vely[i] += dt3*snpy + dt4*crky;
g_velz[i] += dt3*snpz + dt4*crkz;
/* compute new timestep */
const real s0 = g_accx[i]*g_accx[i] + g_accy[i]*g_accy[i] + g_accz[i]*g_accz[i];
const real s1 = g_jrkx[i]*g_jrkx[i] + g_jrky[i]*g_jrky[i] + g_jrkz[i]*g_jrkz[i];
const real s2 = snpx*snpx + snpy*snpy + snpz*snpz;
const real s3 = crkx*crkx + crky*crky + crkz*crkz;
const double u = std::sqrt(s0*s2) + s1;
const double l = std::sqrt(s1*s3) + s2;
assert(l > 0.0f);
const real dt_loc = eta *std::sqrt(u/l);
dt_min = std::min(dt_min, dt_loc);
}
}
if (dt_min == HUGE)
return dt;
else
return dt_min;
}
void energy(real &Ekin, real &Epot)
{
real ekin = 0, epot = 0;
#pragma omp parallel for reduction(+:ekin,epot)
for (int i = 0; i < n; i++)
{
ekin += g_mass[i] * (g_velx[i]*g_velx[i] + g_vely[i]*g_vely[i] + g_velz[i]*g_velz[i]) * real(0.5f);
epot += real(0.5f)*g_mass[i] * g_gpot[i];
}
Ekin = ekin;
Epot = epot;
}
void integrate(const int niter, const real t_end = HUGE)
{
const double tin = rtc();
forces();
const double fn = n;
printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - tin,
fn*fn*PP_FLOP/(rtc() - tin)/1e9);
real Epot0, Ekin0;
energy(Ekin0, Epot0);
const real Etot0 = Epot0 + Ekin0;
printf(" E: %g %g %g \n", Epot0, Ekin0, Etot0);
/////////
real t_global = 0;
double t0 = 0;
int iter = 0;
int ntime = 10;
real dt = 1.0/131072;
real Epot, Ekin, Etot = Etot0;
while (t_global < t_end) {
if (iter % ntime == 0)
t0 = rtc();
if (iter >= niter) return;
dt = step(dt);
iter++;
t_global += dt;
const real Etot_pre = Etot;
energy(Ekin, Epot);
Etot = Ekin + Epot;
if (iter % 1 == 0) {
const real Etot = Ekin + Epot;
printf("iter= %d: t= %g dt= %g Ekin= %g Epot= %g Etot= %g , dE = %g d(dE)= %g \n",
iter, t_global, dt, Ekin, Epot, Etot, (Etot - Etot0)/std::abs(Etot0),
(Etot - Etot_pre)/std::abs(Etot_pre) );
}
if (iter % ntime == 0) {
printf(" mean flop rate in %g sec [%g GFLOP/s]\n", rtc() - t0,
fn*fn*PP_FLOP/(rtc() - t0)/1e9*ntime);
}
fflush(stdout);
}
}
};
void Hermite4::forces()
{
ispc::compute_forces(
n,
g_mass,
g_posx,
g_posy,
g_posz,
g_velx,
g_vely,
g_velz,
g_accx,
g_accy,
g_accz,
g_jrkx,
g_jrky,
g_jrkz,
g_gpot,
eps2);
}
void run(const int nbodies, const real eta, const int nstep)
{
Hermite4 h4(nbodies, eta);
h4.integrate(nstep);
}
int main(int argc, char *argv[])
{
printf(" Usage: %s [nbodies=8192] [nsteps=40] [eta=0.1] \n", argv[0]);
int nbodies = 8192;
if (argc > 1) nbodies = atoi(argv[1]);
int nstep = 40;
if (argc > 2) nstep = atoi(argv[2]);
float eta = 0.1;
if (argc > 3) eta = atof(argv[3]);
printf("nbodies= %d\n", nbodies);
printf("nstep= %d\n", nstep);
printf(" eta= %g \n", eta);
run(nbodies, eta, nstep);
return 0;
}

View File

@@ -0,0 +1,197 @@
/*
Copyright (c) 2014, Evghenii Gaburov
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "typeReal.h"
typedef real<3> vec3;
struct Force
{
vec3 acc, jrk;
real pot, null;
};
struct Predictor
{
vec3 pos, vel;
};
static inline
void body_body_force(
Force &fi,
const Predictor &pi,
const Predictor &pj,
const real mj,
const real eps2)
{
const real dx = pj.pos.x - pi.pos.x;
const real dy = pj.pos.y - pi.pos.y;
const real dz = pj.pos.z - pi.pos.z;
const real ds2 = dx*dx + dy*dy + dz*dz + eps2;
#if 1
const real inv_ds = rsqrt((float)ds2);
#else
const real inv_ds = rsqrt(ds2);
#endif
const real inv_ds2 = inv_ds*inv_ds;
const real minv_ds = inv_ds * mj;
const real minv_ds3 = inv_ds2 * minv_ds;
fi.acc.x += minv_ds3 * dx;
fi.acc.y += minv_ds3 * dy;
fi.acc.z += minv_ds3 * dz;
fi.pot -= minv_ds;
const real dvx = pj.vel.x - pi.vel.x;
const real dvy = pj.vel.y - pi.vel.y;
const real dvz = pj.vel.z - pi.vel.z;
const real rv = dx*dvx + dy*dvy + dz*dvz;
const real Jij = (real)(-3.0) * (rv * inv_ds2 * minv_ds3);
fi.jrk.x += minv_ds3*dvx + Jij*dx;
fi.jrk.y += minv_ds3*dvy + Jij*dy;
fi.jrk.z += minv_ds3*dvz + Jij*dz;
}
task void compute_forces_task(
uniform const int n,
uniform const int nPerTask,
uniform const real mass[],
uniform const real posx[],
uniform const real posy[],
uniform const real posz[],
uniform const real velx[],
uniform const real vely[],
uniform const real velz[],
uniform real accx[],
uniform real accy[],
uniform real accz[],
uniform real jrkx[],
uniform real jrky[],
uniform real jrkz[],
uniform real gpot[],
const uniform real eps2)
{
const uniform int nibeg = taskIndex * nPerTask;
const uniform int niend = min(n, nibeg + nPerTask);
if (nibeg >= n)
return;
uniform real shdata[7][programCount];
assert((n%programCount) == 0);
foreach (i = nibeg ... niend)
{
Force fi;
fi.acc = (real)0.0;
fi.jrk = (real)0.0;
fi.pot = (real)0.0;
Predictor pi;
pi.pos.x = posx[i];
pi.pos.y = posy[i];
pi.pos.z = posz[i];
pi.vel.x = velx[i];
pi.vel.y = vely[i];
pi.vel.z = velz[i];
for (uniform int jb = 0; jb < n; jb += programCount)
{
const int jp = jb + programIndex;
shdata[0][programIndex] = posx[jp];
shdata[1][programIndex] = posy[jp];
shdata[2][programIndex] = posz[jp];
shdata[3][programIndex] = mass[jp];
shdata[4][programIndex] = velx[jp];
shdata[5][programIndex] = vely[jp];
shdata[6][programIndex] = velz[jp];
for (uniform int j = 0; j < programCount; j++)
{
Predictor pj;
pj.pos.x = shdata[0][j];
pj.pos.y = shdata[1][j];
pj.pos.z = shdata[2][j];
pj.vel.x = shdata[4][j];
pj.vel.y = shdata[5][j];
pj.vel.z = shdata[6][j];
const real jmass = shdata[3][j];
body_body_force(fi,pi,pj,jmass,eps2);
}
}
accx[i] = fi.acc.x;
accy[i] = fi.acc.y;
accz[i] = fi.acc.z;
jrkx[i] = fi.jrk.x;
jrky[i] = fi.jrk.y;
jrkz[i] = fi.jrk.z;
gpot[i] = fi.pot;
}
}
export void compute_forces(
uniform const int n,
uniform const real mass[],
uniform const real posx[],
uniform const real posy[],
uniform const real posz[],
uniform const real velx[],
uniform const real vely[],
uniform const real velz[],
uniform real accx[],
uniform real accy[],
uniform real accz[],
uniform real jrkx[],
uniform real jrky[],
uniform real jrkz[],
uniform real gpot[],
const uniform real eps2)
{
const uniform int nPerTask = min(128,programCount*8);
const uniform int nTask = (n+nPerTask-1)/nPerTask;
launch [nTask] compute_forces_task(
n, nPerTask,
mass,
posx,posy,posz,
velx,vely,velz,
accx,accy,accz,
jrkx,jrky,jrkz,
gpot,eps2);
}

View File

@@ -0,0 +1,2 @@
#pragma once
typedef double real;

View File

@@ -0,0 +1,409 @@
/*
Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define DBG(x)
#include <omp.h>
#include <malloc.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <algorithm>
// Signature of ispc-generated 'task' functions
typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
int taskIndex, int taskCount,
int taskIndex0, int taskIndex1, int taskIndex2,
int taskCount0, int taskCount1, int taskCount2);
// Small structure used to hold the data for each task
#ifdef _MSC_VER
__declspec(align(16))
#endif
struct TaskInfo {
TaskFuncType func;
void *data;
int taskIndex;
int taskCount3d[3];
#if defined(ISPC_IS_WINDOWS)
event taskEvent;
#endif
int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; }
int taskIndex0() const
{
return taskIndex % taskCount3d[0];
}
int taskIndex1() const
{
return ( taskIndex / taskCount3d[0] ) % taskCount3d[1];
}
int taskIndex2() const
{
return taskIndex / ( taskCount3d[0]*taskCount3d[1] );
}
int taskCount0() const { return taskCount3d[0]; }
int taskCount1() const { return taskCount3d[1]; }
int taskCount2() const { return taskCount3d[2]; }
TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); }
}
#ifndef _MSC_VER
__attribute__((aligned(32)));
#endif
;
// ispc expects these functions to have C linkage / not be mangled
extern "C" {
void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz);
void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
void ISPCSync(void *handle);
}
///////////////////////////////////////////////////////////////////////////
// TaskGroupBase
#define LOG_TASK_QUEUE_CHUNK_SIZE 14
#define MAX_TASK_QUEUE_CHUNKS 8
#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
#define NUM_MEM_BUFFERS 16
class TaskGroup;
/** The TaskGroupBase structure provides common functionality for "task
groups"; a task group is the set of tasks launched from within a single
ispc function. When the function is ready to return, it waits for all
of the tasks in its task group to finish before it actually returns.
*/
class TaskGroupBase {
public:
void Reset();
int AllocTaskInfo(int count);
TaskInfo *GetTaskInfo(int index);
void *AllocMemory(int64_t size, int32_t alignment);
protected:
TaskGroupBase();
~TaskGroupBase();
int nextTaskInfoIndex;
private:
/* We allocate blocks of TASK_QUEUE_CHUNK_SIZE TaskInfo structures as
needed by the calling function. We hold up to MAX_TASK_QUEUE_CHUNKS
of these (and then exit at runtime if more than this many tasks are
launched.)
*/
TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
/* We also allocate chunks of memory to service ISPCAlloc() calls. The
memBuffers[] array holds pointers to this memory. The first element
of this array is initialized to point to mem and then any subsequent
elements required are initialized with dynamic allocation.
*/
int curMemBuffer, curMemBufferOffset;
int memBufferSize[NUM_MEM_BUFFERS];
char *memBuffers[NUM_MEM_BUFFERS];
char mem[256];
};
inline TaskGroupBase::TaskGroupBase() {
nextTaskInfoIndex = 0;
curMemBuffer = 0;
curMemBufferOffset = 0;
memBuffers[0] = mem;
memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
for (int i = 1; i < NUM_MEM_BUFFERS; ++i) {
memBuffers[i] = NULL;
memBufferSize[i] = 0;
}
for (int i = 0; i < MAX_TASK_QUEUE_CHUNKS; ++i)
taskInfo[i] = NULL;
}
inline TaskGroupBase::~TaskGroupBase() {
// Note: don't delete memBuffers[0], since it points to the start of
// the "mem" member!
for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
delete[](memBuffers[i]);
}
inline void
TaskGroupBase::Reset() {
nextTaskInfoIndex = 0;
curMemBuffer = 0;
curMemBufferOffset = 0;
}
inline int
TaskGroupBase::AllocTaskInfo(int count) {
int ret = nextTaskInfoIndex;
nextTaskInfoIndex += count;
return ret;
}
inline TaskInfo *
TaskGroupBase::GetTaskInfo(int index) {
int chunk = (index >> LOG_TASK_QUEUE_CHUNK_SIZE);
int offset = index & (TASK_QUEUE_CHUNK_SIZE-1);
if (chunk == MAX_TASK_QUEUE_CHUNKS) {
fprintf(stderr, "A total of %d tasks have been launched from the "
"current function--the simple built-in task system can handle "
"no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE "
"and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation. "
"Sorry! Exiting.\n", index);
exit(1);
}
if (taskInfo[chunk] == NULL)
taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
return &taskInfo[chunk][offset];
}
inline void *
TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
char *basePtr = memBuffers[curMemBuffer];
intptr_t iptr = (intptr_t)(basePtr + curMemBufferOffset);
iptr = (iptr + (alignment-1)) & ~(alignment-1);
int newOffset = int(iptr - (intptr_t)basePtr + size);
if (newOffset < memBufferSize[curMemBuffer]) {
curMemBufferOffset = newOffset;
return (char *)iptr;
}
++curMemBuffer;
curMemBufferOffset = 0;
assert(curMemBuffer < NUM_MEM_BUFFERS);
int allocSize = 1 << (12 + curMemBuffer);
allocSize = std::max(int(size+alignment), allocSize);
char *newBuf = new char[allocSize];
memBufferSize[curMemBuffer] = allocSize;
memBuffers[curMemBuffer] = newBuf;
return AllocMemory(size, alignment);
}
///////////////////////////////////////////////////////////////////////////
// Atomics and the like
static inline void
lMemFence() {
// Windows atomic functions already contain the fence
// KNC doesn't need the memory barrier
#if !defined ISPC_IS_KNC && !defined ISPC_IS_WINDOWS
__sync_synchronize();
#endif
}
static void *
lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
#ifdef ISPC_IS_WINDOWS
return InterlockedCompareExchangePointer(v, newValue, oldValue);
#else
void *result = __sync_val_compare_and_swap(v, oldValue, newValue);
lMemFence();
return result;
#endif // ISPC_IS_WINDOWS
}
static int32_t
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
#ifdef ISPC_IS_WINDOWS
return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue);
#else
int32_t result = __sync_val_compare_and_swap(v, oldValue, newValue);
lMemFence();
return result;
#endif // ISPC_IS_WINDOWS
}
static inline int32_t
lAtomicAdd(volatile int32_t *v, int32_t delta) {
#ifdef ISPC_IS_WINDOWS
return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
#else
return __sync_fetch_and_add(v, delta);
#endif
}
///////////////////////////////////////////////////////////////////////////
class TaskGroup : public TaskGroupBase {
public:
void Launch(int baseIndex, int count);
void Sync();
};
///////////////////////////////////////////////////////////////////////////
// OpenMP
static void
InitTaskSystem() {
// No initialization needed
}
inline void
TaskGroup::Launch(int baseIndex, int count) {
#pragma omp parallel
{
const int threadIndex = omp_get_thread_num();
const int threadCount = omp_get_num_threads();
TaskInfo ti = *GetTaskInfo(baseIndex);
#pragma omp for schedule(runtime)
for(int i = 0; i < count; i++)
{
ti.taskIndex = i;
// Actually run the task.
ti.func(ti.data, threadIndex, threadCount, ti.taskIndex, ti.taskCount(),
ti.taskIndex0(), ti.taskIndex1(), ti.taskIndex2(),
ti.taskCount0(), ti.taskCount1(), ti.taskCount2());
}
}
}
inline void
TaskGroup::Sync() {
}
///////////////////////////////////////////////////////////////////////////
#define MAX_FREE_TASK_GROUPS 64
static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
static inline TaskGroup *
AllocTaskGroup()
{
for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
TaskGroup *tg = freeTaskGroups[i];
if (tg != NULL) {
void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
if (ptr != NULL) {
return (TaskGroup *)ptr;
}
}
}
return new TaskGroup;
}
static inline void
FreeTaskGroup(TaskGroup *tg)
{
tg->Reset();
for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
if (freeTaskGroups[i] == NULL) {
void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL);
if (ptr == NULL)
return;
}
}
delete tg;
}
void
ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2)
{
const int count = count0*count1*count2;
TaskGroup *taskGroup;
if (*taskGroupPtr == NULL) {
InitTaskSystem();
taskGroup = AllocTaskGroup();
*taskGroupPtr = taskGroup;
}
else
taskGroup = (TaskGroup *)(*taskGroupPtr);
int baseIndex = taskGroup->AllocTaskInfo(count);
for (int i = 0; i < 1; ++i) {
TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i);
ti->func = (TaskFuncType)func;
ti->data = data;
ti->taskIndex = i;
ti->taskCount3d[0] = count0;
ti->taskCount3d[1] = count1;
ti->taskCount3d[2] = count2;
}
taskGroup->Launch(baseIndex, count);
}
void
ISPCSync(void *h)
{
TaskGroup *taskGroup = (TaskGroup *)h;
if (taskGroup != NULL) {
taskGroup->Sync();
FreeTaskGroup(taskGroup);
}
}
void *
ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment)
{
TaskGroup *taskGroup;
if (*taskGroupPtr == NULL) {
InitTaskSystem();
taskGroup = AllocTaskGroup();
*taskGroupPtr = taskGroup;
}
else
taskGroup = (TaskGroup *)(*taskGroupPtr);
return taskGroup->AllocMemory(size, alignment);
}

1
examples/portable/options/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
options

View File

@@ -0,0 +1,8 @@
EXAMPLE=options
CPP_SRC=options.cpp
ISPC_SRC=options.ispc
ISPC_IA_TARGETS=avx1-i32x16
ISPC_ARM_TARGETS=neon
include ../common_cpu.mk

View File

@@ -0,0 +1,7 @@
EXAMPLE=options
CXX_SRC=options.cpp
ISPC_SRC=options.ispc
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
ISPC_TARGET=generic-16
include ../common_knc.mk

View File

@@ -0,0 +1,14 @@
PROG=options
ISPC_SRC=options.ispc
CU_SRC=options.cu
CXX_SRC=options.cpp
PTXCC_REGMAX=128
#LLVM_GPU=1
NVVM_GPU=1
include ../common_ptx.mk

View File

@@ -0,0 +1,120 @@
/*
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define NOMINMAX
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cassert>
#include <cmath>
#include <algorithm>
using std::max;
#include "options_defs.h"
#include "timing.h"
#include "ispc_malloc.h"
#include "options_ispc.h"
using namespace ispc;
static void usage() {
printf("usage: options [--count=<num options>]\n");
}
int main(int argc, char *argv[]) {
int nOptions = 128*1024;
for (int i = 1; i < argc; ++i) {
if (strncmp(argv[i], "--count=", 8) == 0) {
nOptions = atoi(argv[i] + 8);
if (nOptions <= 0) {
usage();
exit(1);
}
}
}
float *S = new float[nOptions];
float *X = new float[nOptions];
float *T = new float[nOptions];
float *r = new float[nOptions];
float *v = new float[nOptions];
float *result = new float[nOptions];
for (int i = 0; i < nOptions; ++i) {
S[i] = 100; // stock price
X[i] = 98; // option strike price
T[i] = 2; // time (years)
r[i] = .02; // risk-free interest rate
v[i] = 5; // volatility
}
double sum;
//
// Binomial options pricing model, ispc implementation, tasks
//
double binomial_tasks = 1e30;
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
binomial_put_ispc_tasks(S, X, T, r, v, result, nOptions);
double dt = get_elapsed_msec();
binomial_tasks = std::min(binomial_tasks, dt);
}
sum = 0.;
for (int i = 0; i < nOptions; ++i)
sum += result[i];
printf("[binomial ispc, tasks]:\t\t[%.3f] msec (avg %f)\n",
binomial_tasks, sum / nOptions);
//
// Black-Scholes options pricing model, ispc implementation, tasks
//
double bs_ispc_tasks = 1e30;
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
black_scholes_ispc_tasks(S, X, T, r, v, result, nOptions);
double dt = get_elapsed_msec();
sum = 0.;
for (int i = 0; i < nOptions; ++i)
sum += result[i];
bs_ispc_tasks = std::min(bs_ispc_tasks, dt);
}
printf("[black-scholes ispc, tasks]:\t[%.3f] msec (avg %f)\n",
bs_ispc_tasks, sum / nOptions);
return 0;
}

View File

@@ -0,0 +1,334 @@
// -*- mode: c++ -*-
/*
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "options_defs.h"
#include "cuda_helpers.cuh"
__device__ static inline void __range_reduce_log(float input, float * reduced,
int * exponent) {
int int_version = __float_as_int(input); //intbits(input);
// single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
// exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000
// 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0
// non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111
// = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF
//const int exponent_mask(0x7F800000)
const int nonexponent_mask = 0x807FFFFF;
// We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126
const int exponent_neg1 = (126l << 23);
// NOTE(boulos): We don't need to mask anything out since we know
// the sign bit has to be 0. If it's 1, we need to return infinity/nan
// anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
int biased_exponent = int_version >> 23; // This number is [0, 255] but it means [-127, 128]
int offset_exponent = biased_exponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
*exponent = offset_exponent - 127; // get the real value
// Blend the offset_exponent with the original input (do this in
// int for now, until I decide if float can have & and &not)
int blended = (int_version & nonexponent_mask) | (exponent_neg1);
*reduced = __int_as_float(blended); //floatbits(blended);
}
__device__ static inline float __Logf(const float x_full)
{
#if 1
return __logf(x_full);
#else
float reduced;
int exponent;
const int NaN_bits = 0x7fc00000;
const int Neg_Inf_bits = 0xFF800000;
const float NaN = __int_as_float(NaN_bits); //floatbits(NaN_bits);
const float neg_inf = __int_as_float(Neg_Inf_bits); //floatbits(Neg_Inf_bits);
bool use_nan = x_full < 0.f;
bool use_inf = x_full == 0.f;
bool exceptional = use_nan || use_inf;
const float one = 1.0f;
float patched = exceptional ? one : x_full;
__range_reduce_log(patched, &reduced, &exponent);
const float ln2 = 0.693147182464599609375f;
float x1 = one - reduced;
const float c1 = 0.50000095367431640625f;
const float c2 = 0.33326041698455810546875f;
const float c3 = 0.2519190013408660888671875f;
const float c4 = 0.17541764676570892333984375f;
const float c5 = 0.3424419462680816650390625f;
const float c6 = -0.599632322788238525390625f;
const float c7 = +1.98442304134368896484375f;
const float c8 = -2.4899270534515380859375f;
const float c9 = +1.7491014003753662109375f;
float result = x1 * c9 + c8;
result = x1 * result + c7;
result = x1 * result + c6;
result = x1 * result + c5;
result = x1 * result + c4;
result = x1 * result + c3;
result = x1 * result + c2;
result = x1 * result + c1;
result = x1 * result + one;
// Equation was for -(ln(red)/(1-red))
result *= -x1;
result += (float)(exponent) * ln2;
return exceptional ? (use_nan ? NaN : neg_inf) : result;
#endif
}
__device__ static inline float __Expf(const float x_full)
{
#if 1
return __expf(x_full);
#else
const float ln2_part1 = 0.6931457519f;
const float ln2_part2 = 1.4286067653e-6f;
const float one_over_ln2 = 1.44269502162933349609375f;
float scaled = x_full * one_over_ln2;
float k_real = floor(scaled);
int k = (int)k_real;
// Reduced range version of x
float x = x_full - k_real * ln2_part1;
x -= k_real * ln2_part2;
// These coefficients are for e^x in [0, ln(2)]
const float one = 1.f;
const float c2 = 0.4999999105930328369140625f;
const float c3 = 0.166668415069580078125f;
const float c4 = 4.16539050638675689697265625e-2f;
const float c5 = 8.378830738365650177001953125e-3f;
const float c6 = 1.304379315115511417388916015625e-3f;
const float c7 = 2.7555381529964506626129150390625e-4f;
float result = x * c7 + c6;
result = x * result + c5;
result = x * result + c4;
result = x * result + c3;
result = x * result + c2;
result = x * result + one;
result = x * result + one;
// Compute 2^k (should differ for float and double, but I'll avoid
// it for now and just do floats)
const int fpbias = 127;
int biased_n = k + fpbias;
bool overflow = k > fpbias;
// Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
// we've got underflow. -127 * ln(2) -> -88.02. So the most
// negative float input that doesn't result in zero is like -88.
bool underflow = (biased_n <= 0);
const int InfBits = 0x7f800000;
biased_n <<= 23;
// Reinterpret this thing as float
float two_to_the_n = __int_as_float(biased_n); //floatbits(biased_n);
// Handle both doubles and floats (hopefully eliding the copy for float)
float elemtype_2n = two_to_the_n;
result *= elemtype_2n;
// result = overflow ? floatbits(InfBits) : result;
result = overflow ? __int_as_float(InfBits) : result;
result = underflow ? 0.0f : result;
return result;
#endif
}
// Cumulative normal distribution function
//
__device__
static inline float
CND(float X) {
float L = fabsf(X);
float k = 1.0f / (1.0f + 0.2316419f * L);
float k2 = k*k;
float k3 = k2*k;
float k4 = k2*k2;
float k5 = k3*k2;
const float invSqrt2Pi = 0.39894228040f;
float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +
-1.821255978f * k4 + 1.330274429f * k5);
w *= invSqrt2Pi * __Expf(-L * L * .5f);
if (X > 0.f)
w = 1.0f - w;
return w;
}
__global__
void bs_task( float Sa[], float Xa[], float Ta[],
float ra[], float va[],
float result[], int count) {
if (taskIndex >= taskCount) return;
int first = taskIndex * (count/taskCount);
int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
for (int i = programIndex + first; i < last; i += programCount)
if (i < last)
{
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
float d1 = (__Logf(S/X) + (r + v * v * .5f) * T) / (v * sqrtf(T));
float d2 = d1 - v * sqrtf(T);
result[i] = S * CND(d1) - X * __Expf(-r * T) * CND(d2);
}
}
extern "C"
__global__ void
black_scholes_ispc_tasks___export( float Sa[], float Xa[], float Ta[],
float ra[], float va[],
float result[], int count) {
int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
launch(nTasks,1,1,bs_task)
(Sa, Xa, Ta, ra, va, result, count);
cudaDeviceSynchronize();
}
extern "C"
__host__ void
black_scholes_ispc_tasks( float Sa[], float Xa[], float Ta[],
float ra[], float va[],
float result[], int count) {
black_scholes_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count);
cudaDeviceSynchronize();
}
/********/
template<int NBEG, int NEND, int STEP>
struct loop
{
__device__ static void op1(float V[], const float u, const float X, const float S)
{
const int j = NBEG;
float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
V[j] = max(0.0f, X - S * upow);
loop<j+STEP,NEND,STEP>::op1(V,u,X,S);
}
__device__ static void op2(float V[], const float Pu, const float disc)
{
const int j = NBEG;
#pragma unroll
for ( int k = 0; k < j; ++k)
V[k] = ((1.0f - Pu) * V[k] + Pu * V[k+ 1]) / disc;
loop<j+STEP,NEND,STEP>::op2(V, Pu,disc);
}
};
template<int NEND, int STEP>
struct loop<NEND,NEND,STEP>
{
__device__ static void op1(float V[], const float u, const float X, const float S) {}
__device__ static void op2(float V[], const float Pu, const float disc) {}
};
__device__
static inline float
binomial_put(float S, float X, float T, float r, float v)
{
float V[BINOMIAL_NUM];
float dt = T / BINOMIAL_NUM;
float u = exp(v * sqrt(dt));
float d = 1.f / u;
float disc = exp(r * dt);
float Pu = (disc - d) / (u - d);
#if 0 /* slow */
for ( int j = 0; j < BINOMIAL_NUM; ++j) {
float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
V[j] = max(0.0f, X - S * upow);
}
for ( int j = BINOMIAL_NUM-1; j >= 0; --j)
for ( int k = 0; k < j; ++k)
V[k] = ((1.0f - Pu) * V[k] + Pu * V[k+ 1]) / disc;
#else /* with loop unrolling, stores resutls in registers */
loop<0,BINOMIAL_NUM,1>::op1(V,u,X,S);
loop<BINOMIAL_NUM-1, -1, -1>::op2(V, Pu, disc);
#endif
return V[0];
}
__global__ void
binomial_task( float Sa[], float Xa[],
float Ta[], float ra[],
float va[], float result[],
int count)
{
int first = taskIndex * (count/taskCount);
int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
for (int i = programIndex + first; i < last; i += programCount)
if (i < last)
{
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
result[i] = binomial_put(S, X, T, r, v);
}
}
extern "C" __global__ void
binomial_put_ispc_tasks___export( float Sa[], float Xa[],
float Ta[], float ra[],
float va[], float result[],
int count) {
int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
launch(nTasks,1,1,binomial_task)
(Sa, Xa, Ta, ra, va, result, count);
cudaDeviceSynchronize();
}
extern "C"
__host__ void
binomial_put_ispc_tasks( float Sa[], float Xa[], float Ta[],
float ra[], float va[],
float result[], int count) {
cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);
binomial_put_ispc_tasks___export<<<1,32>>>(Sa,Xa,Ta,ra,va,result,count);
cudaDeviceSynchronize();
}

View File

@@ -0,0 +1,211 @@
// -*- mode: c++ -*-
/*
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "options_defs.h"
// Cumulative normal distribution function
static inline float
CND(float X) {
float L = abs(X);
float k = 1.0 / (1.0 + 0.2316419 * L);
float k2 = k*k;
float k3 = k2*k;
float k4 = k2*k2;
float k5 = k3*k2;
const float invSqrt2Pi = 0.39894228040f;
float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +
-1.821255978f * k4 + 1.330274429f * k5);
w *= invSqrt2Pi * exp(-L * L * .5f);
if (X > 0.f)
w = 1.0 - w;
return w;
}
task void
bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[],
uniform float result[], uniform int count) {
uniform int first = taskIndex * (count/taskCount);
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
foreach (i = first ... last) {
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
float d2 = d1 - v * sqrt(T);
result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
}
}
export void
black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[],
uniform float result[], uniform int count) {
uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
}
/********/
export void
black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[],
uniform float result[], uniform int count) {
foreach (i = 0 ... count) {
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
float d2 = d1 - v * sqrt(T);
result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
}
}
static inline float
binomial_put(float S, float X, float T, float r, float v) {
float V[BINOMIAL_NUM];
float dt = T / BINOMIAL_NUM;
float u = exp(v * sqrt(dt));
float d = 1. / u;
float disc = exp(r * dt);
float Pu = (disc - d) / (u - d);
#ifndef __NVPTX__
for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
V[j] = max(0., X - S * upow);
}
for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
for (uniform int k = 0; k < j; ++k)
V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
#else
/* loop unrolling helps NVVM to place V -> registers therefore boosting performance */
/* takes looong time to compile... */
#if BINOMIAL_NUM != 64
#error "Cannot unroll. Please use generic version above"
#endif
// with PTX target unroll loops which will store data in registers..
/* first loop */
#define OP(j) { \
float upow = pow(u, (float)(2*(j)-BINOMIAL_NUM)); \
V[j] = max(0., X - S * upow); }
#define OP10(k) \
OP(k+0); OP(k+1); OP(k+2); OP(k+3); OP(k+4) \
OP(k+5); OP(k+6); OP(k+7); OP(k+8); OP(k+9);
OP10(0)
OP10(10)
OP10(20)
OP10(30)
OP10(40)
OP10(50)
OP(60)
OP(61)
OP(62)
OP(63)
#undef OP10
#undef OP
/* second loop */
#define OP(j) {\
for (uniform int k = 0; k < (j); ++k) \
V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc; }
#define OP10(k) \
OP(k+9); OP(k+8); OP(k+7); OP(k+6); OP(k+5); \
OP(k+4); OP(k+3); OP(k+2); OP(k+1); OP(k+0);
OP(63)
OP(62)
OP(61)
OP(60)
OP10(50)
OP10(40)
OP10(30)
OP10(20)
OP10(10)
OP10(0)
#undef OP10
#undef OP
#endif
return V[0];
}
export void
binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
uniform float ra[], uniform float va[],
uniform float result[], uniform int count) {
foreach (i = 0 ... count) {
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
result[i] = binomial_put(S, X, T, r, v);
}
}
task void
binomial_task(uniform float Sa[], uniform float Xa[],
uniform float Ta[], uniform float ra[],
uniform float va[], uniform float result[],
uniform int count) {
uniform int first = taskIndex * (count/taskCount);
uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
foreach (i = first ... last) {
float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
result[i] = binomial_put(S, X, T, r, v);
}
}
export void
binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
uniform float Ta[], uniform float ra[],
uniform float va[], uniform float result[],
uniform int count) {
uniform int nTasks = 2048; //count/16384; //max((int)64, (int)count/16384);
launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
}

View File

@@ -0,0 +1,40 @@
/*
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef OPTIONS_DEFS_H
#define OPTIONS_DEFS_H 1
#define BINOMIAL_NUM 64
#endif // OPTIONS_DEFS_H

View File

@@ -0,0 +1,9 @@
EXAMPLE=radixSort
CPP_SRC=radixSort.cpp
ISPC_SRC=radixSort.ispc
ISPC_IA_TARGETS=avx1-i32x8
ISPC_ARM_TARGETS=neon
#ISPC_FLAGS=-DDEBUG -g
include ../common_cpu.mk

View File

@@ -0,0 +1,7 @@
EXAMPLE=radixSort
CXX_SRC=radixSort.cpp
ISPC_SRC=radixSort.ispc
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
ISPC_TARGET=generic-16
include ../common_knc.mk

View File

@@ -0,0 +1,15 @@
PROG=radixSort
ISPC_SRC=radixSort.ispc
CU_SRC=radixSort.cu
# NVCC_FLAGS=-Xptxas=-O1
CXX_SRC=radixSort.cpp radixSort.cpp
PTXCC_REGMAX=64
LLVM_GPU=1
NVVM_GPU=1
include ../common_ptx.mk

View File

@@ -0,0 +1,154 @@
/*
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <cassert>
#include <iomanip>
#include "timing.h"
#include "ispc_malloc.h"
#include "radixSort_ispc.h"
static void progressBar(const int x, const int n, const int width = 50)
{
assert(n > 1);
assert(x >= 0 && x < n);
assert(width > 10);
const float f = static_cast<float>(x)/(n-1);
const int w = static_cast<int>(f * width);
// print bar
std::string bstr("[");
for (int i = 0; i < width; i++)
bstr += i < w ? '=' : ' ';
bstr += "]";
// print percentage
char pstr0[32];
sprintf(pstr0, " %2d %c ", static_cast<int>(f*100.0),'%');
const std::string pstr(pstr0);
std::copy(pstr.begin(), pstr.end(), bstr.begin() + (width/2-2));
std::cout << bstr;
std::cout << (x == n-1 ? "\n" : "\r") << std::flush;
}
struct Key
{
int32_t key,val;
};
int main (int argc, char *argv[])
{
int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
Key *keys = new Key [n];
Key *keys_orig = new Key [n];
unsigned int *keys_gold = new unsigned int [n];
srand48(rtc()*65536);
int sortBits = 32;
assert(sortBits <= 32);
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
keys[i].key = ((int)(drand48() * (1<<30))) & ((1ULL << sortBits) - 1);
keys[i].val = i;
}
std::random_shuffle(keys, keys + n);
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
keys_gold[i] = keys[i].key;
keys_orig[i] = keys[i];
}
ispcSetMallocHeapLimit(1024*1024*1024);
ispc::radixSort_alloc(n);
tISPC2 = 1e30;
for (i = 0; i < m; i ++)
{
ispcMemcpy(keys, keys_orig, n*sizeof(Key));
reset_and_start_timer();
ispc::radixSort(n, (int64_t*)keys, sortBits);
tISPC2 = std::min(tISPC2, get_elapsed_msec());
if (argc != 3)
progressBar (i, m);
}
ispc::radixSort_free();
printf("[sort ispc + tasks]:\t[%.3f] msec [%.3f Mpair/s]\n", tISPC2, 1.0e-3*n/tISPC2);
std::sort(keys_gold, keys_gold + n);
for (int i = 0; i < n; i++)
assert(keys[i].key == keys_gold[i]);
#if 0
for (i = 0; i < m; i ++)
{
ispcMemcpy(code, code_orig, n*sizeof(unsigned int));
reset_and_start_timer();
sort_serial (n, code, order);
tSerial += get_elapsed_msec();
if (argc != 3)
progressBar (i, m);
}
printf("[sort serial]:\t\t[%.3f] msec [%.3f Mpair/s]\n", tSerial, 1.0e-3*n*m/tSerial);
#ifndef _CUDA_
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
#else
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", tSerial/tISPC2);
#endif
#endif
delete keys;
delete keys_orig;
delete keys_gold;
return 0;
}

View File

@@ -0,0 +1,401 @@
/*
Copyright (c) 2014, Evghenii Gaburov
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Based on radixSort from http://www.moderngpu.com
*/
#include "cuda_helpers.cuh"
#include <cassert>
#define NUMBITS 8
#define NUMDIGITS (1<<NUMBITS)
typedef long long Key;
__forceinline__ __device__ int atomic_add_global(int* ptr, int value)
{
return atomicAdd(ptr, value);
}
static __device__ __forceinline__ int shfl_scan_add_step(int partial, int up_offset)
{
int result;
asm(
"{.reg .u32 r0;"
".reg .pred p;"
"shfl.up.b32 r0|p, %1, %2, 0;"
"@p add.u32 r0, r0, %3;"
"mov.u32 %0, r0;}"
: "=r"(result) : "r"(partial), "r"(up_offset), "r"(partial));
return result;
}
__forceinline__ __device__ int exclusive_scan_add(int value)
{
int mysum = value;
#pragma unroll
for(int i = 0; i < 5; ++i)
mysum = shfl_scan_add_step(mysum, 1 << i);
return mysum - value;
}
__global__
void countPass(
const Key keysAll[],
Key sortedAll[],
const int bit,
const int numElements,
int countsAll[],
int countsGlobal[])
{
const int blkIdx = taskIndex;
const int numBlocks = taskCount;
const int blkDim = (numElements + numBlocks - 1) / numBlocks;
const int mask = (1 << NUMBITS) - 1;
const Key * keys = keysAll + blkIdx*blkDim;
Key * sorted = sortedAll + blkIdx*blkDim;
int * counts = countsAll + blkIdx*NUMDIGITS;
const int nloc = min(numElements - blkIdx*blkDim, blkDim);
#pragma unroll 8
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
counts[digit] = 0;
for (int i = programIndex; i < nloc; i += programCount)
if (i < nloc)
{
sorted[i] = keys[i];
const int key = mask & ((unsigned int)keys[i] >> bit);
atomic_add_global(&counts[key], 1);
}
#pragma unroll 8
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
atomic_add_global(&countsGlobal[digit], counts[digit]);
}
__global__
void sortPass(
Key keysAll[],
Key sorted[],
int bit,
int numElements,
int digitOffsetsAll[])
{
const int blkIdx = taskIndex;
const int numBlocks = taskCount;
const int blkDim = (numElements + numBlocks - 1) / numBlocks;
const int keyIndex = blkIdx * blkDim;
Key * keys = keysAll + keyIndex;
const int nloc = min(numElements - keyIndex, blkDim);
const int mask = (1 << NUMBITS) - 1;
/* copy digit offset from Gmem to Lmem */
#if 1
__shared__ int digitOffsets_sh[NUMDIGITS*4];
volatile int *digitOffsets = digitOffsets_sh + warpIdx*NUMDIGITS;
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
digitOffsets[digit] = digitOffsetsAll[blkIdx*NUMDIGITS + digit];
#else
int *digitOffsets = &digitOffsetsAll[blkIdx*NUMDIGITS];
#endif
for (int i = programIndex; i < nloc; i += programCount)
if (i < nloc)
{
const int key = mask & ((unsigned int)keys[i] >> bit);
int scatter;
/* not a vector friendly loop */
#pragma unroll 1 /* needed, otherwise compiler unroll and optimizes the result :S */
for (int iv = 0; iv < programCount; iv++)
if (programIndex == iv)
scatter = digitOffsets[key]++;
sorted [scatter] = keys[i];
}
}
__global__
void partialScanLocal(
int numBlocks,
int excScanAll[],
int countsAll[],
int partialSumAll[])
{
const int blkIdx = taskIndex;
const int blkDim = (numBlocks+taskCount-1)/taskCount;
const int bbeg = blkIdx * blkDim;
const int bend = min(bbeg + blkDim, numBlocks);
int (* countsBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])countsAll;
int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll;
#pragma unroll 8
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
{
int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
for ( int block = bbeg; block < bend; block++)
{
const int y = countsBlock[block][digit];
excScanBlock[block][digit] = prev;
prev += y;
}
partialSum[blkIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
}
}
__global__
void partialScanGlobal(
const int numBlocks,
int partialSumAll[],
int prefixSumAll[])
{
int (* partialSum)[NUMDIGITS] = ( int (*)[NUMDIGITS])partialSumAll;
int (* prefixSum)[NUMDIGITS] = ( int (*)[NUMDIGITS]) prefixSumAll;
const int digit = taskIndex;
int carry = 0;
for (int block = programIndex; block < numBlocks; block += programCount)
{
const int value = partialSum[block][digit];
const int scan = exclusive_scan_add(value);
if (block < numBlocks)
prefixSum[block][digit] = scan + carry;
carry += __shfl(scan+value, programCount-1);
}
}
__global__
void completeScanGlobal(
int numBlocks,
int excScanAll[],
int carryValueAll[])
{
const int blkIdx = taskIndex;
const int blkDim = (numBlocks+taskCount-1)/taskCount;
const int bbeg = blkIdx * blkDim;
const int bend = min(bbeg + blkDim, numBlocks);
int (* excScanBlock)[NUMDIGITS] = ( int (*)[NUMDIGITS])excScanAll;
int (* carryValue)[NUMDIGITS] = ( int (*)[NUMDIGITS])carryValueAll;
#pragma unroll 8
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
{
const int carry = carryValue[blkIdx][digit];
for ( int block = bbeg; block < bend; block++)
excScanBlock[block][digit] += carry;
}
}
__device__ static
inline void radixExclusiveScan(
const int numBlocks,
int excScanPtr[],
int countsPtr[],
int partialSum[],
int prefixSum[])
{
const int scale = 8;
launch (numBlocks/scale, 1,1, partialScanLocal)(numBlocks, excScanPtr, countsPtr, partialSum);
sync;
launch (NUMDIGITS,1,1,partialScanGlobal) (numBlocks/scale, partialSum, prefixSum);
sync;
launch (numBlocks/scale,1,1, completeScanGlobal) (numBlocks, excScanPtr, prefixSum);
sync;
}
__device__ static int * memoryPool = NULL;
__device__ static int numBlocks;
__device__ static int nSharedCounts;
__device__ static int nCountsGlobal;
__device__ static int nExcScan;
__device__ static int nCountsBlock;
__device__ static int nPartialSum;
__device__ static int nPrefixSum;
__device__ static int * sharedCounts;
__device__ static int * countsGlobal;
__device__ static int * excScan;
__device__ static int * counts;
__device__ static int * partialSum;
__device__ static int * prefixSum;
__device__ static int numElementsBuf = 0;
__device__ static Key * bufKeys;
__global__
void radixSort_alloc___export(const int n)
{
assert(memoryPool == NULL);
numBlocks = 13*32*4;
nSharedCounts = NUMDIGITS*numBlocks;
nCountsGlobal = NUMDIGITS;
nExcScan = NUMDIGITS*numBlocks;
nCountsBlock = NUMDIGITS*numBlocks;
nPartialSum = NUMDIGITS*numBlocks;
nPrefixSum = NUMDIGITS*numBlocks;
const int nalloc =
nSharedCounts +
nCountsGlobal +
nExcScan +
nCountsBlock +
nPartialSum +
nPrefixSum;
if (programIndex == 0)
memoryPool = new int[nalloc];
sharedCounts = memoryPool;
countsGlobal = sharedCounts + nSharedCounts;
excScan = countsGlobal + nCountsGlobal;
counts = excScan + nExcScan;
partialSum = counts + nCountsBlock;
prefixSum = partialSum + nPartialSum;
}
extern "C"
void radixSort_alloc(const int n)
{
radixSort_alloc___export<<<1,32>>>(n);
sync;
}
__device__ static
void radixSort_freeBufKeys()
{
if (numElementsBuf > 0)
{
if (programIndex == 0)
delete bufKeys;
numElementsBuf = 0;
}
}
__global__ void radixSort_free___export()
{
assert(memoryPool != NULL);
if (programIndex == 0)
delete memoryPool;
memoryPool = NULL;
radixSort_freeBufKeys();
}
extern "C"
void radixSort_free()
{
radixSort_free___export<<<1,32>>>();
sync;
}
__global__ void radixSort___export(
const int numElements,
Key keys[],
const int nBits)
{
#ifdef __NVPTX__
assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */
#endif
if (numElementsBuf < numElements)
radixSort_freeBufKeys();
if (numElementsBuf == 0)
{
numElementsBuf = numElements;
if (programIndex == 0)
bufKeys = new Key[numElementsBuf];
}
const int blkDim = (numElements + numBlocks - 1) / numBlocks;
for ( int bit = 0; bit < nBits; bit += NUMBITS)
{
/* initialize histogram for each digit */
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
countsGlobal[digit] = 0;
/* compute histogram for each digit */
launch (numBlocks,1,1, countPass)(keys, bufKeys, bit, numElements, counts, countsGlobal);
sync;
/* exclusive scan on global histogram */
int carry = 0;
excScan[0] = 0;
#pragma unroll 8
for (int digit = programIndex; digit < NUMDIGITS; digit += programCount)
{
const int value = countsGlobal[digit];
const int scan = exclusive_scan_add(value);
excScan[digit] = scan + carry;
carry += __shfl(scan+value, programCount-1);
}
/* computing offsets for each digit */
radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
/* sorting */
launch (numBlocks,1,1,
sortPass)(
bufKeys,
keys,
bit,
numElements,
excScan);
sync;
}
}
extern "C"
void radixSort(
const int numElements,
Key keys[],
const int nBits)
{
cudaDeviceSetCacheConfig ( cudaFuncCachePreferEqual );
radixSort___export<<<1,32>>>(numElements, keys, nBits);
sync;
}

View File

@@ -0,0 +1,337 @@
/*
Copyright (c) 2014, Evghenii Gaburov
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Based on radixSort from http://www.moderngpu.com
*/
#define NUMBITS 8
#define NUMDIGITS (1<<NUMBITS)
typedef int64 Key;
task
void countPass(
const uniform Key keysAll[],
uniform Key sortedAll[],
const uniform int bit,
const uniform int numElements,
uniform int countsAll[],
uniform int countsGlobal[])
{
const uniform int blockIdx = taskIndex;
const uniform int numBlocks = taskCount;
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
const uniform int mask = (1 << NUMBITS) - 1;
const uniform Key * uniform keys = keysAll + blockIdx*blockDim;
uniform Key * uniform sorted = sortedAll + blockIdx*blockDim;
uniform int * uniform counts = countsAll + blockIdx*NUMDIGITS;
const uniform int nloc = min(numElements - blockIdx*blockDim, blockDim);
foreach (digit = 0 ... NUMDIGITS)
counts[digit] = 0;
foreach (i = 0 ... nloc)
{
sorted[i] = keys[i];
const int key = mask & ((unsigned int)keys[i] >> bit);
#ifdef __NVPTX__
atomic_add_global(&counts[key], 1);
#else
atomic_add_local(&counts[key], 1);
#endif
}
foreach (digit = 0 ... NUMDIGITS)
atomic_add_global(&countsGlobal[digit], counts[digit]);
}
task
void sortPass(
uniform Key keysAll[],
uniform Key sorted[],
uniform int bit,
uniform int numElements,
uniform int digitOffsetsAll[])
{
const uniform int blockIdx = taskIndex;
const uniform int numBlocks = taskCount;
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
const uniform int keyIndex = blockIdx * blockDim;
uniform Key * uniform keys = keysAll + keyIndex;
const uniform int nloc = min(numElements - keyIndex, blockDim);
const uniform int mask = (1 << NUMBITS) - 1;
/* copy digit offset from Gmem to Lmem */
#if 1
uniform int digitOffsets[NUMDIGITS];
foreach (digit = 0 ... NUMDIGITS)
digitOffsets[digit] = digitOffsetsAll[blockIdx*NUMDIGITS + digit];
#else
uniform int * uniform digitOffsets = &digitOffsetsAll[blockIdx*NUMDIGITS];
#endif
foreach (i = 0 ... nloc)
{
const int key = mask & ((unsigned int)keys[i] >> bit);
int scatter;
/* not a vector friendly loop */
foreach_active(iv)
scatter = digitOffsets[key]++;
sorted[scatter] = keys[i];
}
}
task
void partialScanLocal(
uniform int numBlocks,
uniform int excScanAll[],
uniform int countsAll[],
uniform int partialSumAll[])
{
const uniform int blockIdx = taskIndex;
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
const uniform int bbeg = blockIdx * blockDim;
const uniform int bend = min(bbeg + blockDim, numBlocks);
uniform int (* uniform countsBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])countsAll;
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
foreach (digit = 0 ... NUMDIGITS)
{
int prev = bbeg == 0 ? excScanBlock[0][digit] : 0;
for (uniform int block = bbeg; block < bend; block++)
{
const int y = countsBlock[block][digit];
excScanBlock[block][digit] = prev;
prev += y;
}
partialSum[blockIdx][digit] = excScanBlock[bend-1][digit] + countsBlock[bend-1][digit];
}
}
task
void partialScanGlobal(
const uniform int numBlocks,
uniform int partialSumAll[],
uniform int prefixSumAll[])
{
uniform int (* uniform partialSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])partialSumAll;
uniform int (* uniform prefixSum)[NUMDIGITS] = (uniform int (*)[NUMDIGITS]) prefixSumAll;
const uniform int digit = taskIndex;
int carry = 0;
foreach (block = 0 ... numBlocks)
{
const int value = partialSum[block][digit];
const int scan = exclusive_scan_add(value);
prefixSum[block][digit] = scan + carry;
carry += broadcast(scan+value, programCount-1);
}
}
task
void completeScanGlobal(
uniform int numBlocks,
uniform int excScanAll[],
uniform int carryValueAll[])
{
const uniform int blockIdx = taskIndex;
const uniform int blockDim = (numBlocks+taskCount-1)/taskCount;
const uniform int bbeg = blockIdx * blockDim;
const uniform int bend = min(bbeg + blockDim, numBlocks);
uniform int (* uniform excScanBlock)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])excScanAll;
uniform int (* uniform carryValue)[NUMDIGITS] = (uniform int (*)[NUMDIGITS])carryValueAll;
foreach (digit = 0 ... NUMDIGITS)
{
const int carry = carryValue[blockIdx][digit];
for (uniform int block = bbeg; block < bend; block++)
excScanBlock[block][digit] += carry;
}
}
static
inline void radixExclusiveScan(
const uniform int numBlocks,
uniform int excScanPtr[],
uniform int countsPtr[],
uniform int partialSum[],
uniform int prefixSum[])
{
const uniform int scale = 8;
launch [numBlocks/scale] partialScanLocal(numBlocks, excScanPtr, countsPtr, partialSum);
sync;
launch [NUMDIGITS] partialScanGlobal(numBlocks/scale, partialSum, prefixSum);
sync;
launch [numBlocks/scale] completeScanGlobal(numBlocks, excScanPtr, prefixSum);
sync;
}
static uniform int * uniform memoryPool = NULL;
static uniform int numBlocks;
static uniform int nSharedCounts;
static uniform int nCountsGlobal;
static uniform int nExcScan;
static uniform int nCountsBlock;
static uniform int nPartialSum;
static uniform int nPrefixSum;
static uniform int * uniform sharedCounts;
static uniform int * uniform countsGlobal;
static uniform int * uniform excScan;
static uniform int * uniform counts;
static uniform int * uniform partialSum;
static uniform int * uniform prefixSum;
static uniform int numElementsBuf = 0;
static uniform Key * uniform bufKeys;
export void radixSort_alloc(const uniform int n)
{
assert(memoryPool == NULL);
numBlocks = num_cores()*4;
#ifdef __NVPTX__
numBlocks = 13*32*4; //num_cores()*4;
#endif
nSharedCounts = NUMDIGITS*numBlocks;
nCountsGlobal = NUMDIGITS;
nExcScan = NUMDIGITS*numBlocks;
nCountsBlock = NUMDIGITS*numBlocks;
nPartialSum = NUMDIGITS*numBlocks;
nPrefixSum = NUMDIGITS*numBlocks;
const uniform int nalloc =
nSharedCounts +
nCountsGlobal +
nExcScan +
nCountsBlock +
nPartialSum +
nPrefixSum;
memoryPool = uniform new uniform int[nalloc];
sharedCounts = memoryPool;
countsGlobal = sharedCounts + nSharedCounts;
excScan = countsGlobal + nCountsGlobal;
counts = excScan + nExcScan;
partialSum = counts + nCountsBlock;
prefixSum = partialSum + nPartialSum;
}
static
void radixSort_freeBufKeys()
{
if (numElementsBuf > 0)
{
delete bufKeys;
numElementsBuf = 0;
}
}
export void radixSort_free()
{
assert(memoryPool != NULL);
delete memoryPool;
memoryPool = NULL;
radixSort_freeBufKeys();
}
export void radixSort(
const uniform int numElements,
uniform Key keys[],
const uniform int nBits)
{
#ifdef __NVPTX__
assert((numBlocks & 3) == 0); /* task granularity on Kepler is 4 */
#endif
if (numElementsBuf < numElements)
radixSort_freeBufKeys();
if (numElementsBuf == 0)
{
numElementsBuf = numElements;
bufKeys = uniform new uniform Key[numElementsBuf];
}
const uniform int blockDim = (numElements + numBlocks - 1) / numBlocks;
for (uniform int bit = 0; bit < nBits; bit += NUMBITS)
{
/* initialize histogram for each digit */
foreach (digit = 0 ... NUMDIGITS)
countsGlobal[digit] = 0;
/* compute histogram for each digit */
launch [numBlocks] countPass(keys, bufKeys, bit, numElements, counts, countsGlobal);
sync;
/* exclusive scan on global histogram */
int carry = 0;
excScan[0] = 0;
foreach (digit = 0 ... NUMDIGITS)
{
const int value = countsGlobal[digit];
const int scan = exclusive_scan_add(value);
excScan[digit] = scan + carry;
carry += broadcast(scan+value, programCount-1);
}
/* computing offsets for each digit */
radixExclusiveScan(numBlocks, excScan, counts, partialSum, prefixSum);
/* sorting */
launch [numBlocks]
sortPass(
bufKeys,
keys,
bit,
numElements,
excScan);
sync;
}
}

2
examples/portable/rt/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
rt
*.ppm

View File

@@ -0,0 +1,8 @@
EXAMPLE=rt
CPP_SRC=rt.cpp
ISPC_SRC=rt.ispc
ISPC_IA_TARGETS=avx1-i32x8
ISPC_ARM_TARGETS=neon
include ../common_cpu.mk

View File

@@ -0,0 +1,7 @@
EXAMPLE=rt
CXX_SRC=rt.cpp
ISPC_SRC=rt.ispc
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
ISPC_TARGET=generic-16
include ../common_knc.mk

View File

@@ -0,0 +1,13 @@
PROG=rt
ISPC_SRC=rt.ispc
CU_SRC=rt.cu
CXX_SRC=rt.cpp
PTXCC_REGMAX=32
#LLVM_GPU=1
NVVM_GPU=1
include ../common_ptx.mk

View File

@@ -0,0 +1 @@
../../rt/cornell.bvh

View File

@@ -0,0 +1 @@
../../rt/cornell.camera

229
examples/portable/rt/rt.cpp Normal file
View File

@@ -0,0 +1,229 @@
/*
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#define NOMINMAX
#pragma warning (disable: 4244)
#pragma warning (disable: 4305)
#endif
#include <cstdio>
#include <cmath>
#include <algorithm>
#include <cassert>
#include <cstring>
#include <sys/types.h>
#include "timing.h"
#include "rt_ispc.h"
#include "ispc_malloc.h"
using namespace ispc;
typedef unsigned int uint;
static void writeImage(int *idImage, float *depthImage, int width, int height,
const char *filename) {
FILE *f = fopen(filename, "wb");
if (!f) {
perror(filename);
exit(1);
}
fprintf(f, "P6\n%d %d\n255\n", width, height);
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
// use the bits from the object id of the hit object to make a
// random color
int id = idImage[y * width + x];
unsigned char r = 0, g = 0, b = 0;
for (int i = 0; i < 8; ++i) {
// extract bit 3*i for red, 3*i+1 for green, 3*i+2 for blue
int rbit = (id & (1 << (3*i))) >> (3*i);
int gbit = (id & (1 << (3*i+1))) >> (3*i+1);
int bbit = (id & (1 << (3*i+2))) >> (3*i+2);
// and then set the bits of the colors starting from the
// high bits...
r |= rbit << (7-i);
g |= gbit << (7-i);
b |= bbit << (7-i);
}
fputc(r, f);
fputc(g, f);
fputc(b, f);
}
}
fclose(f);
printf("Wrote image file %s\n", filename);
}
static void usage() {
fprintf(stderr, "rt <scene name base> [--scale=<factor>] [ispc iterations] [tasks iterations] [serial iterations]\n");
exit(1);
}
int main(int argc, char *argv[]) {
static unsigned int test_iterations[] = {3, 7, 1};
float scale = 1.f;
const char *filename = NULL;
if (argc < 2) usage();
filename = argv[1];
if (argc > 2) {
if (strncmp(argv[2], "--scale=", 8) == 0) {
scale = atof(argv[2] + 8);
}
}
if ((argc == 6) || (argc == 5)) {
for (int i = 0; i < 3; i++) {
test_iterations[i] = atoi(argv[argc - 3 + i]);
}
}
#define READ(var, n) \
if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \
fprintf(stderr, "Unexpected EOF reading scene file\n"); \
return 1; \
} else /* eat ; */
//
// Read the camera specification information from the camera file
//
char fnbuf[1024];
sprintf(fnbuf, "%s.camera", filename);
FILE *f = fopen(fnbuf, "rb");
if (!f) {
perror(fnbuf);
return 1;
}
//
// Nothing fancy, and trouble if we run on a big-endian system, just
// fread in the bits
//
int baseWidth, baseHeight;
// float camera2world[4][4], raster2camera[4][4];
float *camera2world_ispc = new float[4*4];
float *raster2camera_ispc = new float[4*4];
float (*camera2world )[4] = (float (*)[4])camera2world_ispc;
float (*raster2camera)[4] = (float (*)[4])raster2camera_ispc;
READ(baseWidth, 1);
READ(baseHeight, 1);
READ(camera2world[0][0], 16);
READ(raster2camera[0][0], 16);
//
// Read in the serialized BVH
//
sprintf(fnbuf, "%s.bvh", filename);
f = fopen(fnbuf, "rb");
if (!f) {
perror(fnbuf);
return 1;
}
// The BVH file starts with an int that gives the total number of BVH
// nodes
uint nNodes;
READ(nNodes, 1);
LinearBVHNode *nodes = new LinearBVHNode[nNodes];
for (unsigned int i = 0; i < nNodes; ++i) {
// Each node is 6x floats for a boox, then an integer for an offset
// to the second child node, then an integer that encodes the type
// of node, the total number of int it if a leaf node, etc.
float b[6];
READ(b[0], 6);
nodes[i].bounds[0][0] = b[0];
nodes[i].bounds[0][1] = b[1];
nodes[i].bounds[0][2] = b[2];
nodes[i].bounds[1][0] = b[3];
nodes[i].bounds[1][1] = b[4];
nodes[i].bounds[1][2] = b[5];
READ(nodes[i].offset, 1);
READ(nodes[i].nPrimitives, 1);
READ(nodes[i].splitAxis, 1);
READ(nodes[i].pad, 1);
}
// And then read the triangles
uint nTris;
READ(nTris, 1);
Triangle *triangles = new Triangle[nTris];
for (uint i = 0; i < nTris; ++i) {
// 9x floats for the 3 vertices
float v[9];
READ(v[0], 9);
float *vp = v;
for (int j = 0; j < 3; ++j) {
triangles[i].p[j][0] = *vp++;
triangles[i].p[j][1] = *vp++;
triangles[i].p[j][2] = *vp++;
}
// And create an object id
triangles[i].id = i+1;
}
fclose(f);
int height = int(baseHeight * scale);
int width = int(baseWidth * scale);
// allocate images; one to hold hit object ids, one to hold depth to
// the first interseciton
int *id = new int[width*height];
float *image = new float[width*height];
ispc_memset(id, 0, width*height*sizeof(int));
ispc_memset(image, 0, width*height*sizeof(float));
//
// Run 3 iterations with ispc + 1 core, record the minimum time
//
double minTimeISPCtasks = 1e30;
for (int i = 0; i < test_iterations[1]; ++i) {
reset_and_start_timer();
raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
camera2world, image, id, nodes, triangles);
double dt = get_elapsed_msec();
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
}
printf("[rt ispc + tasks]:\t\t[%.3f] msec for %d x %d image\n",
minTimeISPCtasks, width, height);
writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
return 0;
}

373
examples/portable/rt/rt.cu Normal file
View File

@@ -0,0 +1,373 @@
#include "cuda_helpers.cuh"
#define float3 Float3
struct Float3
{
float x,y,z;
__device__ friend Float3 operator+(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x+b.x;
c.y = a.y+b.y;
c.z = a.z+b.z;
return c;
}
__device__ friend Float3 operator-(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x-b.x;
c.y = a.y-b.y;
c.z = a.z-b.z;
return c;
}
__device__ friend Float3 operator/(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x/b.x;
c.y = a.y/b.y;
c.z = a.z/b.z;
return c;
}
__device__ friend Float3 operator/(const float a, const Float3 b)
{
Float3 c;
c.x = a/b.x;
c.y = a/b.y;
c.z = a/b.z;
return c;
}
__device__ friend Float3 operator*(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x*b.x;
c.y = a.y*b.y;
c.z = a.z*b.z;
return c;
}
__device__ friend Float3 operator*(const Float3 a, const float b)
{
Float3 c;
c.x = a.x*b;
c.y = a.y*b;
c.z = a.z*b;
return c;
}
};
#define int8 char
#define int16 short
struct Ray {
float3 origin, dir, invDir;
unsigned int dirIsNeg0, dirIsNeg1, dirIsNeg2;
float mint, maxt;
int hitId;
};
struct Triangle {
float p[3][4];
int id;
int pad[3];
};
struct LinearBVHNode {
float bounds[2][3];
unsigned int offset; // num primitives for leaf, second child for interior
unsigned int8 nPrimitives;
unsigned int8 splitAxis;
unsigned int16 pad;
};
__device__
static inline float3 Cross(const float3 v1, const float3 v2) {
float v1x = v1.x, v1y = v1.y, v1z = v1.z;
float v2x = v2.x, v2y = v2.y, v2z = v2.z;
float3 ret;
ret.x = (v1y * v2z) - (v1z * v2y);
ret.y = (v1z * v2x) - (v1x * v2z);
ret.z = (v1x * v2y) - (v1y * v2x);
return ret;
}
__device__
static inline float Dot(const float3 a, const float3 b) {
return a.x * b.x + a.y * b.y + a.z * b.z;
}
__device__
inline
static void generateRay( const float raster2camera[4][4],
const float camera2world[4][4],
float x, float y, Ray &ray) {
ray.mint = 0.f;
ray.maxt = 1e30f;
ray.hitId = 0;
// transform raster coordinate (x, y, 0) to camera space
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
float camz = raster2camera[2][3];
float camw = raster2camera[3][3];
camx /= camw;
camy /= camw;
camz /= camw;
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
camera2world[0][2] * camz;
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
camera2world[1][2] * camz;
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
camera2world[2][2] * camz;
ray.origin.x = camera2world[0][3] / camera2world[3][3];
ray.origin.y = camera2world[1][3] / camera2world[3][3];
ray.origin.z = camera2world[2][3] / camera2world[3][3];
ray.invDir = 1.f / ray.dir;
#if 0
ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
#else
ray.dirIsNeg0 = any(ray.invDir.x < 0) ? 1 : 0;
ray.dirIsNeg1 = any(ray.invDir.y < 0) ? 1 : 0;
ray.dirIsNeg2 = any(ray.invDir.z < 0) ? 1 : 0;
#endif
}
__device__
inline
static bool BBoxIntersect(const float bounds[2][3],
const Ray &ray) {
float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
float t0 = ray.mint, t1 = ray.maxt;
// Check all three axis-aligned slabs. Don't try to early out; it's
// not worth the trouble
float3 tNear = (bounds0 - ray.origin) * ray.invDir;
float3 tFar = (bounds1 - ray.origin) * ray.invDir;
if (tNear.x > tFar.x) {
float tmp = tNear.x;
tNear.x = tFar.x;
tFar.x = tmp;
}
t0 = max(tNear.x, t0);
t1 = min(tFar.x, t1);
if (tNear.y > tFar.y) {
float tmp = tNear.y;
tNear.y = tFar.y;
tFar.y = tmp;
}
t0 = max(tNear.y, t0);
t1 = min(tFar.y, t1);
if (tNear.z > tFar.z) {
float tmp = tNear.z;
tNear.z = tFar.z;
tFar.z = tmp;
}
t0 = max(tNear.z, t0);
t1 = min(tFar.z, t1);
return (t0 <= t1);
}
__device__
inline
static bool TriIntersect(const Triangle &tri, Ray &ray) {
float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
float3 e1 = p1 - p0;
float3 e2 = p2 - p0;
float3 s1 = Cross(ray.dir, e2);
float divisor = Dot(s1, e1);
bool hit = true;
if (divisor == 0.)
hit = false;
float invDivisor = 1.f / divisor;
// Compute first barycentric coordinate
float3 d = ray.origin - p0;
float b1 = Dot(d, s1) * invDivisor;
if (b1 < 0. || b1 > 1.)
hit = false;
// Compute second barycentric coordinate
float3 s2 = Cross(d, e1);
float b2 = Dot(ray.dir, s2) * invDivisor;
if (b2 < 0. || b1 + b2 > 1.)
hit = false;
// Compute _t_ to intersection point
float t = Dot(e2, s2) * invDivisor;
if (t < ray.mint || t > ray.maxt)
hit = false;
if (hit) {
ray.maxt = t;
ray.hitId = tri.id;
}
return hit;
}
__device__
inline
bool BVHIntersect(const LinearBVHNode nodes[],
const Triangle tris[], Ray &r,
int todo[]) {
Ray ray = r;
bool hit = false;
// Follow ray through BVH nodes to find primitive intersections
int todoOffset = 0, nodeNum = 0;
while (true) {
// Check ray against BVH node
LinearBVHNode node = nodes[nodeNum];
if (any(BBoxIntersect(node.bounds, ray))) {
unsigned int nPrimitives = node.nPrimitives;
if (nPrimitives > 0) {
// Intersect ray with primitives in leaf BVH node
unsigned int primitivesOffset = node.offset;
for ( unsigned int i = 0; i < nPrimitives; ++i) {
if (TriIntersect(tris[primitivesOffset+i], ray))
hit = true;
}
if (todoOffset == 0)
break;
nodeNum = todo[--todoOffset];
}
else {
// Put far BVH node on _todo_ stack, advance to near node
int dirIsNeg;
if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg0;
if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg1;
if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg2;
if (dirIsNeg) {
todo[todoOffset++] = nodeNum + 1;
nodeNum = node.offset;
}
else {
todo[todoOffset++] = node.offset;
nodeNum = nodeNum + 1;
}
}
}
else {
if (todoOffset == 0)
break;
nodeNum = todo[--todoOffset];
}
}
r.maxt = ray.maxt;
r.hitId = ray.hitId;
return hit;
}
__device__
inline
static void raytrace_tile( int x0, int x1,
int y0, int y1,
int width, int height,
int baseWidth, int baseHeight,
const float raster2camera[4][4],
const float camera2world[4][4],
float image[], int id[],
const LinearBVHNode nodes[],
const Triangle triangles[]) {
float widthScale = (float)(baseWidth) / (float)(width);
float heightScale = (float)(baseHeight) / (float)(height);
#if 0
int * todo = new int[64];
#define ALLOC
#else
int todo[64];
#endif
for (int y = y0 ;y < y1; y++)
for (int x = x0 + programIndex; x < x1; x += programCount)
if (x < x1)
{
Ray ray;
generateRay(raster2camera, camera2world, x*widthScale,
y*heightScale, ray);
BVHIntersect(nodes, triangles, ray, todo);
int offset = y * width + x;
image[offset] = ray.maxt;
id[offset] = ray.hitId;
}
#ifdef ALLOC
delete todo;
#endif
}
__global__
void raytrace_tile_task( int width, int height,
int baseWidth, int baseHeight,
const float raster2camera[4][4],
const float camera2world[4][4],
float image[], int id[],
const LinearBVHNode nodes[],
const Triangle triangles[]) {
int dx = 64, dy = 8; // must match dx, dy below
int xBuckets = (width + (dx-1)) / dx;
int x0 = (taskIndex % xBuckets) * dx;
int x1 = min(x0 + dx, width);
int y0 = (taskIndex / xBuckets) * dy;
int y1 = min(y0 + dy, height);
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
raster2camera, camera2world, image,
id, nodes, triangles);
}
extern "C" __global__ void raytrace_ispc_tasks___export( int width, int height,
int baseWidth, int baseHeight,
const float raster2camera[4][4],
const float camera2world[4][4],
float image[], int id[],
const LinearBVHNode nodes[],
const Triangle triangles[]) {
int dx = 64, dy = 8;
int xBuckets = (width + (dx-1)) / dx;
int yBuckets = (height + (dy-1)) / dy;
int nTasks = xBuckets * yBuckets;
launch(nTasks,1,1,raytrace_tile_task)
(width, height, baseWidth, baseHeight,
raster2camera, camera2world,
image, id, nodes, triangles);
cudaDeviceSynchronize();
}
extern "C" __host__ void raytrace_ispc_tasks( int width, int height,
int baseWidth, int baseHeight,
const float raster2camera[4][4],
const float camera2world[4][4],
float image[], int id[],
const LinearBVHNode nodes[],
const Triangle triangles[]) {
raytrace_ispc_tasks___export<<<1,32>>>( width, height,
baseWidth, baseHeight,
raster2camera,
camera2world,
image, id,
nodes,
triangles);
cudaDeviceSynchronize();
}

View File

@@ -0,0 +1,351 @@
/*
Copyright (c) 2010-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if 1
typedef int bool_t;
#else
typedef bool bool_t;
#endif
typedef float<3> float3;
#ifdef __NVPTX__
#define uniform_t varying
#else
#define uniform_t uniform
#endif
struct int3
{
int x,y,z;
};
struct Ray {
float3 origin, dir, invDir;
uniform unsigned int dirIsNeg[3];
float mint, maxt;
int hitId;
};
struct Triangle {
float p[3][4];
int id;
int pad[3];
};
struct LinearBVHNode {
float bounds[2][3];
unsigned int offset; // num primitives for leaf, second child for interior
unsigned int8 nPrimitives;
unsigned int8 splitAxis;
unsigned int16 pad;
};
static inline float3 Cross(const float3 v1, const float3 v2) {
float v1x = v1.x, v1y = v1.y, v1z = v1.z;
float v2x = v2.x, v2y = v2.y, v2z = v2.z;
float3 ret;
ret.x = (v1y * v2z) - (v1z * v2y);
ret.y = (v1z * v2x) - (v1x * v2z);
ret.z = (v1x * v2y) - (v1y * v2x);
return ret;
}
static inline float Dot(const float3 a, const float3 b) {
return a.x * b.x + a.y * b.y + a.z * b.z;
}
#if 1
inline
#endif
static void generateRay(uniform const float raster2camera[4][4],
uniform const float camera2world[4][4],
float x, float y, Ray &ray) {
ray.mint = 0.f;
ray.maxt = 1e30f;
ray.hitId = 0;
// transform raster coordinate (x, y, 0) to camera space
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
float camz = raster2camera[2][3];
float camw = raster2camera[3][3];
camx /= camw;
camy /= camw;
camz /= camw;
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
camera2world[0][2] * camz;
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
camera2world[1][2] * camz;
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
camera2world[2][2] * camz;
ray.origin.x = camera2world[0][3] / camera2world[3][3];
ray.origin.y = camera2world[1][3] / camera2world[3][3];
ray.origin.z = camera2world[2][3] / camera2world[3][3];
ray.invDir = 1.f / ray.dir;
ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
}
#if 1
inline
#endif
static bool_t BBoxIntersect(const uniform float bounds[2][3],
const Ray &ray) {
const uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
const uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
float t0 = ray.mint, t1 = ray.maxt;
// Check all three axis-aligned slabs. Don't try to early out; it's
// not worth the trouble
float3 tNear = (bounds0 - ray.origin) * ray.invDir;
float3 tFar = (bounds1 - ray.origin) * ray.invDir;
if (tNear.x > tFar.x) {
float tmp = tNear.x;
tNear.x = tFar.x;
tFar.x = tmp;
}
t0 = max(tNear.x, t0);
t1 = min(tFar.x, t1);
if (tNear.y > tFar.y) {
float tmp = tNear.y;
tNear.y = tFar.y;
tFar.y = tmp;
}
t0 = max(tNear.y, t0);
t1 = min(tFar.y, t1);
if (tNear.z > tFar.z) {
float tmp = tNear.z;
tNear.z = tFar.z;
tFar.z = tmp;
}
t0 = max(tNear.z, t0);
t1 = min(tFar.z, t1);
return (t0 <= t1);
}
#if 1
inline
#endif
static bool_t TriIntersect(const uniform_t Triangle tri, Ray &ray) {
const uniform_t float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
const uniform_t float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
const uniform_t float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
const uniform_t float3 e1 = p1 - p0;
const uniform_t float3 e2 = p2 - p0;
float3 s1 = Cross(ray.dir, e2);
float divisor = Dot(s1, e1);
bool_t hit = true;
if (divisor == 0.)
hit = false;
float invDivisor = 1.f / divisor;
// Compute first barycentric coordinate
float3 d = ray.origin - p0;
float b1 = Dot(d, s1) * invDivisor;
if (b1 < 0. || b1 > 1.)
hit = false;
// Compute second barycentric coordinate
float3 s2 = Cross(d, e1);
float b2 = Dot(ray.dir, s2) * invDivisor;
if (b2 < 0. || b1 + b2 > 1.)
hit = false;
// Compute _t_ to intersection point
float t = Dot(e2, s2) * invDivisor;
if (t < ray.mint || t > ray.maxt)
hit = false;
if (hit) {
ray.maxt = t;
ray.hitId = tri.id;
}
return hit;
}
#if 1
inline
#endif
bool_t
BVHIntersect(const uniform LinearBVHNode nodes[],
const uniform Triangle tris[], Ray &r) {
Ray ray = r;
bool_t hit = false;
// Follow ray through BVH nodes to find primitive intersections
uniform int todoOffset = 0, nodeNum = 0;
uniform int todo[64];
while (true) {
// Check ray against BVH node
const uniform LinearBVHNode node = nodes[nodeNum];
if (any(BBoxIntersect(node.bounds, ray))) {
const uniform unsigned int nPrimitives = node.nPrimitives;
if (nPrimitives > 0) {
// Intersect ray with primitives in leaf BVH node
const uniform unsigned int primitivesOffset = node.offset;
for (uniform_t unsigned int i = 0; i < nPrimitives; ++i) {
if (TriIntersect(tris[primitivesOffset+i], ray))
hit = true;
}
if (todoOffset == 0)
break;
nodeNum = todo[--todoOffset];
}
else {
// Put far BVH node on _todo_ stack, advance to near node
#if 0 /* fails */
int dirIsNeg = r.dirIsNeg[node.splitAxis];
#else
int dirIsNeg;
if (node.splitAxis == 0) dirIsNeg = r.dirIsNeg[0];
if (node.splitAxis == 1) dirIsNeg = r.dirIsNeg[1];
if (node.splitAxis == 2) dirIsNeg = r.dirIsNeg[2];
#endif
if (dirIsNeg) {
todo[todoOffset++] = nodeNum + 1;
nodeNum = node.offset;
}
else {
todo[todoOffset++] = node.offset;
nodeNum = nodeNum + 1;
}
}
}
else {
if (todoOffset == 0)
break;
nodeNum = todo[--todoOffset];
}
}
r.maxt = ray.maxt;
r.hitId = ray.hitId;
return hit;
}
#if 1
inline
#endif
static void raytrace_tile(uniform int x0, uniform int x1,
uniform int y0, uniform int y1,
uniform int width, uniform int height,
uniform int baseWidth, uniform int baseHeight,
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform float image[], uniform int id[],
const uniform LinearBVHNode nodes[],
const uniform Triangle triangles[]) {
const uniform float widthScale = (float)(baseWidth) / (float)(width);
const uniform float heightScale = (float)(baseHeight) / (float)(height);
foreach_tiled (y = y0 ... y1, x = x0 ... x1) {
Ray ray;
generateRay(raster2camera, camera2world, x*widthScale,
y*heightScale, ray);
BVHIntersect(nodes, triangles, ray);
int offset = y * width + x;
image[offset] = ray.maxt;
id[offset] = ray.hitId;
}
}
export void raytrace_ispc(uniform int width, uniform int height,
uniform int baseWidth, uniform int baseHeight,
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform float image[], uniform int id[],
const uniform LinearBVHNode nodes[],
const uniform Triangle triangles[]) {
raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
raster2camera, camera2world, image,
id, nodes, triangles);
}
task void raytrace_tile_task(uniform int width, uniform int height,
uniform int baseWidth, uniform int baseHeight,
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform float image[], uniform int id[],
const uniform LinearBVHNode nodes[],
const uniform Triangle triangles[]) {
const uniform int dx = 64, dy = 8; // must match dx, dy below
const uniform int xBuckets = (width + (dx-1)) / dx;
const uniform int x0 = (taskIndex % xBuckets) * dx;
const uniform int x1 = min(x0 + dx, width);
const uniform int y0 = (taskIndex / xBuckets) * dy;
const uniform int y1 = min(y0 + dy, height);
raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight,
raster2camera, camera2world, image,
id, nodes, triangles);
}
export void raytrace_ispc_tasks(uniform int width, uniform int height,
uniform int baseWidth, uniform int baseHeight,
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform float image[], uniform int id[],
const uniform LinearBVHNode nodes[],
const uniform Triangle triangles[]) {
const uniform int dx = 64, dy = 8;
const uniform int xBuckets = (width + (dx-1)) / dx;
const uniform int yBuckets = (height + (dy-1)) / dy;
const uniform int nTasks = xBuckets * yBuckets;
launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight,
raster2camera, camera2world,
image, id, nodes, triangles);
}

View File

@@ -0,0 +1 @@
../../rt/sponza.bvh

View File

@@ -0,0 +1 @@
../../rt/sponza.camera

View File

@@ -0,0 +1 @@
../../rt/teapot.bvh

View File

@@ -0,0 +1 @@
../../rt/teapot.camera

View File

@@ -0,0 +1,2 @@
mandelbrot
*.ppm

View File

@@ -0,0 +1,8 @@
EXAMPLE=volume
CPP_SRC=volume.cpp
ISPC_SRC=volume.ispc
ISPC_IA_TARGETS=avx1-i32x8
ISPC_ARM_TARGETS=neon
include ../common_cpu.mk

View File

@@ -0,0 +1,7 @@
EXAMPLE=volume
CXX_SRC=volume.cpp
ISPC_SRC=volume.ispc
ISPC_INTRINSICS=../../intrinsics/knc-i1x16.h
ISPC_TARGET=generic-16
include ../common_knc.mk

View File

@@ -0,0 +1,13 @@
PROG=volume
ISPC_SRC=volume.ispc
CU_SRC=volume.cu
CXX_SRC=volume.cpp
PTXCC_REGMAX=64
#LLVM_GPU=1
NVVM_GPU=1
include ../common_ptx.mk

View File

@@ -0,0 +1,11 @@
896 1184
0.000155 0.000000 0.000000 -0.069927
0.000000 -0.000155 0.000000 0.093236
0.000000 0.000000 0.000000 1.000000
0.000000 0.000000 -99.999001 100.000000
1.000000 0.000000 0.000000 1.000000
0.000000 0.980129 -0.198360 2.900000
0.000000 0.198360 0.980129 -10.500000
0.000000 0.000000 0.000000 1.000000

View File

@@ -0,0 +1 @@
../../volume_rendering/density_highres.vol

View File

@@ -0,0 +1 @@
../../volume_rendering/density_lowres.vol

View File

@@ -0,0 +1,183 @@
/*
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#define NOMINMAX
#pragma warning (disable: 4244)
#pragma warning (disable: 4305)
#endif
#include <cstdio>
#include <algorithm>
#include "timing.h"
#include "ispc_malloc.h"
#include "volume_ispc.h"
using namespace ispc;
/* Write a PPM image file with the image */
static void
writePPM(float *buf, int width, int height, const char *fn) {
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n");
fprintf(fp, "%d %d\n", width, height);
fprintf(fp, "255\n");
for (int i = 0; i < width*height; ++i) {
float v = buf[i] * 255.f;
if (v < 0.f) v = 0.f;
else if (v > 255.f) v = 255.f;
unsigned char c = (unsigned char)v;
for (int j = 0; j < 3; ++j)
fputc(c, fp);
}
fclose(fp);
printf("Wrote image file %s\n", fn);
}
/* Load image and viewing parameters from a camera data file.
FIXME: we should add support to be able to specify viewing parameters
in the program here directly. */
static void
loadCamera(const char *fn, int *width, int *height, float raster2camera[4][4],
float camera2world[4][4]) {
FILE *f = fopen(fn, "r");
if (!f) {
perror(fn);
exit(1);
}
if (fscanf(f, "%d %d", width, height) != 2) {
fprintf(stderr, "Unexpected end of file in camera file\n");
exit(1);
}
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
if (fscanf(f, "%f", &raster2camera[i][j]) != 1) {
fprintf(stderr, "Unexpected end of file in camera file\n");
exit(1);
}
}
}
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
if (fscanf(f, "%f", &camera2world[i][j]) != 1) {
fprintf(stderr, "Unexpected end of file in camera file\n");
exit(1);
}
}
}
fclose(f);
}
/* Load a volume density file. Expects the number of x, y, and z samples
as the first three values (as integer strings), then x*y*z
floating-point values (also as strings) to give the densities. */
static float *
loadVolume(const char *fn, int n[3]) {
FILE *f = fopen(fn, "r");
if (!f) {
perror(fn);
exit(1);
}
if (fscanf(f, "%d %d %d", &n[0], &n[1], &n[2]) != 3) {
fprintf(stderr, "Couldn't find resolution at start of density file\n");
exit(1);
}
int count = n[0] * n[1] * n[2];
float *v = new float[count];
for (int i = 0; i < count; ++i) {
if (fscanf(f, "%f", &v[i]) != 1) {
fprintf(stderr, "Unexpected end of file at %d'th density value\n", i);
exit(1);
}
}
return v;
}
int main(int argc, char *argv[]) {
static unsigned int test_iterations[] = {3, 7, 1};
if (argc < 3) {
fprintf(stderr, "usage: volume <camera.dat> <volume_density.vol> [ispc iterations] [tasks iterations] [serial iterations]\n");
return 1;
}
if (argc == 6) {
for (int i = 0; i < 3; i++) {
test_iterations[i] = atoi(argv[3 + i]);
}
}
//
// Load viewing data and the volume density data
//
int width, height;
float *camera2world_ispc = new float[4*4];
float *raster2camera_ispc = new float[4*4];
float (*camera2world )[4] = (float (*)[4])camera2world_ispc;
float (*raster2camera)[4] = (float (*)[4])raster2camera_ispc;
loadCamera(argv[1], &width, &height, raster2camera, camera2world);
float *image = new float[width*height];
int *n = new int[3];
float *density = loadVolume(argv[2], n);
// Clear out the buffer
for (int i = 0; i < width * height; ++i)
image[i] = 0.;
//
// Compute the image using the ispc implementation that also uses
// tasks; report the minimum time of three runs.
//
double minISPCtasks = 1e30;
for (int i = 0; i < test_iterations[1]; ++i) {
reset_and_start_timer();
volume_ispc_tasks(density, n, raster2camera, camera2world,
width, height, image);
double dt = get_elapsed_msec();
printf("@time of ISPC + TASKS run:\t\t\t[%.3f] msec\n", dt);
minISPCtasks = std::min(minISPCtasks, dt);
}
printf("[volume ispc + tasks]:\t\t[%.3f] msec\n", minISPCtasks);
writePPM(image, width, height, "volume-ispc-tasks.ppm");
return 0;
}

View File

@@ -0,0 +1,454 @@
/*
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cuda_helpers.cuh"
__device__ static inline float clamp(float v, float low, float high)
{
return min(max(v, low), high);
}
#define float3 Float3
struct Float3
{
float x,y,z;
__device__ friend Float3 operator+(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x+b.x;
c.y = a.y+b.y;
c.z = a.z+b.z;
return c;
}
__device__ friend Float3 operator-(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x-b.x;
c.y = a.y-b.y;
c.z = a.z-b.z;
return c;
}
__device__ friend Float3 operator/(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x/b.x;
c.y = a.y/b.y;
c.z = a.z/b.z;
return c;
}
__device__ friend Float3 operator*(const Float3 a, const Float3 b)
{
Float3 c;
c.x = a.x*b.x;
c.y = a.y*b.y;
c.z = a.z*b.z;
return c;
}
__device__ friend Float3 operator*(const Float3 a, const float b)
{
Float3 c;
c.x = a.x*b;
c.y = a.y*b;
c.z = a.z*b;
return c;
}
};
struct Ray {
float3 origin, dir;
};
__device__ static void
generateRay(const float raster2camera[4][4],
const float camera2world[4][4],
float x, float y, Ray &ray) {
// transform raster coordinate (x, y, 0) to camera space
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
float camz = raster2camera[2][3];
float camw = raster2camera[3][3];
camx /= camw;
camy /= camw;
camz /= camw;
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
ray.origin.x = camera2world[0][3] / camera2world[3][3];
ray.origin.y = camera2world[1][3] / camera2world[3][3];
ray.origin.z = camera2world[2][3] / camera2world[3][3];
}
__device__ static inline bool
Inside(float3 p, float3 pMin, float3 pMax) {
return (p.x >= pMin.x && p.x <= pMax.x &&
p.y >= pMin.y && p.y <= pMax.y &&
p.z >= pMin.z && p.z <= pMax.z);
}
__device__ static bool
IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
float t0 = -1e30f, t1 = 1e30f;
float3 tNear = (pMin - ray.origin) / ray.dir;
float3 tFar = (pMax - ray.origin) / ray.dir;
if (tNear.x > tFar.x) {
float tmp = tNear.x;
tNear.x = tFar.x;
tFar.x = tmp;
}
t0 = max(tNear.x, t0);
t1 = min(tFar.x, t1);
if (tNear.y > tFar.y) {
float tmp = tNear.y;
tNear.y = tFar.y;
tFar.y = tmp;
}
t0 = max(tNear.y, t0);
t1 = min(tFar.y, t1);
if (tNear.z > tFar.z) {
float tmp = tNear.z;
tNear.z = tFar.z;
tFar.z = tmp;
}
t0 = max(tNear.z, t0);
t1 = min(tFar.z, t1);
if (t0 <= t1) {
hit0 = t0;
hit1 = t1;
return true;
}
else
return false;
}
__device__ static inline float Lerp(float t, float a, float b) {
return (1.f - t) * a + t * b;
}
__device__ static inline float D(int x, int y, int z, int nVoxels[3],
float density[]) {
x = clamp(x, 0, nVoxels[0]-1);
y = clamp(y, 0, nVoxels[1]-1);
z = clamp(z, 0, nVoxels[2]-1);
return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
}
__device__ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
return (p - pMin) / (pMax - pMin);
}
__device__ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
float density[], int nVoxels[3]) {
if (!Inside(Pobj, pMin, pMax))
return 0;
// Compute voxel coordinates and offsets for _Pobj_
float3 vox = Offset(Pobj, pMin, pMax);
vox.x = vox.x * nVoxels[0] - .5f;
vox.y = vox.y * nVoxels[1] - .5f;
vox.z = vox.z * nVoxels[2] - .5f;
int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
// Trilinearly interpolate density values to compute local density
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
D(vx+1, vy, vz, nVoxels, density));
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
D(vx+1, vy+1, vz, nVoxels, density));
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
D(vx+1, vy, vz+1, nVoxels, density));
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
D(vx+1, vy+1, vz+1, nVoxels, density));
float d0 = Lerp(dy, d00, d10);
float d1 = Lerp(dy, d01, d11);
return Lerp(dz, d0, d1);
}
/* Returns the transmittance between two points p0 and p1, in a volume
with extent (pMin,pMax) with transmittance coefficient sigma_t,
defined by nVoxels[3] voxels in each dimension in the given density
array. */
__device__ static inline float
transmittance(float3 p0, float3 p1, float3 pMin,
float3 pMax, float sigma_t,
float density[], int nVoxels[3]) {
float rayT0, rayT1;
Ray ray;
ray.origin = p1;
ray.dir = p0 - p1;
// Find the parametric t range along the ray that is inside the volume.
if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
return 1.f;
rayT0 = max(rayT0, 0.f);
// Accumulate beam transmittance in tau
float tau = 0.0f;
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
ray.dir.z * ray.dir.z);
float stepDist = 0.2f;
float stepT = stepDist / rayLength;
float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT;
while (t < rayT1) {
tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
pos = pos + dirStep;
t += stepT;
}
return exp(-tau);
}
__device__ static inline float
distanceSquared(float3 a, float3 b) {
float3 d = a-b;
return d.x*d.x + d.y*d.y + d.z*d.z;
}
__device__ static inline float
raymarch(float density[], int nVoxels[3], Ray ray) {
float rayT0, rayT1;
float3 pMin = {.3f, -.2f, .3f}, pMax = {1.8f, 2.3f, 1.8f};
float3 lightPos = { -1.f, 4., 1.5f };
if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
return 0.f;
rayT0 = max(rayT0, 0.f);
// Parameters that define the volume scattering characteristics and
// sampling rate for raymarching
float Le = .25f; // Emission coefficient
float sigma_a = 10.f; // Absorption coefficient
float sigma_s = 10.f; // Scattering coefficient
float stepDist = 0.025f; // Ray step amount
float lightIntensity = 40.0f; // Light source intensity
float tau = 0.f; // accumulated beam transmittance
float L = 0.f; // radiance along the ray
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
ray.dir.z * ray.dir.z);
float stepT = stepDist / rayLength;
float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT;
while (t < rayT1)
{
float d = Density(pos, pMin, pMax, density, nVoxels);
// terminate once attenuation is high
float atten = exp(-tau);
if (atten < .005f)
break;
// direct lighting
float Li = lightIntensity / distanceSquared(lightPos, pos) *
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
density, nVoxels);
L += stepDist * atten * d * sigma_s * (Li + Le);
// update beam transmittance
tau += stepDist * (sigma_a + sigma_s) * d;
pos = pos + dirStep;
t += stepT;
}
// Gamma correction
return pow(L, 1.f / 2.2f);
}
/* Utility routine used by both the task-based and the single-core entrypoints.
Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
result into the image[] array.
*/
__device__ static void
volume_tile(int x0, int y0, int x1,
int y1, float density[], int nVoxels[3],
const float raster2camera[4][4],
const float camera2world[4][4],
int width, int height, float image[]) {
// Work on 4x4=16 pixel big tiles of the image. This function thus
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
// by 4.
for (int y = y0; y < y1; y += 8) {
for (int x = x0; x < x1; x += 8) {
for (int ob = 0; ob < 64; ob += programCount)
{
const int o = ob + programIndex;
// These two arrays encode the mapping from [0,15] to
// offsets within the 4x4 pixel block so that we render
// each pixel inside the block
const int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
0, 1, 0, 1, 2, 3, 2, 3 };
const int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
2, 2, 3, 3, 2, 2, 3, 3 };
const int xblock[4] = {0, 4, 0, 4};
const int yblock[4] = {0, 0, 4, 4};
// Figure out the pixel to render for this program instance
const int xo = x + xblock[o/16] + xoffsets[o&15];
const int yo = y + yblock[o/16] + yoffsets[o&15];
// Use viewing parameters to compute the corresponding ray
// for the pixel
Ray ray;
generateRay(raster2camera, camera2world, xo, yo, ray);
// And raymarch through the volume to compute the pixel's
// value
int offset = yo * width + xo;
if (xo < x1 && yo < y1)
image[offset] = raymarch(density, nVoxels, ray);
}
}
}
}
__global__ void
volume_task(float density[], int _nVoxels[3],
const float _raster2camera[4][4],
const float _camera2world[4][4],
int width, int height, float image[]) {
if (taskIndex0 >= taskCount0) return;
#if 0
int nVoxels[3];
nVoxels[0] = _nVoxels[0];
nVoxels[1] = _nVoxels[1];
nVoxels[2] = _nVoxels[2];
float raster2camera[4][4];
raster2camera[0][0] = _raster2camera[0][0];
raster2camera[0][1] = _raster2camera[0][1];
raster2camera[0][2] = _raster2camera[0][2];
raster2camera[0][3] = _raster2camera[0][3];
raster2camera[1][0] = _raster2camera[1][0];
raster2camera[1][1] = _raster2camera[1][1];
raster2camera[1][2] = _raster2camera[1][2];
raster2camera[1][3] = _raster2camera[1][3];
raster2camera[2][0] = _raster2camera[2][0];
raster2camera[2][1] = _raster2camera[2][1];
raster2camera[2][2] = _raster2camera[2][2];
raster2camera[2][3] = _raster2camera[2][3];
raster2camera[3][0] = _raster2camera[3][0];
raster2camera[3][1] = _raster2camera[3][1];
raster2camera[3][2] = _raster2camera[3][2];
raster2camera[3][3] = _raster2camera[3][3];
float camera2world[4][4];
camera2world[0][0] = _camera2world[0][0];
camera2world[0][1] = _camera2world[0][1];
camera2world[0][2] = _camera2world[0][2];
camera2world[0][3] = _camera2world[0][3];
camera2world[1][0] = _camera2world[1][0];
camera2world[1][1] = _camera2world[1][1];
camera2world[1][2] = _camera2world[1][2];
camera2world[1][3] = _camera2world[1][3];
camera2world[2][0] = _camera2world[2][0];
camera2world[2][1] = _camera2world[2][1];
camera2world[2][2] = _camera2world[2][2];
camera2world[2][3] = _camera2world[2][3];
camera2world[3][0] = _camera2world[3][0];
camera2world[3][1] = _camera2world[3][1];
camera2world[3][2] = _camera2world[3][2];
camera2world[3][3] = _camera2world[3][3];
#else
#define nVoxels _nVoxels
#define raster2camera _raster2camera
#define camera2world _camera2world
#endif
int dx = 8, dy = 8; // must match value in volume_ispc_tasks
int xbuckets = (width + (dx-1)) / dx;
int ybuckets = (height + (dy-1)) / dy;
int x0 = (taskIndex % xbuckets) * dx;
int y0 = (taskIndex / xbuckets) * dy;
int x1 = x0 + dx, y1 = y0 + dy;
x1 = min(x1, width);
y1 = min(y1, height);
volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
camera2world, width, height, image);
}
extern "C"
__global__ void
volume_ispc_tasks___export( float density[], int nVoxels[3],
const float raster2camera[4][4],
const float camera2world[4][4],
int width, int height, float image[]) {
// Launch tasks to work on (dx,dy)-sized tiles of the image
int dx = 8, dy = 8;
int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
launch(nTasks,1,1,volume_task)
(density, nVoxels, raster2camera, camera2world,
width, height, image);
cudaDeviceSynchronize();
}
extern "C"
__host__ void
volume_ispc_tasks( float density[], int nVoxels[3],
const float raster2camera[4][4],
const float camera2world[4][4],
int width, int height, float image[]) {
volume_ispc_tasks___export<<<1,32>>>(density, nVoxels, raster2camera, camera2world, width, height,image);
cudaDeviceSynchronize();
}

View File

@@ -0,0 +1,413 @@
/*
Copyright (c) 2011-2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
typedef float<3> float3;
struct Ray {
float3 origin, dir;
};
static inline void
generateRay(const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
float x, float y, Ray &ray) {
// transform raster coordinate (x, y, 0) to camera space
float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
float camz = raster2camera[2][3];
float camw = raster2camera[3][3];
camx /= camw;
camy /= camw;
camz /= camw;
ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
ray.origin.x = camera2world[0][3] / camera2world[3][3];
ray.origin.y = camera2world[1][3] / camera2world[3][3];
ray.origin.z = camera2world[2][3] / camera2world[3][3];
}
static inline bool
Inside(float3 p, float3 pMin, float3 pMax) {
return (p.x >= pMin.x && p.x <= pMax.x &&
p.y >= pMin.y && p.y <= pMax.y &&
p.z >= pMin.z && p.z <= pMax.z);
}
static inline bool
IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
float t0 = -1e30, t1 = 1e30;
float3 tNear = (pMin - ray.origin) / ray.dir;
float3 tFar = (pMax - ray.origin) / ray.dir;
if (tNear.x > tFar.x) {
float tmp = tNear.x;
tNear.x = tFar.x;
tFar.x = tmp;
}
t0 = max(tNear.x, t0);
t1 = min(tFar.x, t1);
if (tNear.y > tFar.y) {
float tmp = tNear.y;
tNear.y = tFar.y;
tFar.y = tmp;
}
t0 = max(tNear.y, t0);
t1 = min(tFar.y, t1);
if (tNear.z > tFar.z) {
float tmp = tNear.z;
tNear.z = tFar.z;
tFar.z = tmp;
}
t0 = max(tNear.z, t0);
t1 = min(tFar.z, t1);
if (t0 <= t1) {
hit0 = t0;
hit1 = t1;
return true;
}
else
return false;
}
static inline float Lerp(float t, float a, float b) {
return (1.f - t) * a + t * b;
}
static inline float D(int x, int y, int z, uniform int nVoxels[3],
uniform float density[]) {
x = clamp(x, 0, nVoxels[0]-1);
y = clamp(y, 0, nVoxels[1]-1);
z = clamp(z, 0, nVoxels[2]-1);
return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
}
static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
return (p - pMin) / (pMax - pMin);
}
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
uniform float density[], uniform int nVoxels[3]) {
if (!Inside(Pobj, pMin, pMax))
return 0;
// Compute voxel coordinates and offsets for _Pobj_
float3 vox = Offset(Pobj, pMin, pMax);
vox.x = vox.x * nVoxels[0] - .5f;
vox.y = vox.y * nVoxels[1] - .5f;
vox.z = vox.z * nVoxels[2] - .5f;
int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
// Trilinearly interpolate density values to compute local density
float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),
D(vx+1, vy, vz, nVoxels, density));
float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),
D(vx+1, vy+1, vz, nVoxels, density));
float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),
D(vx+1, vy, vz+1, nVoxels, density));
float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density),
D(vx+1, vy+1, vz+1, nVoxels, density));
float d0 = Lerp(dy, d00, d10);
float d1 = Lerp(dy, d01, d11);
return Lerp(dz, d0, d1);
}
/* Returns the transmittance between two points p0 and p1, in a volume
with extent (pMin,pMax) with transmittance coefficient sigma_t,
defined by nVoxels[3] voxels in each dimension in the given density
array. */
static inline float
transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
uniform float3 pMax, uniform float sigma_t,
uniform float density[], uniform int nVoxels[3]) {
float rayT0, rayT1;
Ray ray;
ray.origin = p1;
ray.dir = p0 - p1;
// Find the parametric t range along the ray that is inside the volume.
if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
return 1.;
rayT0 = max(rayT0, 0.f);
// Accumulate beam transmittance in tau
float tau = 0;
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
ray.dir.z * ray.dir.z);
const uniform float stepDist = 0.2;
float stepT = stepDist / rayLength;
float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT;
while (t < rayT1) {
tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
pos = pos + dirStep;
t += stepT;
}
return exp(-tau);
}
static inline float
distanceSquared(float3 a, float3 b) {
float3 d = a-b;
return d.x*d.x + d.y*d.y + d.z*d.z;
}
static inline float
raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
float rayT0, rayT1;
const uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
const uniform float3 lightPos = { -1, 4, 1.5 };
if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
return 0.;
rayT0 = max(rayT0, 0.f);
// Parameters that define the volume scattering characteristics and
// sampling rate for raymarching
const uniform float Le = .25; // Emission coefficient
const uniform float sigma_a = 10; // Absorption coefficient
const uniform float sigma_s = 10; // Scattering coefficient
const uniform float stepDist = 0.025; // Ray step amount
const uniform float lightIntensity = 40; // Light source intensity
float tau = 0.f; // accumulated beam transmittance
float L = 0; // radiance along the ray
float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
ray.dir.z * ray.dir.z);
float stepT = stepDist / rayLength;
float t = rayT0;
float3 pos = ray.origin + ray.dir * rayT0;
float3 dirStep = ray.dir * stepT;
while (t < rayT1)
{
float d = Density(pos, pMin, pMax, density, nVoxels);
// terminate once attenuation is high
float atten = exp(-tau);
if (atten < .005)
break;
// direct lighting
float Li = lightIntensity / distanceSquared(lightPos, pos) *
transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
density, nVoxels);
L += stepDist * atten * d * sigma_s * (Li + Le);
// update beam transmittance
tau += stepDist * (sigma_a + sigma_s) * d;
pos = pos + dirStep;
t += stepT;
}
// Gamma correction
return pow(L, 1.f / 2.2f);
}
/* Utility routine used by both the task-based and the single-core entrypoints.
Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
result into the image[] array.
*/
static inline void
volume_tile(uniform int x0, uniform int y0, uniform int x1,
uniform int y1, uniform float density[], uniform int nVoxels[3],
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform int width, uniform int height, uniform float image[]) {
// Work on 4x4=16 pixel big tiles of the image. This function thus
// implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
// by 4.
#if 0
for (uniform int y = y0; y < y1; y += 8)
for (uniform int x = x0; x < x1; x += 8)
foreach (o = 0 ... 64)
{
// These two arrays encode the mapping from [0,15] to
// offsets within the 4x4 pixel block so that we render
// each pixel inside the block
const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
0, 1, 0, 1, 2, 3, 2, 3 };
const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
2, 2, 3, 3, 2, 2, 3, 3 };
const uniform int xblock[4] = {0, 4, 0, 4};
const uniform int yblock[4] = {0, 0, 4, 4};
// Figure out the pixel to render for this program instance
const int xo = x + xblock[o/16] + xoffsets[o&15];
const int yo = y + yblock[o/16] + yoffsets[o&15];
// Use viewing parameters to compute the corresponding ray
// for the pixel
Ray ray;
generateRay(raster2camera, camera2world, xo, yo, ray);
// And raymarch through the volume to compute the pixel's
// value
int offset = yo * width + xo;
if (xo < x1 && yo < y1)
image[offset] = raymarch(density, nVoxels, ray);
}
#else
foreach_tiled (y = y0 ... y1, x = x0 ... x1)
{
// Use viewing parameters to compute the corresponding ray
// for the pixel
Ray ray;
generateRay(raster2camera, camera2world, x, y, ray);
// And raymarch through the volume to compute the pixel's
// value
int offset = y * width + x;
image[offset] = raymarch(density, nVoxels, ray);
}
#endif
}
task void
volume_task(uniform float density[], uniform int _nVoxels[3],
const uniform float _raster2camera[4][4],
const uniform float _camera2world[4][4],
uniform int width, uniform int height, uniform float image[])
{
if (taskIndex >= taskCount) return;
#if 1 /* cannot pass shared memory pointers to functions, need to find a way to solve this one :S */
uniform int nVoxels[3];
nVoxels[0] = _nVoxels[0];
nVoxels[1] = _nVoxels[1];
nVoxels[2] = _nVoxels[2];
uniform float raster2camera[4][4];
raster2camera[0][0] = _raster2camera[0][0];
raster2camera[0][1] = _raster2camera[0][1];
raster2camera[0][2] = _raster2camera[0][2];
raster2camera[0][3] = _raster2camera[0][3];
raster2camera[1][0] = _raster2camera[1][0];
raster2camera[1][1] = _raster2camera[1][1];
raster2camera[1][2] = _raster2camera[1][2];
raster2camera[1][3] = _raster2camera[1][3];
raster2camera[2][0] = _raster2camera[2][0];
raster2camera[2][1] = _raster2camera[2][1];
raster2camera[2][2] = _raster2camera[2][2];
raster2camera[2][3] = _raster2camera[2][3];
raster2camera[3][0] = _raster2camera[3][0];
raster2camera[3][1] = _raster2camera[3][1];
raster2camera[3][2] = _raster2camera[3][2];
raster2camera[3][3] = _raster2camera[3][3];
uniform float camera2world[4][4];
camera2world[0][0] = _camera2world[0][0];
camera2world[0][1] = _camera2world[0][1];
camera2world[0][2] = _camera2world[0][2];
camera2world[0][3] = _camera2world[0][3];
camera2world[1][0] = _camera2world[1][0];
camera2world[1][1] = _camera2world[1][1];
camera2world[1][2] = _camera2world[1][2];
camera2world[1][3] = _camera2world[1][3];
camera2world[2][0] = _camera2world[2][0];
camera2world[2][1] = _camera2world[2][1];
camera2world[2][2] = _camera2world[2][2];
camera2world[2][3] = _camera2world[2][3];
camera2world[3][0] = _camera2world[3][0];
camera2world[3][1] = _camera2world[3][1];
camera2world[3][2] = _camera2world[3][2];
camera2world[3][3] = _camera2world[3][3];
#else
#define nVoxels _nVoxels
#define raster2camera _raster2camera
#define camera2world _camera2world
#endif
const uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks
const uniform int xbuckets = (width + (dx-1)) / dx;
const uniform int ybuckets = (height + (dy-1)) / dy;
const uniform int x0 = (taskIndex % xbuckets) * dx;
const uniform int y0 = (taskIndex / xbuckets) * dy;
const uniform int x1 = min(x0 + dx, width);
const uniform int y1 = min(y0 + dy, height);
volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
camera2world, width, height, image);
}
export void
volume_ispc(uniform float density[], uniform int nVoxels[3],
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform int width, uniform int height, uniform float image[]) {
volume_tile(0, 0, width, height, density, nVoxels, raster2camera,
camera2world, width, height, image);
}
export void
volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
const uniform float raster2camera[4][4],
const uniform float camera2world[4][4],
uniform int width, uniform int height, uniform float image[]) {
// Launch tasks to work on (dx,dy)-sized tiles of the image
const uniform int dx = 8, dy = 8;
const uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world,
width, height, image);
sync;
}

View File

@@ -37,6 +37,7 @@
#include <stdlib.h>
#include <algorithm>
#include <iostream>
#include <cassert>
#include <iomanip>
#include "../timing.h"
#include "sort_ispc.h"
@@ -45,26 +46,28 @@ using namespace ispc;
extern void sort_serial (int n, unsigned int code[], int order[]);
/* progress bar by Ross Hemsley;
* http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */
static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50)
static void progressBar(const int x, const int n, const int width = 50)
{
if (n < 100)
{
x *= 100/n;
n = 100;
}
assert(n > 1);
assert(x >= 0 && x < n);
assert(width > 10);
const float f = static_cast<float>(x)/(n-1);
const int w = static_cast<int>(f * width);
if ((x != n) && (x % (n/100) != 0)) return;
// print bar
std::string bstr("[");
for (int i = 0; i < width; i++)
bstr += i < w ? '=' : ' ';
bstr += "]";
using namespace std;
float ratio = x/(float)n;
int c = ratio * w;
// print percentage
char pstr0[32];
sprintf(pstr0, " %2d %c ", static_cast<int>(f*100.0),'%');
const std::string pstr(pstr0);
std::copy(pstr.begin(), pstr.end(), bstr.begin() + (width/2-2));
cout << setw(3) << (int)(ratio*100) << "% [";
for (int x=0; x<c; x++) cout << "=";
for (int x=c; x<w; x++) cout << " ";
cout << "]\r" << flush;
std::cout << bstr;
std::cout << (x == n-1 ? "\n" : "\r") << std::flush;
}
int main (int argc, char *argv[])
@@ -87,7 +90,7 @@ int main (int argc, char *argv[])
tISPC1 += get_elapsed_mcycles();
if (argc != 3)
progressbar (i, m);
progressBar (i, m);
}
printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1);
@@ -105,7 +108,7 @@ int main (int argc, char *argv[])
tISPC2 += get_elapsed_mcycles();
if (argc != 3)
progressbar (i, m);
progressBar (i, m);
}
printf("[sort ispc + tasks]:\t[%.3f] million cycles\n", tISPC2);
@@ -123,7 +126,7 @@ int main (int argc, char *argv[])
tSerial += get_elapsed_mcycles();
if (argc != 3)
progressbar (i, m);
progressBar (i, m);
}
printf("[sort serial]:\t\t[%.3f] million cycles\n", tSerial);

View File

@@ -960,17 +960,22 @@ InitTaskSystem() {
inline void
TaskGroup::Launch(int baseIndex, int count) {
#pragma omp parallel for
for(int i = 0; i < count; i++) {
#pragma omp parallel
{
const int threadIndex = omp_get_thread_num();
const int threadCount = omp_get_num_threads();
#pragma omp for schedule(runtime)
for(int i = 0; i < count; i++)
{
TaskInfo *ti = GetTaskInfo(baseIndex + i);
// Actually run the task.
int threadIndex = omp_get_thread_num();
int threadCount = omp_get_num_threads();
ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
}
}
}
inline void

View File

@@ -58,6 +58,7 @@ __inline__ uint64_t rdtsc() {
#ifdef WIN32
#include <windows.h>
double rtc();
#define rdtsc __rdtsc
#else // WIN32
__inline__ uint64_t rdtsc() {
@@ -72,14 +73,30 @@ __inline__ uint64_t rdtsc() {
__asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
return (uint64_t)high << 32 | low;
}
#include <sys/time.h>
static inline double rtc(void)
{
struct timeval Tvalue;
double etime;
struct timezone dummy;
gettimeofday(&Tvalue,&dummy);
etime = (double) Tvalue.tv_sec +
1.e-6*((double) Tvalue.tv_usec);
return etime;
}
#endif // !WIN32
#endif // !__arm__
static uint64_t start, end;
static uint64_t start, end;
static double tstart, tend;
static inline void reset_and_start_timer()
{
start = rdtsc();
tstart = rtc();
}
/* Returns the number of millions of elapsed processor cycles since the
@@ -89,3 +106,9 @@ static inline double get_elapsed_mcycles()
end = rdtsc();
return (end-start) / (1024. * 1024.);
}
static inline double get_elapsed_msec()
{
tend = rtc();
return (tend - tstart)*1e3;
}

View File

@@ -0,0 +1,58 @@
/*
Copyright (c) 2014, Evghenii Gaburov
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#define programCount 32
#define programIndex (threadIdx.x & 31)
#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5))
#define taskCount0 (gridDim.x*4)
#define taskIndex1 (blockIdx.y)
#define taskCount1 (gridDim.y)
#define taskIndex2 (blockIdx.z)
#define taskCount2 (gridDim.z)
#define taskIndex (taskIndex0 + taskCount0*(taskIndex1 + taskCount1*taskIndex2))
#define taskCount (taskCount0*taskCount1*taskCount2)
#define warpIdx (threadIdx.x >> 5)
#define launch(ntx,nty,ntz,func) if (programIndex==0) func<<<dim3(((ntx)+4-1)/4,nty,ntz),128>>>
#define sync cudaDeviceSynchronize()
#define cif if
__device__ __forceinline__ static double __shfl(double x, int lane)
{
return __hiloint2double(
__shfl_xor(__double2hiint(x), lane),
__shfl_xor(__double2loint(x), lane));
}
#define shuffle(x,y) __shfl(x,y)
#define broadcast(x,y) __shfl(x,y)

View File

@@ -0,0 +1,87 @@
/*
Copyright (c) 2014, Evghenii Gaburov
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <cstring>
#include "ispc_malloc.h"
#ifdef _CUDA_
void * operator new(size_t size) throw(std::bad_alloc)
{
void *ptr;
ispc_malloc(&ptr, size);
return ptr;
}
void operator delete(void *ptr) throw()
{
ispc_free(ptr);
}
#else
void ispc_malloc(void **ptr, const size_t size)
{
*ptr = malloc(size);
}
void ispc_free(void *ptr)
{
free(ptr);
}
void ispc_memset(void *ptr, int value, size_t size)
{
memset(ptr, value, size);
}
void ispcSetMallocHeapLimit(size_t value)
{
}
void ispcSetStackLimit(size_t value)
{
}
unsigned long long ispcGetMallocHeapLimit()
{
return -1;
}
unsigned long long ispcGetStackLimit()
{
return -1;
}
void * ispcMemcpy(void *dest, void *src, size_t num)
{
memcpy(dest, src, num);
return dest;
}
#endif

View File

@@ -0,0 +1,43 @@
/*
Copyright (c) 2014, Evghenii Gaburov
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
extern void ispc_malloc(void **ptr, const size_t size);
extern void ispc_free(void *ptr);
extern void ispc_memset(void *ptr, int value, size_t size);
extern void ispcSetMallocHeapLimit(size_t value);
extern void ispcSetStackLimit(size_t value);
extern unsigned long long ispcGetMallocHeapLimit();
extern unsigned long long ispcGetStackLimit();
extern void * ispcMemcpy(void *dest, void *src, size_t num);

View File

@@ -0,0 +1,76 @@
/*
Copyright (c) 2014, Evghenii Gaburov
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _CUDA_
#error "Something went wrong..."
#endif
void ispc_malloc(void **ptr, const size_t size)
{
cudaMallocManaged(ptr, size);
}
void ispc_free(void *ptr)
{
cudaFree(ptr);
}
void ispc_memset(void *ptr, int value, size_t size)
{
cudaMemset(ptr, value, size);
}
void ispcSetMallocHeapLimit(size_t value)
{
cudaDeviceSetLimit(cudaLimitMallocHeapSize,value);
}
void ispcSetStackLimit(size_t value)
{
cudaDeviceSetLimit(cudaLimitStackSize,value);
}
unsigned long long ispcGetMallocHeapLimit()
{
size_t value;
cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize);
return value;
}
unsigned long long ispcGetStackLimit()
{
size_t value;
cudaDeviceGetLimit(&value, cudaLimitStackSize);
return value;
}
void * ispcMemcpy(void *dest, void *src, size_t num)
{
cudaMemcpy(dest, src, num, cudaMemcpyDefault);
return dest;
}

View File

@@ -7872,6 +7872,14 @@ SizeOfExpr::TypeCheck() {
"struct type \"%s\".", type->GetString().c_str());
return NULL;
}
#ifdef ISPC_NVPTX_ENABLED
if (type != NULL)
if (g->target->getISA() == Target::NVPTX && type->IsVaryingType())
{
Error(pos, "\"sizeof\" with varying data types is not yet supported with \"nvptx\" target.");
return NULL;
}
#endif /* ISPC_NVPTX_ENABLED */
return this;
}
@@ -8704,6 +8712,13 @@ NewExpr::TypeCheck() {
AssertPos(pos, m->errorCount > 0);
return NULL;
}
#ifdef ISPC_NVPTX_ENABLED
if (g->target->getISA() == Target::NVPTX && allocType->IsVaryingType())
{
Error(pos, "\"new\" with varying data types is not yet supported with \"nvptx\" target.");
return NULL;
}
#endif /* ISPC_NVPTX_ENABLED */
if (CastType<UndefinedStructType>(allocType) != NULL) {
Error(pos, "Can't dynamically allocate storage for declared "
"but not defined type \"%s\".", allocType->GetString().c_str());

View File

@@ -47,6 +47,9 @@
#include <stdio.h>
#if defined(LLVM_3_2)
#ifdef ISPC_NVPTX_ENABLED
#include <llvm/Metadata.h>
#endif /* ISPC_NVPTX_ENABLED */
#include <llvm/LLVMContext.h>
#include <llvm/Module.h>
#include <llvm/Type.h>
@@ -54,6 +57,9 @@
#include <llvm/Intrinsics.h>
#include <llvm/DerivedTypes.h>
#else
#ifdef ISPC_NVPTX_ENABLED
#include <llvm/IR/Metadata.h>
#endif /* ISPC_NVPTX_ENABLED */
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/Type.h>
@@ -129,7 +135,11 @@ Function::Function(Symbol *s, Stmt *c) {
sym->parentFunction = this;
}
if (type->isTask) {
if (type->isTask
#ifdef ISPC_NVPTX_ENABLED
&& (g->target->getISA() != Target::NVPTX)
#endif
){
threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
Assert(threadIndexSym);
threadCountSym = m->symbolTable->LookupVariable("threadCount");
@@ -240,7 +250,11 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
#endif
const FunctionType *type = CastType<FunctionType>(sym->type);
Assert(type != NULL);
if (type->isTask == true) {
if (type->isTask == true
#ifdef ISPC_NVPTX_ENABLED
&& (g->target->getISA() != Target::NVPTX)
#endif
){
// For tasks, there should always be three parameters: the
// pointer to the structure that holds all of the arguments, the
// thread index, and the thread count variables.
@@ -338,6 +352,18 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
ctx->SetFunctionMask(argIter);
Assert(++argIter == function->arg_end());
}
#ifdef ISPC_NVPTX_ENABLED
if (type->isTask == true && g->target->getISA() == Target::NVPTX)
{
llvm::NamedMDNode* annotations =
m->module->getOrInsertNamedMetadata("nvvm.annotations");
llvm::SmallVector<llvm::Value*, 3> av;
av.push_back(function);
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
av.push_back(LLVMInt32(1));
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
}
#endif /* ISPC_NVPTX_ENABLED */
}
// Finally, we can generate code for the function
@@ -499,6 +525,21 @@ Function::GenerateIR() {
std::string functionName = sym->name;
if (g->mangleFunctionsWithTarget)
functionName += std::string("_") + g->target->GetISAString();
#ifdef ISPC_NVPTX_ENABLED
if (g->target->getISA() == Target::NVPTX)
{
functionName += std::string("___export"); /* add ___export to the end, for ptxcc to recognize it is exported */
#if 0
llvm::NamedMDNode* annotations =
m->module->getOrInsertNamedMetadata("nvvm.annotations");
llvm::SmallVector<llvm::Value*, 3> av;
av.push_back(function);
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
#endif
}
#endif /* ISPC_NVPTX_ENABLED */
llvm::Function *appFunction =
llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
appFunction->setDoesNotThrow();
@@ -536,6 +577,18 @@ Function::GenerateIR() {
FATAL("Function verificication failed");
}
}
#ifdef ISPC_NVPTX_ENABLED
if (g->target->getISA() == Target::NVPTX)
{
llvm::NamedMDNode* annotations =
m->module->getOrInsertNamedMetadata("nvvm.annotations");
llvm::SmallVector<llvm::Value*, 3> av;
av.push_back(appFunction);
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
}
#endif /* ISPC_NVPTX_ENABLED */
}
}
}

View File

@@ -243,6 +243,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
arch = "arm";
else
#endif
#ifdef ISPC_NVPTX_ENABLED
if(!strncmp(isa, "nvptx", 5))
arch = "nvptx64";
else
#endif /* ISPC_NVPTX_ENABLED */
arch = "x86-64";
}
@@ -582,6 +587,23 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_maskBitCount = 32;
}
#endif
#ifdef ISPC_NVPTX_ENABLED
else if (!strcasecmp(isa, "nvptx"))
{
this->m_isa = Target::NVPTX;
this->m_cpu = "sm_35";
this->m_nativeVectorWidth = 32;
this->m_nativeVectorAlignment = 32;
this->m_vectorWidth = 1;
this->m_hasHalf = true;
this->m_maskingIsFree = true;
this->m_maskBitCount = 1;
this->m_hasTranscendentals = true;
this->m_hasTrigonometry = true;
this->m_hasGather = this->m_hasScatter = false;
cpuFromIsa = "sm_35";
}
#endif /* ISPC_NVPTX_ENABLED */
else {
Error(SourcePos(), "Target \"%s\" is unknown. Choices are: %s.",
isa, SupportedTargets());
@@ -679,6 +701,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
"i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-"
"f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32-v4:128:128";
}
else if (m_isa == Target::NVPTX)
{
dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
}
// 3. Finally set member data
m_dataLayout = new llvm::DataLayout(dl_string);
@@ -695,6 +721,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
// Initialize target-specific "target-feature" attribute.
if (!m_attributes.empty()) {
llvm::AttrBuilder attrBuilder;
#ifdef ISPC_NVPTX_ENABLED
if (m_isa != Target::NVPTX)
#endif
attrBuilder.addAttribute("target-cpu", this->m_cpu);
attrBuilder.addAttribute("target-features", this->m_attributes);
this->m_tf_attributes = new llvm::AttributeSet(
@@ -742,6 +771,9 @@ Target::SupportedTargets() {
return
#ifdef ISPC_ARM_ENABLED
"neon-i8x16, neon-i16x8, neon-i32x4, "
#endif
#ifdef ISPC_NVPTX_ENABLED
"nvptx, "
#endif
"sse2-i32x4, sse2-i32x8, "
"sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
@@ -777,6 +809,10 @@ Target::GetTripleString() const {
triple.setArchName("i386");
else if (m_arch == "x86-64")
triple.setArchName("x86_64");
#ifdef ISPC_NVPTX_ENABLED
else if (m_arch == "nvptx64")
triple = llvm::Triple("nvptx64", "nvidia", "cuda");
#endif /* ISPC_NVPTX_ENABLED */
else
triple.setArchName(m_arch);
}
@@ -809,6 +845,10 @@ Target::ISAToString(ISA isa) {
return "avx2";
case Target::GENERIC:
return "generic";
#ifdef ISPC_NVPTX_ENABLED
case Target::NVPTX:
return "nvptx";
#endif /* ISPC_NVPTX_ENABLED */
default:
FATAL("Unhandled target in ISAToString()");
}
@@ -847,6 +887,10 @@ Target::ISAToTargetString(ISA isa) {
return "avx2-i32x8";
case Target::GENERIC:
return "generic-4";
#ifdef ISPC_NVPTX_ENABLED
case Target::NVPTX:
return "nvptx";
#endif /* ISPC_NVPTX_ENABLED */
default:
FATAL("Unhandled target in ISAToTargetString()");
}

Some files were not shown because too many files have changed in this diff Show More