Compare commits
82 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a9ec745275 | ||
|
|
c2ecc15b93 | ||
|
|
83c8650b36 | ||
|
|
89cb809922 | ||
|
|
fdb4eaf437 | ||
|
|
0432f97555 | ||
|
|
8d1631b714 | ||
|
|
dac091552d | ||
|
|
ea027a95a8 | ||
|
|
f73abb05a7 | ||
|
|
d71c49494f | ||
|
|
25665f0841 | ||
|
|
1eec27f890 | ||
|
|
950f86200b | ||
|
|
e19f4931d1 | ||
|
|
0575b1f38d | ||
|
|
f6cd01f7cf | ||
|
|
f2fbc168af | ||
|
|
b50f6f1730 | ||
|
|
f8a7120d9c | ||
|
|
20dbf59420 | ||
|
|
c67a286aa6 | ||
|
|
c96fef6bc8 | ||
|
|
bba02f87ea | ||
|
|
12dc3f5c28 | ||
|
|
0f01a5dcbe | ||
|
|
664dc3bdda | ||
|
|
bdba3cd97d | ||
|
|
d9c0f9315a | ||
|
|
b7f17d435f | ||
|
|
37cdc18639 | ||
|
|
5893a9c49d | ||
|
|
24f58fa16a | ||
|
|
56ffc78fa4 | ||
|
|
061e68bc77 | ||
|
|
177e6312b4 | ||
|
|
1acf4032c2 | ||
|
|
9c5444698e | ||
|
|
65f3252760 | ||
|
|
e612abe4ba | ||
|
|
34352e4e0e | ||
|
|
1867b5b317 | ||
|
|
a5b7fca7e0 | ||
|
|
7be2c399b1 | ||
|
|
d6337b3b22 | ||
|
|
d2f8b0ace5 | ||
|
|
d805e8b183 | ||
|
|
1f0f2ec05f | ||
|
|
91ac3b9d7c | ||
|
|
d65bf2eb2f | ||
|
|
1bba9d4307 | ||
|
|
4388338dad | ||
|
|
2fb59c90cf | ||
|
|
68f6ea8def | ||
|
|
3f89295d10 | ||
|
|
748b292e77 | ||
|
|
6451c3d99d | ||
|
|
d14a2de168 | ||
|
|
642150095d | ||
|
|
3bf3ac7922 | ||
|
|
c6d1cebad4 | ||
|
|
08189ce08c | ||
|
|
7013d7d52f | ||
|
|
7045b76f84 | ||
|
|
58a0b4a20d | ||
|
|
0f8eee9809 | ||
|
|
0740299860 | ||
|
|
652215861e | ||
|
|
602209e5a8 | ||
|
|
b60f8b4f70 | ||
|
|
b67446d998 | ||
|
|
9670ab0887 | ||
|
|
0223bb85ee | ||
|
|
fd81255db1 | ||
|
|
8a8e1a7f73 | ||
|
|
ef05fbf424 | ||
|
|
fa01b63fa5 | ||
|
|
63d3d25030 | ||
|
|
a8db866228 | ||
|
|
0519eea951 | ||
|
|
5d67252ed0 | ||
|
|
59f4c9985e |
9
Makefile
9
Makefile
@@ -58,11 +58,7 @@ LDFLAGS=
|
||||
ifeq ($(ARCH_OS),Linux)
|
||||
# try to link everything statically under Linux (including libstdc++) so
|
||||
# that the binaries we generate will be portable across distributions...
|
||||
ifeq ($(ARCH_TYPE),x86_64)
|
||||
LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
|
||||
else
|
||||
LDFLAGS=-L/usr/lib/gcc/i686-redhat-linux/4.6.0
|
||||
endif
|
||||
LDFLAGS=-static
|
||||
endif
|
||||
|
||||
LEX=flex
|
||||
@@ -75,7 +71,8 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
|
||||
type.cpp util.cpp
|
||||
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
|
||||
opt.h stmt.h sym.h type.h util.h
|
||||
TARGETS=avx avx-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 generic-16
|
||||
TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
|
||||
generic-16 generic-1
|
||||
BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
|
||||
builtins/dispatch.ll
|
||||
BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
|
||||
|
||||
@@ -47,7 +47,7 @@ remarkable `LLVM Compiler Infrastructure <http://llvm.org>`_ for back-end
|
||||
code generation and optimization and is `hosted on
|
||||
github <http://github.com/ispc/ispc/>`_. It supports Windows, Mac, and
|
||||
Linux, with both x86 and x86-64 targets. It currently supports the SSE2,
|
||||
SSE4, and AVX instruction sets.
|
||||
SSE4, AVX1, and AVX2 instruction sets.
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
134
ast.cpp
134
ast.cpp
@@ -90,11 +90,15 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
DoStmt *dos;
|
||||
ForStmt *fs;
|
||||
ForeachStmt *fes;
|
||||
CaseStmt *cs;
|
||||
DefaultStmt *defs;
|
||||
SwitchStmt *ss;
|
||||
ReturnStmt *rs;
|
||||
LabeledStmt *ls;
|
||||
StmtList *sl;
|
||||
PrintStmt *ps;
|
||||
AssertStmt *as;
|
||||
DeleteStmt *dels;
|
||||
|
||||
if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
|
||||
es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
|
||||
@@ -131,6 +135,14 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
postFunc, data);
|
||||
fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
|
||||
cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
|
||||
else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
|
||||
defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data);
|
||||
else if ((ss = dynamic_cast<SwitchStmt *>(node)) != NULL) {
|
||||
ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data);
|
||||
ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if (dynamic_cast<BreakStmt *>(node) != NULL ||
|
||||
dynamic_cast<ContinueStmt *>(node) != NULL ||
|
||||
dynamic_cast<GotoStmt *>(node) != NULL) {
|
||||
@@ -149,6 +161,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
ps->values = (Expr *)WalkAST(ps->values, preFunc, postFunc, data);
|
||||
else if ((as = dynamic_cast<AssertStmt *>(node)) != NULL)
|
||||
as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
|
||||
else if ((dels = dynamic_cast<DeleteStmt *>(node)) != NULL)
|
||||
dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
|
||||
else
|
||||
FATAL("Unhandled statement type in WalkAST()");
|
||||
}
|
||||
@@ -169,6 +183,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
DereferenceExpr *dre;
|
||||
SizeOfExpr *soe;
|
||||
AddressOfExpr *aoe;
|
||||
NewExpr *newe;
|
||||
|
||||
if ((ue = dynamic_cast<UnaryExpr *>(node)) != NULL)
|
||||
ue->expr = (Expr *)WalkAST(ue->expr, preFunc, postFunc, data);
|
||||
@@ -212,6 +227,12 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
|
||||
else if ((aoe = dynamic_cast<AddressOfExpr *>(node)) != NULL)
|
||||
aoe->expr = (Expr *)WalkAST(aoe->expr, preFunc, postFunc, data);
|
||||
else if ((newe = dynamic_cast<NewExpr *>(node)) != NULL) {
|
||||
newe->countExpr = (Expr *)WalkAST(newe->countExpr, preFunc,
|
||||
postFunc, data);
|
||||
newe->initExpr = (Expr *)WalkAST(newe->initExpr, preFunc,
|
||||
postFunc, data);
|
||||
}
|
||||
else if (dynamic_cast<SymbolExpr *>(node) != NULL ||
|
||||
dynamic_cast<ConstExpr *>(node) != NULL ||
|
||||
dynamic_cast<FunctionSymbolExpr *>(node) != NULL ||
|
||||
@@ -294,3 +315,116 @@ EstimateCost(ASTNode *root) {
|
||||
return cost;
|
||||
}
|
||||
|
||||
|
||||
/** Given an AST node, check to see if it's safe if we happen to run the
|
||||
code for that node with the execution mask all off.
|
||||
*/
|
||||
static bool
|
||||
lCheckAllOffSafety(ASTNode *node, void *data) {
|
||||
bool *okPtr = (bool *)data;
|
||||
|
||||
if (dynamic_cast<FunctionCallExpr *>(node) != NULL) {
|
||||
// FIXME: If we could somehow determine that the function being
|
||||
// called was safe (and all of the args Exprs were safe, then it'd
|
||||
// be nice to be able to return true here. (Consider a call to
|
||||
// e.g. floatbits() in the stdlib.) Unfortunately for now we just
|
||||
// have to be conservative.
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dynamic_cast<AssertStmt *>(node) != NULL) {
|
||||
// While it's fine to run the assert for varying tests, it's not
|
||||
// desirable to check an assert on a uniform variable if all of the
|
||||
// lanes are off.
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dynamic_cast<NewExpr *>(node) != NULL ||
|
||||
dynamic_cast<DeleteStmt *>(node) != NULL) {
|
||||
// We definitely don't want to run the uniform variants of these if
|
||||
// the mask is all off. It's also worth skipping the overhead of
|
||||
// executing the varying versions of them in the all-off mask case.
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (g->target.allOffMaskIsSafe == true)
|
||||
// Don't worry about memory accesses if we have a target that can
|
||||
// safely run them with the mask all off
|
||||
return true;
|
||||
|
||||
IndexExpr *ie;
|
||||
if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
|
||||
const Type *type = ie->baseExpr->GetType();
|
||||
if (type == NULL)
|
||||
return true;
|
||||
if (dynamic_cast<const ReferenceType *>(type) != NULL)
|
||||
type = type->GetReferenceTarget();
|
||||
|
||||
ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
|
||||
if (ce == NULL) {
|
||||
// indexing with a variable... -> not safe
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
const PointerType *pointerType =
|
||||
dynamic_cast<const PointerType *>(type);
|
||||
if (pointerType != NULL) {
|
||||
// pointer[index] -> can't be sure -> not safe
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
const SequentialType *seqType =
|
||||
dynamic_cast<const SequentialType *>(type);
|
||||
Assert(seqType != NULL);
|
||||
int nElements = seqType->GetElementCount();
|
||||
if (nElements == 0) {
|
||||
// Unsized array, so we can't be sure -> not safe
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t indices[ISPC_MAX_NVEC];
|
||||
int count = ce->AsInt32(indices);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
if (indices[i] < 0 || indices[i] >= nElements) {
|
||||
// Index is out of bounds -> not safe
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// All indices are in-bounds
|
||||
return true;
|
||||
}
|
||||
|
||||
MemberExpr *me;
|
||||
if ((me = dynamic_cast<MemberExpr *>(node)) != NULL &&
|
||||
me->dereferenceExpr) {
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
DereferenceExpr *de;
|
||||
if ((de = dynamic_cast<DereferenceExpr *>(node)) != NULL) {
|
||||
const Type *exprType = de->expr->GetType();
|
||||
if (dynamic_cast<const PointerType *>(exprType) != NULL) {
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
SafeToRunWithMaskAllOff(ASTNode *root) {
|
||||
bool safe = true;
|
||||
WalkAST(root, lCheckAllOffSafety, NULL, &safe);
|
||||
return safe;
|
||||
}
|
||||
|
||||
4
ast.h
4
ast.h
@@ -144,4 +144,8 @@ extern Stmt *TypeCheck(Stmt *);
|
||||
the given root. */
|
||||
extern int EstimateCost(ASTNode *root);
|
||||
|
||||
/** Returns true if it would be safe to run the given code with an "all
|
||||
off" mask. */
|
||||
extern bool SafeToRunWithMaskAllOff(ASTNode *root);
|
||||
|
||||
#endif // ISPC_AST_H
|
||||
|
||||
@@ -26,17 +26,21 @@ if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT")
|
||||
try:
|
||||
as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
|
||||
except IOError:
|
||||
print >> sys.stderr, "Couldn't open " + src
|
||||
sys.stderr.write("Couldn't open " + src)
|
||||
sys.exit(1)
|
||||
|
||||
print "unsigned char builtins_bitcode_" + target + "[] = {"
|
||||
for line in as_out.stdout.readlines():
|
||||
length = length + len(line)
|
||||
for c in line:
|
||||
print ord(c)
|
||||
print ", "
|
||||
print " 0 };\n\n"
|
||||
print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n"
|
||||
width = 16;
|
||||
sys.stdout.write("unsigned char builtins_bitcode_" + target + "[] = {\n")
|
||||
|
||||
data = as_out.stdout.read()
|
||||
for i in range(0, len(data), 1):
|
||||
sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
|
||||
|
||||
if i%width == (width-1):
|
||||
sys.stdout.write("\n")
|
||||
|
||||
sys.stdout.write("0x00 };\n\n")
|
||||
sys.stdout.write("int builtins_bitcode_" + target + "_length = " + str(i+1) + ";\n")
|
||||
|
||||
as_out.wait()
|
||||
|
||||
|
||||
59
builtins.cpp
59
builtins.cpp
@@ -386,10 +386,13 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__ceil_uniform_float",
|
||||
"__ceil_varying_double",
|
||||
"__ceil_varying_float",
|
||||
"__clock",
|
||||
"__count_trailing_zeros_i32",
|
||||
"__count_trailing_zeros_i64",
|
||||
"__count_leading_zeros_i32",
|
||||
"__count_leading_zeros_i64",
|
||||
"__delete_uniform",
|
||||
"__delete_varying",
|
||||
"__do_assert_uniform",
|
||||
"__do_assert_varying",
|
||||
"__do_print",
|
||||
@@ -448,6 +451,9 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__min_varying_uint32",
|
||||
"__min_varying_uint64",
|
||||
"__movmsk",
|
||||
"__new_uniform",
|
||||
"__new_varying32",
|
||||
"__new_varying64",
|
||||
"__num_cores",
|
||||
"__packed_load_active",
|
||||
"__packed_store_active",
|
||||
@@ -717,11 +723,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
extern int builtins_bitcode_sse4_x2_length;
|
||||
switch (g->target.vectorWidth) {
|
||||
case 4:
|
||||
AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length,
|
||||
AddBitcodeToModule(builtins_bitcode_sse4,
|
||||
builtins_bitcode_sse4_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 8:
|
||||
AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length,
|
||||
AddBitcodeToModule(builtins_bitcode_sse4_x2,
|
||||
builtins_bitcode_sse4_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
@@ -729,18 +737,39 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
}
|
||||
break;
|
||||
case Target::AVX:
|
||||
case Target::AVX2:
|
||||
switch (g->target.vectorWidth) {
|
||||
case 8:
|
||||
extern unsigned char builtins_bitcode_avx[];
|
||||
extern int builtins_bitcode_avx_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length,
|
||||
extern unsigned char builtins_bitcode_avx1[];
|
||||
extern int builtins_bitcode_avx1_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx1,
|
||||
builtins_bitcode_avx1_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 16:
|
||||
extern unsigned char builtins_bitcode_avx_x2[];
|
||||
extern int builtins_bitcode_avx_x2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
|
||||
extern unsigned char builtins_bitcode_avx1_x2[];
|
||||
extern int builtins_bitcode_avx1_x2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx1_x2,
|
||||
builtins_bitcode_avx1_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
case Target::AVX2:
|
||||
switch (g->target.vectorWidth) {
|
||||
case 8:
|
||||
extern unsigned char builtins_bitcode_avx2[];
|
||||
extern int builtins_bitcode_avx2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx2,
|
||||
builtins_bitcode_avx2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 16:
|
||||
extern unsigned char builtins_bitcode_avx2_x2[];
|
||||
extern int builtins_bitcode_avx2_x2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx2_x2,
|
||||
builtins_bitcode_avx2_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
@@ -770,6 +799,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
builtins_bitcode_generic_16_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 1:
|
||||
extern unsigned char builtins_bitcode_generic_1[];
|
||||
extern int builtins_bitcode_generic_1_length;
|
||||
AddBitcodeToModule(builtins_bitcode_generic_1,
|
||||
builtins_bitcode_generic_1_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
@@ -798,11 +834,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
|
||||
symbolTable);
|
||||
|
||||
lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
|
||||
module, symbolTable);
|
||||
|
||||
if (includeStdlibISPC) {
|
||||
// If the user wants the standard library to be included, parse the
|
||||
// serialized version of the stdlib.ispc file to get its
|
||||
// definitions added.
|
||||
if (g->target.isa == Target::GENERIC) {
|
||||
if (g->target.isa == Target::GENERIC&&g->target.vectorWidth!=1) { // 1 wide uses x86 stdlib
|
||||
extern char stdlib_generic_code[];
|
||||
yy_scan_string(stdlib_generic_code);
|
||||
yyparse();
|
||||
|
||||
@@ -48,23 +48,42 @@ declare void @abort() noreturn
|
||||
;; corresponding to one of the Target::ISA enumerant values that gives the
|
||||
;; most capable ISA that the curremt system can run.
|
||||
;;
|
||||
;; #ifdef _MSC_VER
|
||||
;; extern void __stdcall __cpuid(int info[4], int infoType);
|
||||
;; #else
|
||||
;; Note: clang from LLVM 2.9 should be used if this is updated, for maximum
|
||||
;; backwards compatibility for anyone building ispc with LLVM 2.9.
|
||||
;;
|
||||
;; #include <stdint.h>
|
||||
;; #include <stdlib.h>
|
||||
;;
|
||||
;; static void __cpuid(int info[4], int infoType) {
|
||||
;; __asm__ __volatile__ ("cpuid"
|
||||
;; : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
;; : "0" (infoType));
|
||||
;; }
|
||||
;; #endif
|
||||
;;
|
||||
;; /* Save %ebx in case it's the PIC register */
|
||||
;; static void __cpuid_count(int info[4], int level, int count) {
|
||||
;; __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
|
||||
;; "cpuid\n\t"
|
||||
;; "xchg{l}\t{%%}ebx, %1\n\t"
|
||||
;; : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
;; : "0" (level), "2" (count));
|
||||
;; }
|
||||
;;
|
||||
;; int32_t __get_system_isa() {
|
||||
;; int info[4];
|
||||
;; __cpuid(info, 1);
|
||||
;;
|
||||
;; /* NOTE: the values returned below must be the same as the
|
||||
;; corresponding enumerant values in Target::ISA. */
|
||||
;; if ((info[2] & (1 << 28)) != 0)
|
||||
;; return 2; // AVX
|
||||
;; if ((info[2] & (1 << 28)) != 0) {
|
||||
;; // AVX1 for sure. Do we have AVX2?
|
||||
;; // Call cpuid with eax=7, ecx=0
|
||||
;; __cpuid_count(info, 7, 0);
|
||||
;; if ((info[1] & (1 << 5)) != 0)
|
||||
;; return 3; // AVX2
|
||||
;; else
|
||||
;; return 2; // AVX1
|
||||
;; }
|
||||
;; else if ((info[2] & (1 << 19)) != 0)
|
||||
;; return 1; // SSE4
|
||||
;; else if ((info[3] & (1 << 26)) != 0)
|
||||
@@ -76,33 +95,42 @@ declare void @abort() noreturn
|
||||
%0 = type { i32, i32, i32, i32 }
|
||||
|
||||
define i32 @__get_system_isa() nounwind ssp {
|
||||
%1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
|
||||
%2 = extractvalue %0 %1, 2
|
||||
%3 = extractvalue %0 %1, 3
|
||||
%4 = and i32 %2, 268435456
|
||||
%5 = icmp eq i32 %4, 0
|
||||
br i1 %5, label %6, label %13
|
||||
entry:
|
||||
%0 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
|
||||
%asmresult9.i = extractvalue %0 %0, 2
|
||||
%asmresult10.i = extractvalue %0 %0, 3
|
||||
%and = and i32 %asmresult9.i, 268435456
|
||||
%cmp = icmp eq i32 %and, 0
|
||||
br i1 %cmp, label %if.else7, label %if.then
|
||||
|
||||
; <label>:6 ; preds = %0
|
||||
%7 = and i32 %2, 524288
|
||||
%8 = icmp eq i32 %7, 0
|
||||
br i1 %8, label %9, label %13
|
||||
if.then: ; preds = %entry
|
||||
%1 = tail call %0 asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
|
||||
%asmresult9.i24 = extractvalue %0 %1, 1
|
||||
%and4 = lshr i32 %asmresult9.i24, 5
|
||||
%2 = and i32 %and4, 1
|
||||
%3 = or i32 %2, 2
|
||||
br label %return
|
||||
|
||||
; <label>:9 ; preds = %6
|
||||
%10 = and i32 %3, 67108864
|
||||
%11 = icmp eq i32 %10, 0
|
||||
br i1 %11, label %12, label %13
|
||||
if.else7: ; preds = %entry
|
||||
%and10 = and i32 %asmresult9.i, 524288
|
||||
%cmp11 = icmp eq i32 %and10, 0
|
||||
br i1 %cmp11, label %if.else13, label %return
|
||||
|
||||
; <label>:12 ; preds = %9
|
||||
if.else13: ; preds = %if.else7
|
||||
%and16 = and i32 %asmresult10.i, 67108864
|
||||
%cmp17 = icmp eq i32 %and16, 0
|
||||
br i1 %cmp17, label %if.else19, label %return
|
||||
|
||||
if.else19: ; preds = %if.else13
|
||||
tail call void @abort() noreturn nounwind
|
||||
unreachable
|
||||
|
||||
; <label>:13 ; preds = %9, %6, %0
|
||||
%.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
|
||||
ret i32 %.0
|
||||
return: ; preds = %if.else13, %if.else7, %if.then
|
||||
%retval.0 = phi i32 [ %3, %if.then ], [ 1, %if.else7 ], [ 0, %if.else13 ]
|
||||
ret i32 %retval.0
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; This function is called by each of the dispatch functions we generate;
|
||||
;; it sets @__system_best_isa if it is unset.
|
||||
|
||||
|
||||
@@ -170,33 +170,6 @@ define <16 x float> @__min_varying_float(<16 x float>,
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
@@ -622,12 +595,7 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(16, i8)
|
||||
gen_gather(16, i16)
|
||||
gen_gather(16, i32)
|
||||
gen_gather(16, i64)
|
||||
;; scatter
|
||||
|
||||
gen_scatter(16, i8)
|
||||
gen_scatter(16, i16)
|
||||
|
||||
@@ -170,33 +170,6 @@ define <8 x float> @__min_varying_float(<8 x float>,
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
@@ -403,9 +376,6 @@ define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
; FIXME: there is no AVX instruction for these, but we could be clever
|
||||
; by packing the bits down and setting the last 3/4 or half, respectively,
|
||||
; of the mask to zero... Not sure if this would be a win in the end
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
|
||||
@@ -520,12 +490,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
;; scatter
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
|
||||
77
builtins/target-avx1-x2.ll
Normal file
77
builtins/target-avx1-x2.ll
Normal file
@@ -0,0 +1,77 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(16, i8)
|
||||
gen_gather(16, i16)
|
||||
gen_gather(16, i32)
|
||||
gen_gather(16, i64)
|
||||
|
||||
|
||||
75
builtins/target-avx1.ll
Normal file
75
builtins/target-avx1.ll
Normal file
@@ -0,0 +1,75 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
129
builtins/target-avx2-x2.ll
Normal file
129
builtins/target-avx2-x2.ll
Normal file
@@ -0,0 +1,129 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
|
||||
%r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
|
||||
%r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x float> %r
|
||||
}
|
||||
|
||||
define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
|
||||
%r_1 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
|
||||
%r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(16, i8)
|
||||
gen_gather(16, i16)
|
||||
gen_gather(16, i32)
|
||||
gen_gather(16, i64)
|
||||
|
||||
|
||||
110
builtins/target-avx2.ll
Normal file
110
builtins/target-avx2.ll
Normal file
@@ -0,0 +1,110 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
|
||||
%r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
|
||||
ret <8 x float> %r
|
||||
}
|
||||
|
||||
define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
935
builtins/target-generic-1.ll
Executable file
935
builtins/target-generic-1.ll
Executable file
@@ -0,0 +1,935 @@
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Define the standard library builtins for the NOVEC target
|
||||
define(`MASK',`i32')
|
||||
define(`WIDTH',`1')
|
||||
include(`util.m4')
|
||||
; Define some basics for a 1-wide target
|
||||
stdlib_core()
|
||||
packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
aossoa()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
gen_masked_store(1, i8, 8)
|
||||
gen_masked_store(1, i16, 16)
|
||||
gen_masked_store(1, i32, 32)
|
||||
gen_masked_store(1, i64, 64)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unaligned loads/loads+broadcasts
|
||||
|
||||
load_and_broadcast(1, i8, 8)
|
||||
load_and_broadcast(1, i16, 16)
|
||||
load_and_broadcast(1, i32, 32)
|
||||
load_and_broadcast(1, i64, 64)
|
||||
|
||||
masked_load(1, i8, 8, 1)
|
||||
masked_load(1, i16, 16, 2)
|
||||
masked_load(1, i32, 32, 4)
|
||||
masked_load(1, i64, 64, 8)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(1, i8)
|
||||
gen_gather(1, i16)
|
||||
gen_gather(1, i32)
|
||||
gen_gather(1, i64)
|
||||
|
||||
gen_scatter(1, i8)
|
||||
gen_scatter(1, i16)
|
||||
gen_scatter(1, i32)
|
||||
gen_scatter(1, i64)
|
||||
|
||||
|
||||
define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %mv = trunc <1 x i32> %mask to <1 x i8>
|
||||
; %notmask = xor <1 x i8> %mv, <i8 -1>
|
||||
; %cleared_old = and <1 x i8> %0, %notmask
|
||||
; %masked_new = and <1 x i8> %1, %mv
|
||||
; %new = or <1 x i8> %cleared_old, %masked_new
|
||||
; ret <1 x i8> %new
|
||||
|
||||
; not doing this the easy way because of problems with LLVM's scalarizer
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x i8> %0, <1 x i8> %1
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x i8> %0, i32 0
|
||||
%d1 = extractelement <1 x i8> %1, i32 0
|
||||
%sel = select i1 %cmp, i8 %d0, i8 %d1
|
||||
%r = insertelement <1 x i8> undef, i8 %sel, i32 0
|
||||
ret <1 x i8> %r
|
||||
}
|
||||
|
||||
define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %mv = trunc <1 x i32> %mask to <1 x i16>
|
||||
; %notmask = xor <1 x i16> %mv, <i16 -1>
|
||||
; %cleared_old = and <1 x i16> %0, %notmask
|
||||
; %masked_new = and <1 x i16> %1, %mv
|
||||
; %new = or <1 x i16> %cleared_old, %masked_new
|
||||
; ret <1 x i16> %new
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x i16> %0, <1 x i16> %1
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x i16> %0, i32 0
|
||||
%d1 = extractelement <1 x i16> %1, i32 0
|
||||
%sel = select i1 %cmp, i16 %d0, i16 %d1
|
||||
%r = insertelement <1 x i16> undef, i16 %sel, i32 0
|
||||
ret <1 x i16> %r
|
||||
|
||||
; ret <1 x i16> %sel
|
||||
}
|
||||
|
||||
|
||||
define <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %notmask = xor <1 x i32> %mask, <i32 -1>
|
||||
; %cleared_old = and <1 x i32> %0, %notmask
|
||||
; %masked_new = and <1 x i32> %1, %mask
|
||||
; %new = or <1 x i32> %cleared_old, %masked_new
|
||||
; ret <1 x i32> %new
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x i32> %0, <1 x i32> %1
|
||||
; ret <1 x i32> %sel
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x i32> %0, i32 0
|
||||
%d1 = extractelement <1 x i32> %1, i32 0
|
||||
%sel = select i1 %cmp, i32 %d0, i32 %d1
|
||||
%r = insertelement <1 x i32> undef, i32 %sel, i32 0
|
||||
ret <1 x i32> %r
|
||||
|
||||
}
|
||||
define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %newmask = zext <1 x i32> %mask to <1 x i64>
|
||||
; %notmask = xor <1 x i64> %newmask, <i64 -1>
|
||||
; %cleared_old = and <1 x i64> %0, %notmask
|
||||
; %masked_new = and <1 x i64> %1, %newmask
|
||||
; %new = or <1 x i64> %cleared_old, %masked_new
|
||||
; ret <1 x i64> %new
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x i64> %0, <1 x i64> %1
|
||||
; ret <1 x i64> %sel
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x i64> %0, i32 0
|
||||
%d1 = extractelement <1 x i64> %1, i32 0
|
||||
%sel = select i1 %cmp, i64 %d0, i64 %d1
|
||||
%r = insertelement <1 x i64> undef, i64 %sel, i32 0
|
||||
ret <1 x i64> %r
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__vselect_float(<1 x float>, <1 x float>,
|
||||
<1 x i32> %mask) nounwind readnone alwaysinline {
|
||||
; %v0 = bitcast <1 x float> %0 to <1 x i32>
|
||||
; %v1 = bitcast <1 x float> %1 to <1 x i32>
|
||||
; %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask)
|
||||
; %rf = bitcast <1 x i32> %r to <1 x float>
|
||||
; ret <1 x float> %rf
|
||||
; %cmp = icmp eq <1 x i32> %mask, <i32 0>
|
||||
; %sel = select <1 x i1> %cmp, <1 x float> %0, <1 x float> %1
|
||||
; ret <1 x float> %sel
|
||||
%m = extractelement <1 x i32> %mask, i32 0
|
||||
%cmp = icmp eq i32 %m, 0
|
||||
%d0 = extractelement <1 x float> %0, i32 0
|
||||
%d1 = extractelement <1 x float> %1, i32 0
|
||||
%sel = select i1 %cmp, float %d0, float %d1
|
||||
%r = insertelement <1 x float> undef, float %sel, i32 0
|
||||
ret <1 x float> %r
|
||||
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
define void @__masked_store_blend_8(<1 x i8>* nocapture, <1 x i8>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i8> * %0, align 4
|
||||
%newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask)
|
||||
store <1 x i8> %newval, <1 x i8> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
define void @__masked_store_blend_16(<1 x i16>* nocapture, <1 x i16>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i16> * %0, align 4
|
||||
%newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask)
|
||||
store <1 x i16> %newval, <1 x i16> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i32> * %0, align 4
|
||||
%newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask)
|
||||
store <1 x i32> %newval, <1 x i32> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%val = load <1 x i64> * %0, align 4
|
||||
%newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask)
|
||||
store <1 x i64> %newval, <1 x i64> * %0, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
|
||||
%item = extractelement <1 x i32> %0, i32 0
|
||||
%v = lshr i32 %item, 31
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
;;
|
||||
;; There are not any rounding instructions in SSE2, so we have to emulate
|
||||
;; the functionality with multiple instructions...
|
||||
|
||||
; The code for __round_* is the result of compiling the following source
|
||||
; code.
|
||||
;
|
||||
; export float Round(float x) {
|
||||
; unsigned int sign = signbits(x);
|
||||
; unsigned int ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; x += 0x1.0p23f;
|
||||
; x -= 0x1.0p23f;
|
||||
; ix = intbits(x);
|
||||
; ix ^= sign;
|
||||
; x = floatbits(ix);
|
||||
; return x;
|
||||
;}
|
||||
|
||||
define <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
%float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32>
|
||||
%bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
|
||||
%bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
|
||||
%binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
|
||||
%binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
|
||||
%float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
|
||||
%bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
|
||||
%int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float>
|
||||
ret <1 x float> %int_to_float_bitcast.i.i.i
|
||||
}
|
||||
|
||||
;; Similarly, for implementations of the __floor* functions below, we have the
|
||||
;; bitcode from compiling the following source code...
|
||||
|
||||
;export float Floor(float x) {
|
||||
; float y = Round(x);
|
||||
; unsigned int cmp = y > x ? 0xffffffff : 0;
|
||||
; float delta = -1.f;
|
||||
; unsigned int idelta = intbits(delta);
|
||||
; idelta &= cmp;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
|
||||
%bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
|
||||
%bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 -1082130432>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
|
||||
%binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <1 x float> %binop.i
|
||||
}
|
||||
|
||||
;; And here is the code we compiled to get the __ceil* functions below
|
||||
;
|
||||
;export uniform float Ceil(uniform float x) {
|
||||
; uniform float y = Round(x);
|
||||
; uniform int yltx = y < x ? 0xffffffff : 0;
|
||||
; uniform float delta = 1.f;
|
||||
; uniform int idelta = intbits(delta);
|
||||
; idelta &= yltx;
|
||||
; delta = floatbits(idelta);
|
||||
; return y + delta;
|
||||
;}
|
||||
|
||||
define <1 x float> @__ceil_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
%calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
|
||||
%bincmp.i = fcmp olt <1 x float> %calltmp.i, %0
|
||||
%val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
|
||||
%bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 1065353216>
|
||||
%int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
|
||||
%binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
|
||||
ret <1 x float> %binop.i
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
; expecting math lib to provide this
|
||||
declare double @ceil (double) nounwind readnone
|
||||
declare double @floor (double) nounwind readnone
|
||||
declare double @round (double) nounwind readnone
|
||||
;declare float @llvm.sqrt.f32(float %Val)
|
||||
declare double @llvm.sqrt.f64(double %Val)
|
||||
declare float @llvm.sin.f32(float %Val)
|
||||
declare float @llvm.cos.f32(float %Val)
|
||||
declare float @llvm.sqrt.f32(float %Val)
|
||||
declare float @llvm.exp.f32(float %Val)
|
||||
declare float @llvm.log.f32(float %Val)
|
||||
declare float @llvm.pow.f32(float %f, float %e)
|
||||
|
||||
|
||||
|
||||
|
||||
;; stuff that could be in builtins ...
|
||||
|
||||
define(`unary1to1', `
|
||||
%v_0 = extractelement <1 x $1> %0, i32 0
|
||||
%r_0 = call $1 $2($1 %v_0)
|
||||
%ret_0 = insertelement <1 x $1> undef, $1 %r_0, i32 0
|
||||
ret <1 x $1> %ret_0
|
||||
')
|
||||
|
||||
|
||||
|
||||
;; dummy 1 wide vector ops
|
||||
define void
|
||||
@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
|
||||
<1 x float> %v3, <1 x float> * noalias %out0,
|
||||
<1 x float> * noalias %out1, <1 x float> * noalias %out2,
|
||||
<1 x float> * noalias %out3) nounwind alwaysinline {
|
||||
|
||||
store <1 x float> %v0, <1 x float > * %out0
|
||||
store <1 x float> %v1, <1 x float > * %out1
|
||||
store <1 x float> %v2, <1 x float > * %out2
|
||||
store <1 x float> %v3, <1 x float > * %out3
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void
|
||||
@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
|
||||
<1 x float> %v3, <1 x float> * noalias %out0,
|
||||
<1 x float> * noalias %out1, <1 x float> * noalias %out2,
|
||||
<1 x float> * noalias %out3) nounwind alwaysinline {
|
||||
call void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1,
|
||||
<1 x float> %v2, <1 x float> %v3, <1 x float> * %out0,
|
||||
<1 x float> * %out1, <1 x float> * %out2, <1 x float> * %out3)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void
|
||||
@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
|
||||
<1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
|
||||
<1 x float> * %out2) {
|
||||
store <1 x float> %v0, <1 x float > * %out0
|
||||
store <1 x float> %v1, <1 x float > * %out1
|
||||
store <1 x float> %v2, <1 x float > * %out2
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void
|
||||
@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1,
|
||||
<1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
|
||||
<1 x float> * %out2) {
|
||||
call void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
|
||||
<1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
|
||||
<1 x float> * %out2)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;; end builtins
|
||||
|
||||
|
||||
define <1 x double> @__round_varying_double(<1 x double>) nounwind readonly alwaysinline {
|
||||
unary1to1(double, @round)
|
||||
}
|
||||
|
||||
define <1 x double> @__floor_varying_double(<1 x double>) nounwind readonly alwaysinline {
|
||||
unary1to1(double, @floor)
|
||||
}
|
||||
|
||||
|
||||
define <1 x double> @__ceil_varying_double(<1 x double>) nounwind readonly alwaysinline {
|
||||
unary1to1(double, @ceil)
|
||||
}
|
||||
|
||||
; To do vector integer min and max, we do the vector compare and then sign
|
||||
; extend the i1 vector result to an i32 mask. The __vselect does the
|
||||
; rest...
|
||||
|
||||
define <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp slt <1 x i32> %0, %1
|
||||
%mask = sext <1 x i1> %c to <1 x i32>
|
||||
%v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
|
||||
ret <1 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp slt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt <1 x i32> %0, %1
|
||||
%mask = sext <1 x i1> %c to <1 x i32>
|
||||
%v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
|
||||
ret <1 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp sgt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; The functions for unsigned ints are similar, just with unsigned
|
||||
; comparison functions...
|
||||
|
||||
define <1 x i32> @__min_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ult <1 x i32> %0, %1
|
||||
%mask = sext <1 x i1> %c to <1 x i32>
|
||||
%v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
|
||||
ret <1 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ult i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define <1 x i32> @__max_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt <1 x i32> %0, %1
|
||||
%mask = sext <1 x i1> %c to <1 x i32>
|
||||
%v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
|
||||
ret <1 x i32> %v
|
||||
}
|
||||
|
||||
define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
|
||||
%c = icmp ugt i32 %0, %1
|
||||
%r = select i1 %c, i32 %0, i32 %1
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops / reductions
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
|
||||
|
||||
define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
|
||||
%call = call i32 @llvm.ctpop.i32(i32 %0)
|
||||
ret i32 %call
|
||||
}
|
||||
|
||||
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
|
||||
|
||||
define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
|
||||
define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
|
||||
%r = extractelement <1 x float> %v, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<1 x float>) nounwind readnone {
|
||||
%r = extractelement <1 x float> %0, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define float @__reduce_max_float(<1 x float>) nounwind readnone {
|
||||
%r = extractelement <1 x float> %0, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_int32(<1 x i32> %v) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %v, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_int32(<1 x i32>) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %0, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_int32(<1 x i32>) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %0, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_add_uint32(<1 x i32> %v) nounwind readnone {
|
||||
%r = call i32 @__reduce_add_int32(<1 x i32> %v)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %0, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone {
|
||||
%r = extractelement <1 x i32> %0, i32 0
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
|
||||
define double @__reduce_add_double(<1 x double>) nounwind readnone {
|
||||
%m = extractelement <1 x double> %0, i32 0
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define double @__reduce_min_double(<1 x double>) nounwind readnone {
|
||||
%m = extractelement <1 x double> %0, i32 0
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define double @__reduce_max_double(<1 x double>) nounwind readnone {
|
||||
%m = extractelement <1 x double> %0, i32 0
|
||||
ret double %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_add_int64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_int64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_int64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone {
|
||||
%m = extractelement <1 x i64> %0, i32 0
|
||||
ret i64 %m
|
||||
}
|
||||
|
||||
define i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%v=extractelement <1 x i32> %vv, i32 0
|
||||
store i32 %v, i32 * %samevalue
|
||||
ret i1 true
|
||||
|
||||
}
|
||||
|
||||
define i1 @__reduce_equal_float(<1 x float> %vv, float * %samevalue,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%v=extractelement <1 x float> %vv, i32 0
|
||||
store float %v, float * %samevalue
|
||||
ret i1 true
|
||||
|
||||
}
|
||||
|
||||
define i1 @__reduce_equal_int64(<1 x i64> %vv, i64 * %samevalue,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%v=extractelement <1 x i64> %vv, i32 0
|
||||
store i64 %v, i64 * %samevalue
|
||||
ret i1 true
|
||||
|
||||
}
|
||||
|
||||
define i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue,
|
||||
<1 x i32> %mask) nounwind alwaysinline {
|
||||
%v=extractelement <1 x double> %vv, i32 0
|
||||
store double %v, double * %samevalue
|
||||
ret i1 true
|
||||
|
||||
}
|
||||
|
||||
; extracting/reinserting elements because I want to be able to remove vectors later on
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
define <1 x float> @__rcp_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
;%call = call <1 x float> @llvm.x86.sse.rcp.ps(<1 x float> %0)
|
||||
; do one N-R iteration to improve precision
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
;%v_iv = fmul <1 x float> %0, %call
|
||||
;%two_minus = fsub <1 x float> <float 2., float 2., float 2., float 2.>, %v_iv
|
||||
;%iv_mul = fmul <1 x float> %call, %two_minus
|
||||
;ret <1 x float> %iv_mul
|
||||
%d = extractelement <1 x float> %0, i32 0
|
||||
%r = fdiv float 1.,%d
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; sqrt
|
||||
|
||||
define <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysinline {
|
||||
;%call = call <1 x float> @llvm.x86.sse.sqrt.ps(<1 x float> %0)
|
||||
;ret <1 x float> %call
|
||||
%d = extractelement <1 x float> %0, i32 0
|
||||
%r = call float @llvm.sqrt.f32(float %d)
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; rsqrt
|
||||
|
||||
define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline {
|
||||
; float is = __rsqrt_v(v);
|
||||
;%is = call <1 x float> @llvm.x86.sse.rsqrt.ps(<1 x float> %v)
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
;%v_is = fmul <1 x float> %v, %is
|
||||
;%v_is_is = fmul <1 x float> %v_is, %is
|
||||
;%three_sub = fsub <1 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
;%is_mul = fmul <1 x float> %is, %three_sub
|
||||
;%half_scale = fmul <1 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
;ret <1 x float> %half_scale
|
||||
%s = call <1 x float> @__sqrt_varying_float(<1 x float> %v)
|
||||
%r = call <1 x float> @__rcp_varying_float(<1 x float> %s)
|
||||
ret <1 x float> %r
|
||||
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; svml stuff
|
||||
|
||||
define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm.sin.f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
unary1to1(float,@llvm.sin.f32)
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm.cos.f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
unary1to1(float, @llvm.cos.f32)
|
||||
|
||||
}
|
||||
|
||||
define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
|
||||
; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
|
||||
; store <1 x float> %s, <1 x float> * %1
|
||||
; ret void
|
||||
%sin = call <1 x float> @__svml_sin (<1 x float> %0)
|
||||
%cos = call <1 x float> @__svml_cos (<1 x float> %0)
|
||||
store <1 x float> %sin, <1 x float> * %1
|
||||
store <1 x float> %cos, <1 x float> * %2
|
||||
ret void
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm_tan_f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
;unasry1to1(float, @llvm.tan.f32)
|
||||
; UNSUPPORTED!
|
||||
ret <1 x float > %0
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
|
||||
; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
|
||||
; ret <1 x float> %ret
|
||||
;%r = extractelement <1 x float> %0, i32 0
|
||||
;%s = call float @llvm_atan_f32(float %r)
|
||||
;%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
;ret <1 x float> %rv
|
||||
;unsary1to1(float,@llvm.atan.f32)
|
||||
;UNSUPPORTED!
|
||||
ret <1 x float > %0
|
||||
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
|
||||
;ret <1 x float> %ret
|
||||
;%y = extractelement <1 x float> %0, i32 0
|
||||
;%x = extractelement <1 x float> %1, i32 0
|
||||
;%q = fdiv float %y, %x
|
||||
;%a = call float @llvm.atan.f32 (float %q)
|
||||
;%rv = insertelement <1 x float> undef, float %a, i32 0
|
||||
;ret <1 x float> %rv
|
||||
; UNSUPPORTED!
|
||||
ret <1 x float > %0
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
unary1to1(float, @llvm.exp.f32)
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
|
||||
;ret <1 x float> %ret
|
||||
unary1to1(float, @llvm.log.f32)
|
||||
}
|
||||
|
||||
define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
|
||||
;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
|
||||
;ret <1 x float> %ret
|
||||
%r = extractelement <1 x float> %0, i32 0
|
||||
%e = extractelement <1 x float> %1, i32 0
|
||||
%s = call float @llvm.pow.f32(float %r,float %e)
|
||||
%rv = insertelement <1 x float> undef, float %s, i32 0
|
||||
ret <1 x float> %rv
|
||||
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
define <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
|
||||
; %call = call <1 x float> @llvm.x86.sse.max.ps(<1 x float> %0, <1 x float> %1)
|
||||
; ret <1 x float> %call
|
||||
%a = extractelement <1 x float> %0, i32 0
|
||||
%b = extractelement <1 x float> %1, i32 0
|
||||
%d = fcmp ogt float %a, %b
|
||||
%r = select i1 %d, float %a, float %b
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
}
|
||||
|
||||
define <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
|
||||
; %call = call <1 x float> @llvm.x86.sse.min.ps(<1 x float> %0, <1 x float> %1)
|
||||
; ret <1 x float> %call
|
||||
%a = extractelement <1 x float> %0, i32 0
|
||||
%b = extractelement <1 x float> %1, i32 0
|
||||
%d = fcmp olt float %a, %b
|
||||
%r = select i1 %d, float %a, float %b
|
||||
%rv = insertelement <1 x float> undef, float %r, i32 0
|
||||
ret <1 x float> %rv
|
||||
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision sqrt
|
||||
|
||||
;declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
|
||||
|
||||
define <1 x double> @__sqrt_varying_double(<1 x double>) nounwind alwaysinline {
|
||||
;unarya2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
|
||||
;ret <1 x double> %ret
|
||||
unary1to1(double, @llvm.sqrt.f64)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double precision min/max
|
||||
|
||||
;declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
;declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
|
||||
|
||||
define <1 x double> @__min_varying_double(<1 x double>, <1 x double>) nounwind readnone {
|
||||
;binarsy2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
|
||||
;ret <1 x double> %ret
|
||||
%a = extractelement <1 x double> %0, i32 0
|
||||
%b = extractelement <1 x double> %1, i32 0
|
||||
%d = fcmp olt double %a, %b
|
||||
%r = select i1 %d, double %a, double %b
|
||||
%rv = insertelement <1 x double> undef, double %r, i32 0
|
||||
ret <1 x double> %rv
|
||||
|
||||
}
|
||||
|
||||
define <1 x double> @__max_varying_double(<1 x double>, <1 x double>) nounwind readnone {
|
||||
;binary2sto4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
;ret <1 x double> %ret
|
||||
%a = extractelement <1 x double> %0, i32 0
|
||||
%b = extractelement <1 x double> %1, i32 0
|
||||
%d = fcmp ogt double %a, %b
|
||||
%r = select i1 %d, double %a, double %b
|
||||
%rv = insertelement <1 x double> undef, double %r, i32 0
|
||||
ret <1 x double> %rv
|
||||
|
||||
}
|
||||
|
||||
|
||||
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; uniform float iv = extract(__rcp_u(v), 0);
|
||||
; return iv * (2. - v * iv);
|
||||
%r = fdiv float 1.,%0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
; the roundss intrinsic is a total mess--docs say:
|
||||
;
|
||||
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
|
||||
;
|
||||
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
|
||||
; on b0. The higher order 96 bits are copied directly from input parameter a. The
|
||||
; return value is described by the following equations:
|
||||
;
|
||||
; r0 = RND(b0)
|
||||
; r1 = a1
|
||||
; r2 = a2
|
||||
; r3 = a3
|
||||
;
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both.
|
||||
%v = insertelement<1 x float> undef, float %0, i32 0
|
||||
%rv = call <1 x float> @__round_varying_float(<1 x float> %v)
|
||||
%r=extractelement <1 x float> %rv, i32 0
|
||||
ret float %r
|
||||
|
||||
}
|
||||
|
||||
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%v = insertelement<1 x float> undef, float %0, i32 0
|
||||
%rv = call <1 x float> @__floor_varying_float(<1 x float> %v)
|
||||
%r=extractelement <1 x float> %rv, i32 0
|
||||
ret float %r
|
||||
|
||||
}
|
||||
|
||||
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%v = insertelement<1 x float> undef, float %0, i32 0
|
||||
%rv = call <1 x float> @__ceil_varying_float(<1 x float> %v)
|
||||
%r=extractelement <1 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding doubles
|
||||
|
||||
|
||||
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%rs=call double @round(double %0)
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%rs = call double @floor(double %0)
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%rs = call double @ceil(double %0)
|
||||
ret double %rs
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; sqrt
|
||||
|
||||
|
||||
define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%ret = call float @llvm.sqrt.f32(float %0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {
|
||||
%ret = call double @llvm.sqrt.f64(double %0)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
|
||||
define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
|
||||
%s = call float @__sqrt_uniform_float(float %0)
|
||||
%r = call float @__rcp_uniform_float(float %s)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; fastmath
|
||||
|
||||
|
||||
define void @__fastmath() nounwind alwaysinline {
|
||||
; no-op
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float min/max
|
||||
|
||||
|
||||
define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
%d = fcmp ogt float %0, %1
|
||||
%r = select i1 %d, float %0, float %1
|
||||
ret float %r
|
||||
|
||||
}
|
||||
|
||||
define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
|
||||
%d = fcmp olt float %0, %1
|
||||
%r = select i1 %d, float %0, float %1
|
||||
ret float %r
|
||||
|
||||
}
|
||||
define double @__max_uniform_double(double, double) nounwind readonly alwaysinline {
|
||||
%d = fcmp ogt double %0, %1
|
||||
%r = select i1 %d, double %0, double %1
|
||||
ret double %r
|
||||
|
||||
}
|
||||
|
||||
define double @__min_uniform_double(double, double) nounwind readonly alwaysinline {
|
||||
%d = fcmp olt double %0, %1
|
||||
%r = select i1 %d, double %0, double %1
|
||||
ret double %r
|
||||
|
||||
}
|
||||
|
||||
define_shuffles()
|
||||
|
||||
ctlztz()
|
||||
|
||||
define_prefetches()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
@@ -98,6 +98,14 @@ declare void @__aos_to_soa4_float(float * noalias %p, <WIDTH x float> * noalias
|
||||
<WIDTH x float> * noalias %out2,
|
||||
<WIDTH x float> * noalias %out3) nounwind
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; math
|
||||
|
||||
@@ -241,8 +249,9 @@ declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_1svn',`
|
||||
define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind {
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i8> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
|
||||
store <WIDTH x i8> %v1, <WIDTH x i8> * %0
|
||||
@@ -250,7 +259,7 @@ define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind {
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i16> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
|
||||
store <WIDTH x i16> %v1, <WIDTH x i16> * %0
|
||||
@@ -258,7 +267,7 @@ define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind {
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i32> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
|
||||
store <WIDTH x i32> %v1, <WIDTH x i32> * %0
|
||||
@@ -266,30 +275,40 @@ define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
|
||||
<WIDTH x i64>, <WIDTH x i1>) nounwind {
|
||||
<WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i64> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
|
||||
store <WIDTH x i64> %v1, <WIDTH x i64> * %0
|
||||
ret void
|
||||
}
|
||||
',`
|
||||
declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
define(`gather_scatter', `
|
||||
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
|
||||
i32, <WIDTH x i1>) nounwind readonly
|
||||
i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
|
||||
i32, <WIDTH x i1>) nounwind readonly
|
||||
i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
|
||||
declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
|
||||
i32, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
|
||||
i32, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
|
||||
|
||||
@@ -47,6 +47,14 @@ int64minmax()
|
||||
|
||||
include(`target-sse2-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
|
||||
@@ -44,6 +44,14 @@ int64minmax()
|
||||
|
||||
include(`target-sse2-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
;;
|
||||
|
||||
@@ -47,6 +47,14 @@ int64minmax()
|
||||
|
||||
include(`target-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
|
||||
@@ -44,6 +44,14 @@ int64minmax()
|
||||
|
||||
include(`target-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
|
||||
317
builtins/util.m4
317
builtins/util.m4
@@ -760,14 +760,12 @@ define(`global_atomic_uniform', `
|
||||
ifelse(LLVM_VERSION, `LLVM_2_9',`
|
||||
declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
|
||||
|
||||
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
|
||||
<$1 x MASK> %mask) nounwind alwaysinline {
|
||||
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline {
|
||||
%r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
|
||||
ret $3 %r
|
||||
}
|
||||
', `
|
||||
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
|
||||
<$1 x MASK> %mask) nounwind alwaysinline {
|
||||
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline {
|
||||
%r = atomicrmw $2 $3 * %ptr, $3 %val seq_cst
|
||||
ret $3 %r
|
||||
}
|
||||
@@ -786,26 +784,7 @@ declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val)
|
||||
declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)')
|
||||
|
||||
define(`global_swap', `
|
||||
|
||||
define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
|
||||
<$1 x MASK> %mask) nounwind alwaysinline {
|
||||
%rptr = alloca <$1 x $2>
|
||||
%rptr32 = bitcast <$1 x $2> * %rptr to $2 *
|
||||
|
||||
per_lane($1, <$1 x MASK> %mask, `
|
||||
%val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
|
||||
ifelse(LLVM_VERSION, `LLVM_2_9',`
|
||||
%r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)', `
|
||||
%r_LANE_ID = atomicrmw xchg $2 * %ptr, $2 %val_LANE_ID seq_cst')
|
||||
%rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE
|
||||
store $2 %r_LANE_ID, $2 * %rp_LANE_ID')
|
||||
|
||||
%r = load <$1 x $2> * %rptr
|
||||
ret <$1 x $2> %r
|
||||
}
|
||||
|
||||
define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
|
||||
<$1 x MASK> %mask) nounwind alwaysinline {
|
||||
define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline {
|
||||
ifelse(LLVM_VERSION, `LLVM_2_9',`
|
||||
%r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)', `
|
||||
%r = atomicrmw xchg $2 * %ptr, $2 %val seq_cst')
|
||||
@@ -845,7 +824,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
|
||||
}
|
||||
|
||||
define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
|
||||
$2 %val, <$1 x MASK> %mask) nounwind alwaysinline {
|
||||
$2 %val) nounwind alwaysinline {
|
||||
ifelse(LLVM_VERSION, `LLVM_2_9',`
|
||||
%r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)', `
|
||||
%r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst')
|
||||
@@ -1586,17 +1565,15 @@ declare void @__pseudo_masked_store_64(<WIDTH x i64> * nocapture, <WIDTH x i64>,
|
||||
; these represent gathers from a common base pointer with offsets. The
|
||||
; offset_scale factor scales the offsets before they are added to the base
|
||||
; pointer--it should have the value 1, 2, 4, or 8. (It can always just be 1.)
|
||||
; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling
|
||||
; available in x86 addressing calculations...
|
||||
; Then, the offset delta_value (guaranteed to be a compile-time constant value),
|
||||
; is added to the final address. The 2, 4, 8 scales are used to match LLVM patterns
|
||||
; that use the free 2/4/8 scaling available in x86 addressing calculations, and
|
||||
; offset_delta feeds into the free offset calculation.
|
||||
;
|
||||
; varying int8 __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base,
|
||||
; int{32,64} offsets, int32 offset_scale, mask)
|
||||
; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base,
|
||||
; int{32,64} offsets, int32 offset_scale, mask)
|
||||
; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base,
|
||||
; int{32,64} offsets, int32 offset_scale, mask)
|
||||
; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base,
|
||||
; int{32,64} offsets, int32 offset_scale, mask)
|
||||
; varying int{8,16,32,64}
|
||||
; __pseudo_gather_base_offsets{32,64}_{8,16,32,64}(uniform int8 *base,
|
||||
; int{32,64} offsets, uniform int32 offset_scale,
|
||||
; int{32,64} offset_delta, mask)
|
||||
;
|
||||
; Then, the GSImprovementsPass optimizations finds these and either
|
||||
; converts them to native gather functions or converts them to vector
|
||||
@@ -1612,22 +1589,22 @@ declare <WIDTH x i16> @__pseudo_gather64_16(<WIDTH x i64>, <WIDTH x MASK>) nounw
|
||||
declare <WIDTH x i32> @__pseudo_gather64_32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i64> @__pseudo_gather64_64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
|
||||
|
||||
declare <WIDTH x i8> @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32,
|
||||
declare <WIDTH x i8> @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32,
|
||||
declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32,
|
||||
declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32,
|
||||
declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
|
||||
declare <WIDTH x i8> @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32,
|
||||
declare <WIDTH x i8> @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32,
|
||||
declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32,
|
||||
declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32,
|
||||
declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
|
||||
; Similarly to the pseudo-gathers defined above, we also declare undefined
|
||||
@@ -1642,13 +1619,9 @@ declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i3
|
||||
; transforms them to scatters like:
|
||||
;
|
||||
; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base,
|
||||
; varying int32 offsets, int32 offset_scale, varying int8 values, mask)
|
||||
; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base,
|
||||
; varying int32 offsets, int32 offset_scale, varying int16 values, mask)
|
||||
; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base,
|
||||
; varying int32 offsets, int32 offset_scale, varying int32 values, mask)
|
||||
; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base,
|
||||
; varying int32 offsets, int32 offset_scale, varying int64 values, mask)
|
||||
; varying int32 offsets, uniform int32 offset_scale,
|
||||
; varying int{32,64} offset_delta, varying int8 values, mask)
|
||||
; (and similarly for 16/32/64 bit values)
|
||||
;
|
||||
; And the GSImprovementsPass in turn converts these to actual native
|
||||
; scatters or masked stores.
|
||||
@@ -1663,22 +1636,22 @@ declare void @__pseudo_scatter64_16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>
|
||||
declare void @__pseudo_scatter64_32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter64_64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i8>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i16>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i32>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i8>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i16>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i32>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -1832,6 +1805,81 @@ ok:
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; new/delete
|
||||
|
||||
declare i8 * @malloc(i64)
|
||||
declare void @free(i8 *)
|
||||
|
||||
define i8 * @__new_uniform(i64 %size) {
|
||||
%a = call i8 * @malloc(i64 %size)
|
||||
ret i8 * %a
|
||||
}
|
||||
|
||||
define <WIDTH x i64> @__new_varying32(<WIDTH x i32> %size, <WIDTH x MASK> %mask) {
|
||||
%ret = alloca <WIDTH x i64>
|
||||
store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
|
||||
%ret64 = bitcast <WIDTH x i64> * %ret to i64 *
|
||||
|
||||
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||
%sz_LANE_ID = extractelement <WIDTH x i32> %size, i32 LANE
|
||||
%sz64_LANE_ID = zext i32 %sz_LANE_ID to i64
|
||||
%ptr_LANE_ID = call i8 * @malloc(i64 %sz64_LANE_ID)
|
||||
%ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64
|
||||
%store_LANE_ID = getelementptr i64 * %ret64, i32 LANE
|
||||
store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID')
|
||||
|
||||
%r = load <WIDTH x i64> * %ret
|
||||
ret <WIDTH x i64> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i64> @__new_varying64(<WIDTH x i64> %size, <WIDTH x MASK> %mask) {
|
||||
%ret = alloca <WIDTH x i64>
|
||||
store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
|
||||
%ret64 = bitcast <WIDTH x i64> * %ret to i64 *
|
||||
|
||||
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||
%sz_LANE_ID = extractelement <WIDTH x i64> %size, i32 LANE
|
||||
%ptr_LANE_ID = call i8 * @malloc(i64 %sz_LANE_ID)
|
||||
%ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64
|
||||
%store_LANE_ID = getelementptr i64 * %ret64, i32 LANE
|
||||
store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID')
|
||||
|
||||
%r = load <WIDTH x i64> * %ret
|
||||
ret <WIDTH x i64> %r
|
||||
}
|
||||
|
||||
define void @__delete_uniform(i8 * %ptr) {
|
||||
call void @free(i8 * %ptr)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__delete_varying(<WIDTH x i64> %ptr, <WIDTH x MASK> %mask) {
|
||||
per_lane(WIDTH, <WIDTH x MASK> %mask, `
|
||||
%iptr_LANE_ID = extractelement <WIDTH x i64> %ptr, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8 *
|
||||
call void @free(i8 * %ptr_LANE_ID)
|
||||
')
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; read hw clock
|
||||
|
||||
define i64 @__clock() nounwind uwtable ssp {
|
||||
entry:
|
||||
tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
|
||||
%0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
|
||||
%asmresult = extractvalue { i32, i32 } %0, 0
|
||||
%asmresult1 = extractvalue { i32, i32 } %0, 1
|
||||
%conv = zext i32 %asmresult1 to i64
|
||||
%shl = shl nuw i64 %conv, 32
|
||||
%conv2 = zext i32 %asmresult to i64
|
||||
%or = or i64 %shl, %conv2
|
||||
ret i64 %or
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; stdlib transcendentals
|
||||
;;
|
||||
@@ -1997,38 +2045,18 @@ global_atomic_uniform(WIDTH, umax, i64, uint64)
|
||||
global_swap(WIDTH, i32, int32)
|
||||
global_swap(WIDTH, i64, int64)
|
||||
|
||||
define <WIDTH x float> @__atomic_swap_float_global(float * %ptr, <WIDTH x float> %val,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
%iptr = bitcast float * %ptr to i32 *
|
||||
%ival = bitcast <WIDTH x float> %val to <WIDTH x i32>
|
||||
%iret = call <WIDTH x i32> @__atomic_swap_int32_global(i32 * %iptr, <WIDTH x i32> %ival, <WIDTH x MASK> %mask)
|
||||
%ret = bitcast <WIDTH x i32> %iret to <WIDTH x float>
|
||||
ret <WIDTH x float> %ret
|
||||
}
|
||||
|
||||
define <WIDTH x double> @__atomic_swap_double_global(double * %ptr, <WIDTH x double> %val,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
%iptr = bitcast double * %ptr to i64 *
|
||||
%ival = bitcast <WIDTH x double> %val to <WIDTH x i64>
|
||||
%iret = call <WIDTH x i64> @__atomic_swap_int64_global(i64 * %iptr, <WIDTH x i64> %ival, <WIDTH x MASK> %mask)
|
||||
%ret = bitcast <WIDTH x i64> %iret to <WIDTH x double>
|
||||
ret <WIDTH x double> %ret
|
||||
}
|
||||
|
||||
define float @__atomic_swap_uniform_float_global(float * %ptr, float %val,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
define float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline {
|
||||
%iptr = bitcast float * %ptr to i32 *
|
||||
%ival = bitcast float %val to i32
|
||||
%iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <WIDTH x MASK> %mask)
|
||||
%iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival)
|
||||
%ret = bitcast i32 %iret to float
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define double @__atomic_swap_uniform_double_global(double * %ptr, double %val,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
define double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline {
|
||||
%iptr = bitcast double * %ptr to i64 *
|
||||
%ival = bitcast double %val to i64
|
||||
%iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <WIDTH x MASK> %mask)
|
||||
%iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival)
|
||||
%ret = bitcast i64 %iret to double
|
||||
ret double %ret
|
||||
}
|
||||
@@ -2058,24 +2086,23 @@ define <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
|
||||
ret <WIDTH x double> %ret
|
||||
}
|
||||
|
||||
define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
|
||||
float %val) nounwind alwaysinline {
|
||||
%iptr = bitcast float * %ptr to i32 *
|
||||
%icmp = bitcast float %cmp to i32
|
||||
%ival = bitcast float %val to i32
|
||||
%iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp,
|
||||
i32 %ival, <WIDTH x MASK> %mask)
|
||||
i32 %ival)
|
||||
%ret = bitcast i32 %iret to float
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
|
||||
double %val, <WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
double %val) nounwind alwaysinline {
|
||||
%iptr = bitcast double * %ptr to i64 *
|
||||
%icmp = bitcast double %cmp to i64
|
||||
%ival = bitcast double %val to i64
|
||||
%iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp,
|
||||
i64 %ival, <WIDTH x MASK> %mask)
|
||||
%iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp, i64 %ival)
|
||||
%ret = bitcast i64 %iret to double
|
||||
ret double %ret
|
||||
}
|
||||
@@ -2219,9 +2246,9 @@ return:
|
||||
define(`gen_masked_store', `
|
||||
define void @__masked_store_$3(<$1 x $2>* nocapture, <$1 x $2>, <$1 x i32>) nounwind alwaysinline {
|
||||
per_lane($1, <$1 x i32> %2, `
|
||||
%ptr_ID = getelementptr <$1 x $2> * %0, i32 0, i32 LANE
|
||||
%storeval_ID = extractelement <$1 x $2> %1, i32 LANE
|
||||
store $2 %storeval_ID, $2 * %ptr_ID')
|
||||
%ptr_LANE_ID = getelementptr <$1 x $2> * %0, i32 0, i32 LANE
|
||||
%storeval_LANE_ID = extractelement <$1 x $2> %1, i32 LANE
|
||||
store $2 %storeval_LANE_ID, $2 * %ptr_LANE_ID')
|
||||
ret void
|
||||
}
|
||||
')
|
||||
@@ -2676,7 +2703,7 @@ pl_known_mask:
|
||||
pl_all_on:
|
||||
;; the mask is all on--just expand the code for each lane sequentially
|
||||
forloop(i, 0, eval($1-1),
|
||||
`patsubst(`$3', `ID\|LANE', i)')
|
||||
`patsubst(`$3', `LANE', i)')
|
||||
br label %pl_done
|
||||
|
||||
pl_unknown_mask:
|
||||
@@ -2727,7 +2754,8 @@ define(`gen_gather', `
|
||||
;; Define the utility function to do the gather operation for a single element
|
||||
;; of the type
|
||||
define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
|
||||
<$1 x i32> %offset_delta, <$1 x $2> %ret,
|
||||
i32 %lane) nounwind readonly alwaysinline {
|
||||
; compute address for this one from the base
|
||||
%offset32 = extractelement <$1 x i32> %offsets, i32 %lane
|
||||
; the order and details of the next 4 lines are important--they match LLVMs
|
||||
@@ -2737,15 +2765,20 @@ define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_
|
||||
%offset = mul i64 %offset64, %scale64
|
||||
%ptroffset = getelementptr i8 * %ptr, i64 %offset
|
||||
|
||||
%delta = extractelement <$1 x i32> %offset_delta, i32 %lane
|
||||
%delta64 = sext i32 %delta to i64
|
||||
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
|
||||
|
||||
; load value and insert into returned value
|
||||
%ptrcast = bitcast i8 * %ptroffset to $2 *
|
||||
%ptrcast = bitcast i8 * %finalptr to $2 *
|
||||
%val = load $2 *%ptrcast
|
||||
%updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
|
||||
ret <$1 x $2> %updatedret
|
||||
}
|
||||
|
||||
define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
|
||||
<$1 x i64> %offset_delta, <$1 x $2> %ret,
|
||||
i32 %lane) nounwind readonly alwaysinline {
|
||||
; compute address for this one from the base
|
||||
%offset64 = extractelement <$1 x i64> %offsets, i32 %lane
|
||||
; the order and details of the next 4 lines are important--they match LLVMs
|
||||
@@ -2754,8 +2787,11 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_
|
||||
%offset = mul i64 %offset64, %offset_scale64
|
||||
%ptroffset = getelementptr i8 * %ptr, i64 %offset
|
||||
|
||||
%delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
|
||||
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
|
||||
|
||||
; load value and insert into returned value
|
||||
%ptrcast = bitcast i8 * %ptroffset to $2 *
|
||||
%ptrcast = bitcast i8 * %finalptr to $2 *
|
||||
%val = load $2 *%ptrcast
|
||||
%updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
|
||||
ret <$1 x $2> %updatedret
|
||||
@@ -2763,6 +2799,7 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_
|
||||
|
||||
|
||||
define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
|
||||
<$1 x i32> %offset_delta,
|
||||
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
; We can be clever and avoid the per-lane stuff for gathers if we are willing
|
||||
; to require that the 0th element of the array being gathered from is always
|
||||
@@ -2775,16 +2812,25 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32
|
||||
<$1 x i32> %vecmask)
|
||||
%newOffsets = load <$1 x i32> * %offsetsPtr
|
||||
|
||||
%deltaPtr = alloca <$1 x i32>
|
||||
store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr
|
||||
call void @__masked_store_blend_32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta,
|
||||
<$1 x i32> %vecmask)
|
||||
%newDelta = load <$1 x i32> * %deltaPtr
|
||||
|
||||
%ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
|
||||
i32 %offset_scale, <$1 x $2> undef, i32 0)
|
||||
i32 %offset_scale, <$1 x i32> %offset_delta,
|
||||
<$1 x $2> undef, i32 0)
|
||||
forloop(lane, 1, eval($1-1),
|
||||
`patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr,
|
||||
<$1 x i32> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
|
||||
<$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %offset_delta,
|
||||
<$1 x $2> %retPREV, i32 LANE)
|
||||
', `LANE', lane), `PREV', eval(lane-1))')
|
||||
ret <$1 x $2> %ret`'eval($1-1)
|
||||
}
|
||||
|
||||
define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
|
||||
<$1 x i64> %offset_delta,
|
||||
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
; We can be clever and avoid the per-lane stuff for gathers if we are willing
|
||||
; to require that the 0th element of the array being gathered from is always
|
||||
@@ -2797,11 +2843,19 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32
|
||||
<$1 x i32> %vecmask)
|
||||
%newOffsets = load <$1 x i64> * %offsetsPtr
|
||||
|
||||
%deltaPtr = alloca <$1 x i64>
|
||||
store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr
|
||||
call void @__masked_store_blend_64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta,
|
||||
<$1 x i32> %vecmask)
|
||||
%newDelta = load <$1 x i64> * %deltaPtr
|
||||
|
||||
%ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
|
||||
i32 %offset_scale, <$1 x $2> undef, i32 0)
|
||||
i32 %offset_scale, <$1 x i64> %newDelta,
|
||||
<$1 x $2> undef, i32 0)
|
||||
forloop(lane, 1, eval($1-1),
|
||||
`patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr,
|
||||
<$1 x i64> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
|
||||
<$1 x i64> %newOffsets, i32 %offset_scale, <$1 x i64> %newDelta,
|
||||
<$1 x $2> %retPREV, i32 LANE)
|
||||
', `LANE', lane), `PREV', eval(lane-1))')
|
||||
ret <$1 x $2> %ret`'eval($1-1)
|
||||
}
|
||||
@@ -2811,11 +2865,11 @@ define <$1 x $2> @__gather32_$2(<$1 x i32> %ptrs,
|
||||
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%ret_ptr = alloca <$1 x $2>
|
||||
per_lane($1, <$1 x i32> %vecmask, `
|
||||
%iptr_ID = extractelement <$1 x i32> %ptrs, i32 LANE
|
||||
%ptr_ID = inttoptr i32 %iptr_ID to $2 *
|
||||
%val_ID = load $2 * %ptr_ID
|
||||
%store_ptr_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
|
||||
store $2 %val_ID, $2 * %store_ptr_ID
|
||||
%iptr_LANE_ID = extractelement <$1 x i32> %ptrs, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $2 *
|
||||
%val_LANE_ID = load $2 * %ptr_LANE_ID
|
||||
%store_ptr_LANE_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
|
||||
store $2 %val_LANE_ID, $2 * %store_ptr_LANE_ID
|
||||
')
|
||||
|
||||
%ret = load <$1 x $2> * %ret_ptr
|
||||
@@ -2827,11 +2881,11 @@ define <$1 x $2> @__gather64_$2(<$1 x i64> %ptrs,
|
||||
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%ret_ptr = alloca <$1 x $2>
|
||||
per_lane($1, <$1 x i32> %vecmask, `
|
||||
%iptr_ID = extractelement <$1 x i64> %ptrs, i32 LANE
|
||||
%ptr_ID = inttoptr i64 %iptr_ID to $2 *
|
||||
%val_ID = load $2 * %ptr_ID
|
||||
%store_ptr_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
|
||||
store $2 %val_ID, $2 * %store_ptr_ID
|
||||
%iptr_LANE_ID = extractelement <$1 x i64> %ptrs, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $2 *
|
||||
%val_LANE_ID = load $2 * %ptr_LANE_ID
|
||||
%store_ptr_LANE_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
|
||||
store $2 %val_LANE_ID, $2 * %store_ptr_LANE_ID
|
||||
')
|
||||
|
||||
%ret = load <$1 x $2> * %ret_ptr
|
||||
@@ -2852,7 +2906,8 @@ define(`gen_scatter', `
|
||||
;; Define the function that descripes the work to do to scatter a single
|
||||
;; value
|
||||
define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %values, i32 %lane) nounwind alwaysinline {
|
||||
<$1 x i32> %offset_delta, <$1 x $2> %values,
|
||||
i32 %lane) nounwind alwaysinline {
|
||||
%offset32 = extractelement <$1 x i32> %offsets, i32 %lane
|
||||
; the order and details of the next 4 lines are important--they match LLVMs
|
||||
; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
|
||||
@@ -2861,42 +2916,52 @@ define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scal
|
||||
%offset = mul i64 %offset64, %scale64
|
||||
%ptroffset = getelementptr i8 * %ptr, i64 %offset
|
||||
|
||||
%ptrcast = bitcast i8 * %ptroffset to $2 *
|
||||
%delta = extractelement <$1 x i32> %offset_delta, i32 %lane
|
||||
%delta64 = sext i32 %delta to i64
|
||||
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
|
||||
|
||||
%ptrcast = bitcast i8 * %finalptr to $2 *
|
||||
%storeval = extractelement <$1 x $2> %values, i32 %lane
|
||||
store $2 %storeval, $2 * %ptrcast
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__scatter_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %values, i32 %lane) nounwind alwaysinline {
|
||||
<$1 x i64> %offset_delta, <$1 x $2> %values,
|
||||
i32 %lane) nounwind alwaysinline {
|
||||
%offset64 = extractelement <$1 x i64> %offsets, i32 %lane
|
||||
; the order and details of the next 4 lines are important--they match LLVMs
|
||||
; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
|
||||
%scale64 = sext i32 %offset_scale to i64
|
||||
%offset = mul i64 %offset64, %scale64
|
||||
%ptroffset = getelementptr i8 * %ptr, i64 %offset
|
||||
%ptrcast = bitcast i8 * %ptroffset to $2 *
|
||||
|
||||
%delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
|
||||
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
|
||||
|
||||
%ptrcast = bitcast i8 * %finalptr to $2 *
|
||||
%storeval = extractelement <$1 x $2> %values, i32 %lane
|
||||
store $2 %storeval, $2 * %ptrcast
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
|
||||
<$1 x i32> %offset_delta, <$1 x $2> %values,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
|
||||
per_lane($1, <$1 x i32> %mask, `
|
||||
call void @__scatter_elt32_$2(i8 * %base, <$1 x i32> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %values, i32 LANE)')
|
||||
<$1 x i32> %offset_delta, <$1 x $2> %values, i32 LANE)')
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
|
||||
<$1 x i64> %offset_delta, <$1 x $2> %values,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
|
||||
per_lane($1, <$1 x i32> %mask, `
|
||||
call void @__scatter_elt64_$2(i8 * %base, <$1 x i64> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %values, i32 LANE)')
|
||||
<$1 x i64> %offset_delta, <$1 x $2> %values, i32 LANE)')
|
||||
ret void
|
||||
}
|
||||
|
||||
@@ -2904,10 +2969,10 @@ define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %of
|
||||
define void @__scatter32_$2(<$1 x i32> %ptrs, <$1 x $2> %values,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
per_lane($1, <$1 x i32> %mask, `
|
||||
%iptr_ID = extractelement <$1 x i32> %ptrs, i32 LANE
|
||||
%ptr_ID = inttoptr i32 %iptr_ID to $2 *
|
||||
%val_ID = extractelement <$1 x $2> %values, i32 LANE
|
||||
store $2 %val_ID, $2 * %ptr_ID
|
||||
%iptr_LANE_ID = extractelement <$1 x i32> %ptrs, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $2 *
|
||||
%val_LANE_ID = extractelement <$1 x $2> %values, i32 LANE
|
||||
store $2 %val_LANE_ID, $2 * %ptr_LANE_ID
|
||||
')
|
||||
ret void
|
||||
}
|
||||
@@ -2916,10 +2981,10 @@ define void @__scatter32_$2(<$1 x i32> %ptrs, <$1 x $2> %values,
|
||||
define void @__scatter64_$2(<$1 x i64> %ptrs, <$1 x $2> %values,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
per_lane($1, <$1 x i32> %mask, `
|
||||
%iptr_ID = extractelement <$1 x i64> %ptrs, i32 LANE
|
||||
%ptr_ID = inttoptr i64 %iptr_ID to $2 *
|
||||
%val_ID = extractelement <$1 x $2> %values, i32 LANE
|
||||
store $2 %val_ID, $2 * %ptr_ID
|
||||
%iptr_LANE_ID = extractelement <$1 x i64> %ptrs, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $2 *
|
||||
%val_LANE_ID = extractelement <$1 x $2> %values, i32 LANE
|
||||
store $2 %val_LANE_ID, $2 * %ptr_LANE_ID
|
||||
')
|
||||
ret void
|
||||
}
|
||||
|
||||
55
cbackend.cpp
55
cbackend.cpp
@@ -24,6 +24,8 @@
|
||||
#define PRIx64 "llx"
|
||||
#endif
|
||||
|
||||
#include "llvmutil.h"
|
||||
|
||||
#include "llvm/CallingConv.h"
|
||||
#include "llvm/Constants.h"
|
||||
#include "llvm/DerivedTypes.h"
|
||||
@@ -232,6 +234,7 @@ namespace {
|
||||
unsigned NextAnonValueNumber;
|
||||
|
||||
std::string includeName;
|
||||
int vectorWidth;
|
||||
|
||||
/// UnnamedStructIDs - This contains a unique ID for each struct that is
|
||||
/// either anonymous or has no name.
|
||||
@@ -240,11 +243,13 @@ namespace {
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
explicit CWriter(formatted_raw_ostream &o, const char *incname)
|
||||
explicit CWriter(formatted_raw_ostream &o, const char *incname,
|
||||
int vecwidth)
|
||||
: FunctionPass(ID), Out(o), IL(0), Mang(0), LI(0),
|
||||
TheModule(0), TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0),
|
||||
OpaqueCounter(0), NextAnonValueNumber(0),
|
||||
includeName(incname ? incname : "generic_defs.h") {
|
||||
includeName(incname ? incname : "generic_defs.h"),
|
||||
vectorWidth(vecwidth) {
|
||||
initializeLoopInfoPass(*PassRegistry::getPassRegistry());
|
||||
FPCounter = 0;
|
||||
}
|
||||
@@ -773,6 +778,16 @@ raw_ostream &CWriter::printType(raw_ostream &Out, Type *Ty,
|
||||
Out << " return ret;\n";
|
||||
Out << " }\n ";
|
||||
|
||||
// if it's an array of i8s, also provide a version that takes a const
|
||||
// char *
|
||||
if (ATy->getElementType() == LLVMTypes::Int8Type) {
|
||||
Out << " static " << NameSoFar << " init(const char *p) {\n";
|
||||
Out << " " << NameSoFar << " ret;\n";
|
||||
Out << " strncpy((char *)ret.array, p, " << NumElements << ");\n";
|
||||
Out << " return ret;\n";
|
||||
Out << " }\n";
|
||||
}
|
||||
|
||||
printType(Out, ATy->getElementType(), false,
|
||||
"array[" + utostr(NumElements) + "]");
|
||||
return Out << ";\n} ";
|
||||
@@ -842,6 +857,7 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
|
||||
}
|
||||
Out << '\"';
|
||||
} else {
|
||||
if (Static)
|
||||
Out << '{';
|
||||
if (CPA->getNumOperands()) {
|
||||
Out << ' ';
|
||||
@@ -851,6 +867,7 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
|
||||
printConstant(cast<Constant>(CPA->getOperand(i)), Static);
|
||||
}
|
||||
}
|
||||
if (Static)
|
||||
Out << " }";
|
||||
}
|
||||
}
|
||||
@@ -1321,7 +1338,8 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
|
||||
break;
|
||||
}
|
||||
|
||||
case Type::ArrayTyID:
|
||||
case Type::ArrayTyID: {
|
||||
ArrayType *AT = cast<ArrayType>(CPV->getType());
|
||||
if (Static)
|
||||
// arrays are wrapped in structs...
|
||||
Out << "{ ";
|
||||
@@ -1334,7 +1352,6 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
|
||||
printConstantArray(CA, Static);
|
||||
} else {
|
||||
assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
|
||||
ArrayType *AT = cast<ArrayType>(CPV->getType());
|
||||
if (AT->getNumElements()) {
|
||||
Out << ' ';
|
||||
Constant *CZ = Constant::getNullValue(AT->getElementType());
|
||||
@@ -1350,7 +1367,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
|
||||
else
|
||||
Out << ")";
|
||||
break;
|
||||
|
||||
}
|
||||
case Type::VectorTyID:
|
||||
printType(Out, CPV->getType());
|
||||
Out << "(";
|
||||
@@ -2097,7 +2114,8 @@ bool CWriter::doInitialization(Module &M) {
|
||||
I->getName() == "memset" || I->getName() == "memset_pattern16" ||
|
||||
I->getName() == "puts" ||
|
||||
I->getName() == "printf" || I->getName() == "putchar" ||
|
||||
I->getName() == "fflush")
|
||||
I->getName() == "fflush" || I->getName() == "malloc" ||
|
||||
I->getName() == "free")
|
||||
continue;
|
||||
|
||||
// Don't redeclare ispc's own intrinsics
|
||||
@@ -2203,7 +2221,7 @@ bool CWriter::doInitialization(Module &M) {
|
||||
// FIXME common linkage should avoid this problem.
|
||||
if (!I->getInitializer()->isNullValue()) {
|
||||
Out << " = " ;
|
||||
writeOperand(I->getInitializer(), true);
|
||||
writeOperand(I->getInitializer(), false);
|
||||
} else if (I->hasWeakLinkage()) {
|
||||
// We have to specify an initializer, but it doesn't have to be
|
||||
// complete. If the value is an aggregate, print out { 0 }, and let
|
||||
@@ -2218,7 +2236,7 @@ bool CWriter::doInitialization(Module &M) {
|
||||
Out << "{ { 0 } }";
|
||||
} else {
|
||||
// Just print it out normally.
|
||||
writeOperand(I->getInitializer(), true);
|
||||
writeOperand(I->getInitializer(), false);
|
||||
}
|
||||
}
|
||||
Out << ";\n";
|
||||
@@ -2892,6 +2910,20 @@ void CWriter::visitBinaryOperator(Instruction &I) {
|
||||
Out << "(";
|
||||
writeOperand(I.getOperand(0));
|
||||
Out << ", ";
|
||||
if ((I.getOpcode() == Instruction::Shl ||
|
||||
I.getOpcode() == Instruction::LShr ||
|
||||
I.getOpcode() == Instruction::AShr)) {
|
||||
std::vector<PHINode *> phis;
|
||||
if (LLVMVectorValuesAllEqual(I.getOperand(1),
|
||||
vectorWidth, phis)) {
|
||||
Out << "__extract_element(";
|
||||
writeOperand(I.getOperand(1));
|
||||
Out << ", 0) ";
|
||||
}
|
||||
else
|
||||
writeOperand(I.getOperand(1));
|
||||
}
|
||||
else
|
||||
writeOperand(I.getOperand(1));
|
||||
Out << ")";
|
||||
return;
|
||||
@@ -3406,6 +3438,9 @@ void CWriter::visitCallInst(CallInst &I) {
|
||||
Callee = RF;
|
||||
}
|
||||
|
||||
if (Callee->getName() == "malloc")
|
||||
Out << "(uint8_t *)";
|
||||
|
||||
if (NeedsCast) {
|
||||
// Ok, just cast the pointer type.
|
||||
Out << "((";
|
||||
@@ -3633,7 +3668,7 @@ std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
|
||||
#endif
|
||||
|
||||
std::string E;
|
||||
if (const Target *Match = TargetRegistry::lookupTarget(Triple, E))
|
||||
if (const llvm::Target *Match = TargetRegistry::lookupTarget(Triple, E))
|
||||
TargetAsm = Match->createMCAsmInfo(Triple);
|
||||
else
|
||||
return c.Codes[0];
|
||||
@@ -4335,7 +4370,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
|
||||
pm.add(new BitcastCleanupPass);
|
||||
pm.add(createDeadCodeEliminationPass()); // clean up after smear pass
|
||||
//CO pm.add(createPrintModulePass(&fos));
|
||||
pm.add(new CWriter(fos, includeName));
|
||||
pm.add(new CWriter(fos, includeName, vectorWidth));
|
||||
pm.add(createGCInfoDeleter());
|
||||
//CO pm.add(createVerifierPass());
|
||||
|
||||
|
||||
515
ctx.cpp
515
ctx.cpp
@@ -74,18 +74,35 @@ struct CFInfo {
|
||||
llvm::Value *savedContinueLanesPtr,
|
||||
llvm::Value *savedMask, llvm::Value *savedLoopMask);
|
||||
|
||||
static CFInfo *GetSwitch(bool isUniform, llvm::BasicBlock *breakTarget,
|
||||
llvm::BasicBlock *continueTarget,
|
||||
llvm::Value *savedBreakLanesPtr,
|
||||
llvm::Value *savedContinueLanesPtr,
|
||||
llvm::Value *savedMask, llvm::Value *savedLoopMask,
|
||||
llvm::Value *switchExpr,
|
||||
llvm::BasicBlock *bbDefault,
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > *bbCases,
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbNext,
|
||||
bool scUniform);
|
||||
|
||||
bool IsIf() { return type == If; }
|
||||
bool IsLoop() { return type == Loop; }
|
||||
bool IsForeach() { return type == Foreach; }
|
||||
bool IsSwitch() { return type == Switch; }
|
||||
bool IsVarying() { return !isUniform; }
|
||||
bool IsUniform() { return isUniform; }
|
||||
|
||||
enum CFType { If, Loop, Foreach };
|
||||
enum CFType { If, Loop, Foreach, Switch };
|
||||
CFType type;
|
||||
bool isUniform;
|
||||
llvm::BasicBlock *savedBreakTarget, *savedContinueTarget;
|
||||
llvm::Value *savedBreakLanesPtr, *savedContinueLanesPtr;
|
||||
llvm::Value *savedMask, *savedLoopMask;
|
||||
llvm::Value *savedSwitchExpr;
|
||||
llvm::BasicBlock *savedDefaultBlock;
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > *savedCaseBlocks;
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNextBlocks;
|
||||
bool savedSwitchConditionWasUniform;
|
||||
|
||||
private:
|
||||
CFInfo(CFType t, bool uniformIf, llvm::Value *sm) {
|
||||
@@ -95,11 +112,18 @@ private:
|
||||
savedBreakTarget = savedContinueTarget = NULL;
|
||||
savedBreakLanesPtr = savedContinueLanesPtr = NULL;
|
||||
savedMask = savedLoopMask = sm;
|
||||
savedSwitchExpr = NULL;
|
||||
savedDefaultBlock = NULL;
|
||||
savedCaseBlocks = NULL;
|
||||
savedNextBlocks = NULL;
|
||||
}
|
||||
CFInfo(CFType t, bool iu, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
|
||||
llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
|
||||
llvm::Value *lm) {
|
||||
Assert(t == Loop);
|
||||
llvm::Value *lm, llvm::Value *sse = NULL, llvm::BasicBlock *bbd = NULL,
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > *bbc = NULL,
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbn = NULL,
|
||||
bool scu = false) {
|
||||
Assert(t == Loop || t == Switch);
|
||||
type = t;
|
||||
isUniform = iu;
|
||||
savedBreakTarget = bt;
|
||||
@@ -108,6 +132,11 @@ private:
|
||||
savedContinueLanesPtr = sc;
|
||||
savedMask = sm;
|
||||
savedLoopMask = lm;
|
||||
savedSwitchExpr = sse;
|
||||
savedDefaultBlock = bbd;
|
||||
savedCaseBlocks = bbc;
|
||||
savedNextBlocks = bbn;
|
||||
savedSwitchConditionWasUniform = scu;
|
||||
}
|
||||
CFInfo(CFType t, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
|
||||
llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
|
||||
@@ -121,6 +150,10 @@ private:
|
||||
savedContinueLanesPtr = sc;
|
||||
savedMask = sm;
|
||||
savedLoopMask = lm;
|
||||
savedSwitchExpr = NULL;
|
||||
savedDefaultBlock = NULL;
|
||||
savedCaseBlocks = NULL;
|
||||
savedNextBlocks = NULL;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -154,6 +187,23 @@ CFInfo::GetForeach(llvm::BasicBlock *breakTarget,
|
||||
savedMask, savedForeachMask);
|
||||
}
|
||||
|
||||
|
||||
CFInfo *
|
||||
CFInfo::GetSwitch(bool isUniform, llvm::BasicBlock *breakTarget,
|
||||
llvm::BasicBlock *continueTarget,
|
||||
llvm::Value *savedBreakLanesPtr,
|
||||
llvm::Value *savedContinueLanesPtr, llvm::Value *savedMask,
|
||||
llvm::Value *savedLoopMask, llvm::Value *savedSwitchExpr,
|
||||
llvm::BasicBlock *savedDefaultBlock,
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > *savedCases,
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNext,
|
||||
bool savedSwitchConditionUniform) {
|
||||
return new CFInfo(Switch, isUniform, breakTarget, continueTarget,
|
||||
savedBreakLanesPtr, savedContinueLanesPtr,
|
||||
savedMask, savedLoopMask, savedSwitchExpr, savedDefaultBlock,
|
||||
savedCases, savedNext, savedSwitchConditionUniform);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
|
||||
@@ -182,6 +232,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
|
||||
breakLanesPtr = continueLanesPtr = NULL;
|
||||
breakTarget = continueTarget = NULL;
|
||||
|
||||
switchExpr = NULL;
|
||||
caseBlocks = NULL;
|
||||
defaultBlock = NULL;
|
||||
nextBlocks = NULL;
|
||||
|
||||
returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
|
||||
StoreInst(LLVMMaskAllOff, returnedLanesPtr);
|
||||
|
||||
@@ -422,14 +477,15 @@ FunctionEmitContext::StartVaryingIf(llvm::Value *oldMask) {
|
||||
|
||||
void
|
||||
FunctionEmitContext::EndIf() {
|
||||
CFInfo *ci = popCFState();
|
||||
// Make sure we match up with a Start{Uniform,Varying}If().
|
||||
Assert(controlFlowInfo.size() > 0 && controlFlowInfo.back()->IsIf());
|
||||
CFInfo *ci = controlFlowInfo.back();
|
||||
controlFlowInfo.pop_back();
|
||||
Assert(ci->IsIf());
|
||||
|
||||
// 'uniform' ifs don't change the mask so we only need to restore the
|
||||
// mask going into the if for 'varying' if statements
|
||||
if (!ci->IsUniform() && bblock != NULL) {
|
||||
if (ci->IsUniform() || bblock == NULL)
|
||||
return;
|
||||
|
||||
// We can't just restore the mask as it was going into the 'if'
|
||||
// statement. First we have to take into account any program
|
||||
// instances that have executed 'return' statements; the restored
|
||||
@@ -437,7 +493,7 @@ FunctionEmitContext::EndIf() {
|
||||
restoreMaskGivenReturns(ci->savedMask);
|
||||
|
||||
// If the 'if' statement is inside a loop with a 'varying'
|
||||
// consdition, we also need to account for any break or continue
|
||||
// condition, we also need to account for any break or continue
|
||||
// statements that executed inside the 'if' statmeent; we also must
|
||||
// leave the lane masks for the program instances that ran those
|
||||
// off after we restore the mask after the 'if'. The code below
|
||||
@@ -445,30 +501,39 @@ FunctionEmitContext::EndIf() {
|
||||
// or continue statements (and breakLanesPtr and continueLanesPtr
|
||||
// have their initial 'all off' values), so we don't need to check
|
||||
// for that here.
|
||||
if (continueLanesPtr != NULL) {
|
||||
//
|
||||
// There are three general cases to deal with here:
|
||||
// - Loops: both break and continue are allowed, and thus the corresponding
|
||||
// lane mask pointers are non-NULL
|
||||
// - Foreach: only continueLanesPtr may be non-NULL
|
||||
// - Switch: only breakLanesPtr may be non-NULL
|
||||
if (continueLanesPtr != NULL || breakLanesPtr != NULL) {
|
||||
// We want to compute:
|
||||
// newMask = (oldMask & ~(breakLanes | continueLanes))
|
||||
llvm::Value *oldMask = GetInternalMask();
|
||||
llvm::Value *continueLanes = LoadInst(continueLanesPtr,
|
||||
"continue_lanes");
|
||||
llvm::Value *bcLanes = continueLanes;
|
||||
// newMask = (oldMask & ~(breakLanes | continueLanes)),
|
||||
// treading breakLanes or continueLanes as "all off" if the
|
||||
// corresponding pointer is NULL.
|
||||
llvm::Value *bcLanes = NULL;
|
||||
|
||||
if (continueLanesPtr != NULL)
|
||||
bcLanes = LoadInst(continueLanesPtr, "continue_lanes");
|
||||
else
|
||||
bcLanes = LLVMMaskAllOff;
|
||||
|
||||
if (breakLanesPtr != NULL) {
|
||||
// breakLanesPtr will be NULL if we're inside a 'foreach' loop
|
||||
llvm::Value *breakLanes = LoadInst(breakLanesPtr, "break_lanes");
|
||||
bcLanes = BinaryOperator(llvm::Instruction::Or, breakLanes,
|
||||
continueLanes, "break|continue_lanes");
|
||||
bcLanes = BinaryOperator(llvm::Instruction::Or, bcLanes,
|
||||
breakLanes, "|break_lanes");
|
||||
}
|
||||
|
||||
llvm::Value *notBreakOrContinue =
|
||||
NotOperator(bcLanes, "!(break|continue)_lanes");
|
||||
llvm::Value *oldMask = GetInternalMask();
|
||||
llvm::Value *newMask =
|
||||
BinaryOperator(llvm::Instruction::And, oldMask,
|
||||
notBreakOrContinue, "new_mask");
|
||||
SetInternalMask(newMask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
@@ -502,17 +567,8 @@ FunctionEmitContext::StartLoop(llvm::BasicBlock *bt, llvm::BasicBlock *ct,
|
||||
|
||||
void
|
||||
FunctionEmitContext::EndLoop() {
|
||||
Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsLoop());
|
||||
CFInfo *ci = controlFlowInfo.back();
|
||||
controlFlowInfo.pop_back();
|
||||
|
||||
// Restore the break/continue state information to what it was before
|
||||
// we went into this loop.
|
||||
breakTarget = ci->savedBreakTarget;
|
||||
continueTarget = ci->savedContinueTarget;
|
||||
breakLanesPtr = ci->savedBreakLanesPtr;
|
||||
continueLanesPtr = ci->savedContinueLanesPtr;
|
||||
loopMask = ci->savedLoopMask;
|
||||
CFInfo *ci = popCFState();
|
||||
Assert(ci->IsLoop());
|
||||
|
||||
if (!ci->IsUniform())
|
||||
// If the loop had a 'uniform' test, then it didn't make any
|
||||
@@ -525,7 +581,7 @@ FunctionEmitContext::EndLoop() {
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {
|
||||
FunctionEmitContext::StartForeach() {
|
||||
// Store the current values of various loop-related state so that we
|
||||
// can restore it when we exit this loop.
|
||||
llvm::Value *oldMask = GetInternalMask();
|
||||
@@ -537,7 +593,7 @@ FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {
|
||||
|
||||
continueLanesPtr = AllocaInst(LLVMTypes::MaskType, "foreach_continue_lanes");
|
||||
StoreInst(LLVMMaskAllOff, continueLanesPtr);
|
||||
continueTarget = ct;
|
||||
continueTarget = NULL; // should be set by SetContinueTarget()
|
||||
|
||||
loopMask = NULL;
|
||||
}
|
||||
@@ -545,17 +601,8 @@ FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {
|
||||
|
||||
void
|
||||
FunctionEmitContext::EndForeach() {
|
||||
Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsForeach());
|
||||
CFInfo *ci = controlFlowInfo.back();
|
||||
controlFlowInfo.pop_back();
|
||||
|
||||
// Restore the break/continue state information to what it was before
|
||||
// we went into this loop.
|
||||
breakTarget = ci->savedBreakTarget;
|
||||
continueTarget = ci->savedContinueTarget;
|
||||
breakLanesPtr = ci->savedBreakLanesPtr;
|
||||
continueLanesPtr = ci->savedContinueLanesPtr;
|
||||
loopMask = ci->savedLoopMask;
|
||||
CFInfo *ci = popCFState();
|
||||
Assert(ci->IsForeach());
|
||||
}
|
||||
|
||||
|
||||
@@ -576,28 +623,64 @@ FunctionEmitContext::restoreMaskGivenReturns(llvm::Value *oldMask) {
|
||||
}
|
||||
|
||||
|
||||
/** Returns "true" if the first enclosing non-if control flow expression is
|
||||
a "switch" statement.
|
||||
*/
|
||||
bool
|
||||
FunctionEmitContext::inSwitchStatement() const {
|
||||
// Go backwards through controlFlowInfo, since we add new nested scopes
|
||||
// to the back.
|
||||
int i = controlFlowInfo.size() - 1;
|
||||
while (i >= 0 && controlFlowInfo[i]->IsIf())
|
||||
--i;
|
||||
// Got to the first non-if (or end of CF info)
|
||||
if (i == -1)
|
||||
return false;
|
||||
return controlFlowInfo[i]->IsSwitch();
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::Break(bool doCoherenceCheck) {
|
||||
Assert(controlFlowInfo.size() > 0);
|
||||
if (breakTarget == NULL) {
|
||||
Error(currentPos, "\"break\" statement is illegal outside of "
|
||||
"for/while/do loops.");
|
||||
"for/while/do loops and \"switch\" statements.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (bblock == NULL)
|
||||
return;
|
||||
|
||||
if (inSwitchStatement() == true &&
|
||||
switchConditionWasUniform == true &&
|
||||
ifsInCFAllUniform(CFInfo::Switch)) {
|
||||
// We know that all program instances are executing the break, so
|
||||
// just jump to the block immediately after the switch.
|
||||
Assert(breakTarget != NULL);
|
||||
BranchInst(breakTarget);
|
||||
bblock = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
// If all of the enclosing 'if' tests in the loop have uniform control
|
||||
// flow or if we can tell that the mask is all on, then we can just
|
||||
// jump to the break location.
|
||||
if (ifsInLoopAllUniform() || GetInternalMask() == LLVMMaskAllOn) {
|
||||
if (inSwitchStatement() == false &&
|
||||
(ifsInCFAllUniform(CFInfo::Loop) ||
|
||||
GetInternalMask() == LLVMMaskAllOn)) {
|
||||
BranchInst(breakTarget);
|
||||
if (ifsInLoopAllUniform() && doCoherenceCheck)
|
||||
Warning(currentPos, "Coherent break statement not necessary in fully uniform "
|
||||
"control flow.");
|
||||
if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
|
||||
Warning(currentPos, "Coherent break statement not necessary in "
|
||||
"fully uniform control flow.");
|
||||
// Set bblock to NULL since the jump has terminated the basic block
|
||||
bblock = NULL;
|
||||
}
|
||||
else {
|
||||
// Otherwise we need to update the mask of the lanes that have
|
||||
// executed a 'break' statement:
|
||||
// Varying switch, uniform switch where the 'break' is under
|
||||
// varying control flow, or a loop with varying 'if's above the
|
||||
// break. In these cases, we need to update the mask of the lanes
|
||||
// that have executed a 'break' statement:
|
||||
// breakLanes = breakLanes | mask
|
||||
Assert(breakLanesPtr != NULL);
|
||||
llvm::Value *mask = GetInternalMask();
|
||||
@@ -613,16 +696,20 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
|
||||
// an 'if' statement and restore the mask then.
|
||||
SetInternalMask(LLVMMaskAllOff);
|
||||
|
||||
if (doCoherenceCheck)
|
||||
// If the user has indicated that this is a 'coherent' break
|
||||
// statement, then check to see if the mask is all off. If so,
|
||||
// we have to conservatively jump to the continueTarget, not
|
||||
// the breakTarget, since part of the reason the mask is all
|
||||
// off may be due to 'continue' statements that executed in the
|
||||
// current loop iteration.
|
||||
// FIXME: if the loop only has break statements and no
|
||||
// continues, we can jump to breakTarget in that case.
|
||||
if (doCoherenceCheck) {
|
||||
if (continueTarget != NULL)
|
||||
// If the user has indicated that this is a 'coherent'
|
||||
// break statement, then check to see if the mask is all
|
||||
// off. If so, we have to conservatively jump to the
|
||||
// continueTarget, not the breakTarget, since part of the
|
||||
// reason the mask is all off may be due to 'continue'
|
||||
// statements that executed in the current loop iteration.
|
||||
jumpIfAllLoopLanesAreDone(continueTarget);
|
||||
else if (breakTarget != NULL)
|
||||
// Similarly handle these for switch statements, where we
|
||||
// only have a break target.
|
||||
jumpIfAllLoopLanesAreDone(breakTarget);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -635,12 +722,12 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (ifsInLoopAllUniform() || GetInternalMask() == LLVMMaskAllOn) {
|
||||
if (ifsInCFAllUniform(CFInfo::Loop) || GetInternalMask() == LLVMMaskAllOn) {
|
||||
// Similarly to 'break' statements, we can immediately jump to the
|
||||
// continue target if we're only in 'uniform' control flow within
|
||||
// loop or if we can tell that the mask is all on.
|
||||
AddInstrumentationPoint("continue: uniform CF, jumped");
|
||||
if (ifsInLoopAllUniform() && doCoherenceCheck)
|
||||
if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
|
||||
Warning(currentPos, "Coherent continue statement not necessary in "
|
||||
"fully uniform control flow.");
|
||||
BranchInst(continueTarget);
|
||||
@@ -653,8 +740,9 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
|
||||
llvm::Value *mask = GetInternalMask();
|
||||
llvm::Value *continueMask =
|
||||
LoadInst(continueLanesPtr, "continue_mask");
|
||||
llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or,
|
||||
mask, continueMask, "mask|continueMask");
|
||||
llvm::Value *newMask =
|
||||
BinaryOperator(llvm::Instruction::Or, mask, continueMask,
|
||||
"mask|continueMask");
|
||||
StoreInst(newMask, continueLanesPtr);
|
||||
|
||||
// And set the current mask to be all off in case there are any
|
||||
@@ -671,22 +759,23 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
|
||||
|
||||
|
||||
/** This function checks to see if all of the 'if' statements (if any)
|
||||
between the current scope and the first enclosing loop have 'uniform'
|
||||
tests.
|
||||
between the current scope and the first enclosing loop/switch of given
|
||||
control flow type have 'uniform' tests.
|
||||
*/
|
||||
bool
|
||||
FunctionEmitContext::ifsInLoopAllUniform() const {
|
||||
FunctionEmitContext::ifsInCFAllUniform(int type) const {
|
||||
Assert(controlFlowInfo.size() > 0);
|
||||
// Go backwards through controlFlowInfo, since we add new nested scopes
|
||||
// to the back. Stop once we come to the first enclosing loop.
|
||||
// to the back. Stop once we come to the first enclosing control flow
|
||||
// structure of the desired type.
|
||||
int i = controlFlowInfo.size() - 1;
|
||||
while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Loop) {
|
||||
while (i >= 0 && controlFlowInfo[i]->type != type) {
|
||||
if (controlFlowInfo[i]->isUniform == false)
|
||||
// Found a scope due to an 'if' statement with a varying test
|
||||
return false;
|
||||
--i;
|
||||
}
|
||||
Assert(i >= 0); // else we didn't find a loop!
|
||||
Assert(i >= 0); // else we didn't find the expected control flow type!
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -759,6 +848,244 @@ FunctionEmitContext::RestoreContinuedLanes() {
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak) {
|
||||
llvm::Value *oldMask = GetInternalMask();
|
||||
controlFlowInfo.push_back(CFInfo::GetSwitch(cfIsUniform, breakTarget,
|
||||
continueTarget, breakLanesPtr,
|
||||
continueLanesPtr, oldMask,
|
||||
loopMask, switchExpr, defaultBlock,
|
||||
caseBlocks, nextBlocks,
|
||||
switchConditionWasUniform));
|
||||
|
||||
breakLanesPtr = AllocaInst(LLVMTypes::MaskType, "break_lanes_memory");
|
||||
StoreInst(LLVMMaskAllOff, breakLanesPtr);
|
||||
breakTarget = bbBreak;
|
||||
|
||||
continueLanesPtr = NULL;
|
||||
continueTarget = NULL;
|
||||
loopMask = NULL;
|
||||
|
||||
// These will be set by the SwitchInst() method
|
||||
switchExpr = NULL;
|
||||
defaultBlock = NULL;
|
||||
caseBlocks = NULL;
|
||||
nextBlocks = NULL;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::EndSwitch() {
|
||||
Assert(bblock != NULL);
|
||||
|
||||
CFInfo *ci = popCFState();
|
||||
if (ci->IsVarying() && bblock != NULL)
|
||||
restoreMaskGivenReturns(ci->savedMask);
|
||||
}
|
||||
|
||||
|
||||
/** Emit code to check for an "all off" mask before the code for a
|
||||
case or default label in a "switch" statement.
|
||||
*/
|
||||
void
|
||||
FunctionEmitContext::addSwitchMaskCheck(llvm::Value *mask) {
|
||||
llvm::Value *allOff = None(mask);
|
||||
llvm::BasicBlock *bbSome = CreateBasicBlock("case_default_on");
|
||||
|
||||
// Find the basic block for the case or default label immediately after
|
||||
// the current one in the switch statement--that's where we want to
|
||||
// jump if the mask is all off at this label.
|
||||
Assert(nextBlocks->find(bblock) != nextBlocks->end());
|
||||
llvm::BasicBlock *bbNext = nextBlocks->find(bblock)->second;
|
||||
|
||||
// Jump to the next one of the mask is all off; otherwise jump to the
|
||||
// newly created block that will hold the actual code for this label.
|
||||
BranchInst(bbNext, bbSome, allOff);
|
||||
SetCurrentBasicBlock(bbSome);
|
||||
}
|
||||
|
||||
|
||||
/** Returns the execution mask at entry to the first enclosing "switch"
|
||||
statement. */
|
||||
llvm::Value *
|
||||
FunctionEmitContext::getMaskAtSwitchEntry() {
|
||||
Assert(controlFlowInfo.size() > 0);
|
||||
int i = controlFlowInfo.size() - 1;
|
||||
while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Switch)
|
||||
--i;
|
||||
Assert(i != -1);
|
||||
return controlFlowInfo[i]->savedMask;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::EmitDefaultLabel(bool checkMask, SourcePos pos) {
|
||||
if (inSwitchStatement() == false) {
|
||||
Error(pos, "\"default\" label illegal outside of \"switch\" "
|
||||
"statement.");
|
||||
return;
|
||||
}
|
||||
|
||||
// If there's a default label in the switch, a basic block for it
|
||||
// should have been provided in the previous call to SwitchInst().
|
||||
Assert(defaultBlock != NULL);
|
||||
|
||||
if (bblock != NULL)
|
||||
// The previous case in the switch fell through, or we're in a
|
||||
// varying switch; terminate the current block with a jump to the
|
||||
// block for the code for the default label.
|
||||
BranchInst(defaultBlock);
|
||||
SetCurrentBasicBlock(defaultBlock);
|
||||
|
||||
if (switchConditionWasUniform)
|
||||
// Nothing more to do for this case; return back to the caller,
|
||||
// which will then emit the code for the default case.
|
||||
return;
|
||||
|
||||
// For a varying switch, we need to update the execution mask.
|
||||
//
|
||||
// First, compute the mask that corresponds to which program instances
|
||||
// should execute the "default" code; this corresponds to the set of
|
||||
// program instances that don't match any of the case statements.
|
||||
// Therefore, we generate code that compares the value of the switch
|
||||
// expression to the value associated with each of the "case"
|
||||
// statements such that the surviving lanes didn't match any of them.
|
||||
llvm::Value *matchesDefault = getMaskAtSwitchEntry();
|
||||
for (int i = 0; i < (int)caseBlocks->size(); ++i) {
|
||||
int value = (*caseBlocks)[i].first;
|
||||
llvm::Value *valueVec = (switchExpr->getType() == LLVMTypes::Int32VectorType) ?
|
||||
LLVMInt32Vector(value) : LLVMInt64Vector(value);
|
||||
// TODO: for AVX2 at least, the following generates better code
|
||||
// than doing ICMP_NE and skipping the NotOperator() below; file a
|
||||
// LLVM bug?
|
||||
llvm::Value *matchesCaseValue =
|
||||
CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr,
|
||||
valueVec, "cmp_case_value");
|
||||
matchesCaseValue = I1VecToBoolVec(matchesCaseValue);
|
||||
|
||||
llvm::Value *notMatchesCaseValue = NotOperator(matchesCaseValue);
|
||||
matchesDefault = BinaryOperator(llvm::Instruction::And, matchesDefault,
|
||||
notMatchesCaseValue, "default&~case_match");
|
||||
}
|
||||
|
||||
// The mask may have some lanes on, which corresponds to the previous
|
||||
// label falling through; compute the updated mask by ANDing with the
|
||||
// current mask.
|
||||
llvm::Value *oldMask = GetInternalMask();
|
||||
llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, oldMask,
|
||||
matchesDefault, "old_mask|matches_default");
|
||||
SetInternalMask(newMask);
|
||||
|
||||
if (checkMask)
|
||||
addSwitchMaskCheck(newMask);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::EmitCaseLabel(int value, bool checkMask, SourcePos pos) {
|
||||
if (inSwitchStatement() == false) {
|
||||
Error(pos, "\"case\" label illegal outside of \"switch\" statement.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Find the basic block for this case statement.
|
||||
llvm::BasicBlock *bbCase = NULL;
|
||||
Assert(caseBlocks != NULL);
|
||||
for (int i = 0; i < (int)caseBlocks->size(); ++i)
|
||||
if ((*caseBlocks)[i].first == value) {
|
||||
bbCase = (*caseBlocks)[i].second;
|
||||
break;
|
||||
}
|
||||
Assert(bbCase != NULL);
|
||||
|
||||
if (bblock != NULL)
|
||||
// fall through from the previous case
|
||||
BranchInst(bbCase);
|
||||
SetCurrentBasicBlock(bbCase);
|
||||
|
||||
if (switchConditionWasUniform)
|
||||
return;
|
||||
|
||||
// update the mask: first, get a mask that indicates which program
|
||||
// instances have a value for the switch expression that matches this
|
||||
// case statement.
|
||||
llvm::Value *valueVec = (switchExpr->getType() == LLVMTypes::Int32VectorType) ?
|
||||
LLVMInt32Vector(value) : LLVMInt64Vector(value);
|
||||
llvm::Value *matchesCaseValue =
|
||||
CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr,
|
||||
valueVec, "cmp_case_value");
|
||||
matchesCaseValue = I1VecToBoolVec(matchesCaseValue);
|
||||
|
||||
// If a lane was off going into the switch, we don't care if has a
|
||||
// value in the switch expression that happens to match this case.
|
||||
llvm::Value *entryMask = getMaskAtSwitchEntry();
|
||||
matchesCaseValue = BinaryOperator(llvm::Instruction::And, entryMask,
|
||||
matchesCaseValue, "entry_mask&case_match");
|
||||
|
||||
// Take the surviving lanes and turn on the mask for them.
|
||||
llvm::Value *oldMask = GetInternalMask();
|
||||
llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, oldMask,
|
||||
matchesCaseValue, "mask|case_match");
|
||||
SetInternalMask(newMask);
|
||||
|
||||
if (checkMask)
|
||||
addSwitchMaskCheck(newMask);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::SwitchInst(llvm::Value *expr, llvm::BasicBlock *bbDefault,
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > &bbCases,
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &bbNext) {
|
||||
// The calling code should have called StartSwitch() before calling
|
||||
// SwitchInst().
|
||||
Assert(controlFlowInfo.size() &&
|
||||
controlFlowInfo.back()->IsSwitch());
|
||||
|
||||
switchExpr = expr;
|
||||
defaultBlock = bbDefault;
|
||||
caseBlocks = new std::vector<std::pair<int, llvm::BasicBlock *> >(bbCases);
|
||||
nextBlocks = new std::map<llvm::BasicBlock *, llvm::BasicBlock *>(bbNext);
|
||||
switchConditionWasUniform =
|
||||
(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(expr->getType()) == false);
|
||||
|
||||
if (switchConditionWasUniform == true) {
|
||||
// For a uniform switch condition, just wire things up to the LLVM
|
||||
// switch instruction.
|
||||
llvm::SwitchInst *s = llvm::SwitchInst::Create(expr, bbDefault,
|
||||
bbCases.size(), bblock);
|
||||
for (int i = 0; i < (int)bbCases.size(); ++i) {
|
||||
if (expr->getType() == LLVMTypes::Int32Type)
|
||||
s->addCase(LLVMInt32(bbCases[i].first), bbCases[i].second);
|
||||
else {
|
||||
Assert(expr->getType() == LLVMTypes::Int64Type);
|
||||
s->addCase(LLVMInt64(bbCases[i].first), bbCases[i].second);
|
||||
}
|
||||
}
|
||||
|
||||
AddDebugPos(s);
|
||||
// switch is a terminator
|
||||
bblock = NULL;
|
||||
}
|
||||
else {
|
||||
// For a varying switch, we first turn off all lanes of the mask
|
||||
SetInternalMask(LLVMMaskAllOff);
|
||||
|
||||
if (nextBlocks->size() > 0) {
|
||||
// If there are any labels inside the switch, jump to the first
|
||||
// one; any code before the first label won't be executed by
|
||||
// anyone.
|
||||
std::map<llvm::BasicBlock *, llvm::BasicBlock *>::const_iterator iter;
|
||||
iter = nextBlocks->find(NULL);
|
||||
Assert(iter != nextBlocks->end());
|
||||
llvm::BasicBlock *bbFirst = iter->second;
|
||||
BranchInst(bbFirst);
|
||||
bblock = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
FunctionEmitContext::VaryingCFDepth() const {
|
||||
int sum = 0;
|
||||
@@ -905,6 +1232,14 @@ FunctionEmitContext::All(llvm::Value *mask) {
|
||||
}
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::None(llvm::Value *mask) {
|
||||
llvm::Value *mmval = LaneMask(mask);
|
||||
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
|
||||
LLVMInt32(0), "none_mm_cmp");
|
||||
}
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::LaneMask(llvm::Value *v) {
|
||||
// Call the target-dependent movmsk function to turn the vector mask
|
||||
@@ -944,7 +1279,11 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::GetStringPtr(const std::string &str) {
|
||||
#ifdef LLVM_3_1svn
|
||||
llvm::Constant *lstr = llvm::ConstantDataArray::getString(*g->ctx, str);
|
||||
#else
|
||||
llvm::Constant *lstr = llvm::ConstantArray::get(*g->ctx, str);
|
||||
#endif
|
||||
llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::InternalLinkage;
|
||||
llvm::Value *lstrPtr = new llvm::GlobalVariable(*m->module, lstr->getType(),
|
||||
true /*isConst*/,
|
||||
@@ -994,7 +1333,11 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
|
||||
|
||||
static llvm::Value *
|
||||
lGetStringAsValue(llvm::BasicBlock *bblock, const char *s) {
|
||||
#ifdef LLVM_3_1svn
|
||||
llvm::Constant *sConstant = llvm::ConstantDataArray::getString(*g->ctx, s);
|
||||
#else
|
||||
llvm::Constant *sConstant = llvm::ConstantArray::get(*g->ctx, s);
|
||||
#endif
|
||||
llvm::Value *sPtr = new llvm::GlobalVariable(*m->module, sConstant->getType(),
|
||||
true /* const */,
|
||||
llvm::GlobalValue::InternalLinkage,
|
||||
@@ -2588,7 +2931,7 @@ FunctionEmitContext::SyncInst() {
|
||||
|
||||
|
||||
/** When we gathering from or scattering to a varying atomic type, we need
|
||||
to add an appropraite offset to the final address for each lane right
|
||||
to add an appropriate offset to the final address for each lane right
|
||||
before we use it. Given a varying pointer we're about to use and its
|
||||
type, this function determines whether these offsets are needed and
|
||||
returns an updated pointer that incorporates these offsets if needed.
|
||||
@@ -2632,3 +2975,37 @@ FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr,
|
||||
|
||||
return BinaryOperator(llvm::Instruction::Add, ptr, offset);
|
||||
}
|
||||
|
||||
|
||||
CFInfo *
|
||||
FunctionEmitContext::popCFState() {
|
||||
Assert(controlFlowInfo.size() > 0);
|
||||
CFInfo *ci = controlFlowInfo.back();
|
||||
controlFlowInfo.pop_back();
|
||||
|
||||
if (ci->IsSwitch()) {
|
||||
breakTarget = ci->savedBreakTarget;
|
||||
continueTarget = ci->savedContinueTarget;
|
||||
breakLanesPtr = ci->savedBreakLanesPtr;
|
||||
continueLanesPtr = ci->savedContinueLanesPtr;
|
||||
loopMask = ci->savedLoopMask;
|
||||
switchExpr = ci->savedSwitchExpr;
|
||||
defaultBlock = ci->savedDefaultBlock;
|
||||
caseBlocks = ci->savedCaseBlocks;
|
||||
nextBlocks = ci->savedNextBlocks;
|
||||
switchConditionWasUniform = ci->savedSwitchConditionWasUniform;
|
||||
}
|
||||
else if (ci->IsLoop() || ci->IsForeach()) {
|
||||
breakTarget = ci->savedBreakTarget;
|
||||
continueTarget = ci->savedContinueTarget;
|
||||
breakLanesPtr = ci->savedBreakLanesPtr;
|
||||
continueLanesPtr = ci->savedContinueLanesPtr;
|
||||
loopMask = ci->savedLoopMask;
|
||||
}
|
||||
else {
|
||||
Assert(ci->IsIf());
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
return ci;
|
||||
}
|
||||
|
||||
107
ctx.h
107
ctx.h
@@ -161,10 +161,8 @@ public:
|
||||
void EndLoop();
|
||||
|
||||
/** Indicates that code generation for a 'foreach' or 'foreach_tiled'
|
||||
loop is about to start. The provided basic block pointer indicates
|
||||
where control flow should go if a 'continue' statement is executed
|
||||
in the loop. */
|
||||
void StartForeach(llvm::BasicBlock *continueTarget);
|
||||
loop is about to start. */
|
||||
void StartForeach();
|
||||
void EndForeach();
|
||||
|
||||
/** Emit code for a 'break' statement in a loop. If doCoherenceCheck
|
||||
@@ -187,12 +185,53 @@ public:
|
||||
previous iteration. */
|
||||
void RestoreContinuedLanes();
|
||||
|
||||
/** Indicates that code generation for a "switch" statement is about to
|
||||
start. isUniform indicates whether the "switch" value is uniform,
|
||||
and bbAfterSwitch gives the basic block immediately following the
|
||||
"switch" statement. (For example, if the switch condition is
|
||||
uniform, we jump here upon executing a "break" statement.) */
|
||||
void StartSwitch(bool isUniform, llvm::BasicBlock *bbAfterSwitch);
|
||||
/** Indicates the end of code generation for a "switch" statement. */
|
||||
void EndSwitch();
|
||||
|
||||
/** Emits code for a "switch" statement in the program.
|
||||
@param expr Gives the value of the expression after the "switch"
|
||||
@param defaultBlock Basic block to execute for the "default" case. This
|
||||
should be NULL if there is no "default" label inside
|
||||
the switch.
|
||||
@param caseBlocks vector that stores the mapping from label values
|
||||
after "case" statements to basic blocks corresponding
|
||||
to the "case" labels.
|
||||
@param nextBlocks For each basic block for a "case" or "default"
|
||||
label, this gives the basic block for the
|
||||
immediately-following "case" or "default" label (or
|
||||
the basic block after the "switch" statement for the
|
||||
last label.)
|
||||
*/
|
||||
void SwitchInst(llvm::Value *expr, llvm::BasicBlock *defaultBlock,
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > &caseBlocks,
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &nextBlocks);
|
||||
|
||||
/** Generates code for a "default" label after a "switch" statement.
|
||||
The checkMask parameter indicates whether additional code should be
|
||||
generated to check to see if the execution mask is all off after
|
||||
the default label (in which case a jump to the following label will
|
||||
be issued. */
|
||||
void EmitDefaultLabel(bool checkMask, SourcePos pos);
|
||||
|
||||
/** Generates code for a "case" label after a "switch" statement. See
|
||||
the documentation for EmitDefaultLabel() for discussion of the
|
||||
checkMask parameter. */
|
||||
void EmitCaseLabel(int value, bool checkMask, SourcePos pos);
|
||||
|
||||
/** Returns the current number of nested levels of 'varying' control
|
||||
flow */
|
||||
int VaryingCFDepth() const;
|
||||
|
||||
bool InForeachLoop() const;
|
||||
|
||||
void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
|
||||
|
||||
/** Step through the code and find label statements; create a basic
|
||||
block for each one, so that subsequent calls to
|
||||
GetLabeledBasicBlock() return the corresponding basic block. */
|
||||
@@ -221,6 +260,10 @@ public:
|
||||
i1 value that indicates if all of the mask lanes are on. */
|
||||
llvm::Value *All(llvm::Value *mask);
|
||||
|
||||
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
|
||||
i1 value that indicates if all of the mask lanes are off. */
|
||||
llvm::Value *None(llvm::Value *mask);
|
||||
|
||||
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
|
||||
i32 value wherein the i'th bit is on if and only if the i'th lane
|
||||
of the mask is on. */
|
||||
@@ -492,10 +535,10 @@ private:
|
||||
the loop. */
|
||||
llvm::Value *loopMask;
|
||||
|
||||
/** If currently in a loop body, this is a pointer to memory to store a
|
||||
mask value that represents which of the lanes have executed a
|
||||
'break' statement. If we're not in a loop body, this should be
|
||||
NULL. */
|
||||
/** If currently in a loop body or switch statement, this is a pointer
|
||||
to memory to store a mask value that represents which of the lanes
|
||||
have executed a 'break' statement. If we're not in a loop body or
|
||||
switch, this should be NULL. */
|
||||
llvm::Value *breakLanesPtr;
|
||||
|
||||
/** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
|
||||
@@ -503,16 +546,49 @@ private:
|
||||
'continue' statement. */
|
||||
llvm::Value *continueLanesPtr;
|
||||
|
||||
/** If we're inside a loop, this gives the basic block immediately
|
||||
after the current loop, which we will jump to if all of the lanes
|
||||
have executed a break statement or are otherwise done with the
|
||||
loop. */
|
||||
/** If we're inside a loop or switch statement, this gives the basic
|
||||
block immediately after the current loop or switch, which we will
|
||||
jump to if all of the lanes have executed a break statement or are
|
||||
otherwise done with it. */
|
||||
llvm::BasicBlock *breakTarget;
|
||||
|
||||
/** If we're inside a loop, this gives the block to jump to if all of
|
||||
the running lanes have executed a 'continue' statement. */
|
||||
llvm::BasicBlock *continueTarget;
|
||||
|
||||
/** @name Switch statement state
|
||||
|
||||
These variables store various state that's active when we're
|
||||
generating code for a switch statement. They should all be NULL
|
||||
outside of a switch.
|
||||
@{
|
||||
*/
|
||||
|
||||
/** The value of the expression used to determine which case in the
|
||||
statements after the switch to execute. */
|
||||
llvm::Value *switchExpr;
|
||||
|
||||
/** Map from case label numbers to the basic block that will hold code
|
||||
for that case. */
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > *caseBlocks;
|
||||
|
||||
/** The basic block of code to run for the "default" label in the
|
||||
switch statement. */
|
||||
llvm::BasicBlock *defaultBlock;
|
||||
|
||||
/** For each basic block for the code for cases (and the default label,
|
||||
if present), this map gives the basic block for the immediately
|
||||
following case/default label. */
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *nextBlocks;
|
||||
|
||||
/** Records whether the switch condition was uniform; this is a
|
||||
distinct notion from whether the switch represents uniform or
|
||||
varying control flow; we may have varying control flow from a
|
||||
uniform switch condition if there is a 'break' inside the switch
|
||||
that's under varying control flow. */
|
||||
bool switchConditionWasUniform;
|
||||
/** @} */
|
||||
|
||||
/** A pointer to memory that records which of the program instances
|
||||
have executed a 'return' statement (and are thus really truly done
|
||||
running any more instructions in this functions. */
|
||||
@@ -556,7 +632,7 @@ private:
|
||||
|
||||
llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
|
||||
static void addGSMetadata(llvm::Value *inst, SourcePos pos);
|
||||
bool ifsInLoopAllUniform() const;
|
||||
bool ifsInCFAllUniform(int cfType) const;
|
||||
void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
|
||||
llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
|
||||
|
||||
@@ -564,6 +640,11 @@ private:
|
||||
const Type *ptrType);
|
||||
|
||||
void restoreMaskGivenReturns(llvm::Value *oldMask);
|
||||
void addSwitchMaskCheck(llvm::Value *mask);
|
||||
bool inSwitchStatement() const;
|
||||
llvm::Value *getMaskAtSwitchEntry();
|
||||
|
||||
CFInfo *popCFState();
|
||||
|
||||
void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
|
||||
llvm::Value *mask);
|
||||
|
||||
1
decl.cpp
1
decl.cpp
@@ -266,6 +266,7 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
|
||||
funArgs->push_back(sym);
|
||||
}
|
||||
|
||||
if (funSym != NULL)
|
||||
funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying);
|
||||
|
||||
return funSym;
|
||||
|
||||
@@ -1,3 +1,61 @@
|
||||
=== v1.1.4 === (4 February 2012)
|
||||
|
||||
There are two major bugfixes for Windows in this release. First, a number
|
||||
of failures in AVX code generation on Windows have been fixed; AVX on
|
||||
Windows now has no known issues. Second, a longstanding bug in parsing 64-bit
|
||||
integer constants on Windows has been fixed.
|
||||
|
||||
This release features a new experimental scalar target, contributed by Gabe
|
||||
Weisz <gweisz@cs.cmu.edu>. This target ("--target=generic-1") compiles
|
||||
gangs of single program instances (i.e. programCount == 1); it can be
|
||||
useful for debugging ispc programs.
|
||||
|
||||
The compiler now supports dynamic memory allocation in ispc programs (with
|
||||
"new" and "delete" operators based on C++). See
|
||||
http://ispc.github.com/ispc.html#dynamic-memory-allocation in the
|
||||
documentation for more information.
|
||||
|
||||
ispc now performs "short circuit" evaluation of the || and && logical
|
||||
operators and the ? : selection operator. (This represents the correction
|
||||
of a major incompatibility with C.) Code like "(index < arraySize &&
|
||||
array[index] == 1)" thus now executes as in C, where "array[index]" won't
|
||||
be evaluated unless "index" is less than "arraySize".
|
||||
|
||||
The standard library now provides "local" atomic operations, which are
|
||||
atomic across the gang of program instances (but not across other gangs or
|
||||
other hardware threads. See the updated documentation on atomics for more
|
||||
information:
|
||||
http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences.
|
||||
|
||||
The standard library now offers a clock() function, which returns a uniform
|
||||
int64 value that counts processor cycles; it can be used for
|
||||
fine-resolution timing measurements.
|
||||
|
||||
Finally (of limited interest now): ispc now supports the forthcoming AVX2
|
||||
instruction set, due with Haswell-generation CPUs. All tests and examples
|
||||
compile and execute correctly with AVX2. (Thanks specifically to Craig
|
||||
Topper and Nadav Rotem for work on AVX2 support in LLVM, which made this
|
||||
possible.)
|
||||
|
||||
=== v1.1.3 === (20 January 2012)
|
||||
|
||||
With this release, the language now supports "switch" statements, with the
|
||||
same semantics and syntax as in C.
|
||||
|
||||
This release includes fixes for two important performance related issues:
|
||||
the quality of code generated for "foreach" statements has been
|
||||
substantially improved (https://github.com/ispc/ispc/issues/151), and a
|
||||
performance regression with code for "gathers" that was introduced in
|
||||
v1.1.2 has been fixed in this release.
|
||||
|
||||
A number of other small bugs were fixed in this release as well, including
|
||||
one where invalid memory would sometimes be incorrectly accessed
|
||||
(https://github.com/ispc/ispc/issues/160).
|
||||
|
||||
Thanks to Jean-Luc Duprat for a number of patches that improve support for
|
||||
building on various platforms, and to Pierre-Antoine Lacaze for patches so
|
||||
that ispc builds under MinGW.
|
||||
|
||||
=== v1.1.2 === (9 January 2012)
|
||||
|
||||
The major new feature in this release is support for "generic" C++
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
=============================================================
|
||||
Intel® SPMD Program Compiler Frequently Asked Questions (FAQ)
|
||||
=============================================================
|
||||
=====================================
|
||||
Frequently Asked Questions About ispc
|
||||
=====================================
|
||||
|
||||
This document includes a number of frequently (and not frequently) asked
|
||||
questions about ispc, the Intel® SPMD Program Compiler. The source to this
|
||||
document is in the file ``docs/faq.txt`` in the ``ispc`` source
|
||||
document is in the file ``docs/faq.rst`` in the ``ispc`` source
|
||||
distribution.
|
||||
|
||||
* Understanding ispc's Output
|
||||
|
||||
350
docs/ispc.rst
350
docs/ispc.rst
@@ -96,9 +96,13 @@ Contents:
|
||||
|
||||
+ `Declarations and Initializers`_
|
||||
+ `Expressions`_
|
||||
|
||||
* `Dynamic Memory Allocation`_
|
||||
|
||||
+ `Control Flow`_
|
||||
|
||||
* `Conditional Statements: "if"`_
|
||||
* `Conditional Statements: "switch"`_
|
||||
* `Basic Iteration Statements: "for", "while", and "do"`_
|
||||
* `Unstructured Control Flow: "goto"`_
|
||||
* `"Coherent" Control Flow Statements: "cif" and Friends`_
|
||||
@@ -1141,12 +1145,13 @@ in C:
|
||||
|
||||
* Expression syntax and basic types
|
||||
* Syntax for variable declarations
|
||||
* Control flow structures: if, for, while, do
|
||||
* Control flow structures: ``if``, ``for``, ``while``, ``do``, and ``switch``.
|
||||
* Pointers, including function pointers, ``void *``, and C's array/pointer
|
||||
duality (arrays are converted to pointers when passed to functions, etc.)
|
||||
* Structs and arrays
|
||||
* Support for recursive function calls
|
||||
* Support for separate compilation of source files
|
||||
* "Short-circuit" evaluation of ``||``, ``&&`` and ``? :`` operators
|
||||
* The preprocessor
|
||||
|
||||
``ispc`` adds a number of features from C++ and C99 to this base:
|
||||
@@ -1161,6 +1166,7 @@ in C:
|
||||
* The ``inline`` qualifier to indicate that a function should be inlined
|
||||
* Function overloading by parameter type
|
||||
* Hexadecimal floating-point constants
|
||||
* Dynamic memory allocation with ``new`` and ``delete``.
|
||||
|
||||
``ispc`` also adds a number of new features that aren't in C89, C99, or
|
||||
C++:
|
||||
@@ -1179,13 +1185,11 @@ C++:
|
||||
There are a number of features of C89 that are not supported in ``ispc``
|
||||
but are likely to be supported in future releases:
|
||||
|
||||
* Short circuiting of logical operations
|
||||
* There are no types named ``char``, ``short``, or ``long`` (or ``long
|
||||
double``). However, there are built-in ``int8``, ``int16``, and
|
||||
``int64`` types
|
||||
* Character constants
|
||||
* String constants and arrays of characters as strings
|
||||
* ``switch`` statements
|
||||
* ``goto`` statements are partially supported (see `Unstructured Control Flow: "goto"`_)
|
||||
* ``union`` types
|
||||
* Bitfield members of ``struct`` types
|
||||
@@ -1965,19 +1969,137 @@ operator also work as expected.
|
||||
(*fp).a = 0;
|
||||
fp->b = 1;
|
||||
|
||||
As in C and C++, evaluation of the ``||`` and ``&&`` logical operators as
|
||||
well as the selection operator ``? :`` is "short-circuited"; the right hand
|
||||
side won't be evaluated if the value from the left-hand side determines the
|
||||
logical operator's value. For example, in the following code,
|
||||
``array[index]`` won't be evaluated for values of ``index`` that are
|
||||
greater than or equal to ``NUM_ITEMS``.
|
||||
|
||||
::
|
||||
|
||||
if (index < NUM_ITEMS && array[index] > 0) {
|
||||
// ...
|
||||
}
|
||||
|
||||
|
||||
Dynamic Memory Allocation
|
||||
-------------------------
|
||||
|
||||
``ispc`` programs can dynamically allocate (and free) memory, using syntax
|
||||
based on C++'s ``new`` and ``delete`` operators:
|
||||
|
||||
::
|
||||
|
||||
int count = ...;
|
||||
int *ptr = new uniform int[count];
|
||||
// use ptr...
|
||||
delete[] ptr;
|
||||
|
||||
In the above code, each program instance allocates its own ``count`-sized
|
||||
array of ``uniform int`` values, uses that memory, and then deallocates
|
||||
that memory. Uses of ``new`` and ``delete`` in ``ispc`` programs are
|
||||
serviced by corresponding calls the system C library's ``malloc()`` and
|
||||
``free()`` functions.
|
||||
|
||||
After a pointer has been deleted, it is illegal to access the memory it
|
||||
points to. However, note that deletion happens on a per-program-instance
|
||||
basis. In other words, consider the following code:
|
||||
|
||||
::
|
||||
|
||||
int *ptr = new uniform int[count];
|
||||
// use ptr
|
||||
if (count > 1000)
|
||||
delete[] ptr;
|
||||
// ...
|
||||
|
||||
Here, the program instances where ``count`` is greater than 1000 have
|
||||
deleted the dynamically allocated memory pointed to by ``ptr``, but the
|
||||
other program instances have not. As such, it's illegal for the former set
|
||||
of program instances to access ``*ptr``, but it's perfectly fine for the
|
||||
latter set to continue to use the memory ``ptr`` points to. Note that it
|
||||
is illegal to delete a pointer value returned by ``new`` more than one
|
||||
time.
|
||||
|
||||
Sometimes, it's useful to be able to do a single allocation for the entire
|
||||
gang of program instances. A ``new`` statement can be qualified with
|
||||
``uniform`` to indicate a single memory allocation:
|
||||
|
||||
::
|
||||
|
||||
float * uniform ptr = uniform new float[10];
|
||||
|
||||
While a regular call to ``new`` returns a ``varying`` pointer (i.e. a
|
||||
distinct pointer to separately-allocated memory for each program instance),
|
||||
a ``uniform new`` performs a single allocation and returns a ``uniform``
|
||||
pointer.
|
||||
|
||||
When using ``uniform new``, it's important to be aware of a subtlety; if
|
||||
the returned pointer is stored in a varying pointer variable (as may be
|
||||
appropriate and useful for the particular program being written), then the
|
||||
varying pointer may inadvertently be passed to a subsequent ``delete``
|
||||
statement, which is an error: effectively
|
||||
|
||||
::
|
||||
|
||||
float *ptr = uniform new float[10];
|
||||
// use ptr...
|
||||
delete ptr; // ERROR: varying pointer is deleted
|
||||
|
||||
In this case, ``ptr`` will be deleted multiple times, once for each
|
||||
executing program instance, which is an error (unless it happens that only
|
||||
a single program instance is active in the above code.)
|
||||
|
||||
When using ``new`` statements, it's important to make an appropriate choice
|
||||
of ``uniform`` or ``varying`` (as always, the default), for both the
|
||||
``new`` operator itself as well as the type of data being allocated, based
|
||||
on the program's needs. Consider the following four memory allocations:
|
||||
|
||||
::
|
||||
|
||||
uniform float * uniform p1 = uniform new uniform float[10];
|
||||
float * uniform p2 = uniform new float[10];
|
||||
uniform float * p3 = new uniform float[10];
|
||||
float * p4 = new float[10];
|
||||
|
||||
Assuming that a ``float`` is 4 bytes in memory and if the gang size is 8
|
||||
program instances, then the first allocation represents a single allocation
|
||||
of 40 bytes, the second is a single allocation of 8*4*10 = 320 bytes, the
|
||||
third is 8 allocations of 40 bytes, and the last performs 8 allocations of
|
||||
80 bytes each.
|
||||
|
||||
Note in particular that varying allocations of varying data types are rarely
|
||||
desirable in practice. In that case, each program instance is performing a
|
||||
separate allocation of ``varying float`` memory. In this case, it's likely
|
||||
that the program instances will only access a single element of each
|
||||
``varying float``, which is wasteful.
|
||||
|
||||
Although ``ispc`` doesn't support constructors or destructors like C++, it
|
||||
is possible to provide initializer values with ``new`` statements:
|
||||
|
||||
::
|
||||
|
||||
struct Point { float x, y, z; };
|
||||
Point *pptr = new Point(10, 20, 30);
|
||||
|
||||
Here for example, the "x" element of the returned ``Point`` is initialized
|
||||
to have the value 10 and so forth. In general, the rules for how
|
||||
initializer values provided in ``new`` statements are used to initialize
|
||||
complex data types follow the same rules as initializers for variables
|
||||
described in `Declarations and Initializers`_.
|
||||
|
||||
Control Flow
|
||||
------------
|
||||
|
||||
``ispc`` supports most of C's control flow constructs, including ``if``,
|
||||
``for``, ``while``, ``do``. It also supports variants of C's control flow
|
||||
``switch``, ``for``, ``while``, ``do``. It has limited support for
|
||||
``goto``, detailed below. It also supports variants of C's control flow
|
||||
constructs that provide hints about the expected runtime coherence of the
|
||||
control flow at that statement. It also provides parallel looping
|
||||
constructs, ``foreach`` and ``foreach_tiled``, all of which will be
|
||||
detailed in this section.
|
||||
|
||||
``ispc`` does not currently support ``switch`` statements or ``goto``.
|
||||
|
||||
Conditional Statements: "if"
|
||||
----------------------------
|
||||
|
||||
@@ -1994,6 +2116,31 @@ executes if the condition is false.
|
||||
else
|
||||
x *= 2.;
|
||||
|
||||
Conditional Statements: "switch"
|
||||
--------------------------------
|
||||
|
||||
The ``switch`` conditional statement is also available, again with the same
|
||||
behavior as in C; the expression used in the ``switch`` must be of integer
|
||||
type (but it can be uniform or varying). As in C, if there is no ``break``
|
||||
statement at the end of the code for a given case, execution "falls
|
||||
through" to the following case. These features are demonstrated in the
|
||||
code below.
|
||||
|
||||
::
|
||||
|
||||
int x = ...;
|
||||
switch (x) {
|
||||
case 0:
|
||||
case 1:
|
||||
foo(x);
|
||||
/* fall through */
|
||||
case 5:
|
||||
x = 0;
|
||||
break;
|
||||
default:
|
||||
x *= x;
|
||||
}
|
||||
|
||||
Basic Iteration Statements: "for", "while", and "do"
|
||||
----------------------------------------------------
|
||||
|
||||
@@ -3242,24 +3389,53 @@ Systems Programming Support
|
||||
Atomic Operations and Memory Fences
|
||||
-----------------------------------
|
||||
|
||||
The usual range of atomic memory operations are provided in ``ispc``,
|
||||
including variants to handle both uniform and varying types. As a first
|
||||
example, consider on variant of the 32-bit integer atomic add routine:
|
||||
The standard range of atomic memory operations are provided by the standard
|
||||
library``ispc``, including variants to handle both uniform and varying
|
||||
types as well as "local" and "global" atomics.
|
||||
|
||||
Local atomics provide atomic behavior across the program instances in a
|
||||
gang, but not across multiple gangs or memory operations in different
|
||||
hardware threads. To see why they are needed, consider a histogram
|
||||
calculation where each program instance in the gang computes which bucket a
|
||||
value lies in and then increments a corresponding counter. If the code is
|
||||
written like this:
|
||||
|
||||
::
|
||||
|
||||
int32 atomic_add_global(uniform int32 * uniform ptr, int32 delta)
|
||||
uniform int count[N_BUCKETS] = ...;
|
||||
float value = ...;
|
||||
int bucket = clamp(value / N_BUCKETS, 0, N_BUCKETS);
|
||||
++count[bucket]; // ERROR: undefined behavior if collisions
|
||||
|
||||
The semantics are the expected ones for an atomic add function: the pointer
|
||||
points to a single location in memory (the same one for all program
|
||||
instances), and for each executing program instance, the value stored in
|
||||
the location that ``ptr`` points to has that program instance's value
|
||||
"delta" added to it atomically, and the old value at that location is
|
||||
returned from the function. (Thus, if multiple processors simultaneously
|
||||
issue atomic adds to the same memory location, the adds will be serialized
|
||||
by the hardware so that the correct result is computed in the end.
|
||||
Furthermore, the atomic adds are serialized across the running program
|
||||
instances.)
|
||||
then the program's behavior is undefined: whenever multiple program
|
||||
instances have values that map to the same value of ``bucket``, then the
|
||||
effect of the increment is undefined. (See the discussion in the `Data
|
||||
Races Within a Gang`_ section; in the case here, there isn't a sequence
|
||||
point between one program instance updating ``count[bucket]`` and the other
|
||||
program instance reading its value.)
|
||||
|
||||
The ``atomic_add_local()`` function can be used in this case; as a local
|
||||
atomic it is atomic across the gang of program instances, such that the
|
||||
expected result is computed.
|
||||
|
||||
::
|
||||
|
||||
...
|
||||
int bucket = clamp(value / N_BUCKETS, 0, N_BUCKETS);
|
||||
atomic_add_local(&count[bucket], 1);
|
||||
|
||||
It uses this variant of the 32-bit integer atomic add routine:
|
||||
|
||||
::
|
||||
|
||||
int32 atomic_add_local(uniform int32 * uniform ptr, int32 delta)
|
||||
|
||||
The semantics of this routine are typical for an atomic add function: the
|
||||
pointer here points to a single location in memory (the same one for all
|
||||
program instances), and for each executing program instance, the value
|
||||
stored in the location that ``ptr`` points to has that program instance's
|
||||
value "delta" added to it atomically, and the old value at that location is
|
||||
returned from the function.
|
||||
|
||||
One thing to note is that that the type of the value being added to a
|
||||
``uniform`` integer, while the increment amount and the return value are
|
||||
@@ -3270,44 +3446,75 @@ atomics for the running program instances may be issued in arbitrary order;
|
||||
it's not guaranteed that they will be issued in ``programIndex`` order, for
|
||||
example.
|
||||
|
||||
Here are the declarations of the ``int32`` variants of these functions.
|
||||
There are also ``int64`` equivalents as well as variants that take
|
||||
``unsigned`` ``int32`` and ``int64`` values. (The ``atomic_swap_global()``
|
||||
function can be used with ``float`` and ``double`` types as well.)
|
||||
Global atomics are more powerful than local atomics; they are atomic across
|
||||
both the program instances in the gang as well as atomic across different
|
||||
gangs and different hardware threads. For example, for the global variant
|
||||
of the atomic used above,
|
||||
|
||||
::
|
||||
|
||||
int32 atomic_add_global(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_subtract_global(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_min_global(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_max_global(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_and_global(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_or_global(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_xor_global(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_swap_global(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_add_global(uniform int32 * uniform ptr, int32 delta)
|
||||
|
||||
There are also variants of these functions that take ``uniform`` values for
|
||||
the operand and return a ``uniform`` result. These correspond to a single
|
||||
if multiple processors simultaneously issue atomic adds to the same memory
|
||||
location, the adds will be serialized by the hardware so that the correct
|
||||
result is computed in the end.
|
||||
|
||||
Here are the declarations of the ``int32`` variants of these functions.
|
||||
There are also ``int64`` equivalents as well as variants that take
|
||||
``unsigned`` ``int32`` and ``int64`` values.
|
||||
|
||||
::
|
||||
|
||||
int32 atomic_add_{local,global}(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_subtract_{local,global}(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_min_{local,global}(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_max_{local,global}(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_and_{local,global}(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_or_{local,global}(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_xor_{local,global}(uniform int32 * uniform ptr, int32 value)
|
||||
int32 atomic_swap_{local,global}(uniform int32 * uniform ptr, int32 value)
|
||||
|
||||
Support for ``float`` and ``double`` types is also available. For local
|
||||
atomics, all but the logical operations are available. (There are
|
||||
corresponding ``double`` variants of these, not listed here.)
|
||||
|
||||
::
|
||||
|
||||
float atomic_add_local(uniform float * uniform ptr, float value)
|
||||
float atomic_subtract_local(uniform float * uniform ptr, float value)
|
||||
float atomic_min_local(uniform float * uniform ptr, float value)
|
||||
float atomic_max_local(uniform float * uniform ptr, float value)
|
||||
float atomic_swap_local(uniform float * uniform ptr, float value)
|
||||
|
||||
For global atomics, only atomic swap is available for these types:
|
||||
|
||||
::
|
||||
|
||||
float atomic_swap_global(uniform float * uniform ptr, float value)
|
||||
double atomic_swap_global(uniform double * uniform ptr, double value)
|
||||
|
||||
There are also variants of the atomic that take ``uniform`` values for the
|
||||
operand and return a ``uniform`` result. These correspond to a single
|
||||
atomic operation being performed for the entire gang of program instances,
|
||||
rather than one per program instance.
|
||||
|
||||
::
|
||||
|
||||
uniform int32 atomic_add_global(uniform int32 * uniform ptr,
|
||||
uniform int32 atomic_add_{local,global}(uniform int32 * uniform ptr,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_subtract_global(uniform int32 * uniform ptr,
|
||||
uniform int32 atomic_subtract_{local,global}(uniform int32 * uniform ptr,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_min_global(uniform int32 * uniform ptr,
|
||||
uniform int32 atomic_min_{local,global}(uniform int32 * uniform ptr,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_max_global(uniform int32 * uniform ptr,
|
||||
uniform int32 atomic_max_{local,global}(uniform int32 * uniform ptr,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_and_global(uniform int32 * uniform ptr,
|
||||
uniform int32 atomic_and_{local,global}(uniform int32 * uniform ptr,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_or_global(uniform int32 * uniform ptr,
|
||||
uniform int32 atomic_or_{local,global}(uniform int32 * uniform ptr,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_xor_global(uniform int32 * uniform ptr,
|
||||
uniform int32 atomic_xor_{local,global}(uniform int32 * uniform ptr,
|
||||
uniform int32 value)
|
||||
uniform int32 atomic_swap_global(uniform int32 * uniform ptr,
|
||||
uniform int32 atomic_swap_{local,global}(uniform int32 * uniform ptr,
|
||||
uniform int32 newval)
|
||||
|
||||
Be careful that you use the atomic function that you mean to; consider the
|
||||
@@ -3332,8 +3539,7 @@ will cause the desired atomic add function to be called.
|
||||
::
|
||||
|
||||
extern uniform int32 counter;
|
||||
int32 one = 1;
|
||||
int32 myCounter = atomic_add_global(&counter, one);
|
||||
int32 myCounter = atomic_add_global(&counter, (varying int32)1);
|
||||
|
||||
There is a third variant of each of these atomic functions that takes a
|
||||
``varying`` pointer; this allows each program instance to issue an atomic
|
||||
@@ -3343,30 +3549,27 @@ the same location in memory!)
|
||||
|
||||
::
|
||||
|
||||
int32 atomic_add_global(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_subtract_global(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_min_global(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_max_global(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_and_global(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_or_global(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_xor_global(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_swap_global(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_add_{local,global}(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_subtract_{local,global}(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_min_{local,global}(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_max_{local,global}(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_and_{local,global}(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_or_{local,global}(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_xor_{local,global}(uniform int32 * varying ptr, int32 value)
|
||||
int32 atomic_swap_{local,global}(uniform int32 * varying ptr, int32 value)
|
||||
|
||||
There are also atomic swap and "compare and exchange" functions.
|
||||
Compare and exchange atomically compares the value in "val" to
|
||||
"compare"--if they match, it assigns "newval" to "val". In either case,
|
||||
the old value of "val" is returned. (As with the other atomic operations,
|
||||
there are also ``unsigned`` and 64-bit variants of this function.
|
||||
Furthermore, there are ``float`` and ``double`` variants as well.)
|
||||
There are also atomic "compare and exchange" functions. Compare and
|
||||
exchange atomically compares the value in "val" to "compare"--if they
|
||||
match, it assigns "newval" to "val". In either case, the old value of
|
||||
"val" is returned. (As with the other atomic operations, there are also
|
||||
``unsigned`` and 64-bit variants of this function. Furthermore, there are
|
||||
``float`` and ``double`` variants as well.)
|
||||
|
||||
::
|
||||
|
||||
int32 atomic_swap_global(uniform int32 * uniform ptr, int32 newvalue)
|
||||
uniform int32 atomic_swap_global(uniform int32 * uniform ptr,
|
||||
uniform int32 newvalue)
|
||||
int32 atomic_compare_exchange_global(uniform int32 * uniform ptr,
|
||||
int32 atomic_compare_exchange_{local,global}(uniform int32 * uniform ptr,
|
||||
int32 compare, int32 newval)
|
||||
uniform int32 atomic_compare_exchange_global(uniform int32 * uniform ptr,
|
||||
uniform int32 atomic_compare_exchange_{local,global}(uniform int32 * uniform ptr,
|
||||
uniform int32 compare, uniform int32 newval)
|
||||
|
||||
``ispc`` also has a standard library routine that inserts a memory barrier
|
||||
@@ -3419,12 +3622,27 @@ pointer types.
|
||||
System Information
|
||||
------------------
|
||||
|
||||
A routine is available to find the number of CPU cores available in the
|
||||
system:
|
||||
The value of a high-precision hardware clock counter is returned by the
|
||||
``clock()`` routine; its value increments by one each processor cycle.
|
||||
Thus, taking the difference between the values returned by ``clock()`` and
|
||||
different points in program execution gives the number of cycles between
|
||||
those points in the program.
|
||||
|
||||
::
|
||||
|
||||
int num_cores()
|
||||
uniform int64 clock()
|
||||
|
||||
Note that ``clock()`` flushes the processor pipeline. It has an overhead
|
||||
of a hundred or so cycles, so for very fine-grained measurements, it may be
|
||||
worthwhile to measure the cost of calling ``clock()`` and subtracting that
|
||||
value from reported results.
|
||||
|
||||
A routine is also available to find the number of CPU cores available in
|
||||
the system:
|
||||
|
||||
::
|
||||
|
||||
uniform int num_cores()
|
||||
|
||||
This value can be useful for adapting the granularity of parallel task
|
||||
decomposition depending on the number of processors in the system.
|
||||
|
||||
@@ -45,8 +45,7 @@
|
||||
developers mailing list</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen documentation of
|
||||
<tt>ispc</tt> source code</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -45,8 +45,7 @@
|
||||
developers mailing list</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen documentation of
|
||||
<tt>ispc</tt> source code</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
|
||||
# This could be handy for archiving the generated documentation or
|
||||
# if some version control system is used.
|
||||
|
||||
PROJECT_NUMBER = 1.1.2
|
||||
PROJECT_NUMBER = 1.1.4
|
||||
|
||||
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
||||
# base path where the generated documentation will be put.
|
||||
|
||||
@@ -82,7 +82,7 @@ static inline void vnormalize(vec &v) {
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
static void
|
||||
ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
@@ -124,7 +124,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
static void
|
||||
orthoBasis(vec basis[3], vec n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||
@@ -147,7 +147,7 @@ orthoBasis(vec basis[3], vec n) {
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
static float
|
||||
ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
float eps = 0.0001f;
|
||||
@@ -212,49 +212,12 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
RNGState rngstate;
|
||||
|
||||
seed_rng(&rngstate, y0);
|
||||
float invSamples = 1.f / nsubsamples;
|
||||
|
||||
// Compute the mapping between the 'programCount'-wide program
|
||||
// instances running in parallel and samples in the image.
|
||||
//
|
||||
// For now, we'll always take four samples per pixel, so start by
|
||||
// initializing du and dv with offsets into subpixel samples. We'll
|
||||
// take care of further updating du and dv for the case where we're
|
||||
// doing more than 4 program instances in parallel shortly.
|
||||
uniform float uSteps[4] = { 0, 1, 0, 1 };
|
||||
uniform float vSteps[4] = { 0, 0, 1, 1 };
|
||||
float du = uSteps[programIndex % 4] / nsubsamples;
|
||||
float dv = vSteps[programIndex % 4] / nsubsamples;
|
||||
foreach_tiled(y = y0 ... y1, x = 0 ... w,
|
||||
u = 0 ... nsubsamples, v = 0 ... nsubsamples) {
|
||||
float du = (float)u * invSamples, dv = (float)v * invSamples;
|
||||
|
||||
// Now handle the case where we are able to do more than one pixel's
|
||||
// worth of work at once. nx records the number of pixels in the x
|
||||
// direction we do per iteration and ny the number in y.
|
||||
uniform int nx = 1, ny = 1;
|
||||
|
||||
// FIXME: We actually need ny to be 1 regardless of the decomposition,
|
||||
// since the task decomposition is one scanline high.
|
||||
|
||||
if (programCount == 8) {
|
||||
// Do two pixels at once in the x direction
|
||||
nx = 2;
|
||||
if (programIndex >= 4)
|
||||
// And shift the offsets for the second pixel's worth of work
|
||||
++du;
|
||||
}
|
||||
else if (programCount == 16) {
|
||||
nx = 4;
|
||||
ny = 1;
|
||||
if (programIndex >= 4 && programIndex < 8)
|
||||
++du;
|
||||
if (programIndex >= 8 && programIndex < 12)
|
||||
du += 2;
|
||||
if (programIndex >= 12)
|
||||
du += 3;
|
||||
}
|
||||
|
||||
// Now loop over all of the pixels, stepping in x and y as calculated
|
||||
// above. (Assumes that ny divides y and nx divides x...)
|
||||
for (uniform int y = y0; y < y1; y += ny) {
|
||||
for (uniform int x = 0; x < w; x += nx) {
|
||||
// Figure out x,y pixel in NDC
|
||||
float px = (x + du - (w / 2.0f)) / (w / 2.0f);
|
||||
float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
|
||||
@@ -279,37 +242,14 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
|
||||
|
||||
// Note use of 'coherent' if statement; the set of rays we
|
||||
// trace will often all hit or all miss the scene
|
||||
cif (isect.hit)
|
||||
cif (isect.hit) {
|
||||
ret = ambient_occlusion(isect, plane, spheres, rngstate);
|
||||
ret *= invSamples * invSamples;
|
||||
|
||||
// This is a little grungy; we have results for
|
||||
// programCount-worth of values. Because we're doing 2x2
|
||||
// subsamples, we need to peel them off in groups of four,
|
||||
// average the four values for each pixel, and update the
|
||||
// output image.
|
||||
//
|
||||
// Store the varying value to a uniform array of the same size.
|
||||
// See the discussion about communication among program
|
||||
// instances in the ispc user's manual for more discussion on
|
||||
// this idiom.
|
||||
uniform float retArray[programCount];
|
||||
retArray[programIndex] = ret;
|
||||
|
||||
// offset to the first pixel in the image
|
||||
uniform int offset = 3 * (y * w + x);
|
||||
for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
|
||||
// Get the four sample values for this pixel
|
||||
uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
|
||||
retArray[p+3];
|
||||
|
||||
// Normalize by number of samples taken
|
||||
sumret /= nsubsamples * nsubsamples;
|
||||
|
||||
// Store result in the image
|
||||
image[offset+0] = sumret;
|
||||
image[offset+1] = sumret;
|
||||
image[offset+2] = sumret;
|
||||
}
|
||||
int offset = 3 * (y * w + x);
|
||||
atomic_add_local(&image[offset], ret);
|
||||
atomic_add_local(&image[offset+1], ret);
|
||||
atomic_add_local(&image[offset+2], ret);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o) $(TASK_OBJ))
|
||||
|
||||
default: $(EXAMPLE)
|
||||
|
||||
all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
|
||||
all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar
|
||||
|
||||
.PHONY: dirs clean
|
||||
|
||||
@@ -57,3 +57,9 @@ objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
|
||||
|
||||
$(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
|
||||
objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
|
||||
$(ISPC) $< -o $@ --target=generic-1
|
||||
|
||||
$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
|
||||
|
||||
@@ -158,38 +158,22 @@ IntersectLightsWithTileMinMax(
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
// Parallize across frustum planes.
|
||||
// We really only have four side planes here, but write the code to
|
||||
// handle programCount > 4 robustly
|
||||
uniform float frustumPlanes_xy[programCount];
|
||||
uniform float frustumPlanes_z[programCount];
|
||||
uniform float frustumPlanes_xy[4] = {
|
||||
-(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y),
|
||||
-(cameraProj_22 * gBufferScale_y) };
|
||||
uniform float frustumPlanes_z[4] = {
|
||||
tileEndX - gBufferScale_x,
|
||||
-tileStartX + gBufferScale_x,
|
||||
tileEndY - gBufferScale_y,
|
||||
-tileStartY + gBufferScale_y };
|
||||
|
||||
// TODO: If programIndex < 4 here? Don't care about masking off the
|
||||
// rest but if interleaving ("x2" modes) the other lanes should ideally
|
||||
// not be emitted...
|
||||
{
|
||||
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||
float frustumPlanes_xy_v;
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1, (cameraProj_11 * gBufferScale_x));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2, (cameraProj_22 * gBufferScale_y));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
|
||||
|
||||
float frustumPlanes_z_v;
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileEndX - gBufferScale_x);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 2, tileEndY - gBufferScale_y);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
|
||||
|
||||
// Normalize
|
||||
float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v +
|
||||
frustumPlanes_z_v * frustumPlanes_z_v);
|
||||
frustumPlanes_xy_v *= norm;
|
||||
frustumPlanes_z_v *= norm;
|
||||
|
||||
// Save out for uniform use later
|
||||
frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
|
||||
frustumPlanes_z[programIndex] = frustumPlanes_z_v;
|
||||
for (uniform int i = 0; i < 4; ++i) {
|
||||
uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] +
|
||||
frustumPlanes_z[i] * frustumPlanes_z[i]);
|
||||
frustumPlanes_xy[i] *= norm;
|
||||
frustumPlanes_z[i] *= norm;
|
||||
}
|
||||
|
||||
uniform int32 tileNumLights = 0;
|
||||
@@ -601,30 +585,20 @@ SplitTileMinMax(
|
||||
uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
|
||||
uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
|
||||
|
||||
// Parallize across frustum planes
|
||||
// Only have 2 frustum split planes here so may not be worth it, but
|
||||
// we'll do it for now for consistency
|
||||
uniform float frustumPlanes_xy[programCount];
|
||||
uniform float frustumPlanes_z[programCount];
|
||||
|
||||
// This one is totally constant over the whole screen... worth pulling it up at all?
|
||||
float frustumPlanes_xy_v;
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
|
||||
frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1, (cameraProj_22 * gBufferScale_y));
|
||||
|
||||
float frustumPlanes_z_v;
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
|
||||
frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
|
||||
uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
|
||||
(cameraProj_22 * gBufferScale_y) };
|
||||
uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
|
||||
tileMidY - gBufferScale_y };
|
||||
|
||||
// Normalize
|
||||
float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v +
|
||||
frustumPlanes_z_v * frustumPlanes_z_v);
|
||||
frustumPlanes_xy_v *= norm;
|
||||
frustumPlanes_z_v *= norm;
|
||||
|
||||
// Save out for uniform use later
|
||||
frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
|
||||
frustumPlanes_z[programIndex] = frustumPlanes_z_v;
|
||||
uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] +
|
||||
frustumPlanes_z[0] * frustumPlanes_z[0]),
|
||||
rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] +
|
||||
frustumPlanes_z[1] * frustumPlanes_z[1]) };
|
||||
frustumPlanes_xy[0] *= norm[0];
|
||||
frustumPlanes_xy[1] *= norm[1];
|
||||
frustumPlanes_z[0] *= norm[0];
|
||||
frustumPlanes_z[1] *= norm[1];
|
||||
|
||||
// Initialize
|
||||
uniform int32 subtileLightOffset[4];
|
||||
|
||||
@@ -251,6 +251,14 @@ static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) { \
|
||||
return cond ? a : b; \
|
||||
}
|
||||
|
||||
#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP) \
|
||||
static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \
|
||||
TYPE ret; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
ret.v[i] = (CAST)(a.v[i]) OP b; \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
#define SMEAR(VTYPE, NAME, STYPE) \
|
||||
static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \
|
||||
VTYPE ret; \
|
||||
@@ -386,6 +394,10 @@ BINARY_OP_CAST(__vec16_i8, int8_t, __srem, %)
|
||||
BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>)
|
||||
BINARY_OP_CAST(__vec16_i8, int8_t, __ashr, >>)
|
||||
|
||||
SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
|
||||
|
||||
CMP_OP(__vec16_i8, int8_t, __equal, ==)
|
||||
CMP_OP(__vec16_i8, int8_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=)
|
||||
@@ -425,6 +437,10 @@ BINARY_OP_CAST(__vec16_i16, int16_t, __srem, %)
|
||||
BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>)
|
||||
BINARY_OP_CAST(__vec16_i16, int16_t, __ashr, >>)
|
||||
|
||||
SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
|
||||
|
||||
CMP_OP(__vec16_i16, int16_t, __equal, ==)
|
||||
CMP_OP(__vec16_i16, int16_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=)
|
||||
@@ -464,6 +480,10 @@ BINARY_OP_CAST(__vec16_i32, int32_t, __srem, %)
|
||||
BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>)
|
||||
BINARY_OP_CAST(__vec16_i32, int32_t, __ashr, >>)
|
||||
|
||||
SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
|
||||
|
||||
CMP_OP(__vec16_i32, int32_t, __equal, ==)
|
||||
CMP_OP(__vec16_i32, int32_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=)
|
||||
@@ -503,6 +523,10 @@ BINARY_OP_CAST(__vec16_i64, int64_t, __srem, %)
|
||||
BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
|
||||
BINARY_OP_CAST(__vec16_i64, int64_t, __ashr, >>)
|
||||
|
||||
SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
|
||||
|
||||
CMP_OP(__vec16_i64, int64_t, __equal, ==)
|
||||
CMP_OP(__vec16_i64, int64_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
|
||||
@@ -938,7 +962,7 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// masked load/store
|
||||
|
||||
static FORCEINLINE __vec16_i8 __masked_load_8(unsigned char *p,
|
||||
static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i8 ret;
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
@@ -948,7 +972,7 @@ static FORCEINLINE __vec16_i8 __masked_load_8(unsigned char *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i16 __masked_load_16(unsigned char *p,
|
||||
static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i16 ret;
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
@@ -958,7 +982,7 @@ static FORCEINLINE __vec16_i16 __masked_load_16(unsigned char *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __masked_load_32(unsigned char *p,
|
||||
static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i32 ret;
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
@@ -968,7 +992,7 @@ static FORCEINLINE __vec16_i32 __masked_load_32(unsigned char *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i64 __masked_load_64(unsigned char *p,
|
||||
static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i64 ret;
|
||||
int64_t *ptr = (int64_t *)p;
|
||||
@@ -978,7 +1002,7 @@ static FORCEINLINE __vec16_i64 __masked_load_64(unsigned char *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_8(unsigned char *p, __vec16_i8 val,
|
||||
static FORCEINLINE void __masked_store_8(void *p, __vec16_i8 val,
|
||||
__vec16_i1 mask) {
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -986,7 +1010,7 @@ static FORCEINLINE void __masked_store_8(unsigned char *p, __vec16_i8 val,
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_16(unsigned char *p, __vec16_i16 val,
|
||||
static FORCEINLINE void __masked_store_16(void *p, __vec16_i16 val,
|
||||
__vec16_i1 mask) {
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -994,7 +1018,7 @@ static FORCEINLINE void __masked_store_16(unsigned char *p, __vec16_i16 val,
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_32(unsigned char *p, __vec16_i32 val,
|
||||
static FORCEINLINE void __masked_store_32(void *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -1002,7 +1026,7 @@ static FORCEINLINE void __masked_store_32(unsigned char *p, __vec16_i32 val,
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_64(unsigned char *p, __vec16_i64 val,
|
||||
static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
|
||||
__vec16_i1 mask) {
|
||||
int64_t *ptr = (int64_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -1010,19 +1034,41 @@ static FORCEINLINE void __masked_store_64(unsigned char *p, __vec16_i64 val,
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_8(void *p, __vec16_i8 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_8(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_16(void *p, __vec16_i16 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_16(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_32(void *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_32(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_64(p, val, mask);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// gather/scatter
|
||||
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
__vec16_i1 mask) { \
|
||||
VTYPE ret; \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]); \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
ret.v[i] = *ptr; \
|
||||
} \
|
||||
return ret; \
|
||||
@@ -1061,12 +1107,14 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
|
||||
// scatter
|
||||
|
||||
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
VTYPE val, __vec16_i1 mask) { \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]); \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
*ptr = val.v[i]; \
|
||||
} \
|
||||
}
|
||||
|
||||
@@ -51,8 +51,8 @@
|
||||
#define FORCEINLINE __attribute__((always_inline)) inline
|
||||
#endif
|
||||
|
||||
//CO#undef FORCEINLINE
|
||||
//CO#define FORCEINLINE
|
||||
#undef FORCEINLINE
|
||||
#define FORCEINLINE
|
||||
|
||||
typedef float __vec1_f;
|
||||
typedef double __vec1_d;
|
||||
@@ -303,6 +303,13 @@ static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, __vec4_i8 b) {
|
||||
_mm_extract_epi8(a.v, 3) << _mm_extract_epi8(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, int32_t b) {
|
||||
return __vec4_i8(_mm_extract_epi8(a.v, 0) << b,
|
||||
_mm_extract_epi8(a.v, 1) << b,
|
||||
_mm_extract_epi8(a.v, 2) << b,
|
||||
_mm_extract_epi8(a.v, 3) << b);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8 __udiv(__vec4_i8 a, __vec4_i8 b) {
|
||||
return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) /
|
||||
(uint8_t)_mm_extract_epi8(b.v, 0),
|
||||
@@ -358,6 +365,13 @@ static FORCEINLINE __vec4_i8 __lshr(__vec4_i8 a, __vec4_i8 b) {
|
||||
(uint8_t)_mm_extract_epi8(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8 __lshr(__vec4_i8 a, int32_t b) {
|
||||
return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) >> b,
|
||||
(uint8_t)_mm_extract_epi8(a.v, 1) >> b,
|
||||
(uint8_t)_mm_extract_epi8(a.v, 2) >> b,
|
||||
(uint8_t)_mm_extract_epi8(a.v, 3) >> b);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, __vec4_i8 b) {
|
||||
return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) >>
|
||||
(int8_t)_mm_extract_epi8(b.v, 0),
|
||||
@@ -369,6 +383,13 @@ static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, __vec4_i8 b) {
|
||||
(int8_t)_mm_extract_epi8(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, int32_t b) {
|
||||
return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) >> b,
|
||||
(int8_t)_mm_extract_epi8(a.v, 1) >> b,
|
||||
(int8_t)_mm_extract_epi8(a.v, 2) >> b,
|
||||
(int8_t)_mm_extract_epi8(a.v, 3) >> b);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i8 a, __vec4_i8 b) {
|
||||
__m128i cmp = _mm_cmpeq_epi8(a.v, b.v);
|
||||
return __vec4_i1(_mm_extract_epi8(cmp, 0),
|
||||
@@ -547,6 +568,10 @@ static FORCEINLINE __vec4_i16 __shl(__vec4_i16 a, __vec4_i16 b) {
|
||||
_mm_extract_epi16(a.v, 3) << _mm_extract_epi16(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16 __shl(__vec4_i16 a, int32_t b) {
|
||||
return _mm_sll_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16 __udiv(__vec4_i16 a, __vec4_i16 b) {
|
||||
return __vec4_i16((uint16_t)_mm_extract_epi16(a.v, 0) /
|
||||
(uint16_t)_mm_extract_epi16(b.v, 0),
|
||||
@@ -602,6 +627,10 @@ static FORCEINLINE __vec4_i16 __lshr(__vec4_i16 a, __vec4_i16 b) {
|
||||
(uint16_t)_mm_extract_epi16(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16 __lshr(__vec4_i16 a, int32_t b) {
|
||||
return _mm_srl_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, __vec4_i16 b) {
|
||||
return __vec4_i16((int16_t)_mm_extract_epi16(a.v, 0) >>
|
||||
(int16_t)_mm_extract_epi16(b.v, 0),
|
||||
@@ -613,6 +642,10 @@ static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, __vec4_i16 b) {
|
||||
(int16_t)_mm_extract_epi16(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, int32_t b) {
|
||||
return _mm_sra_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i16 a, __vec4_i16 b) {
|
||||
__m128i cmp = _mm_cmpeq_epi16(a.v, b.v);
|
||||
return __vec4_i1(_mm_extract_epi16(cmp, 0),
|
||||
@@ -789,9 +822,6 @@ static FORCEINLINE __vec4_i32 __xor(__vec4_i32 a, __vec4_i32 b) {
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, __vec4_i32 b) {
|
||||
// FIXME: if we can determine at compile time that b has the same value
|
||||
// across all elements, then we can use _mm_sll_epi32.
|
||||
|
||||
/* fixme: llvm generates thie code for shift left, which is presumably
|
||||
more efficient than doing each component individually as below.
|
||||
|
||||
@@ -813,57 +843,92 @@ _f___ii: ## @f___ii
|
||||
ret
|
||||
|
||||
*/
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) << _mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) << _mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) << _mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) << _mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) <<
|
||||
_mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) <<
|
||||
_mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) <<
|
||||
_mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) <<
|
||||
_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, int32_t b) {
|
||||
return _mm_sll_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __udiv(__vec4_i32 a, __vec4_i32 b) {
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) / (uint32_t)_mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) / (uint32_t)_mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) / (uint32_t)_mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) / (uint32_t)_mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) /
|
||||
(uint32_t)_mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) /
|
||||
(uint32_t)_mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) /
|
||||
(uint32_t)_mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) /
|
||||
(uint32_t)_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __sdiv(__vec4_i32 a, __vec4_i32 b) {
|
||||
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) / (int32_t)_mm_extract_epi32(b.v, 0),
|
||||
(int32_t)_mm_extract_epi32(a.v, 1) / (int32_t)_mm_extract_epi32(b.v, 1),
|
||||
(int32_t)_mm_extract_epi32(a.v, 2) / (int32_t)_mm_extract_epi32(b.v, 2),
|
||||
(int32_t)_mm_extract_epi32(a.v, 3) / (int32_t)_mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) /
|
||||
(int32_t)_mm_extract_epi32(b.v, 0),
|
||||
(int32_t)_mm_extract_epi32(a.v, 1) /
|
||||
(int32_t)_mm_extract_epi32(b.v, 1),
|
||||
(int32_t)_mm_extract_epi32(a.v, 2) /
|
||||
(int32_t)_mm_extract_epi32(b.v, 2),
|
||||
(int32_t)_mm_extract_epi32(a.v, 3) /
|
||||
(int32_t)_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __urem(__vec4_i32 a, __vec4_i32 b) {
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) % (uint32_t)_mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) % (uint32_t)_mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) % (uint32_t)_mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) % (uint32_t)_mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) %
|
||||
(uint32_t)_mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) %
|
||||
(uint32_t)_mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) %
|
||||
(uint32_t)_mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) %
|
||||
(uint32_t)_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __srem(__vec4_i32 a, __vec4_i32 b) {
|
||||
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) % (int32_t)_mm_extract_epi32(b.v, 0),
|
||||
(int32_t)_mm_extract_epi32(a.v, 1) % (int32_t)_mm_extract_epi32(b.v, 1),
|
||||
(int32_t)_mm_extract_epi32(a.v, 2) % (int32_t)_mm_extract_epi32(b.v, 2),
|
||||
(int32_t)_mm_extract_epi32(a.v, 3) % (int32_t)_mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) %
|
||||
(int32_t)_mm_extract_epi32(b.v, 0),
|
||||
(int32_t)_mm_extract_epi32(a.v, 1) %
|
||||
(int32_t)_mm_extract_epi32(b.v, 1),
|
||||
(int32_t)_mm_extract_epi32(a.v, 2) %
|
||||
(int32_t)_mm_extract_epi32(b.v, 2),
|
||||
(int32_t)_mm_extract_epi32(a.v, 3) %
|
||||
(int32_t)_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __lshr(__vec4_i32 a, __vec4_i32 b) {
|
||||
// FIXME: if we can determine at compile time that b has the same value
|
||||
// across all elements, e.g. using gcc's __builtin_constant_p, then we
|
||||
// can use _mm_srl_epi32.
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) >>
|
||||
_mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) >>
|
||||
_mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) >>
|
||||
_mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) >>
|
||||
_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __lshr(__vec4_i32 a, int32_t b) {
|
||||
return _mm_srl_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, __vec4_i32 b) {
|
||||
// FIXME: if we can determine at compile time that b has the same value
|
||||
// across all elements, then we can use _mm_sra_epi32.
|
||||
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
|
||||
(int32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
|
||||
(int32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
|
||||
(int32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) >>
|
||||
_mm_extract_epi32(b.v, 0),
|
||||
(int32_t)_mm_extract_epi32(a.v, 1) >>
|
||||
_mm_extract_epi32(b.v, 1),
|
||||
(int32_t)_mm_extract_epi32(a.v, 2) >>
|
||||
_mm_extract_epi32(b.v, 2),
|
||||
(int32_t)_mm_extract_epi32(a.v, 3) >>
|
||||
_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, int32_t b) {
|
||||
return _mm_sra_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i32 a, __vec4_i32 b) {
|
||||
@@ -876,10 +941,8 @@ static FORCEINLINE __vec4_i1 __not_equal(__vec4_i32 a, __vec4_i32 b) {
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i32 a, __vec4_i32 b) {
|
||||
a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
|
||||
b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
|
||||
return _mm_or_si128(_mm_cmplt_epi32(a.v, b.v),
|
||||
_mm_cmpeq_epi32(a.v, b.v));
|
||||
// a<=b == (min(a,b) == a)
|
||||
return _mm_cmpeq_epi32(_mm_min_epu32(a.v, b.v), a.v);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i32 a, __vec4_i32 b) {
|
||||
@@ -888,10 +951,8 @@ static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i32 a, __vec4_i32 b) {
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i32 a, __vec4_i32 b) {
|
||||
a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
|
||||
b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
|
||||
return _mm_or_si128(_mm_cmpgt_epi32(a.v, b.v),
|
||||
_mm_cmpeq_epi32(a.v, b.v));
|
||||
// a>=b == (max(a,b) == a)
|
||||
return _mm_cmpeq_epi32(_mm_max_epu32(a.v, b.v), a.v);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i32 a, __vec4_i32 b) {
|
||||
@@ -1016,6 +1077,12 @@ static FORCEINLINE __vec4_i64 __shl(__vec4_i64 a, __vec4_i64 b) {
|
||||
_mm_extract_epi64(a.v[1], 1) << _mm_extract_epi64(b.v[1], 1));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64 __shl(__vec4_i64 a, int32_t b) {
|
||||
__m128i amt = _mm_set_epi32(0, 0, 0, b);
|
||||
return __vec4_i64(_mm_sll_epi64(a.v[0], amt),
|
||||
_mm_sll_epi64(a.v[1], amt));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64 __udiv(__vec4_i64 a, __vec4_i64 b) {
|
||||
return __vec4_i64((uint64_t)_mm_extract_epi64(a.v[0], 0) /
|
||||
(uint64_t)_mm_extract_epi64(b.v[0], 0),
|
||||
@@ -1071,6 +1138,12 @@ static FORCEINLINE __vec4_i64 __lshr(__vec4_i64 a, __vec4_i64 b) {
|
||||
(uint64_t)_mm_extract_epi64(b.v[1], 1));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64 __lshr(__vec4_i64 a, int32_t b) {
|
||||
__m128i amt = _mm_set_epi32(0, 0, 0, b);
|
||||
return __vec4_i64(_mm_srl_epi64(a.v[0], amt),
|
||||
_mm_srl_epi64(a.v[1], amt));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, __vec4_i64 b) {
|
||||
return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) >>
|
||||
(int64_t)_mm_extract_epi64(b.v[0], 0),
|
||||
@@ -1082,6 +1155,13 @@ static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, __vec4_i64 b) {
|
||||
(int64_t)_mm_extract_epi64(b.v[1], 1));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, int32_t b) {
|
||||
return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) >> b,
|
||||
(int64_t)_mm_extract_epi64(a.v[0], 1) >> b,
|
||||
(int64_t)_mm_extract_epi64(a.v[1], 0) >> b,
|
||||
(int64_t)_mm_extract_epi64(a.v[1], 1) >> b);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i64 a, __vec4_i64 b) {
|
||||
__m128i cmp0 = _mm_cmpeq_epi64(a.v[0], b.v[0]);
|
||||
__m128i cmp1 = _mm_cmpeq_epi64(a.v[1], b.v[1]);
|
||||
@@ -2328,7 +2408,7 @@ static FORCEINLINE uint64_t __reduce_max_uint64(__vec4_i64 v) {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// masked load/store
|
||||
|
||||
static FORCEINLINE __vec4_i8 __masked_load_8(unsigned char *p,
|
||||
static FORCEINLINE __vec4_i8 __masked_load_8(void *p,
|
||||
__vec4_i1 mask) {
|
||||
int8_t r[4];
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
@@ -2348,7 +2428,7 @@ static FORCEINLINE __vec4_i8 __masked_load_8(unsigned char *p,
|
||||
return __vec4_i8(r[0], r[1], r[2], r[3]);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16 __masked_load_16(unsigned char *p,
|
||||
static FORCEINLINE __vec4_i16 __masked_load_16(void *p,
|
||||
__vec4_i1 mask) {
|
||||
int16_t r[4];
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
@@ -2372,7 +2452,7 @@ static FORCEINLINE __vec4_i16 __masked_load_16(unsigned char *p,
|
||||
return __vec4_i16(r[0], r[1], r[2], r[3]);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __masked_load_32(unsigned char *p,
|
||||
static FORCEINLINE __vec4_i32 __masked_load_32(void *p,
|
||||
__vec4_i1 mask) {
|
||||
__m128i r = _mm_set_epi32(0, 0, 0, 0);
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
@@ -2395,7 +2475,7 @@ static FORCEINLINE __vec4_i32 __masked_load_32(unsigned char *p,
|
||||
return r;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64 __masked_load_64(unsigned char *p,
|
||||
static FORCEINLINE __vec4_i64 __masked_load_64(void *p,
|
||||
__vec4_i1 mask) {
|
||||
uint64_t r[4];
|
||||
uint64_t *ptr = (uint64_t *)p;
|
||||
@@ -2418,7 +2498,7 @@ static FORCEINLINE __vec4_i64 __masked_load_64(unsigned char *p,
|
||||
return __vec4_i64(r[0], r[1], r[2], r[3]);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_8(unsigned char *p, __vec4_i8 val,
|
||||
static FORCEINLINE void __masked_store_8(void *p, __vec4_i8 val,
|
||||
__vec4_i1 mask) {
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
|
||||
@@ -2439,7 +2519,8 @@ static FORCEINLINE void __masked_store_8(unsigned char *p, __vec4_i8 val,
|
||||
ptr[3] = _mm_extract_epi8(val.v, 3);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_16(unsigned char *p, __vec4_i16 val, __vec4_i1 mask) {
|
||||
static FORCEINLINE void __masked_store_16(void *p, __vec4_i16 val,
|
||||
__vec4_i1 mask) {
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
@@ -2459,7 +2540,7 @@ static FORCEINLINE void __masked_store_16(unsigned char *p, __vec4_i16 val, __ve
|
||||
ptr[3] = _mm_extract_epi16(val.v, 3);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_32(unsigned char *p, __vec4_i32 val,
|
||||
static FORCEINLINE void __masked_store_32(void *p, __vec4_i32 val,
|
||||
__vec4_i1 mask) {
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
@@ -2479,7 +2560,7 @@ static FORCEINLINE void __masked_store_32(unsigned char *p, __vec4_i32 val,
|
||||
ptr[3] = _mm_extract_epi32(val.v, 3);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_64(unsigned char *p, __vec4_i64 val,
|
||||
static FORCEINLINE void __masked_store_64(void *p, __vec4_i64 val,
|
||||
__vec4_i1 mask) {
|
||||
int64_t *ptr = (int64_t *)p;
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
@@ -2499,58 +2580,82 @@ static FORCEINLINE void __masked_store_64(unsigned char *p, __vec4_i64 val,
|
||||
ptr[3] = _mm_extract_epi64(val.v[1], 1);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_8(void *p, __vec4_i8 val,
|
||||
__vec4_i1 mask) {
|
||||
__masked_store_8(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_16(void *p, __vec4_i16 val,
|
||||
__vec4_i1 mask) {
|
||||
__masked_store_16(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_32(void *p, __vec4_i32 val,
|
||||
__vec4_i1 mask) {
|
||||
// FIXME: do a load, blendvps, store here...
|
||||
__masked_store_32(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_64(void *p, __vec4_i64 val,
|
||||
__vec4_i1 mask) {
|
||||
// FIXME: do a 2x (load, blendvps, store) here...
|
||||
__masked_store_64(p, val, mask);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// gather/scatter
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
template<typename RetVec, typename RetScalar>
|
||||
static FORCEINLINE RetVec
|
||||
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
|
||||
__vec4_i32 offsets, uint32_t scale, __vec4_i1 mask) {
|
||||
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
RetScalar r[4];
|
||||
#if 1
|
||||
// "Fast gather" trick...
|
||||
offsets = __select(mask, offsets, __smear_i32(0));
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0);
|
||||
constOffset = __select(mask, constOffset, __smear_i32(0));
|
||||
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[0] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 1);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[1] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 2);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[2] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 3);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[3] = *ptr;
|
||||
#else
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[0] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 1);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[1] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 2);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[2] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 3);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[3] = *ptr;
|
||||
}
|
||||
@@ -2558,54 +2663,57 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
|
||||
return RetVec(r[0], r[1], r[2], r[3]);
|
||||
}
|
||||
|
||||
|
||||
template<typename RetVec, typename RetScalar>
|
||||
static FORCEINLINE RetVec
|
||||
lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
RetScalar r[4];
|
||||
#if 1
|
||||
// "Fast gather" trick...
|
||||
offsets = __select(mask, offsets, __smear_i64(0));
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
constOffset = __select(mask, constOffset, __smear_i64(0));
|
||||
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[0] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[1] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[2] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[3] = *ptr;
|
||||
#else
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[0] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[1] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[2] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[3] = *ptr;
|
||||
}
|
||||
@@ -2616,80 +2724,89 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
|
||||
|
||||
static FORCEINLINE __vec4_i8
|
||||
__gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale,
|
||||
mask);
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8
|
||||
__gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale,
|
||||
mask);
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16
|
||||
__gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale,
|
||||
mask);
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16
|
||||
__gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale,
|
||||
mask);
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32
|
||||
__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
|
||||
__vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
__m128i r = _mm_set_epi32(0, 0, 0, 0);
|
||||
#if 1
|
||||
// "Fast gather"...
|
||||
offsets = __select(mask, offsets, __smear_i32(0));
|
||||
constOffset = __select(mask, constOffset, __smear_i32(0));
|
||||
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0) +
|
||||
_mm_extract_epi32(constOffset.v, 0);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 0);
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 1);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 1) +
|
||||
_mm_extract_epi32(constOffset.v, 1);
|
||||
ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 1);
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 2);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 2) +
|
||||
_mm_extract_epi32(constOffset.v, 2);
|
||||
ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 2);
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 3);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 3) +
|
||||
_mm_extract_epi32(constOffset.v, 3);
|
||||
ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 3);
|
||||
#else
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0) +
|
||||
_mm_extract_epi32(constOffset.v, 0);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 1);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 1) +
|
||||
_mm_extract_epi32(constOffset.v, 1);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 2);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 2) +
|
||||
_mm_extract_epi32(constOffset.v, 2);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 2);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 3);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 3) +
|
||||
_mm_extract_epi32(constOffset.v, 3);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 3);
|
||||
}
|
||||
@@ -2699,23 +2816,23 @@ __gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
|
||||
|
||||
static FORCEINLINE __vec4_i32
|
||||
__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale,
|
||||
mask);
|
||||
delta, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64
|
||||
__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale,
|
||||
mask);
|
||||
delta, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64
|
||||
__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale,
|
||||
mask);
|
||||
delta, mask);
|
||||
}
|
||||
|
||||
template<typename RetVec, typename RetScalar>
|
||||
@@ -2862,217 +2979,108 @@ static FORCEINLINE __vec4_i64 __gather64_i64(__vec4_i64 ptrs, __vec4_i1 mask) {
|
||||
|
||||
// scatter
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
|
||||
*ptr = _mm_extract_epi8(val.v, 0);
|
||||
#define SCATTER32_64(SUFFIX, TYPE, EXTRACT) \
|
||||
static FORCEINLINE void \
|
||||
__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
|
||||
uint32_t scale, __vec4_i32 constOffset, \
|
||||
__vec4_##SUFFIX val, __vec4_i1 mask) { \
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0); \
|
||||
if (m != 0) { \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
|
||||
_mm_extract_epi32(constOffset.v, 0)); \
|
||||
*ptr = EXTRACT(val.v, 0); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 1); \
|
||||
if (m != 0) { \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \
|
||||
_mm_extract_epi32(constOffset.v, 1)); \
|
||||
*ptr = EXTRACT(val.v, 1); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 2); \
|
||||
if (m != 0) { \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \
|
||||
_mm_extract_epi32(constOffset.v, 2)); \
|
||||
*ptr = EXTRACT(val.v, 2); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 3); \
|
||||
if (m != 0) { \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \
|
||||
_mm_extract_epi32(constOffset.v, 3)); \
|
||||
*ptr = EXTRACT(val.v, 3); \
|
||||
} \
|
||||
} \
|
||||
static FORCEINLINE void \
|
||||
__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
|
||||
uint32_t scale, __vec4_i64 constOffset, \
|
||||
__vec4_##SUFFIX val, __vec4_i1 mask) { \
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0); \
|
||||
if (m != 0) { \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + \
|
||||
_mm_extract_epi64(constOffset.v[0], 0); \
|
||||
TYPE *ptr = (TYPE *)(p + offset); \
|
||||
*ptr = EXTRACT(val.v, 0); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 1); \
|
||||
if (m != 0) { \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + \
|
||||
_mm_extract_epi64(constOffset.v[0], 1); \
|
||||
TYPE *ptr = (TYPE *)(p + offset); \
|
||||
*ptr = EXTRACT(val.v, 1); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 2); \
|
||||
if (m != 0) { \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + \
|
||||
_mm_extract_epi64(constOffset.v[1], 0); \
|
||||
TYPE *ptr = (TYPE *)(p + offset); \
|
||||
*ptr = EXTRACT(val.v, 2); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 3); \
|
||||
if (m != 0) { \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + \
|
||||
_mm_extract_epi64(constOffset.v[1], 1); \
|
||||
TYPE *ptr = (TYPE *)(p + offset); \
|
||||
*ptr = EXTRACT(val.v, 3); \
|
||||
} \
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
|
||||
*ptr = _mm_extract_epi8(val.v, 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
|
||||
*ptr = _mm_extract_epi8(val.v, 2);
|
||||
}
|
||||
SCATTER32_64(i8, int8_t, _mm_extract_epi8)
|
||||
SCATTER32_64(i16, int16_t, _mm_extract_epi16)
|
||||
SCATTER32_64(i32, int32_t, _mm_extract_epi32)
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
|
||||
*ptr = _mm_extract_epi8(val.v, 3);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_i8(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
uint8_t *ptr = (uint8_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi8(val.v, 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
uint8_t *ptr = (uint8_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi8(val.v, 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
uint8_t *ptr = (uint8_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi8(val.v, 2);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
uint8_t *ptr = (uint8_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi8(val.v, 3);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
|
||||
*ptr = _mm_extract_epi16(val.v, 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
|
||||
*ptr = _mm_extract_epi16(val.v, 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
|
||||
*ptr = _mm_extract_epi16(val.v, 2);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
|
||||
*ptr = _mm_extract_epi16(val.v, 3);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_i16(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
uint16_t *ptr = (uint16_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi16(val.v, 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
uint16_t *ptr = (uint16_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi16(val.v, 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
uint16_t *ptr = (uint16_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi16(val.v, 2);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
uint16_t *ptr = (uint16_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi16(val.v, 3);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i32(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int32_t *ptr = (int32_t *)(b + scale *
|
||||
_mm_extract_epi32(offsets.v, 0));
|
||||
*ptr = _mm_extract_epi32(val.v, 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int32_t *ptr = (int32_t *)(b + scale *
|
||||
_mm_extract_epi32(offsets.v, 1));
|
||||
*ptr = _mm_extract_epi32(val.v, 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int32_t *ptr = (int32_t *)(b + scale *
|
||||
_mm_extract_epi32(offsets.v, 2));
|
||||
*ptr = _mm_extract_epi32(val.v, 2);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int32_t *ptr = (int32_t *)(b + scale *
|
||||
_mm_extract_epi32(offsets.v, 3));
|
||||
*ptr = _mm_extract_epi32(val.v, 3);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi32(val.v, 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi32(val.v, 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi32(val.v, 2);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi32(val.v, 3);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val,
|
||||
__vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) +
|
||||
_mm_extract_epi32(constOffset.v, 0);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) +
|
||||
_mm_extract_epi32(constOffset.v, 1);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) +
|
||||
_mm_extract_epi32(constOffset.v, 2);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) +
|
||||
_mm_extract_epi32(constOffset.v, 3);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 1);
|
||||
}
|
||||
@@ -3080,31 +3088,36 @@ __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i64 constOffset,
|
||||
__vec4_i64 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +
|
||||
_mm_extract_epi64(constOffset.v[0], 0);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +
|
||||
_mm_extract_epi64(constOffset.v[0], 1);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +
|
||||
_mm_extract_epi64(constOffset.v[1], 0);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +
|
||||
_mm_extract_epi64(constOffset.v[1], 1);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 1);
|
||||
}
|
||||
|
||||
@@ -104,7 +104,7 @@ static void generateRay(uniform const float raster2camera[4][4],
|
||||
}
|
||||
|
||||
|
||||
static inline bool BBoxIntersect(const uniform float bounds[2][3],
|
||||
static bool BBoxIntersect(const uniform float bounds[2][3],
|
||||
const Ray &ray) {
|
||||
uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
|
||||
uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
|
||||
@@ -143,7 +143,7 @@ static inline bool BBoxIntersect(const uniform float bounds[2][3],
|
||||
|
||||
|
||||
|
||||
static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
static bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
|
||||
uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
|
||||
uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
|
||||
|
||||
@@ -273,7 +273,7 @@ lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
|
||||
#else
|
||||
void *result;
|
||||
#if (ISPC_POINTER_BYTES == 4)
|
||||
__asm__ __volatile__("lock\ncmpxchgd %2,%1"
|
||||
__asm__ __volatile__("lock\ncmpxchgl %2,%1"
|
||||
: "=a"(result), "=m"(*v)
|
||||
: "q"(newValue), "0"(oldValue)
|
||||
: "memory");
|
||||
|
||||
@@ -129,7 +129,7 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
|
||||
}
|
||||
|
||||
|
||||
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
static float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
uniform float density[], uniform int nVoxels[3]) {
|
||||
if (!Inside(Pobj, pMin, pMax))
|
||||
return 0;
|
||||
|
||||
54
expr.h
54
expr.h
@@ -314,7 +314,6 @@ public:
|
||||
std::string identifier;
|
||||
const SourcePos identifierPos;
|
||||
|
||||
protected:
|
||||
MemberExpr(Expr *expr, const char *identifier, SourcePos pos,
|
||||
SourcePos identifierPos, bool derefLValue);
|
||||
|
||||
@@ -389,6 +388,10 @@ public:
|
||||
with values given by the "vales" parameter. */
|
||||
ConstExpr(ConstExpr *old, double *values);
|
||||
|
||||
/** Create ConstExpr with the same type and values as the given one,
|
||||
but at the given position. */
|
||||
ConstExpr(ConstExpr *old, SourcePos pos);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
void Print() const;
|
||||
@@ -681,11 +684,44 @@ public:
|
||||
const Type *GetType() const;
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
llvm::Constant *GetConstant(const Type *type) const;
|
||||
void Print() const;
|
||||
int EstimateCost() const;
|
||||
};
|
||||
|
||||
|
||||
/** An expression representing a "new" expression, used for dynamically
|
||||
allocating memory.
|
||||
*/
|
||||
class NewExpr : public Expr {
|
||||
public:
|
||||
NewExpr(int typeQual, const Type *type, Expr *initializer, Expr *count,
|
||||
SourcePos tqPos, SourcePos p);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
Expr *TypeCheck();
|
||||
Expr *Optimize();
|
||||
void Print() const;
|
||||
int EstimateCost() const;
|
||||
|
||||
/** Type of object to allocate storage for. */
|
||||
const Type *allocType;
|
||||
/** Expression giving the number of elements to allocate, when the
|
||||
"new Foo[expr]" form is used. This may be NULL, in which case a
|
||||
single element of the given type will be allocated. */
|
||||
Expr *countExpr;
|
||||
/** Optional initializer expression used to initialize the allocated
|
||||
memory. */
|
||||
Expr *initExpr;
|
||||
/** Indicates whether this is a "varying new" or "uniform new"
|
||||
(i.e. whether a separate allocation is performed per program
|
||||
instance, or whether a single allocation is performed for the
|
||||
entire gang of program instances.) */
|
||||
bool isVarying;
|
||||
};
|
||||
|
||||
|
||||
/** This function indicates whether it's legal to convert from fromType to
|
||||
toType. If the optional errorMsgBase and source position parameters
|
||||
are provided, then an error message is issued if the type conversion
|
||||
@@ -704,4 +740,20 @@ bool CanConvertTypes(const Type *fromType, const Type *toType,
|
||||
*/
|
||||
Expr *TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase);
|
||||
|
||||
/** Utility routine that emits code to initialize a symbol given an
|
||||
initializer expression.
|
||||
|
||||
@param lvalue Memory location of storage for the symbol's data
|
||||
@param symName Name of symbol (used in error messages)
|
||||
@param symType Type of variable being initialized
|
||||
@param initExpr Expression for the initializer
|
||||
@param ctx FunctionEmitContext to use for generating instructions
|
||||
@param pos Source file position of the variable being initialized
|
||||
*/
|
||||
void
|
||||
InitSymbol(llvm::Value *lvalue, const Type *symType, Expr *initExpr,
|
||||
FunctionEmitContext *ctx, SourcePos pos);
|
||||
|
||||
bool PossiblyResolveFunctionOverloads(Expr *expr, const Type *type);
|
||||
|
||||
#endif // ISPC_EXPR_H
|
||||
|
||||
3
func.cpp
3
func.cpp
@@ -334,12 +334,13 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
if (ctx->GetCurrentBasicBlock())
|
||||
ctx->ReturnInst();
|
||||
}
|
||||
else
|
||||
else {
|
||||
// Set up basic blocks for goto targets
|
||||
ctx->InitializeLabelMap(code);
|
||||
// No check, just emit the code
|
||||
code->EmitCode(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx->GetCurrentBasicBlock()) {
|
||||
// FIXME: We'd like to issue a warning if we've reached the end of
|
||||
|
||||
15
ispc.cpp
15
ispc.cpp
@@ -185,6 +185,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->allOffMaskIsSafe = true;
|
||||
t->maskBitCount = 1;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-1")) {
|
||||
t->isa = Target::GENERIC;
|
||||
t->nativeVectorWidth = 1;
|
||||
t->vectorWidth = 1;
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
}
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
else if (!strcasecmp(isa, "avx")) {
|
||||
t->isa = Target::AVX;
|
||||
@@ -210,7 +218,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->isa = Target::AVX2;
|
||||
t->nativeVectorWidth = 8;
|
||||
t->vectorWidth = 8;
|
||||
t->attributes = "+avx2,+popcnt,+cmov";
|
||||
t->attributes = "+avx2,+popcnt,+cmov,+f16c";
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
@@ -219,7 +227,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->isa = Target::AVX2;
|
||||
t->nativeVectorWidth = 16;
|
||||
t->vectorWidth = 16;
|
||||
t->attributes = "+avx2,+popcnt,+cmov";
|
||||
t->attributes = "+avx2,+popcnt,+cmov,+f16c";
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
@@ -270,7 +278,7 @@ Target::SupportedTargetISAs() {
|
||||
#ifdef LLVM_3_1svn
|
||||
", avx2, avx2-x2"
|
||||
#endif // LLVM_3_1svn
|
||||
", generic-4, generic-8, generic-16";
|
||||
", generic-4, generic-8, generic-16, generic-1";
|
||||
}
|
||||
|
||||
|
||||
@@ -387,7 +395,6 @@ lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
|
||||
return false;
|
||||
}
|
||||
|
||||
type->dump();
|
||||
Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type));
|
||||
return true;
|
||||
}
|
||||
|
||||
4
ispc.h
4
ispc.h
@@ -418,6 +418,7 @@ enum {
|
||||
COST_ASSIGN = 1,
|
||||
COST_COHERENT_BREAK_CONTINE = 4,
|
||||
COST_COMPLEX_ARITH_OP = 4,
|
||||
COST_DELETE = 32,
|
||||
COST_DEREF = 4,
|
||||
COST_FUNCALL = 4,
|
||||
COST_FUNPTR_UNIFORM = 12,
|
||||
@@ -425,6 +426,7 @@ enum {
|
||||
COST_GATHER = 8,
|
||||
COST_GOTO = 4,
|
||||
COST_LOAD = 2,
|
||||
COST_NEW = 32,
|
||||
COST_REGULAR_BREAK_CONTINUE = 2,
|
||||
COST_RETURN = 4,
|
||||
COST_SELECT = 4,
|
||||
@@ -437,6 +439,8 @@ enum {
|
||||
COST_VARYING_IF = 3,
|
||||
COST_UNIFORM_LOOP = 4,
|
||||
COST_VARYING_LOOP = 6,
|
||||
COST_UNIFORM_SWITCH = 4,
|
||||
COST_VARYING_SWITCH = 12,
|
||||
COST_ASSERT = 8,
|
||||
|
||||
CHECK_MASK_AT_FUNCTION_START_COST = 16,
|
||||
|
||||
82
ispc.vcxproj
82
ispc.vcxproj
@@ -18,11 +18,14 @@
|
||||
<ClCompile Include="decl.cpp" />
|
||||
<ClCompile Include="expr.cpp" />
|
||||
<ClCompile Include="func.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx1.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx1-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx2-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-c-32.cpp" />
|
||||
<ClCompile Include="gen-bitcode-c-64.cpp" />
|
||||
<ClCompile Include="gen-bitcode-dispatch.cpp" />
|
||||
<ClCompile Include="gen-bitcode-generic-1.cpp" />
|
||||
<ClCompile Include="gen-bitcode-generic-4.cpp" />
|
||||
<ClCompile Include="gen-bitcode-generic-8.cpp" />
|
||||
<ClCompile Include="gen-bitcode-generic-16.cpp" />
|
||||
@@ -158,29 +161,68 @@
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-avx.ll">
|
||||
<CustomBuild Include="builtins\target-avx1.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll > gen-bitcode-avx1.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll > gen-bitcode-avx1.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-avx-x2.ll">
|
||||
<CustomBuild Include="builtins\target-avx1-x2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll > gen-bitcode-avx-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll > gen-bitcode-avx-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll > gen-bitcode-avx1-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll > gen-bitcode-avx1-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-avx2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll > gen-bitcode-avx2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll > gen-bitcode-avx2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-avx2-x2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll > gen-bitcode-avx2-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll > gen-bitcode-avx2-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-generic-1.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll > gen-bitcode-generic-1.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-1.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll > gen-bitcode-generic-1.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-1.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-1.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-1.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
|
||||
5
lex.ll
5
lex.ll
@@ -93,6 +93,8 @@ continue { return TOKEN_CONTINUE; }
|
||||
creturn { return TOKEN_CRETURN; }
|
||||
default { return TOKEN_DEFAULT; }
|
||||
do { return TOKEN_DO; }
|
||||
delete { return TOKEN_DELETE; }
|
||||
delete\[\] { return TOKEN_DELETE; }
|
||||
double { return TOKEN_DOUBLE; }
|
||||
else { return TOKEN_ELSE; }
|
||||
enum { return TOKEN_ENUM; }
|
||||
@@ -112,6 +114,7 @@ int16 { return TOKEN_INT16; }
|
||||
int32 { return TOKEN_INT; }
|
||||
int64 { return TOKEN_INT64; }
|
||||
launch { return TOKEN_LAUNCH; }
|
||||
new { return TOKEN_NEW; }
|
||||
NULL { return TOKEN_NULL; }
|
||||
print { return TOKEN_PRINT; }
|
||||
reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
|
||||
@@ -156,7 +159,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
|
||||
yylval->intVal = lParseBinary(yytext+2, *yylloc, &endPtr);
|
||||
else {
|
||||
#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
|
||||
yylval->intVal = _strtoi64(yytext, &endPtr, 0);
|
||||
yylval->intVal = _strtoui64(yytext, &endPtr, 0);
|
||||
#else
|
||||
// FIXME: should use strtouq and then issue an error if we can't
|
||||
// fit into 64 bits...
|
||||
|
||||
251
llvmutil.cpp
251
llvmutil.cpp
@@ -36,7 +36,9 @@
|
||||
*/
|
||||
|
||||
#include "llvmutil.h"
|
||||
#include "ispc.h"
|
||||
#include "type.h"
|
||||
#include <llvm/Instructions.h>
|
||||
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
|
||||
LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
|
||||
@@ -465,3 +467,252 @@ LLVMBoolVector(const bool *bvec) {
|
||||
}
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
/** Conservative test to see if two llvm::Values are equal. There are
|
||||
(potentially many) cases where the two values actually are equal but
|
||||
this will return false. However, if it does return true, the two
|
||||
vectors definitely are equal.
|
||||
|
||||
@todo This seems to catch all of the cases we currently need it for in
|
||||
practice, but it's be nice to make it a little more robust/general. In
|
||||
general, though, a little something called the halting problem means we
|
||||
won't get all of them.
|
||||
*/
|
||||
static bool
|
||||
lValuesAreEqual(llvm::Value *v0, llvm::Value *v1,
|
||||
std::vector<llvm::PHINode *> &seenPhi0,
|
||||
std::vector<llvm::PHINode *> &seenPhi1) {
|
||||
// Thanks to the fact that LLVM hashes and returns the same pointer for
|
||||
// constants (of all sorts, even constant expressions), this first test
|
||||
// actually catches a lot of cases. LLVM's SSA form also helps a lot
|
||||
// with this..
|
||||
if (v0 == v1)
|
||||
return true;
|
||||
|
||||
Assert(seenPhi0.size() == seenPhi1.size());
|
||||
for (unsigned int i = 0; i < seenPhi0.size(); ++i)
|
||||
if (v0 == seenPhi0[i] && v1 == seenPhi1[i])
|
||||
return true;
|
||||
|
||||
llvm::BinaryOperator *bo0 = llvm::dyn_cast<llvm::BinaryOperator>(v0);
|
||||
llvm::BinaryOperator *bo1 = llvm::dyn_cast<llvm::BinaryOperator>(v1);
|
||||
if (bo0 != NULL && bo1 != NULL) {
|
||||
if (bo0->getOpcode() != bo1->getOpcode())
|
||||
return false;
|
||||
return (lValuesAreEqual(bo0->getOperand(0), bo1->getOperand(0),
|
||||
seenPhi0, seenPhi1) &&
|
||||
lValuesAreEqual(bo0->getOperand(1), bo1->getOperand(1),
|
||||
seenPhi0, seenPhi1));
|
||||
}
|
||||
|
||||
llvm::PHINode *phi0 = llvm::dyn_cast<llvm::PHINode>(v0);
|
||||
llvm::PHINode *phi1 = llvm::dyn_cast<llvm::PHINode>(v1);
|
||||
if (phi0 != NULL && phi1 != NULL) {
|
||||
if (phi0->getNumIncomingValues() != phi1->getNumIncomingValues())
|
||||
return false;
|
||||
|
||||
seenPhi0.push_back(phi0);
|
||||
seenPhi1.push_back(phi1);
|
||||
|
||||
unsigned int numIncoming = phi0->getNumIncomingValues();
|
||||
// Check all of the incoming values: if all of them are all equal,
|
||||
// then we're good.
|
||||
bool anyFailure = false;
|
||||
for (unsigned int i = 0; i < numIncoming; ++i) {
|
||||
Assert(phi0->getIncomingBlock(i) == phi1->getIncomingBlock(i));
|
||||
if (!lValuesAreEqual(phi0->getIncomingValue(i),
|
||||
phi1->getIncomingValue(i), seenPhi0, seenPhi1)) {
|
||||
anyFailure = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
seenPhi0.pop_back();
|
||||
seenPhi1.pop_back();
|
||||
|
||||
return !anyFailure;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/** Given an llvm::Value known to be an integer, return its value as
|
||||
an int64_t.
|
||||
*/
|
||||
static int64_t
|
||||
lGetIntValue(llvm::Value *offset) {
|
||||
llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
|
||||
Assert(intOffset && (intOffset->getBitWidth() == 32 ||
|
||||
intOffset->getBitWidth() == 64));
|
||||
return intOffset->getSExtValue();
|
||||
}
|
||||
|
||||
|
||||
/** This function takes chains of InsertElement instructions along the
|
||||
lines of:
|
||||
|
||||
%v0 = insertelement undef, value_0, i32 index_0
|
||||
%v1 = insertelement %v1, value_1, i32 index_1
|
||||
...
|
||||
%vn = insertelement %vn-1, value_n-1, i32 index_n-1
|
||||
|
||||
and initializes the provided elements array such that the i'th
|
||||
llvm::Value * in the array is the element that was inserted into the
|
||||
i'th element of the vector.
|
||||
*/
|
||||
void
|
||||
LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
|
||||
llvm::Value **elements) {
|
||||
for (int i = 0; i < vectorWidth; ++i)
|
||||
elements[i] = NULL;
|
||||
|
||||
while (ie != NULL) {
|
||||
int64_t iOffset = lGetIntValue(ie->getOperand(2));
|
||||
Assert(iOffset >= 0 && iOffset < vectorWidth);
|
||||
Assert(elements[iOffset] == NULL);
|
||||
|
||||
elements[iOffset] = ie->getOperand(1);
|
||||
|
||||
llvm::Value *insertBase = ie->getOperand(0);
|
||||
ie = llvm::dyn_cast<llvm::InsertElementInst>(insertBase);
|
||||
if (ie == NULL) {
|
||||
if (llvm::isa<llvm::UndefValue>(insertBase))
|
||||
return;
|
||||
|
||||
llvm::ConstantVector *cv =
|
||||
llvm::dyn_cast<llvm::ConstantVector>(insertBase);
|
||||
Assert(cv != NULL);
|
||||
Assert(iOffset < (int)cv->getNumOperands());
|
||||
elements[iOffset] = cv->getOperand(iOffset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Tests to see if all of the elements of the vector in the 'v' parameter
|
||||
are equal. Like lValuesAreEqual(), this is a conservative test and may
|
||||
return false for arrays where the values are actually all equal. */
|
||||
bool
|
||||
LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
|
||||
std::vector<llvm::PHINode *> &seenPhis) {
|
||||
if (vectorLength == 1)
|
||||
return true;
|
||||
|
||||
if (llvm::isa<llvm::ConstantAggregateZero>(v))
|
||||
return true;
|
||||
|
||||
llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v);
|
||||
if (cv != NULL)
|
||||
return (cv->getSplatValue() != NULL);
|
||||
|
||||
#ifdef LLVM_3_1svn
|
||||
llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v);
|
||||
if (cdv != NULL)
|
||||
return (cdv->getSplatValue() != NULL);
|
||||
#endif
|
||||
|
||||
llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v);
|
||||
if (bop != NULL)
|
||||
return (LLVMVectorValuesAllEqual(bop->getOperand(0), vectorLength,
|
||||
seenPhis) &&
|
||||
LLVMVectorValuesAllEqual(bop->getOperand(1), vectorLength,
|
||||
seenPhis));
|
||||
|
||||
llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(v);
|
||||
if (cast != NULL)
|
||||
return LLVMVectorValuesAllEqual(cast->getOperand(0), vectorLength,
|
||||
seenPhis);
|
||||
|
||||
llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(v);
|
||||
if (ie != NULL) {
|
||||
llvm::Value *elements[ISPC_MAX_NVEC];
|
||||
LLVMFlattenInsertChain(ie, vectorLength, elements);
|
||||
|
||||
// We will ignore any values of elements[] that are NULL; as they
|
||||
// correspond to undefined values--we just want to see if all of
|
||||
// the defined values have the same value.
|
||||
int lastNonNull = 0;
|
||||
while (lastNonNull < vectorLength && elements[lastNonNull] == NULL)
|
||||
++lastNonNull;
|
||||
|
||||
if (lastNonNull == vectorLength)
|
||||
// all of them are undef!
|
||||
return true;
|
||||
|
||||
for (int i = lastNonNull; i < vectorLength; ++i) {
|
||||
if (elements[i] == NULL)
|
||||
continue;
|
||||
|
||||
std::vector<llvm::PHINode *> seenPhi0;
|
||||
std::vector<llvm::PHINode *> seenPhi1;
|
||||
if (lValuesAreEqual(elements[lastNonNull], elements[i], seenPhi0,
|
||||
seenPhi1) == false)
|
||||
return false;
|
||||
lastNonNull = i;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v);
|
||||
if (phi) {
|
||||
for (unsigned int i = 0; i < seenPhis.size(); ++i)
|
||||
if (seenPhis[i] == phi)
|
||||
return true;
|
||||
|
||||
seenPhis.push_back(phi);
|
||||
|
||||
unsigned int numIncoming = phi->getNumIncomingValues();
|
||||
// Check all of the incoming values: if all of them are all equal,
|
||||
// then we're good.
|
||||
for (unsigned int i = 0; i < numIncoming; ++i) {
|
||||
if (!LLVMVectorValuesAllEqual(phi->getIncomingValue(i), vectorLength,
|
||||
seenPhis)) {
|
||||
seenPhis.pop_back();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
seenPhis.pop_back();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (llvm::isa<llvm::UndefValue>(v))
|
||||
// ?
|
||||
return false;
|
||||
|
||||
Assert(!llvm::isa<llvm::Constant>(v));
|
||||
|
||||
if (llvm::isa<llvm::CallInst>(v) || llvm::isa<llvm::LoadInst>(v) ||
|
||||
!llvm::isa<llvm::Instruction>(v))
|
||||
return false;
|
||||
|
||||
llvm::ShuffleVectorInst *shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(v);
|
||||
if (shuffle != NULL) {
|
||||
llvm::Value *indices = shuffle->getOperand(2);
|
||||
if (LLVMVectorValuesAllEqual(indices, vectorLength, seenPhis))
|
||||
// The easy case--just a smear of the same element across the
|
||||
// whole vector.
|
||||
return true;
|
||||
|
||||
// TODO: handle more general cases?
|
||||
return false;
|
||||
}
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "all equal: ");
|
||||
v->dump();
|
||||
fprintf(stderr, "\n");
|
||||
llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(v);
|
||||
if (inst) {
|
||||
inst->getParent()->dump();
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
23
llvmutil.h
23
llvmutil.h
@@ -38,12 +38,23 @@
|
||||
#ifndef ISPC_LLVMUTIL_H
|
||||
#define ISPC_LLVMUTIL_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Type.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#include <llvm/Constants.h>
|
||||
|
||||
namespace llvm {
|
||||
class PHINode;
|
||||
class InsertElementInst;
|
||||
}
|
||||
|
||||
// llvm::Type *s are no longer const in llvm 3.0
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
#define LLVM_TYPE_CONST
|
||||
#else
|
||||
#define LLVM_TYPE_CONST const
|
||||
#endif
|
||||
|
||||
|
||||
/** This structure holds pointers to a variety of LLVM types; code
|
||||
elsewhere can use them from here, ratherthan needing to make more
|
||||
@@ -99,6 +110,7 @@ extern llvm::Constant *LLVMTrue, *LLVMFalse;
|
||||
of LLVMTypes and the LLVMTrue/LLVMFalse constants. However, it can't
|
||||
be called until the compilation target is known.
|
||||
*/
|
||||
struct Target;
|
||||
extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target);
|
||||
|
||||
/** Returns an LLVM i8 constant of the given value */
|
||||
@@ -205,4 +217,13 @@ extern llvm::Constant *LLVMMaskAllOn;
|
||||
/** LLVM constant value representing an 'all off' SIMD lane mask */
|
||||
extern llvm::Constant *LLVMMaskAllOff;
|
||||
|
||||
/** Tests to see if all of the elements of the vector in the 'v' parameter
|
||||
are equal. Like lValuesAreEqual(), this is a conservative test and may
|
||||
return false for arrays where the values are actually all equal. */
|
||||
extern bool LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
|
||||
std::vector<llvm::PHINode *> &seenPhis);
|
||||
|
||||
void LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
|
||||
llvm::Value **elements);
|
||||
|
||||
#endif // ISPC_LLVMUTIL_H
|
||||
|
||||
28
main.cpp
28
main.cpp
@@ -60,10 +60,27 @@
|
||||
#define BUILD_VERSION ""
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
|
||||
static void usage(int ret) {
|
||||
printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n",
|
||||
BUILD_DATE, BUILD_VERSION);
|
||||
printf("usage: ispc\n");
|
||||
static void
|
||||
lPrintVersion() {
|
||||
printf("Intel(r) SPMD Program Compiler (ispc), build %s (%s, LLVM %s)\n",
|
||||
BUILD_DATE, BUILD_VERSION,
|
||||
#ifdef LLVM_2_9
|
||||
"2.9"
|
||||
#elif defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
"3.0"
|
||||
#elif defined(LLVM_3_1) || defined(LLVM_3_1svn)
|
||||
"3.1"
|
||||
#else
|
||||
#error "Unhandled LLVM version"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
usage(int ret) {
|
||||
lPrintVersion();
|
||||
printf("\nusage: ispc\n");
|
||||
printf(" [--addressing={32,64}]\t\tSelect 32- or 64-bit addressing. (Note that 32-bit\n");
|
||||
printf(" \t\taddressing calculations are done by default, even\n");
|
||||
printf(" \t\ton 64-bit target architectures.)\n");
|
||||
@@ -367,8 +384,7 @@ int main(int Argc, char *Argv[]) {
|
||||
generatePIC = true;
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
|
||||
printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n",
|
||||
BUILD_DATE, BUILD_VERSION);
|
||||
lPrintVersion();
|
||||
return 0;
|
||||
}
|
||||
else if (argv[i][0] == '-') {
|
||||
|
||||
87
parse.yy
87
parse.yy
@@ -106,13 +106,14 @@ static void lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
|
||||
const EnumType *enumType);
|
||||
|
||||
static const char *lBuiltinTokens[] = {
|
||||
"assert", "bool", "break", "case", "cbreak", "ccontinue", "cdo", "cfor",
|
||||
"cif", "cwhile", "const", "continue", "creturn", "default", "do", "double",
|
||||
"else", "enum", "export", "extern", "false", "float", "for", "foreach",
|
||||
"foreach_tiled", "goto", "if", "inline", "int", "int8", "int16",
|
||||
"int32", "int64", "launch", "NULL", "print", "return", "signed", "sizeof",
|
||||
"static", "struct", "switch", "sync", "task", "true", "typedef", "uniform",
|
||||
"unsigned", "varying", "void", "while", NULL
|
||||
"assert", "bool", "break", "case", "cbreak", "ccontinue", "cdo",
|
||||
"cfor", "cif", "cwhile", "const", "continue", "creturn", "default",
|
||||
"do", "delete", "double", "else", "enum", "export", "extern", "false",
|
||||
"float", "for", "foreach", "foreach_tiled", "goto", "if", "inline",
|
||||
"int", "int8", "int16", "int32", "int64", "launch", "new", "NULL",
|
||||
"print", "return", "signed", "sizeof", "static", "struct", "switch",
|
||||
"sync", "task", "true", "typedef", "uniform", "unsigned", "varying",
|
||||
"void", "while", NULL
|
||||
};
|
||||
|
||||
static const char *lParamListTokens[] = {
|
||||
@@ -170,7 +171,7 @@ struct ForeachDimension {
|
||||
%token TOKEN_AND_OP TOKEN_OR_OP TOKEN_MUL_ASSIGN TOKEN_DIV_ASSIGN TOKEN_MOD_ASSIGN
|
||||
%token TOKEN_ADD_ASSIGN TOKEN_SUB_ASSIGN TOKEN_LEFT_ASSIGN TOKEN_RIGHT_ASSIGN
|
||||
%token TOKEN_AND_ASSIGN TOKEN_OR_ASSIGN TOKEN_XOR_ASSIGN
|
||||
%token TOKEN_SIZEOF
|
||||
%token TOKEN_SIZEOF TOKEN_NEW TOKEN_DELETE
|
||||
|
||||
%token TOKEN_EXTERN TOKEN_EXPORT TOKEN_STATIC TOKEN_INLINE TOKEN_TASK
|
||||
%token TOKEN_UNIFORM TOKEN_VARYING TOKEN_TYPEDEF TOKEN_SOA
|
||||
@@ -189,7 +190,7 @@ struct ForeachDimension {
|
||||
%type <expr> multiplicative_expression additive_expression shift_expression
|
||||
%type <expr> relational_expression equality_expression and_expression
|
||||
%type <expr> exclusive_or_expression inclusive_or_expression
|
||||
%type <expr> logical_and_expression logical_or_expression
|
||||
%type <expr> logical_and_expression logical_or_expression new_expression
|
||||
%type <expr> conditional_expression assignment_expression expression
|
||||
%type <expr> initializer constant_expression for_test
|
||||
%type <exprList> argument_expression_list initializer_list
|
||||
@@ -197,7 +198,7 @@ struct ForeachDimension {
|
||||
%type <stmt> statement labeled_statement compound_statement for_init_statement
|
||||
%type <stmt> expression_statement selection_statement iteration_statement
|
||||
%type <stmt> jump_statement statement_list declaration_statement print_statement
|
||||
%type <stmt> assert_statement sync_statement
|
||||
%type <stmt> assert_statement sync_statement delete_statement
|
||||
|
||||
%type <declaration> declaration parameter_declaration
|
||||
%type <declarators> init_declarator_list
|
||||
@@ -215,7 +216,7 @@ struct ForeachDimension {
|
||||
%type <enumType> enum_specifier
|
||||
|
||||
%type <type> specifier_qualifier_list struct_or_union_specifier
|
||||
%type <type> type_specifier type_name
|
||||
%type <type> type_specifier type_name rate_qualified_new_type
|
||||
%type <type> short_vec_specifier
|
||||
%type <atomicType> atomic_var_type_specifier
|
||||
|
||||
@@ -225,7 +226,7 @@ struct ForeachDimension {
|
||||
|
||||
%type <stringVal> string_constant
|
||||
%type <constCharPtr> struct_or_union_name enum_identifier goto_identifier
|
||||
%type <intVal> int_constant soa_width_specifier
|
||||
%type <intVal> int_constant soa_width_specifier rate_qualified_new
|
||||
|
||||
%type <foreachDimension> foreach_dimension_specifier
|
||||
%type <foreachDimensionList> foreach_dimension_list
|
||||
@@ -448,8 +449,36 @@ conditional_expression
|
||||
{ $$ = new SelectExpr($1, $3, $5, Union(@1,@5)); }
|
||||
;
|
||||
|
||||
assignment_expression
|
||||
rate_qualified_new
|
||||
: TOKEN_NEW { $$ = 0; }
|
||||
| TOKEN_UNIFORM TOKEN_NEW { $$ = TYPEQUAL_UNIFORM; }
|
||||
| TOKEN_VARYING TOKEN_NEW { $$ = TYPEQUAL_VARYING; }
|
||||
;
|
||||
|
||||
rate_qualified_new_type
|
||||
: type_specifier { $$ = $1; }
|
||||
| TOKEN_UNIFORM type_specifier { $$ = $2->GetAsUniformType(); }
|
||||
| TOKEN_VARYING type_specifier { $$ = $2->GetAsVaryingType(); }
|
||||
;
|
||||
|
||||
new_expression
|
||||
: conditional_expression
|
||||
| rate_qualified_new rate_qualified_new_type
|
||||
{
|
||||
$$ = new NewExpr($1, $2, NULL, NULL, @1, Union(@1, @2));
|
||||
}
|
||||
| rate_qualified_new rate_qualified_new_type '(' initializer_list ')'
|
||||
{
|
||||
$$ = new NewExpr($1, $2, $4, NULL, @1, Union(@1, @2));
|
||||
}
|
||||
| rate_qualified_new rate_qualified_new_type '[' expression ']'
|
||||
{
|
||||
$$ = new NewExpr($1, $2, NULL, $4, @1, Union(@1, @4));
|
||||
}
|
||||
;
|
||||
|
||||
assignment_expression
|
||||
: new_expression
|
||||
| unary_expression '=' assignment_expression
|
||||
{ $$ = new AssignExpr(AssignExpr::Assign, $1, $3, Union(@1, @3)); }
|
||||
| unary_expression TOKEN_MUL_ASSIGN assignment_expression
|
||||
@@ -1240,6 +1269,7 @@ statement
|
||||
| print_statement
|
||||
| assert_statement
|
||||
| sync_statement
|
||||
| delete_statement
|
||||
| error
|
||||
{
|
||||
std::vector<std::string> builtinTokens;
|
||||
@@ -1265,9 +1295,17 @@ labeled_statement
|
||||
$$ = new LabeledStmt($1, $3, @1);
|
||||
}
|
||||
| TOKEN_CASE constant_expression ':' statement
|
||||
{ UNIMPLEMENTED; }
|
||||
{
|
||||
int value;
|
||||
if ($2 != NULL &&
|
||||
lGetConstantInt($2, &value, @2, "Case statement value")) {
|
||||
$$ = new CaseStmt(value, $4, Union(@1, @2));
|
||||
}
|
||||
else
|
||||
$$ = NULL;
|
||||
}
|
||||
| TOKEN_DEFAULT ':' statement
|
||||
{ UNIMPLEMENTED; }
|
||||
{ $$ = new DefaultStmt($3, @1); }
|
||||
;
|
||||
|
||||
start_scope
|
||||
@@ -1313,7 +1351,7 @@ selection_statement
|
||||
| TOKEN_CIF '(' expression ')' statement TOKEN_ELSE statement
|
||||
{ $$ = new IfStmt($3, $5, $7, true, @1); }
|
||||
| TOKEN_SWITCH '(' expression ')' statement
|
||||
{ UNIMPLEMENTED; }
|
||||
{ $$ = new SwitchStmt($3, $5, @1); }
|
||||
;
|
||||
|
||||
for_test
|
||||
@@ -1461,23 +1499,30 @@ jump_statement
|
||||
;
|
||||
|
||||
sync_statement
|
||||
: TOKEN_SYNC
|
||||
: TOKEN_SYNC ';'
|
||||
{ $$ = new ExprStmt(new SyncExpr(@1), @1); }
|
||||
;
|
||||
|
||||
delete_statement
|
||||
: TOKEN_DELETE expression ';'
|
||||
{
|
||||
$$ = new DeleteStmt($2, Union(@1, @2));
|
||||
}
|
||||
;
|
||||
|
||||
print_statement
|
||||
: TOKEN_PRINT '(' string_constant ')'
|
||||
: TOKEN_PRINT '(' string_constant ')' ';'
|
||||
{
|
||||
$$ = new PrintStmt(*$3, NULL, @1);
|
||||
}
|
||||
| TOKEN_PRINT '(' string_constant ',' argument_expression_list ')'
|
||||
| TOKEN_PRINT '(' string_constant ',' argument_expression_list ')' ';'
|
||||
{
|
||||
$$ = new PrintStmt(*$3, $5, @1);
|
||||
}
|
||||
;
|
||||
|
||||
assert_statement
|
||||
: TOKEN_ASSERT '(' string_constant ',' expression ')'
|
||||
: TOKEN_ASSERT '(' string_constant ',' expression ')' ';'
|
||||
{
|
||||
$$ = new AssertStmt(*$3, $5, @1);
|
||||
}
|
||||
@@ -1614,7 +1659,7 @@ lAddFunctionParams(Declarator *decl) {
|
||||
|
||||
/** Add a symbol for the built-in mask variable to the symbol table */
|
||||
static void lAddMaskToSymbolTable(SourcePos pos) {
|
||||
const Type *t = g->target.isa == Target::GENERIC ?
|
||||
const Type *t = g->target.maskBitCount == 1 ?
|
||||
AtomicType::VaryingConstBool : AtomicType::VaryingConstUInt32;
|
||||
Symbol *maskSymbol = new Symbol("__mask", pos, t);
|
||||
m->symbolTable->AddVariable(maskSymbol);
|
||||
|
||||
140
run_tests.py
140
run_tests.py
@@ -12,12 +12,14 @@ import re
|
||||
import signal
|
||||
import random
|
||||
import string
|
||||
import mutex
|
||||
import subprocess
|
||||
import shlex
|
||||
import platform
|
||||
import tempfile
|
||||
|
||||
# This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
|
||||
# git history has a workaround for that issue.
|
||||
|
||||
is_windows = (platform.system() == 'Windows' or
|
||||
'CYGWIN_NT' in platform.system())
|
||||
|
||||
@@ -36,29 +38,32 @@ parser.add_option("-c", "--compiler", dest="compiler_exe", help="Compiler binary
|
||||
default=None)
|
||||
parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
|
||||
default=False, action="store_true")
|
||||
parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel',
|
||||
default="1024", type="int")
|
||||
parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output',
|
||||
default=False, action="store_true")
|
||||
if not is_windows:
|
||||
parser.add_option('--valgrind', dest='valgrind', help='Run tests with valgrind',
|
||||
default=False, action="store_true")
|
||||
parser.add_option('--wrap-exe', dest='wrapexe',
|
||||
help='Executable to wrap test runs with (e.g. "valgrind")',
|
||||
default="")
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
if not is_windows and options.valgrind:
|
||||
valgrind_cmd = "valgrind "
|
||||
if not is_windows:
|
||||
ispc_exe = "./ispc"
|
||||
else:
|
||||
valgrind_cmd = ""
|
||||
ispc_exe = "../Release/ispc.exe"
|
||||
|
||||
is_generic_target = options.target.find("generic-") != -1
|
||||
is_generic_target = (options.target.find("generic-") != -1 and
|
||||
options.target != "generic-1")
|
||||
if is_generic_target and options.include_file == None:
|
||||
if options.target == "generic-4":
|
||||
print "No generics #include specified; using examples/intrinsics/sse4.h"
|
||||
sys.stderr.write("No generics #include specified; using examples/intrinsics/sse4.h\n")
|
||||
options.include_file = "examples/intrinsics/sse4.h"
|
||||
elif options.target == "generic-8":
|
||||
print "No generics #include specified and no default available for \"generic-8\" target.";
|
||||
sys.stderr.write("No generics #include specified and no default available for \"generic-8\" target.\n")
|
||||
sys.exit(1)
|
||||
elif options.target == "generic-16":
|
||||
print "No generics #include specified; using examples/intrinsics/generic-16.h"
|
||||
sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-16.h\n")
|
||||
options.include_file = "examples/intrinsics/generic-16.h"
|
||||
|
||||
if options.compiler_exe == None:
|
||||
@@ -67,16 +72,33 @@ if options.compiler_exe == None:
|
||||
else:
|
||||
options.compiler_exe = "g++"
|
||||
|
||||
# if no specific test files are specified, run all of the tests in tests/
|
||||
# and failing_tests/
|
||||
def fix_windows_paths(files):
|
||||
ret = [ ]
|
||||
for fn in files:
|
||||
ret += [ string.replace(fn, '\\', '/') ]
|
||||
return ret
|
||||
|
||||
|
||||
# if no specific test files are specified, run all of the tests in tests/,
|
||||
# failing_tests/, and tests_errors/
|
||||
if len(args) == 0:
|
||||
files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc") + \
|
||||
glob.glob("tests_errors/*ispc")
|
||||
files = fix_windows_paths(files)
|
||||
else:
|
||||
files = [ ]
|
||||
if is_windows:
|
||||
argfiles = [ ]
|
||||
for f in args:
|
||||
# we have to glob ourselves if this is being run under a DOS
|
||||
# shell..
|
||||
argfiles += glob.glob(f)
|
||||
else:
|
||||
argfiles = args
|
||||
|
||||
files = [ ]
|
||||
for f in argfiles:
|
||||
if os.path.splitext(string.lower(f))[1] != ".ispc":
|
||||
print "Ignoring file %s, which doesn't have an .ispc extension." % f
|
||||
sys.stdout.write("Ignoring file %s, which doesn't have an .ispc extension.\n" % f)
|
||||
else:
|
||||
files += [ f ]
|
||||
|
||||
@@ -88,18 +110,13 @@ if (options.random):
|
||||
# counter
|
||||
total_tests = 0
|
||||
|
||||
# We'd like to use the Lock class from the multiprocessing package to
|
||||
# serialize accesses to finished_tests_counter. Unfortunately, the version of
|
||||
# python that ships with OSX 10.5 has this bug:
|
||||
# http://bugs.python.org/issue5261. Therefore, we use the (deprecated but
|
||||
# still available) mutex class.
|
||||
#finished_tests_counter_lock = multiprocessing.Lock()
|
||||
finished_tests_mutex = mutex.mutex()
|
||||
finished_tests_counter = multiprocessing.Value(c_int)
|
||||
finished_tests_counter_lock = multiprocessing.Lock()
|
||||
|
||||
# utility routine to print an update on the number of tests that have been
|
||||
# finished. Should be called with the mutex (or lock) held..
|
||||
# finished. Should be called with the lock held..
|
||||
def update_progress(fn):
|
||||
global total_tests
|
||||
finished_tests_counter.value = finished_tests_counter.value + 1
|
||||
progress_str = " Done %d / %d [%s]" % (finished_tests_counter.value, total_tests, fn)
|
||||
# spaces to clear out detrius from previous printing...
|
||||
@@ -108,18 +125,18 @@ def update_progress(fn):
|
||||
progress_str += '\r'
|
||||
sys.stdout.write(progress_str)
|
||||
sys.stdout.flush()
|
||||
finished_tests_mutex.unlock()
|
||||
|
||||
def run_command(cmd):
|
||||
if options.verbose:
|
||||
print "Running: %s" % cmd
|
||||
sys.stdout.write("Running: %s\n" % cmd)
|
||||
sp = subprocess.Popen(shlex.split(cmd), stdin=None,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
out = sp.communicate()
|
||||
output = ""
|
||||
output += out[0]
|
||||
output += out[1]
|
||||
output += out[0].decode("utf-8")
|
||||
output += out[1].decode("utf-8")
|
||||
|
||||
return (sp.returncode, output)
|
||||
|
||||
# run the commands in cmd_list
|
||||
@@ -128,9 +145,9 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
|
||||
(return_code, output) = run_command(cmd)
|
||||
compile_failed = (return_code != 0)
|
||||
if compile_failed:
|
||||
print "Compilation of test %s failed " % filename
|
||||
sys.stdout.write("Compilation of test %s failed \n" % filename)
|
||||
if output != "":
|
||||
print "%s" % output
|
||||
sys.stdout.write("%s" % output)
|
||||
return (1, 0)
|
||||
|
||||
(return_code, output) = run_command(run_cmd)
|
||||
@@ -139,11 +156,11 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
|
||||
surprise = ((expect_failure and not run_failed) or
|
||||
(not expect_failure and run_failed))
|
||||
if surprise == True:
|
||||
print "Test %s %s (return code %d) " % \
|
||||
sys.stderr.write("Test %s %s (return code %d) \n" % \
|
||||
(filename, "unexpectedly passed" if expect_failure else "failed",
|
||||
return_code)
|
||||
return_code))
|
||||
if output != "":
|
||||
print "%s" % output
|
||||
sys.stdout.write("%s\n" % output)
|
||||
if surprise == True:
|
||||
return (0, 1)
|
||||
else:
|
||||
@@ -160,7 +177,7 @@ def run_test(filename):
|
||||
# is this a test to make sure an error is issued?
|
||||
want_error = (filename.find("tests_errors") != -1)
|
||||
if want_error == True:
|
||||
ispc_cmd = "ispc --werror --nowrap %s --arch=%s --target=%s" % \
|
||||
ispc_cmd = ispc_exe + " --werror --nowrap %s --arch=%s --target=%s" % \
|
||||
(input_prefix + filename, options.arch, options.target)
|
||||
(return_code, output) = run_command(ispc_cmd)
|
||||
got_error = (return_code != 0)
|
||||
@@ -168,18 +185,17 @@ def run_test(filename):
|
||||
# figure out the error message we're expecting
|
||||
file = open(input_prefix + filename, 'r')
|
||||
firstline = file.readline()
|
||||
firstline = string.replace(firstline, "//", "")
|
||||
firstline = string.lstrip(firstline)
|
||||
firstline = string.rstrip(firstline)
|
||||
firstline = firstline.replace("//", "")
|
||||
firstline = firstline.lstrip()
|
||||
firstline = firstline.rstrip()
|
||||
file.close()
|
||||
|
||||
if (output.find(firstline) == -1):
|
||||
print "OUT %s" % filename
|
||||
print "Didnt see expected error message %s from test %s.\nActual output:\n%s" % \
|
||||
(firstline, filename, output)
|
||||
sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
|
||||
(firstline, filename, output))
|
||||
return (1, 0)
|
||||
elif got_error == False:
|
||||
print "Unexpectedly no errors issued from test %s" % filename
|
||||
sys.stderr.write("Unexpectedly no errors issued from test %s\n" % filename)
|
||||
return (1, 0)
|
||||
else:
|
||||
return (0, 0)
|
||||
@@ -199,17 +215,17 @@ def run_test(filename):
|
||||
continue
|
||||
# one of them should have a function with one of the
|
||||
# declarations in sig2def
|
||||
for pattern, ident in sig2def.items():
|
||||
for pattern, ident in list(sig2def.items()):
|
||||
if line.find(pattern) != -1:
|
||||
match = ident
|
||||
break
|
||||
file.close()
|
||||
if match == -1:
|
||||
print "Fatal error: unable to find function signature " + \
|
||||
"in test %s" % filename
|
||||
sys.stderr.write("Fatal error: unable to find function signature " + \
|
||||
"in test %s\n" % filename)
|
||||
return (1, 0)
|
||||
else:
|
||||
is_generic_target = options.target.find("generic-") != -1
|
||||
global is_generic_target
|
||||
if is_generic_target:
|
||||
obj_name = "%s.cpp" % filename
|
||||
|
||||
@@ -218,7 +234,7 @@ def run_test(filename):
|
||||
obj_name = "%s%s.obj" % (input_prefix, filename)
|
||||
exe_name = "%s%s.exe" % (input_prefix, filename)
|
||||
|
||||
cc_cmd = "%s /I. /Zi /nologo /DTEST_SIG=%d %stest_static.cpp %s /Fe%s" % \
|
||||
cc_cmd = "%s /I. /I../winstuff /Zi /nologo /DTEST_SIG=%d %stest_static.cpp %s /Fe%s" % \
|
||||
(options.compiler_exe, match, input_prefix, obj_name, exe_name)
|
||||
if should_fail:
|
||||
cc_cmd += " /DEXPECT_FAILURE"
|
||||
@@ -238,7 +254,7 @@ def run_test(filename):
|
||||
if should_fail:
|
||||
cc_cmd += " -DEXPECT_FAILURE"
|
||||
|
||||
ispc_cmd = "ispc --woff %s -o %s --arch=%s --target=%s" % \
|
||||
ispc_cmd = ispc_exe + " --woff %s -o %s --arch=%s --target=%s" % \
|
||||
(input_prefix+filename, obj_name, options.arch, options.target)
|
||||
if options.no_opt:
|
||||
ispc_cmd += " -O0"
|
||||
@@ -246,17 +262,17 @@ def run_test(filename):
|
||||
ispc_cmd += " --emit-c++ --c++-include-file=%s" % options.include_file
|
||||
|
||||
# compile the ispc code, make the executable, and run it...
|
||||
global valgrind_cmd
|
||||
(compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd],
|
||||
valgrind_cmd + " " + exe_name, \
|
||||
options.wrapexe + " " + exe_name, \
|
||||
filename, should_fail)
|
||||
|
||||
# clean up after running the test
|
||||
try:
|
||||
if not run_error:
|
||||
os.unlink(exe_name)
|
||||
if is_windows:
|
||||
os.unlink(filename + ".pdb")
|
||||
os.unlink(filename + ".ilk")
|
||||
os.unlink("%s%s.pdb" % (input_prefix, filename))
|
||||
os.unlink("%s%s.ilk" % (input_prefix, filename))
|
||||
os.unlink(obj_name)
|
||||
except:
|
||||
None
|
||||
@@ -297,11 +313,8 @@ def run_tasks_from_queue(queue, queue_ret):
|
||||
if run_error != 0:
|
||||
run_error_files += [ filename ]
|
||||
|
||||
# If not for http://bugs.python.org/issue5261 on OSX, we'd like to do this:
|
||||
#with finished_tests_counter_lock:
|
||||
#update_progress(filename)
|
||||
# but instead we do this...
|
||||
finished_tests_mutex.lock(update_progress, filename)
|
||||
with finished_tests_counter_lock:
|
||||
update_progress(filename)
|
||||
|
||||
task_threads = []
|
||||
|
||||
@@ -315,12 +328,15 @@ if __name__ == '__main__':
|
||||
|
||||
compile_error_files = [ ]
|
||||
run_error_files = [ ]
|
||||
nthreads = multiprocessing.cpu_count()
|
||||
print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)
|
||||
|
||||
nthreads = min(multiprocessing.cpu_count(), options.num_jobs)
|
||||
sys.stdout.write("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests))
|
||||
|
||||
# put each of the test filenames into a queue
|
||||
q = multiprocessing.Queue()
|
||||
for fn in files:
|
||||
if is_windows:
|
||||
fn = fn.replace("\\",'/')
|
||||
q.put(fn)
|
||||
for x in range(nthreads):
|
||||
q.put('STOP')
|
||||
@@ -340,7 +356,7 @@ if __name__ == '__main__':
|
||||
# (i.e. return 0 if all is ok)
|
||||
for t in task_threads:
|
||||
t.join()
|
||||
print
|
||||
sys.stdout.write("\n")
|
||||
|
||||
while not qret.empty():
|
||||
(c, r) = qret.get()
|
||||
@@ -349,13 +365,13 @@ if __name__ == '__main__':
|
||||
|
||||
if len(compile_error_files) > 0:
|
||||
compile_error_files.sort()
|
||||
print "%d / %d tests FAILED compilation:" % (len(compile_error_files), total_tests)
|
||||
sys.stdout.write("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests))
|
||||
for f in compile_error_files:
|
||||
print "\t%s" % f
|
||||
sys.stdout.write("\t%s\n" % f)
|
||||
if len(run_error_files) > 0:
|
||||
run_error_files.sort()
|
||||
print "%d / %d tests FAILED execution:" % (len(run_error_files), total_tests)
|
||||
sys.stdout.write("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests))
|
||||
for f in run_error_files:
|
||||
print "\t%s" % f
|
||||
sys.stdout.write("\t%s\n" % f)
|
||||
|
||||
sys.exit(len(compile_error_files) + len(run_error_files))
|
||||
|
||||
676
stdlib.ispc
676
stdlib.ispc
@@ -787,165 +787,14 @@ packed_store_active(uniform int * uniform a, int vals) {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// System information
|
||||
|
||||
static inline int num_cores() {
|
||||
static inline uniform int num_cores() {
|
||||
return __num_cores();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Atomics and memory barriers
|
||||
|
||||
static inline void memory_barrier() {
|
||||
__memory_barrier();
|
||||
static inline uniform int64 clock() {
|
||||
return __clock();
|
||||
}
|
||||
|
||||
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
memory_barrier(); \
|
||||
TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
|
||||
(MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
memory_barrier(); \
|
||||
TA ret; \
|
||||
uniform int mask = lanemask(); \
|
||||
for (uniform int i = 0; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
uniform TA * uniform p = ptrArray[i]; \
|
||||
uniform TA v = extract(value, i); \
|
||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
|
||||
(MASKTYPE)__mask); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
|
||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE) \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
uniform TA oneval = reduce_##OPA(value); \
|
||||
TA ret; \
|
||||
if (lanemask() != 0) { \
|
||||
memory_barrier(); \
|
||||
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, \
|
||||
(MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
|
||||
(MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
||||
TA value) { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
memory_barrier(); \
|
||||
TA ret; \
|
||||
uniform int mask = lanemask(); \
|
||||
for (uniform int i = 0; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
uniform TA * uniform p = ptrArray[i]; \
|
||||
uniform TA v = extract(value, i); \
|
||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
|
||||
(MASKTYPE)__mask); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,swap,swap,IntMaskType)
|
||||
|
||||
// For everything but atomic min and max, we can use the same
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,UIntMaskType)
|
||||
|
||||
DEFINE_ATOMIC_OP(float,float,swap,swap,IntMaskType)
|
||||
|
||||
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,swap,swap,IntMaskType)
|
||||
|
||||
// For everything but atomic min and max, we can use the same
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,UIntMaskType)
|
||||
|
||||
DEFINE_ATOMIC_OP(double,double,swap,swap,IntMaskType)
|
||||
|
||||
#undef DEFINE_ATOMIC_OP
|
||||
|
||||
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \
|
||||
static inline TA atomic_compare_exchange_global( \
|
||||
uniform TA * uniform ptr, TA oldval, TA newval) { \
|
||||
memory_barrier(); \
|
||||
TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, \
|
||||
(MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline uniform TA atomic_compare_exchange_global( \
|
||||
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = \
|
||||
__atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, \
|
||||
(MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
|
||||
|
||||
#undef ATOMIC_DECL_CMPXCHG
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Floating-Point Math
|
||||
|
||||
@@ -1329,6 +1178,419 @@ static inline uniform int64 clamp(uniform int64 v, uniform int64 low,
|
||||
return min(max(v, low), high);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Global atomics and memory barriers
|
||||
|
||||
static inline void memory_barrier() {
|
||||
__memory_barrier();
|
||||
}
|
||||
|
||||
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
memory_barrier(); \
|
||||
TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
memory_barrier(); \
|
||||
TA ret; \
|
||||
uniform int mask = lanemask(); \
|
||||
for (uniform int i = 0; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
uniform TA * uniform p = ptrArray[i]; \
|
||||
uniform TA v = extract(value, i); \
|
||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
|
||||
#define DEFINE_ATOMIC_SWAP(TA,TB) \
|
||||
static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform int i = 0; \
|
||||
TA ret[programCount]; \
|
||||
TA memVal; \
|
||||
uniform int lastSwap; \
|
||||
uniform int mask = lanemask(); \
|
||||
/* First, have the first running program instance (if any) perform \
|
||||
the swap with memory with its value of "value"; record the \
|
||||
value returned. */ \
|
||||
for (; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
|
||||
lastSwap = i; \
|
||||
break; \
|
||||
} \
|
||||
/* Now, for all of the remaining running program instances, set the \
|
||||
return value of the last instance that did a swap with this \
|
||||
instance's value of "value"; this gives the same effect as if the \
|
||||
current instance had executed a hardware atomic swap right before \
|
||||
the last one that did a swap. */ \
|
||||
for (; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
ret[lastSwap] = extract(value, i); \
|
||||
lastSwap = i; \
|
||||
} \
|
||||
/* And the last instance that wanted to swap gets the value we \
|
||||
originally got back from memory... */ \
|
||||
ret[lastSwap] = memVal; \
|
||||
memory_barrier(); \
|
||||
return ret[programIndex]; \
|
||||
} \
|
||||
static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
memory_barrier(); \
|
||||
TA ret; \
|
||||
uniform int mask = lanemask(); \
|
||||
for (uniform int i = 0; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
uniform TA * uniform p = ptrArray[i]; \
|
||||
uniform TA v = extract(value, i); \
|
||||
uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
|
||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
uniform TA oneval = reduce_##OPA(value); \
|
||||
TA ret; \
|
||||
if (lanemask() != 0) { \
|
||||
memory_barrier(); \
|
||||
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval); \
|
||||
memory_barrier(); \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
||||
TA value) { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
memory_barrier(); \
|
||||
TA ret; \
|
||||
uniform int mask = lanemask(); \
|
||||
for (uniform int i = 0; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
uniform TA * uniform p = ptrArray[i]; \
|
||||
uniform TA v = extract(value, i); \
|
||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
|
||||
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(int32,int32)
|
||||
|
||||
// For everything but atomic min and max, we can use the same
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(unsigned int32,int32)
|
||||
|
||||
DEFINE_ATOMIC_SWAP(float,float)
|
||||
|
||||
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
|
||||
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(int64,int64)
|
||||
|
||||
// For everything but atomic min and max, we can use the same
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(unsigned int64,int64)
|
||||
|
||||
DEFINE_ATOMIC_SWAP(double,double)
|
||||
|
||||
#undef DEFINE_ATOMIC_OP
|
||||
#undef DEFINE_ATOMIC_MINMAX_OP
|
||||
#undef DEFINE_ATOMIC_SWAP
|
||||
|
||||
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \
|
||||
static inline uniform TA atomic_compare_exchange_global( \
|
||||
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = \
|
||||
__atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_compare_exchange_global( \
|
||||
uniform TA * uniform ptr, TA oldval, TA newval) { \
|
||||
memory_barrier(); \
|
||||
TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, \
|
||||
(MASKTYPE)__mask); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_compare_exchange_global( \
|
||||
uniform TA * varying ptr, TA oldval, TA newval) { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
memory_barrier(); \
|
||||
TA ret; \
|
||||
uniform int mask = lanemask(); \
|
||||
for (uniform int i = 0; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
uniform TA r = \
|
||||
__atomic_compare_exchange_uniform_##TB##_global(ptrArray[i], \
|
||||
extract(oldval, i), \
|
||||
extract(newval, i)); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
|
||||
|
||||
#undef ATOMIC_DECL_CMPXCHG
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// local atomics
|
||||
|
||||
#define LOCAL_ATOMIC(TYPE,NAME,OPFUNC) \
|
||||
static inline uniform TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, \
|
||||
uniform TYPE value) { \
|
||||
uniform TYPE ret = *ptr; \
|
||||
*ptr = OPFUNC(*ptr, value); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value) { \
|
||||
TYPE ret; \
|
||||
uniform int mask = lanemask(); \
|
||||
for (uniform int i = 0; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
ret = insert(ret, i, *ptr); \
|
||||
*ptr = OPFUNC(*ptr, extract(value, i)); \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) { \
|
||||
TYPE ret; \
|
||||
uniform TYPE * uniform ptrs[programCount]; \
|
||||
ptrs[programIndex] = p; \
|
||||
uniform int mask = lanemask(); \
|
||||
for (uniform int i = 0; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
ret = insert(ret, i, *ptrs[i]); \
|
||||
*ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \
|
||||
} \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
static inline uniform int32 __add(uniform int32 a, uniform int32 b) { return a+b; }
|
||||
static inline uniform int32 __sub(uniform int32 a, uniform int32 b) { return a-b; }
|
||||
static inline uniform int32 __and(uniform int32 a, uniform int32 b) { return a & b; }
|
||||
static inline uniform int32 __or(uniform int32 a, uniform int32 b) { return a | b; }
|
||||
static inline uniform int32 __xor(uniform int32 a, uniform int32 b) { return a ^ b; }
|
||||
static inline uniform int32 __swap(uniform int32 a, uniform int32 b) { return b; }
|
||||
|
||||
static inline uniform unsigned int32 __add(uniform unsigned int32 a,
|
||||
uniform unsigned int32 b) { return a+b; }
|
||||
static inline uniform unsigned int32 __sub(uniform unsigned int32 a,
|
||||
uniform unsigned int32 b) { return a-b; }
|
||||
static inline uniform unsigned int32 __and(uniform unsigned int32 a,
|
||||
uniform unsigned int32 b) { return a & b; }
|
||||
static inline uniform unsigned int32 __or(uniform unsigned int32 a,
|
||||
uniform unsigned int32 b) { return a | b; }
|
||||
static inline uniform unsigned int32 __xor(uniform unsigned int32 a,
|
||||
uniform unsigned int32 b) { return a ^ b; }
|
||||
static inline uniform unsigned int32 __swap(uniform unsigned int32 a,
|
||||
uniform unsigned int32 b) { return b; }
|
||||
|
||||
|
||||
static inline uniform float __add(uniform float a, uniform float b) { return a+b; }
|
||||
static inline uniform float __sub(uniform float a, uniform float b) { return a-b; }
|
||||
static inline uniform float __swap(uniform float a, uniform float b) { return b; }
|
||||
|
||||
static inline uniform int64 __add(uniform int64 a, uniform int64 b) { return a+b; }
|
||||
static inline uniform int64 __sub(uniform int64 a, uniform int64 b) { return a-b; }
|
||||
static inline uniform int64 __and(uniform int64 a, uniform int64 b) { return a & b; }
|
||||
static inline uniform int64 __or(uniform int64 a, uniform int64 b) { return a | b; }
|
||||
static inline uniform int64 __xor(uniform int64 a, uniform int64 b) { return a ^ b; }
|
||||
static inline uniform int64 __swap(uniform int64 a, uniform int64 b) { return b; }
|
||||
|
||||
static inline uniform unsigned int64 __add(uniform unsigned int64 a,
|
||||
uniform unsigned int64 b) { return a+b; }
|
||||
static inline uniform unsigned int64 __sub(uniform unsigned int64 a,
|
||||
uniform unsigned int64 b) { return a-b; }
|
||||
static inline uniform unsigned int64 __and(uniform unsigned int64 a,
|
||||
uniform unsigned int64 b) { return a & b; }
|
||||
static inline uniform unsigned int64 __or(uniform unsigned int64 a,
|
||||
uniform unsigned int64 b) { return a | b; }
|
||||
static inline uniform unsigned int64 __xor(uniform unsigned int64 a,
|
||||
uniform unsigned int64 b) { return a ^ b; }
|
||||
static inline uniform unsigned int64 __swap(uniform unsigned int64 a,
|
||||
uniform unsigned int64 b) { return b; }
|
||||
|
||||
static inline uniform double __add(uniform double a, uniform double b) { return a+b; }
|
||||
static inline uniform double __sub(uniform double a, uniform double b) { return a-b; }
|
||||
static inline uniform double __swap(uniform double a, uniform double b) { return a-b; }
|
||||
|
||||
LOCAL_ATOMIC(int32, add, __add)
|
||||
LOCAL_ATOMIC(int32, subtract, __sub)
|
||||
LOCAL_ATOMIC(int32, and, __and)
|
||||
LOCAL_ATOMIC(int32, or, __or)
|
||||
LOCAL_ATOMIC(int32, xor, __xor)
|
||||
LOCAL_ATOMIC(int32, min, min)
|
||||
LOCAL_ATOMIC(int32, max, max)
|
||||
LOCAL_ATOMIC(int32, swap, __swap)
|
||||
|
||||
LOCAL_ATOMIC(unsigned int32, add, __add)
|
||||
LOCAL_ATOMIC(unsigned int32, subtract, __sub)
|
||||
LOCAL_ATOMIC(unsigned int32, and, __and)
|
||||
LOCAL_ATOMIC(unsigned int32, or, __or)
|
||||
LOCAL_ATOMIC(unsigned int32, xor, __xor)
|
||||
LOCAL_ATOMIC(unsigned int32, min, min)
|
||||
LOCAL_ATOMIC(unsigned int32, max, max)
|
||||
LOCAL_ATOMIC(unsigned int32, swap, __swap)
|
||||
|
||||
LOCAL_ATOMIC(float, add, __add)
|
||||
LOCAL_ATOMIC(float, subtract, __sub)
|
||||
LOCAL_ATOMIC(float, min, min)
|
||||
LOCAL_ATOMIC(float, max, max)
|
||||
LOCAL_ATOMIC(float, swap, __swap)
|
||||
|
||||
LOCAL_ATOMIC(int64, add, __add)
|
||||
LOCAL_ATOMIC(int64, subtract, __sub)
|
||||
LOCAL_ATOMIC(int64, and, __and)
|
||||
LOCAL_ATOMIC(int64, or, __or)
|
||||
LOCAL_ATOMIC(int64, xor, __xor)
|
||||
LOCAL_ATOMIC(int64, min, min)
|
||||
LOCAL_ATOMIC(int64, max, max)
|
||||
LOCAL_ATOMIC(int64, swap, __swap)
|
||||
|
||||
LOCAL_ATOMIC(unsigned int64, add, __add)
|
||||
LOCAL_ATOMIC(unsigned int64, subtract, __sub)
|
||||
LOCAL_ATOMIC(unsigned int64, and, __and)
|
||||
LOCAL_ATOMIC(unsigned int64, or, __or)
|
||||
LOCAL_ATOMIC(unsigned int64, xor, __xor)
|
||||
LOCAL_ATOMIC(unsigned int64, min, min)
|
||||
LOCAL_ATOMIC(unsigned int64, max, max)
|
||||
LOCAL_ATOMIC(unsigned int64, swap, __swap)
|
||||
|
||||
LOCAL_ATOMIC(double, add, __add)
|
||||
LOCAL_ATOMIC(double, subtract, __sub)
|
||||
LOCAL_ATOMIC(double, min, min)
|
||||
LOCAL_ATOMIC(double, max, max)
|
||||
LOCAL_ATOMIC(double, swap, __swap)
|
||||
|
||||
// compare exchange
|
||||
#define LOCAL_CMPXCHG(TYPE) \
|
||||
static inline uniform TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
|
||||
uniform TYPE cmp, \
|
||||
uniform TYPE update) { \
|
||||
uniform TYPE old = *ptr; \
|
||||
if (old == cmp) \
|
||||
*ptr = update; \
|
||||
return old; \
|
||||
} \
|
||||
static inline TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
|
||||
TYPE cmp, TYPE update) { \
|
||||
TYPE ret; \
|
||||
uniform int mask = lanemask(); \
|
||||
for (uniform int i = 0; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
uniform TYPE old = *ptr; \
|
||||
if (old == extract(cmp, i)) \
|
||||
*ptr = extract(update, i); \
|
||||
ret = insert(ret, i, old); \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TYPE atomic_compare_exchange_local(uniform TYPE * varying p, \
|
||||
TYPE cmp, TYPE update) { \
|
||||
uniform TYPE * uniform ptrs[programCount]; \
|
||||
ptrs[programIndex] = p; \
|
||||
TYPE ret; \
|
||||
uniform int mask = lanemask(); \
|
||||
for (uniform int i = 0; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
uniform TYPE old = *ptrs[i]; \
|
||||
if (old == extract(cmp, i)) \
|
||||
*ptrs[i] = extract(update, i); \
|
||||
ret = insert(ret, i, old); \
|
||||
} \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
LOCAL_CMPXCHG(int32)
|
||||
LOCAL_CMPXCHG(unsigned int32)
|
||||
LOCAL_CMPXCHG(float)
|
||||
LOCAL_CMPXCHG(int64)
|
||||
LOCAL_CMPXCHG(unsigned int64)
|
||||
LOCAL_CMPXCHG(double)
|
||||
|
||||
#undef LOCAL_ATOMIC
|
||||
#undef LOCAL_CMPXCHG
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Transcendentals (float precision)
|
||||
|
||||
@@ -2764,6 +3026,10 @@ static inline uniform double pow(uniform double a, uniform double b) {
|
||||
// half-precision floats
|
||||
|
||||
static inline uniform float half_to_float(uniform unsigned int16 h) {
|
||||
if (__have_native_half) {
|
||||
return __half_to_float_uniform(h);
|
||||
}
|
||||
else {
|
||||
if ((h & 0x7FFFu) == 0)
|
||||
// Signed zero
|
||||
return floatbits(((unsigned int32) h) << 16);
|
||||
@@ -2819,8 +3085,13 @@ static inline uniform float half_to_float(uniform unsigned int16 h) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline float half_to_float(unsigned int16 h) {
|
||||
if (__have_native_half) {
|
||||
return __half_to_float_varying(h);
|
||||
}
|
||||
else {
|
||||
if ((h & 0x7FFFu) == 0)
|
||||
// Signed zero
|
||||
return floatbits(((unsigned int32) h) << 16);
|
||||
@@ -2876,9 +3147,14 @@ static inline float half_to_float(unsigned int16 h) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline uniform int16 float_to_half(uniform float f) {
|
||||
if (__have_native_half) {
|
||||
return __float_to_half_uniform(f);
|
||||
}
|
||||
else {
|
||||
uniform int32 x = intbits(f);
|
||||
// Store the return value in an int32 until the very end; this ends up
|
||||
// generating better code...
|
||||
@@ -2942,9 +3218,14 @@ static inline uniform int16 float_to_half(uniform float f) {
|
||||
}
|
||||
return (int16)ret;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline int16 float_to_half(float f) {
|
||||
if (__have_native_half) {
|
||||
return __float_to_half_varying(f);
|
||||
}
|
||||
else {
|
||||
int32 x = intbits(f);
|
||||
// Store the return value in an int32 until the very end; this ends up
|
||||
// generating better code...
|
||||
@@ -3008,9 +3289,14 @@ static inline int16 float_to_half(float f) {
|
||||
}
|
||||
return (int16)ret;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
|
||||
if (__have_native_half) {
|
||||
return __half_to_float_uniform(h);
|
||||
}
|
||||
else {
|
||||
uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
||||
uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
||||
uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
||||
@@ -3024,10 +3310,14 @@ static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
|
||||
// Mantissa
|
||||
uniform unsigned int32 xm = ((unsigned int32) hm) << 13;
|
||||
return floatbits(xs | xe | xm);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static inline float half_to_float_fast(unsigned int16 h) {
|
||||
if (__have_native_half) {
|
||||
return __half_to_float_varying(h);
|
||||
}
|
||||
else {
|
||||
unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
||||
unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
||||
unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
||||
@@ -3041,10 +3331,14 @@ static inline float half_to_float_fast(unsigned int16 h) {
|
||||
// Mantissa
|
||||
unsigned int32 xm = ((unsigned int32) hm) << 13;
|
||||
return floatbits(xs | xe | xm);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static inline uniform int16 float_to_half_fast(uniform float f) {
|
||||
if (__have_native_half) {
|
||||
return __float_to_half_uniform(f);
|
||||
}
|
||||
else {
|
||||
uniform int32 x = intbits(f);
|
||||
uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
||||
uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
||||
@@ -3063,8 +3357,13 @@ static inline uniform int16 float_to_half_fast(uniform float f) {
|
||||
|
||||
return (int16)ret;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int16 float_to_half_fast(float f) {
|
||||
if (__have_native_half) {
|
||||
return __float_to_half_varying(f);
|
||||
}
|
||||
else {
|
||||
int32 x = intbits(f);
|
||||
unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
||||
unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
||||
@@ -3083,6 +3382,7 @@ static inline int16 float_to_half_fast(float f) {
|
||||
|
||||
return (int16)ret;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// RNG stuff
|
||||
@@ -3095,16 +3395,15 @@ static inline unsigned int random(RNGState * uniform state)
|
||||
{
|
||||
unsigned int b;
|
||||
|
||||
// FIXME: state->z1, etc..
|
||||
b = (((*state).z1 << 6) ^ (*state).z1) >> 13;
|
||||
(*state).z1 = (((*state).z1 & 4294967294U) << 18) ^ b;
|
||||
b = (((*state).z2 << 2) ^ (*state).z2) >> 27;
|
||||
(*state).z2 = (((*state).z2 & 4294967288U) << 2) ^ b;
|
||||
b = (((*state).z3 << 13) ^ (*state).z3) >> 21;
|
||||
(*state).z3 = (((*state).z3 & 4294967280U) << 7) ^ b;
|
||||
b = (((*state).z4 << 3) ^ (*state).z4) >> 12;
|
||||
(*state).z4 = (((*state).z4 & 4294967168U) << 13) ^ b;
|
||||
return ((*state).z1 ^ (*state).z2 ^ (*state).z3 ^ (*state).z4);
|
||||
b = ((state->z1 << 6) ^ state->z1) >> 13;
|
||||
state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
|
||||
b = ((state->z2 << 2) ^ state->z2) >> 27;
|
||||
state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
|
||||
b = ((state->z3 << 13) ^ state->z3) >> 21;
|
||||
state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
|
||||
b = ((state->z4 << 3) ^ state->z4) >> 12;
|
||||
state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
|
||||
return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
|
||||
}
|
||||
|
||||
static inline float frandom(RNGState * uniform state)
|
||||
@@ -3120,35 +3419,43 @@ static inline uniform unsigned int __seed4(RNGState * uniform state,
|
||||
uniform unsigned int c1 = 0xf0f0f0f0;
|
||||
uniform unsigned int c2 = 0x0f0f0f0f;
|
||||
|
||||
(*state).z1 = insert((*state).z1, start + 0, seed);
|
||||
(*state).z1 = insert((*state).z1, start + 1, seed ^ c1);
|
||||
(*state).z1 = insert((*state).z1, start + 2, (seed << 3) ^ c1);
|
||||
(*state).z1 = insert((*state).z1, start + 3, (seed << 2) ^ c2);
|
||||
state->z1 = insert(state->z1, start + 0, seed);
|
||||
state->z1 = insert(state->z1, start + 1, seed ^ c1);
|
||||
state->z1 = insert(state->z1, start + 2, (seed << 3) ^ c1);
|
||||
state->z1 = insert(state->z1, start + 3, (seed << 2) ^ c2);
|
||||
|
||||
seed += 131;
|
||||
(*state).z2 = insert((*state).z2, start + 0, seed);
|
||||
(*state).z2 = insert((*state).z2, start + 1, seed ^ c1);
|
||||
(*state).z2 = insert((*state).z2, start + 2, (seed << 3) ^ c1);
|
||||
(*state).z2 = insert((*state).z2, start + 3, (seed << 2) ^ c2);
|
||||
state->z2 = insert(state->z2, start + 0, seed);
|
||||
state->z2 = insert(state->z2, start + 1, seed ^ c1);
|
||||
state->z2 = insert(state->z2, start + 2, (seed << 3) ^ c1);
|
||||
state->z2 = insert(state->z2, start + 3, (seed << 2) ^ c2);
|
||||
|
||||
seed ^= extract((*state).z2, 2);
|
||||
(*state).z3 = insert((*state).z3, start + 0, seed);
|
||||
(*state).z3 = insert((*state).z3, start + 1, seed ^ c1);
|
||||
(*state).z3 = insert((*state).z3, start + 2, (seed << 3) ^ c1);
|
||||
(*state).z3 = insert((*state).z3, start + 3, (seed << 2) ^ c2);
|
||||
seed ^= extract(state->z2, 2);
|
||||
state->z3 = insert(state->z3, start + 0, seed);
|
||||
state->z3 = insert(state->z3, start + 1, seed ^ c1);
|
||||
state->z3 = insert(state->z3, start + 2, (seed << 3) ^ c1);
|
||||
state->z3 = insert(state->z3, start + 3, (seed << 2) ^ c2);
|
||||
|
||||
seed <<= 4;
|
||||
seed += 3;
|
||||
seed ^= extract((*state).z1, 3);
|
||||
(*state).z4 = insert((*state).z4, start + 0, seed);
|
||||
(*state).z4 = insert((*state).z4, start + 1, seed ^ c1);
|
||||
(*state).z4 = insert((*state).z4, start + 2, (seed << 3) ^ c1);
|
||||
(*state).z4 = insert((*state).z4, start + 3, (seed << 2) ^ c2);
|
||||
seed ^= extract(state->z1, 3);
|
||||
state->z4 = insert(state->z4, start + 0, seed);
|
||||
state->z4 = insert(state->z4, start + 1, seed ^ c1);
|
||||
state->z4 = insert(state->z4, start + 2, (seed << 3) ^ c1);
|
||||
state->z4 = insert(state->z4, start + 3, (seed << 2) ^ c2);
|
||||
|
||||
return seed;
|
||||
}
|
||||
|
||||
static inline void seed_rng(uniform RNGState * uniform state, uniform unsigned int seed) {
|
||||
if (programCount == 1) {
|
||||
state->z1 = seed;
|
||||
state->z2 = seed ^ 0xbeeff00d;
|
||||
state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
|
||||
state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00) << 8) |
|
||||
((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
|
||||
}
|
||||
else {
|
||||
seed = __seed4(state, 0, seed);
|
||||
if (programCount == 8)
|
||||
__seed4(state, 4, seed ^ 0xbeeff00d);
|
||||
@@ -3159,6 +3466,7 @@ static inline void seed_rng(uniform RNGState * uniform state, uniform unsigned i
|
||||
((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void fastmath() {
|
||||
__fastmath();
|
||||
|
||||
@@ -4,11 +4,15 @@ import sys
|
||||
|
||||
t=str(sys.argv[1])
|
||||
|
||||
print "char stdlib_" + t + "_code[] = { "
|
||||
sys.stdout.write("char stdlib_" + t + "_code[] = {\n")
|
||||
|
||||
for line in sys.stdin:
|
||||
for c in line:
|
||||
print ord(c)
|
||||
print ", "
|
||||
width = 16
|
||||
data = sys.stdin.read()
|
||||
for i in range(0, len(data), 1):
|
||||
sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
|
||||
|
||||
if i%width == (width-1):
|
||||
sys.stdout.write("\n")
|
||||
|
||||
sys.stdout.write("0x00 };\n\n")
|
||||
|
||||
print "0 };"
|
||||
|
||||
76
stmt.h
76
stmt.h
@@ -282,6 +282,60 @@ public:
|
||||
};
|
||||
|
||||
|
||||
/** Statement corresponding to a "case" label in the program. In addition
|
||||
to the value associated with the "case", this statement also stores the
|
||||
statements following it. */
|
||||
class CaseStmt : public Stmt {
|
||||
public:
|
||||
CaseStmt(int value, Stmt *stmt, SourcePos pos);
|
||||
|
||||
void EmitCode(FunctionEmitContext *ctx) const;
|
||||
void Print(int indent) const;
|
||||
|
||||
Stmt *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
/** Integer value after the "case" statement */
|
||||
const int value;
|
||||
Stmt *stmts;
|
||||
};
|
||||
|
||||
|
||||
/** Statement for a "default" label (as would be found inside a "switch"
|
||||
statement). */
|
||||
class DefaultStmt : public Stmt {
|
||||
public:
|
||||
DefaultStmt(Stmt *stmt, SourcePos pos);
|
||||
|
||||
void EmitCode(FunctionEmitContext *ctx) const;
|
||||
void Print(int indent) const;
|
||||
|
||||
Stmt *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
Stmt *stmts;
|
||||
};
|
||||
|
||||
|
||||
/** A "switch" statement in the program. */
|
||||
class SwitchStmt : public Stmt {
|
||||
public:
|
||||
SwitchStmt(Expr *expr, Stmt *stmts, SourcePos pos);
|
||||
|
||||
void EmitCode(FunctionEmitContext *ctx) const;
|
||||
void Print(int indent) const;
|
||||
|
||||
Stmt *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
/** Expression that is used to determine which label to jump to. */
|
||||
Expr *expr;
|
||||
/** Statement block after the "switch" expression. */
|
||||
Stmt *stmts;
|
||||
};
|
||||
|
||||
|
||||
/** A "goto" in an ispc program. */
|
||||
class GotoStmt : public Stmt {
|
||||
public:
|
||||
GotoStmt(const char *label, SourcePos gotoPos, SourcePos idPos);
|
||||
@@ -293,11 +347,14 @@ public:
|
||||
Stmt *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
/** Name of the label to jump to when the goto is executed. */
|
||||
std::string label;
|
||||
SourcePos identifierPos;
|
||||
};
|
||||
|
||||
|
||||
/** Statement corresponding to a label (as would be used as a goto target)
|
||||
in the program. */
|
||||
class LabeledStmt : public Stmt {
|
||||
public:
|
||||
LabeledStmt(const char *label, Stmt *stmt, SourcePos p);
|
||||
@@ -309,7 +366,9 @@ public:
|
||||
Stmt *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
/** Name of the label. */
|
||||
std::string name;
|
||||
/** Statements following the label. */
|
||||
Stmt *stmt;
|
||||
};
|
||||
|
||||
@@ -383,4 +442,21 @@ public:
|
||||
Expr *expr;
|
||||
};
|
||||
|
||||
|
||||
/** Representation of a delete statement in the program.
|
||||
*/
|
||||
class DeleteStmt : public Stmt {
|
||||
public:
|
||||
DeleteStmt(Expr *e, SourcePos p);
|
||||
|
||||
void EmitCode(FunctionEmitContext *ctx) const;
|
||||
void Print(int indent) const;
|
||||
|
||||
Stmt *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
/** Expression that gives the pointer value to be deleted. */
|
||||
Expr *expr;
|
||||
};
|
||||
|
||||
#endif // ISPC_STMT_H
|
||||
|
||||
@@ -15,7 +15,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
if (programCount == 4)
|
||||
if (programCount == 1)
|
||||
RET[programIndex] = 1;
|
||||
else if (programCount == 4)
|
||||
RET[programIndex] = 5.;
|
||||
else
|
||||
RET[programIndex] = 10.;
|
||||
|
||||
@@ -3,13 +3,13 @@ export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float a[programCount];
|
||||
for (unsigned int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
uniform float a[programCount+4];
|
||||
for (unsigned int i = 0; i < programCount+4; ++i)
|
||||
a[i] = aFOO[min((int)i, programCount)];
|
||||
|
||||
RET[programIndex] = *(a + 2);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 3;
|
||||
RET[programIndex] = (programCount == 1) ? 2 : 3;
|
||||
}
|
||||
|
||||
@@ -14,4 +14,4 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
}
|
||||
|
||||
|
||||
export void result(uniform float RET[]) { RET[programIndex] = 5; }
|
||||
export void result(uniform float RET[]) { RET[programIndex] = programCount == 1 ? 0 : 5; }
|
||||
|
||||
@@ -14,7 +14,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
if (programCount == 4)
|
||||
if (programCount == 1)
|
||||
RET[programIndex] = 0;
|
||||
else if (programCount == 4)
|
||||
RET[programIndex] = 2;
|
||||
else
|
||||
RET[programIndex] = 4;
|
||||
|
||||
@@ -13,5 +13,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 2;
|
||||
RET[programIndex] = programCount == 1 ? 1 : 2;
|
||||
}
|
||||
|
||||
@@ -12,5 +12,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (programCount/2) - 1;
|
||||
RET[programIndex] = programCount == 1 ? 0 : ((programCount/2) - 1);
|
||||
}
|
||||
|
||||
@@ -5,11 +5,11 @@ uniform int32 s = 0xff;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
int32 bits = 0xfffffff0;
|
||||
int32 bits = 0xfff0;
|
||||
float b = atomic_xor_global(&s, bits);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0xff;
|
||||
RET[programIndex] = (programCount & 1) ? 0xff0f : 0xff;
|
||||
}
|
||||
|
||||
@@ -10,5 +10,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 3000;
|
||||
RET[programIndex] = (programCount == 1) ? 2 : 3000;
|
||||
}
|
||||
|
||||
@@ -12,5 +12,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = programCount;
|
||||
RET[programIndex] = (programCount == 1) ? 0 : programCount;
|
||||
}
|
||||
|
||||
@@ -13,5 +13,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1;
|
||||
RET[programIndex] = (programCount == 1) ? 0 : 1;
|
||||
}
|
||||
|
||||
17
tests/atomics-swap.ispc
Normal file
17
tests/atomics-swap.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 1234;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0;
|
||||
if (programIndex & 1) {
|
||||
b = atomic_swap_global(&s, programIndex);
|
||||
}
|
||||
RET[programIndex] = reduce_add(b) + s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1234 + reduce_add(programIndex & 1 ? programIndex : 0);
|
||||
}
|
||||
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
int br = broadcast(a, (uniform int)b-2);
|
||||
int br = (programCount == 1) ? 4 : broadcast(a, (uniform int)b-2);
|
||||
RET[programIndex] = br;
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
int16 a = aFOO[programIndex];
|
||||
int16 b = broadcast(a, 2);
|
||||
int16 b = (programCount == 1) ? 3 : broadcast(a, 2);
|
||||
RET[programIndex] = b;
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int8 a = aFOO[programIndex];
|
||||
int8 br = broadcast(a, (uniform int)b-2);
|
||||
int8 br = (programCount == 1) ? 4 : broadcast(a, (uniform int)b-2);
|
||||
RET[programIndex] = br;
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = broadcast(a, 2);
|
||||
float b = (programCount == 1) ? 3 : broadcast(a, 2);
|
||||
RET[programIndex] = b;
|
||||
}
|
||||
|
||||
|
||||
10
tests/const-fold-select-1.ispc
Normal file
10
tests/const-fold-select-1.ispc
Normal file
@@ -0,0 +1,10 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = (programIndex >= 0) ? 1 : 0;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1;
|
||||
}
|
||||
10
tests/const-fold-select-2.ispc
Normal file
10
tests/const-fold-select-2.ispc
Normal file
@@ -0,0 +1,10 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
RET[programIndex] = (programCount < 10000) ? 1 : 0;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1;
|
||||
}
|
||||
@@ -3,9 +3,9 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
double a = programIndex;
|
||||
RET[programIndex] = extract(a, 3);
|
||||
RET[programIndex] = extract(a, min(programCount-1, 3));
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 3;
|
||||
RET[programIndex] = (programCount == 1) ? 0 : 3;
|
||||
}
|
||||
|
||||
30
tests/foreach-double-1.ispc
Normal file
30
tests/foreach-double-1.ispc
Normal file
@@ -0,0 +1,30 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform double one = 1;
|
||||
|
||||
void copy(uniform double dst[], uniform double src[], uniform int count) {
|
||||
foreach (i = 0 ... count)
|
||||
dst[i] = one * src[i];
|
||||
}
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int count = 200 + aFOO[1];
|
||||
uniform double * uniform src = uniform new uniform double[count];
|
||||
for (uniform int i = 0; i < count; ++i)
|
||||
src[i] = i;
|
||||
|
||||
uniform double * uniform dst = uniform new uniform double[count];
|
||||
copy(dst, src, count);
|
||||
|
||||
uniform int errors = 0;
|
||||
for (uniform int i = 0; i < count; ++i)
|
||||
if (dst[i] != src[i])
|
||||
++errors;
|
||||
|
||||
RET[programIndex] = errors;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
21
tests/half-3.ispc
Normal file
21
tests/half-3.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_v(uniform float RET[]) {
|
||||
int errors = 0;
|
||||
|
||||
foreach (i = 0 ... 65535) {
|
||||
unsigned int16 h = i;
|
||||
float f = half_to_float(i);
|
||||
h = float_to_half(f);
|
||||
|
||||
int mismatches = (f == f && i != h);
|
||||
errors += reduce_add(mismatches);
|
||||
}
|
||||
|
||||
RET[programIndex] = errors;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
15
tests/local-atomics-1.ispc
Normal file
15
tests/local-atomics-1.ispc
Normal file
@@ -0,0 +1,15 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float delta = 1;
|
||||
float b = atomic_add_local(&s, delta);
|
||||
RET[programIndex] = reduce_add(b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = reduce_add(programIndex);
|
||||
}
|
||||
17
tests/local-atomics-10.ispc
Normal file
17
tests/local-atomics-10.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0;
|
||||
float delta = 1;
|
||||
if (programIndex < 2)
|
||||
b = atomic_add_local(&s, delta);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = programCount == 1 ? 1 : 2;
|
||||
}
|
||||
20
tests/local-atomics-11.ispc
Normal file
20
tests/local-atomics-11.ispc
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0;
|
||||
if (programIndex & 1)
|
||||
b = atomic_add_local(&s, programIndex);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int sum = 0;
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
if (i & 1)
|
||||
sum += i;
|
||||
RET[programIndex] = sum;
|
||||
}
|
||||
20
tests/local-atomics-12.ispc
Normal file
20
tests/local-atomics-12.ispc
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0;
|
||||
if (programIndex & 1)
|
||||
b = atomic_or_local(&s, (1 << programIndex));
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int sum = 0;
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
if (i & 1)
|
||||
sum += (1 << i);
|
||||
RET[programIndex] = sum;
|
||||
}
|
||||
16
tests/local-atomics-13.ispc
Normal file
16
tests/local-atomics-13.ispc
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0;
|
||||
if (programIndex & 1)
|
||||
b = atomic_or_local(&s, (1 << programIndex));
|
||||
RET[programIndex] = popcnt(reduce_max((int32)b));
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = programCount == 1 ? 0 : ((programCount/2) - 1);
|
||||
}
|
||||
20
tests/local-atomics-14.ispc
Normal file
20
tests/local-atomics-14.ispc
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int64 s = 0xffffffffff000000;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0;
|
||||
if (programIndex & 1)
|
||||
b = atomic_or_local(&s, (1 << programIndex));
|
||||
RET[programIndex] = (s>>20);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int sum = 0;
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
if (i & 1)
|
||||
sum += (1 << i);
|
||||
RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
|
||||
}
|
||||
15
tests/local-atomics-2.ispc
Normal file
15
tests/local-atomics-2.ispc
Normal file
@@ -0,0 +1,15 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int64 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float delta = 1;
|
||||
float b = atomic_add_local(&s, delta);
|
||||
RET[programIndex] = reduce_add(b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = reduce_add(programIndex);
|
||||
}
|
||||
15
tests/local-atomics-3.ispc
Normal file
15
tests/local-atomics-3.ispc
Normal file
@@ -0,0 +1,15 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 0xff;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
int32 bits = 0xfff0;
|
||||
float b = atomic_xor_local(&s, bits);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (programCount & 1) ? 0xff0f : 0xff;
|
||||
}
|
||||
14
tests/local-atomics-4.ispc
Normal file
14
tests/local-atomics-4.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = atomic_or_local(&s, (1<<programIndex));
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (1<<programCount)-1;
|
||||
}
|
||||
14
tests/local-atomics-5.ispc
Normal file
14
tests/local-atomics-5.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 0xbeef;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = atomic_swap_local(&s, programIndex);
|
||||
RET[programIndex] = reduce_max(b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0xbeef;
|
||||
}
|
||||
14
tests/local-atomics-6.ispc
Normal file
14
tests/local-atomics-6.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 2;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = atomic_compare_exchange_local(&s, programIndex, a*1000);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (programCount == 1) ? 2 : 3000;
|
||||
}
|
||||
14
tests/local-atomics-7.ispc
Normal file
14
tests/local-atomics-7.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
int32 a = aFOO[programIndex];
|
||||
float b = atomic_min_local(&s, a);
|
||||
RET[programIndex] = reduce_min(b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = reduce_min(programIndex);
|
||||
}
|
||||
16
tests/local-atomics-8.ispc
Normal file
16
tests/local-atomics-8.ispc
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
int32 a = aFOO[programIndex];
|
||||
int32 b = 0;
|
||||
if (programIndex & 1)
|
||||
b = atomic_max_local(&s, a);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (programCount == 1) ? 0 : programCount;
|
||||
}
|
||||
17
tests/local-atomics-9.ispc
Normal file
17
tests/local-atomics-9.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0;
|
||||
int32 delta = 1;
|
||||
if (programIndex < 2)
|
||||
b = atomic_add_local(&s, delta);
|
||||
RET[programIndex] = reduce_add(b);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (programCount == 1) ? 0 : 1;
|
||||
}
|
||||
17
tests/local-atomics-swap.ispc
Normal file
17
tests/local-atomics-swap.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 1234;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0;
|
||||
if (programIndex & 1) {
|
||||
b = atomic_swap_local(&s, programIndex);
|
||||
}
|
||||
RET[programIndex] = reduce_add(b) + s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1234 + reduce_add(programIndex & 1 ? programIndex : 0);
|
||||
}
|
||||
14
tests/local-atomics-uniform-1.ispc
Normal file
14
tests/local-atomics-uniform-1.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 10;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform unsigned int32 b = atomic_add_local(&s, 1);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 11;
|
||||
}
|
||||
14
tests/local-atomics-uniform-2.ispc
Normal file
14
tests/local-atomics-uniform-2.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 0b1010;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform unsigned int32 b = atomic_or_local(&s, 1);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0b1011;
|
||||
}
|
||||
14
tests/local-atomics-uniform-3.ispc
Normal file
14
tests/local-atomics-uniform-3.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 0b1010;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform unsigned int32 b = atomic_or_local(&s, 1);
|
||||
RET[programIndex] = b;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0b1010;
|
||||
}
|
||||
14
tests/local-atomics-uniform-4.ispc
Normal file
14
tests/local-atomics-uniform-4.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 0xffff;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform unsigned int32 b = atomic_min_local(&s, 1);
|
||||
RET[programIndex] = b;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0xffff;
|
||||
}
|
||||
14
tests/local-atomics-uniform-5.ispc
Normal file
14
tests/local-atomics-uniform-5.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s = 0xffff;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform unsigned int32 b = atomic_min_local(&s, 1);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1;
|
||||
}
|
||||
14
tests/local-atomics-uniform-6.ispc
Normal file
14
tests/local-atomics-uniform-6.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform float s = 100.;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform float b = atomic_swap_local(&s, 1.);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1.;
|
||||
}
|
||||
14
tests/local-atomics-uniform-7.ispc
Normal file
14
tests/local-atomics-uniform-7.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform float s = 100.;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform float b = atomic_swap_local(&s, 1.);
|
||||
RET[programIndex] = b;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 100.;
|
||||
}
|
||||
14
tests/local-atomics-uniform-8.ispc
Normal file
14
tests/local-atomics-uniform-8.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform float s = 100.;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform float b = atomic_compare_exchange_local(&s, 1., -100.);
|
||||
RET[programIndex] = b;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 100.;
|
||||
}
|
||||
14
tests/local-atomics-uniform-9.ispc
Normal file
14
tests/local-atomics-uniform-9.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int64 s = 100.;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform int64 b = atomic_compare_exchange_local(&s, 100, -100);
|
||||
RET[programIndex] = s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = -100.;
|
||||
}
|
||||
18
tests/local-atomics-varyingptr-1.ispc
Normal file
18
tests/local-atomics-varyingptr-1.ispc
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform unsigned int32 s[programCount];
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0;
|
||||
float delta = 1;
|
||||
if (programIndex < 2)
|
||||
atomic_add_local(&s[programIndex], delta);
|
||||
RET[programIndex] = s[programIndex];
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = RET[1] = 1;
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user