Merge remote-tracking branch 'matt/master'
This commit is contained in:
28
Makefile
28
Makefile
@@ -3,6 +3,11 @@
|
||||
#
|
||||
|
||||
ARCH_OS = $(shell uname)
|
||||
ifeq ($(ARCH_OS), Darwin)
|
||||
ARCH_OS2 = "OSX"
|
||||
else
|
||||
ARCH_OS2 = $(shell uname -o)
|
||||
endif
|
||||
ARCH_TYPE = $(shell arch)
|
||||
|
||||
ifeq ($(shell llvm-config --version), 3.1svn)
|
||||
@@ -26,7 +31,15 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
|
||||
-lclangAnalysis -lclangAST -lclangLex -lclangBasic
|
||||
|
||||
ISPC_LIBS=$(shell llvm-config --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
|
||||
-lpthread -ldl
|
||||
-lpthread
|
||||
|
||||
ifeq ($(ARCH_OS),Linux)
|
||||
ISPC_LIBS += -ldl
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH_OS2),Msys)
|
||||
ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
|
||||
endif
|
||||
|
||||
LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
|
||||
LLVM_VERSION=LLVM_$(shell llvm-config --version | sed s/\\./_/)
|
||||
@@ -58,7 +71,8 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
|
||||
type.cpp util.cpp
|
||||
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
|
||||
opt.h stmt.h sym.h type.h util.h
|
||||
TARGETS=avx avx-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 generic-16
|
||||
TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
|
||||
generic-16
|
||||
BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
|
||||
builtins/dispatch.ll
|
||||
BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
|
||||
@@ -129,22 +143,22 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
|
||||
|
||||
objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
|
||||
@echo Creating C++ source from builtins definition file $<
|
||||
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | ./bitcode2cpp.py $< > $@
|
||||
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | python bitcode2cpp.py $< > $@
|
||||
|
||||
objs/builtins-c-32.cpp: builtins/builtins.c
|
||||
@echo Creating C++ source from builtins definition file $<
|
||||
@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-32 > $@
|
||||
@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-32 > $@
|
||||
|
||||
objs/builtins-c-64.cpp: builtins/builtins.c
|
||||
@echo Creating C++ source from builtins definition file $<
|
||||
@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-64 > $@
|
||||
@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-64 > $@
|
||||
|
||||
objs/stdlib_generic_ispc.cpp: stdlib.ispc
|
||||
@echo Creating C++ source from $< for generic
|
||||
@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
|
||||
./stdlib2cpp.py generic > $@
|
||||
python stdlib2cpp.py generic > $@
|
||||
|
||||
objs/stdlib_x86_ispc.cpp: stdlib.ispc
|
||||
@echo Creating C++ source from $< for x86
|
||||
@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
|
||||
./stdlib2cpp.py x86 > $@
|
||||
python stdlib2cpp.py x86 > $@
|
||||
|
||||
@@ -47,7 +47,7 @@ remarkable `LLVM Compiler Infrastructure <http://llvm.org>`_ for back-end
|
||||
code generation and optimization and is `hosted on
|
||||
github <http://github.com/ispc/ispc/>`_. It supports Windows, Mac, and
|
||||
Linux, with both x86 and x86-64 targets. It currently supports the SSE2,
|
||||
SSE4, and AVX instruction sets.
|
||||
SSE4, AVX1, and AVX2 instruction sets.
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
24
ast.cpp
24
ast.cpp
@@ -90,7 +90,11 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
DoStmt *dos;
|
||||
ForStmt *fs;
|
||||
ForeachStmt *fes;
|
||||
CaseStmt *cs;
|
||||
DefaultStmt *defs;
|
||||
SwitchStmt *ss;
|
||||
ReturnStmt *rs;
|
||||
LabeledStmt *ls;
|
||||
StmtList *sl;
|
||||
PrintStmt *ps;
|
||||
AssertStmt *as;
|
||||
@@ -130,10 +134,21 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
postFunc, data);
|
||||
fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if (dynamic_cast<BreakStmt *>(node) != NULL ||
|
||||
dynamic_cast<ContinueStmt *>(node) != NULL) {
|
||||
// nothing
|
||||
else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
|
||||
cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
|
||||
else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
|
||||
defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data);
|
||||
else if ((ss = dynamic_cast<SwitchStmt *>(node)) != NULL) {
|
||||
ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data);
|
||||
ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data);
|
||||
}
|
||||
else if (dynamic_cast<BreakStmt *>(node) != NULL ||
|
||||
dynamic_cast<ContinueStmt *>(node) != NULL ||
|
||||
dynamic_cast<GotoStmt *>(node) != NULL) {
|
||||
// nothing
|
||||
}
|
||||
else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
|
||||
ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
|
||||
else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
|
||||
rs->val = (Expr *)WalkAST(rs->val, preFunc, postFunc, data);
|
||||
else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
|
||||
@@ -151,7 +166,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
|
||||
else {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Handle expressions
|
||||
assert(dynamic_cast<Expr *>(node) != NULL);
|
||||
Assert(dynamic_cast<Expr *>(node) != NULL);
|
||||
UnaryExpr *ue;
|
||||
BinaryExpr *be;
|
||||
AssignExpr *ae;
|
||||
@@ -289,3 +304,4 @@ EstimateCost(ASTNode *root) {
|
||||
WalkAST(root, lCostCallback, NULL, &cost);
|
||||
return cost;
|
||||
}
|
||||
|
||||
|
||||
45
builtins.cpp
45
builtins.cpp
@@ -386,6 +386,7 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__ceil_uniform_float",
|
||||
"__ceil_varying_double",
|
||||
"__ceil_varying_float",
|
||||
"__clock",
|
||||
"__count_trailing_zeros_i32",
|
||||
"__count_trailing_zeros_i64",
|
||||
"__count_leading_zeros_i32",
|
||||
@@ -717,11 +718,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
extern int builtins_bitcode_sse4_x2_length;
|
||||
switch (g->target.vectorWidth) {
|
||||
case 4:
|
||||
AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length,
|
||||
AddBitcodeToModule(builtins_bitcode_sse4,
|
||||
builtins_bitcode_sse4_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 8:
|
||||
AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length,
|
||||
AddBitcodeToModule(builtins_bitcode_sse4_x2,
|
||||
builtins_bitcode_sse4_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
@@ -729,18 +732,39 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
}
|
||||
break;
|
||||
case Target::AVX:
|
||||
case Target::AVX2:
|
||||
switch (g->target.vectorWidth) {
|
||||
case 8:
|
||||
extern unsigned char builtins_bitcode_avx[];
|
||||
extern int builtins_bitcode_avx_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length,
|
||||
extern unsigned char builtins_bitcode_avx1[];
|
||||
extern int builtins_bitcode_avx1_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx1,
|
||||
builtins_bitcode_avx1_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 16:
|
||||
extern unsigned char builtins_bitcode_avx_x2[];
|
||||
extern int builtins_bitcode_avx_x2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
|
||||
extern unsigned char builtins_bitcode_avx1_x2[];
|
||||
extern int builtins_bitcode_avx1_x2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx1_x2,
|
||||
builtins_bitcode_avx1_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
FATAL("logic error in DefineStdlib");
|
||||
}
|
||||
break;
|
||||
case Target::AVX2:
|
||||
switch (g->target.vectorWidth) {
|
||||
case 8:
|
||||
extern unsigned char builtins_bitcode_avx2[];
|
||||
extern int builtins_bitcode_avx2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx2,
|
||||
builtins_bitcode_avx2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
case 16:
|
||||
extern unsigned char builtins_bitcode_avx2_x2[];
|
||||
extern int builtins_bitcode_avx2_x2_length;
|
||||
AddBitcodeToModule(builtins_bitcode_avx2_x2,
|
||||
builtins_bitcode_avx2_x2_length,
|
||||
module, symbolTable);
|
||||
break;
|
||||
default:
|
||||
@@ -798,6 +822,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
|
||||
symbolTable);
|
||||
|
||||
lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
|
||||
module, symbolTable);
|
||||
|
||||
if (includeStdlibISPC) {
|
||||
// If the user wants the standard library to be included, parse the
|
||||
// serialized version of the stdlib.ispc file to get its
|
||||
|
||||
@@ -149,7 +149,7 @@ void __do_print(const char *format, const char *types, int width, int mask,
|
||||
|
||||
|
||||
int __num_cores() {
|
||||
#ifdef _MSC_VER
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
// This is quite a hack. Including all of windows.h to get this definition
|
||||
// pulls in a bunch of stuff that leads to undefined symbols at link time.
|
||||
// So we don't #include <windows.h> but instead have the equivalent declarations
|
||||
|
||||
@@ -48,23 +48,42 @@ declare void @abort() noreturn
|
||||
;; corresponding to one of the Target::ISA enumerant values that gives the
|
||||
;; most capable ISA that the curremt system can run.
|
||||
;;
|
||||
;; #ifdef _MSC_VER
|
||||
;; extern void __stdcall __cpuid(int info[4], int infoType);
|
||||
;; #else
|
||||
;; Note: clang from LLVM 2.9 should be used if this is updated, for maximum
|
||||
;; backwards compatibility for anyone building ispc with LLVM 2.9.
|
||||
;;
|
||||
;; #include <stdint.h>
|
||||
;; #include <stdlib.h>
|
||||
;;
|
||||
;; static void __cpuid(int info[4], int infoType) {
|
||||
;; __asm__ __volatile__ ("cpuid"
|
||||
;; : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
;; : "0" (infoType));
|
||||
;; }
|
||||
;; #endif
|
||||
;;
|
||||
;; /* Save %ebx in case it's the PIC register */
|
||||
;; static void __cpuid_count(int info[4], int level, int count) {
|
||||
;; __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
|
||||
;; "cpuid\n\t"
|
||||
;; "xchg{l}\t{%%}ebx, %1\n\t"
|
||||
;; : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
;; : "0" (level), "2" (count));
|
||||
;; }
|
||||
;;
|
||||
;; int32_t __get_system_isa() {
|
||||
;; int info[4];
|
||||
;; __cpuid(info, 1);
|
||||
;;
|
||||
;; /* NOTE: the values returned below must be the same as the
|
||||
;; corresponding enumerant values in Target::ISA. */
|
||||
;; if ((info[2] & (1 << 28)) != 0)
|
||||
;; return 2; // AVX
|
||||
;; if ((info[2] & (1 << 28)) != 0) {
|
||||
;; // AVX1 for sure. Do we have AVX2?
|
||||
;; // Call cpuid with eax=7, ecx=0
|
||||
;; __cpuid_count(info, 7, 0);
|
||||
;; if ((info[1] & (1 << 5)) != 0)
|
||||
;; return 3; // AVX2
|
||||
;; else
|
||||
;; return 2; // AVX1
|
||||
;; }
|
||||
;; else if ((info[2] & (1 << 19)) != 0)
|
||||
;; return 1; // SSE4
|
||||
;; else if ((info[3] & (1 << 26)) != 0)
|
||||
@@ -76,33 +95,42 @@ declare void @abort() noreturn
|
||||
%0 = type { i32, i32, i32, i32 }
|
||||
|
||||
define i32 @__get_system_isa() nounwind ssp {
|
||||
%1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
|
||||
%2 = extractvalue %0 %1, 2
|
||||
%3 = extractvalue %0 %1, 3
|
||||
%4 = and i32 %2, 268435456
|
||||
%5 = icmp eq i32 %4, 0
|
||||
br i1 %5, label %6, label %13
|
||||
entry:
|
||||
%0 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
|
||||
%asmresult9.i = extractvalue %0 %0, 2
|
||||
%asmresult10.i = extractvalue %0 %0, 3
|
||||
%and = and i32 %asmresult9.i, 268435456
|
||||
%cmp = icmp eq i32 %and, 0
|
||||
br i1 %cmp, label %if.else7, label %if.then
|
||||
|
||||
; <label>:6 ; preds = %0
|
||||
%7 = and i32 %2, 524288
|
||||
%8 = icmp eq i32 %7, 0
|
||||
br i1 %8, label %9, label %13
|
||||
if.then: ; preds = %entry
|
||||
%1 = tail call %0 asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
|
||||
%asmresult9.i24 = extractvalue %0 %1, 1
|
||||
%and4 = lshr i32 %asmresult9.i24, 5
|
||||
%2 = and i32 %and4, 1
|
||||
%3 = or i32 %2, 2
|
||||
br label %return
|
||||
|
||||
; <label>:9 ; preds = %6
|
||||
%10 = and i32 %3, 67108864
|
||||
%11 = icmp eq i32 %10, 0
|
||||
br i1 %11, label %12, label %13
|
||||
if.else7: ; preds = %entry
|
||||
%and10 = and i32 %asmresult9.i, 524288
|
||||
%cmp11 = icmp eq i32 %and10, 0
|
||||
br i1 %cmp11, label %if.else13, label %return
|
||||
|
||||
; <label>:12 ; preds = %9
|
||||
if.else13: ; preds = %if.else7
|
||||
%and16 = and i32 %asmresult10.i, 67108864
|
||||
%cmp17 = icmp eq i32 %and16, 0
|
||||
br i1 %cmp17, label %if.else19, label %return
|
||||
|
||||
if.else19: ; preds = %if.else13
|
||||
tail call void @abort() noreturn nounwind
|
||||
unreachable
|
||||
|
||||
; <label>:13 ; preds = %9, %6, %0
|
||||
%.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
|
||||
ret i32 %.0
|
||||
return: ; preds = %if.else13, %if.else7, %if.then
|
||||
%retval.0 = phi i32 [ %3, %if.then ], [ 1, %if.else7 ], [ 0, %if.else13 ]
|
||||
ret i32 %retval.0
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; This function is called by each of the dispatch functions we generate;
|
||||
;; it sets @__system_best_isa if it is unset.
|
||||
|
||||
|
||||
@@ -170,33 +170,6 @@ define <16 x float> @__min_varying_float(<16 x float>,
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
@@ -622,12 +595,7 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(16, i8)
|
||||
gen_gather(16, i16)
|
||||
gen_gather(16, i32)
|
||||
gen_gather(16, i64)
|
||||
;; scatter
|
||||
|
||||
gen_scatter(16, i8)
|
||||
gen_scatter(16, i16)
|
||||
|
||||
@@ -170,33 +170,6 @@ define <8 x float> @__min_varying_float(<8 x float>,
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; horizontal ops
|
||||
|
||||
@@ -238,7 +211,7 @@ reduce_equal(8)
|
||||
;; horizontal int32 ops
|
||||
|
||||
define <8 x i32> @__add_varying_int32(<8 x i32>,
|
||||
<8 x i32>) nounwind readnone alwaysinline {
|
||||
<8 x i32>) nounwind readnone alwaysinline {
|
||||
%s = add <8 x i32> %0, %1
|
||||
ret <8 x i32> %s
|
||||
}
|
||||
@@ -314,7 +287,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline
|
||||
;; horizontal int64 ops
|
||||
|
||||
define <8 x i64> @__add_varying_int64(<8 x i64>,
|
||||
<8 x i64>) nounwind readnone alwaysinline {
|
||||
<8 x i64>) nounwind readnone alwaysinline {
|
||||
%s = add <8 x i64> %0, %1
|
||||
ret <8 x i64> %s
|
||||
}
|
||||
@@ -403,9 +376,6 @@ define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
; FIXME: there is no AVX instruction for these, but we could be clever
|
||||
; by packing the bits down and setting the last 3/4 or half, respectively,
|
||||
; of the mask to zero... Not sure if this would be a win in the end
|
||||
gen_masked_store(8, i8, 8)
|
||||
gen_masked_store(8, i16, 16)
|
||||
|
||||
@@ -520,12 +490,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
;; scatter
|
||||
|
||||
gen_scatter(8, i8)
|
||||
gen_scatter(8, i16)
|
||||
|
||||
77
builtins/target-avx1-x2.ll
Normal file
77
builtins/target-avx1-x2.ll
Normal file
@@ -0,0 +1,77 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(16, i8)
|
||||
gen_gather(16, i16)
|
||||
gen_gather(16, i32)
|
||||
gen_gather(16, i64)
|
||||
|
||||
|
||||
75
builtins/target-avx1.ll
Normal file
75
builtins/target-avx1.ll
Normal file
@@ -0,0 +1,75 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
129
builtins/target-avx2-x2.ll
Normal file
129
builtins/target-avx2-x2.ll
Normal file
@@ -0,0 +1,129 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
|
||||
ret <16 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
|
||||
%r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
|
||||
%r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x float> %r
|
||||
}
|
||||
|
||||
define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
|
||||
%r_0 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
|
||||
%r_1 = shufflevector <16 x float> %v, <16 x float> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
%vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
|
||||
%r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(16, i8)
|
||||
gen_gather(16, i16)
|
||||
gen_gather(16, i32)
|
||||
gen_gather(16, i64)
|
||||
|
||||
|
||||
110
builtins/target-avx2.ll
Normal file
110
builtins/target-avx2.ll
Normal file
@@ -0,0 +1,110 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
;; modification, are permitted provided that the following conditions are
|
||||
;; met:
|
||||
;;
|
||||
;; * Redistributions of source code must retain the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer.
|
||||
;;
|
||||
;; * Redistributions in binary form must reproduce the above copyright
|
||||
;; notice, this list of conditions and the following disclaimer in the
|
||||
;; documentation and/or other materials provided with the distribution.
|
||||
;;
|
||||
;; * Neither the name of Intel Corporation nor the names of its
|
||||
;; contributors may be used to endorse or promote products derived from
|
||||
;; this software without specific prior written permission.
|
||||
;;
|
||||
;;
|
||||
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
%m = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %0, <8 x i32> %1)
|
||||
ret <8 x i32> %m
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
|
||||
define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
|
||||
%r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
|
||||
ret <8 x float> %r
|
||||
}
|
||||
|
||||
define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
|
||||
%v1 = bitcast i16 %v to <1 x i16>
|
||||
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
|
||||
%r = extractelement <8 x float> %rv, i32 0
|
||||
ret float %r
|
||||
}
|
||||
|
||||
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%v1 = bitcast float %v to <1 x float>
|
||||
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
|
||||
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; round to nearest even
|
||||
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(8, i8)
|
||||
gen_gather(8, i16)
|
||||
gen_gather(8, i32)
|
||||
gen_gather(8, i64)
|
||||
@@ -233,7 +233,7 @@ declare <WIDTH x i32> @__masked_load_32(i8 * nocapture, <WIDTH x i1> %mask) noun
|
||||
declare <WIDTH x i64> @__masked_load_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
|
||||
|
||||
declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
@@ -241,8 +241,9 @@ declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_1svn',`
|
||||
define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind {
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i8> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
|
||||
store <WIDTH x i8> %v1, <WIDTH x i8> * %0
|
||||
@@ -250,7 +251,7 @@ define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind {
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i16> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
|
||||
store <WIDTH x i16> %v1, <WIDTH x i16> * %0
|
||||
@@ -258,7 +259,7 @@ define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind {
|
||||
<WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i32> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
|
||||
store <WIDTH x i32> %v1, <WIDTH x i32> * %0
|
||||
@@ -266,30 +267,40 @@ define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
}
|
||||
|
||||
define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
|
||||
<WIDTH x i64>, <WIDTH x i1>) nounwind {
|
||||
<WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
|
||||
%v = load <WIDTH x i64> * %0
|
||||
%v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
|
||||
store <WIDTH x i64> %v1, <WIDTH x i64> * %0
|
||||
ret void
|
||||
}
|
||||
',`
|
||||
declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
|
||||
<WIDTH x i1> %mask) nounwind
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
define(`gather_scatter', `
|
||||
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
|
||||
i32, <WIDTH x i1>) nounwind readonly
|
||||
i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
|
||||
i32, <WIDTH x i1>) nounwind readonly
|
||||
i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
|
||||
declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
|
||||
i32, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
|
||||
i32, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
|
||||
|
||||
@@ -47,6 +47,14 @@ int64minmax()
|
||||
|
||||
include(`target-sse2-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
|
||||
@@ -44,6 +44,14 @@ int64minmax()
|
||||
|
||||
include(`target-sse2-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding
|
||||
;;
|
||||
|
||||
@@ -47,6 +47,14 @@ int64minmax()
|
||||
|
||||
include(`target-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
|
||||
@@ -44,6 +44,14 @@ int64minmax()
|
||||
|
||||
include(`target-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; half conversion routines
|
||||
|
||||
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
|
||||
216
builtins/util.m4
216
builtins/util.m4
@@ -760,14 +760,12 @@ define(`global_atomic_uniform', `
|
||||
ifelse(LLVM_VERSION, `LLVM_2_9',`
|
||||
declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
|
||||
|
||||
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
|
||||
<$1 x MASK> %mask) nounwind alwaysinline {
|
||||
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline {
|
||||
%r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
|
||||
ret $3 %r
|
||||
}
|
||||
', `
|
||||
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
|
||||
<$1 x MASK> %mask) nounwind alwaysinline {
|
||||
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline {
|
||||
%r = atomicrmw $2 $3 * %ptr, $3 %val seq_cst
|
||||
ret $3 %r
|
||||
}
|
||||
@@ -786,26 +784,7 @@ declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val)
|
||||
declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)')
|
||||
|
||||
define(`global_swap', `
|
||||
|
||||
define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
|
||||
<$1 x MASK> %mask) nounwind alwaysinline {
|
||||
%rptr = alloca <$1 x $2>
|
||||
%rptr32 = bitcast <$1 x $2> * %rptr to $2 *
|
||||
|
||||
per_lane($1, <$1 x MASK> %mask, `
|
||||
%val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
|
||||
ifelse(LLVM_VERSION, `LLVM_2_9',`
|
||||
%r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)', `
|
||||
%r_LANE_ID = atomicrmw xchg $2 * %ptr, $2 %val_LANE_ID seq_cst')
|
||||
%rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE
|
||||
store $2 %r_LANE_ID, $2 * %rp_LANE_ID')
|
||||
|
||||
%r = load <$1 x $2> * %rptr
|
||||
ret <$1 x $2> %r
|
||||
}
|
||||
|
||||
define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
|
||||
<$1 x MASK> %mask) nounwind alwaysinline {
|
||||
define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline {
|
||||
ifelse(LLVM_VERSION, `LLVM_2_9',`
|
||||
%r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)', `
|
||||
%r = atomicrmw xchg $2 * %ptr, $2 %val seq_cst')
|
||||
@@ -845,7 +824,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
|
||||
}
|
||||
|
||||
define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
|
||||
$2 %val, <$1 x MASK> %mask) nounwind alwaysinline {
|
||||
$2 %val) nounwind alwaysinline {
|
||||
ifelse(LLVM_VERSION, `LLVM_2_9',`
|
||||
%r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)', `
|
||||
%r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst')
|
||||
@@ -1586,17 +1565,15 @@ declare void @__pseudo_masked_store_64(<WIDTH x i64> * nocapture, <WIDTH x i64>,
|
||||
; these represent gathers from a common base pointer with offsets. The
|
||||
; offset_scale factor scales the offsets before they are added to the base
|
||||
; pointer--it should have the value 1, 2, 4, or 8. (It can always just be 1.)
|
||||
; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling
|
||||
; available in x86 addressing calculations...
|
||||
; Then, the offset delta_value (guaranteed to be a compile-time constant value),
|
||||
; is added to the final address. The 2, 4, 8 scales are used to match LLVM patterns
|
||||
; that use the free 2/4/8 scaling available in x86 addressing calculations, and
|
||||
; offset_delta feeds into the free offset calculation.
|
||||
;
|
||||
; varying int8 __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base,
|
||||
; int{32,64} offsets, int32 offset_scale, mask)
|
||||
; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base,
|
||||
; int{32,64} offsets, int32 offset_scale, mask)
|
||||
; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base,
|
||||
; int{32,64} offsets, int32 offset_scale, mask)
|
||||
; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base,
|
||||
; int{32,64} offsets, int32 offset_scale, mask)
|
||||
; varying int{8,16,32,64}
|
||||
; __pseudo_gather_base_offsets{32,64}_{8,16,32,64}(uniform int8 *base,
|
||||
; int{32,64} offsets, uniform int32 offset_scale,
|
||||
; int{32,64} offset_delta, mask)
|
||||
;
|
||||
; Then, the GSImprovementsPass optimizations finds these and either
|
||||
; converts them to native gather functions or converts them to vector
|
||||
@@ -1612,22 +1589,22 @@ declare <WIDTH x i16> @__pseudo_gather64_16(<WIDTH x i64>, <WIDTH x MASK>) nounw
|
||||
declare <WIDTH x i32> @__pseudo_gather64_32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i64> @__pseudo_gather64_64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
|
||||
|
||||
declare <WIDTH x i8> @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32,
|
||||
declare <WIDTH x i8> @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32,
|
||||
declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32,
|
||||
declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
|
||||
declare <WIDTH x i8> @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32,
|
||||
declare <WIDTH x i8> @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32,
|
||||
declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32,
|
||||
declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32,
|
||||
declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
|
||||
; Similarly to the pseudo-gathers defined above, we also declare undefined
|
||||
@@ -1642,13 +1619,9 @@ declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i3
|
||||
; transforms them to scatters like:
|
||||
;
|
||||
; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base,
|
||||
; varying int32 offsets, int32 offset_scale, varying int8 values, mask)
|
||||
; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base,
|
||||
; varying int32 offsets, int32 offset_scale, varying int16 values, mask)
|
||||
; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base,
|
||||
; varying int32 offsets, int32 offset_scale, varying int32 values, mask)
|
||||
; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base,
|
||||
; varying int32 offsets, int32 offset_scale, varying int64 values, mask)
|
||||
; varying int32 offsets, uniform int32 offset_scale,
|
||||
; varying int{32,64} offset_delta, varying int8 values, mask)
|
||||
; (and similarly for 16/32/64 bit values)
|
||||
;
|
||||
; And the GSImprovementsPass in turn converts these to actual native
|
||||
; scatters or masked stores.
|
||||
@@ -1663,22 +1636,22 @@ declare void @__pseudo_scatter64_16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>
|
||||
declare void @__pseudo_scatter64_32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter64_64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i8>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i16>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i32>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i8>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i16>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i32>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32,
|
||||
declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
@@ -1832,6 +1805,22 @@ ok:
|
||||
ret void
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; read hw clock
|
||||
|
||||
define i64 @__clock() nounwind uwtable ssp {
|
||||
entry:
|
||||
tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
|
||||
%0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
|
||||
%asmresult = extractvalue { i32, i32 } %0, 0
|
||||
%asmresult1 = extractvalue { i32, i32 } %0, 1
|
||||
%conv = zext i32 %asmresult1 to i64
|
||||
%shl = shl nuw i64 %conv, 32
|
||||
%conv2 = zext i32 %asmresult to i64
|
||||
%or = or i64 %shl, %conv2
|
||||
ret i64 %or
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; stdlib transcendentals
|
||||
;;
|
||||
@@ -1997,38 +1986,18 @@ global_atomic_uniform(WIDTH, umax, i64, uint64)
|
||||
global_swap(WIDTH, i32, int32)
|
||||
global_swap(WIDTH, i64, int64)
|
||||
|
||||
define <WIDTH x float> @__atomic_swap_float_global(float * %ptr, <WIDTH x float> %val,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
%iptr = bitcast float * %ptr to i32 *
|
||||
%ival = bitcast <WIDTH x float> %val to <WIDTH x i32>
|
||||
%iret = call <WIDTH x i32> @__atomic_swap_int32_global(i32 * %iptr, <WIDTH x i32> %ival, <WIDTH x MASK> %mask)
|
||||
%ret = bitcast <WIDTH x i32> %iret to <WIDTH x float>
|
||||
ret <WIDTH x float> %ret
|
||||
}
|
||||
|
||||
define <WIDTH x double> @__atomic_swap_double_global(double * %ptr, <WIDTH x double> %val,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
%iptr = bitcast double * %ptr to i64 *
|
||||
%ival = bitcast <WIDTH x double> %val to <WIDTH x i64>
|
||||
%iret = call <WIDTH x i64> @__atomic_swap_int64_global(i64 * %iptr, <WIDTH x i64> %ival, <WIDTH x MASK> %mask)
|
||||
%ret = bitcast <WIDTH x i64> %iret to <WIDTH x double>
|
||||
ret <WIDTH x double> %ret
|
||||
}
|
||||
|
||||
define float @__atomic_swap_uniform_float_global(float * %ptr, float %val,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
define float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline {
|
||||
%iptr = bitcast float * %ptr to i32 *
|
||||
%ival = bitcast float %val to i32
|
||||
%iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <WIDTH x MASK> %mask)
|
||||
%iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival)
|
||||
%ret = bitcast i32 %iret to float
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define double @__atomic_swap_uniform_double_global(double * %ptr, double %val,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
define double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline {
|
||||
%iptr = bitcast double * %ptr to i64 *
|
||||
%ival = bitcast double %val to i64
|
||||
%iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <WIDTH x MASK> %mask)
|
||||
%iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival)
|
||||
%ret = bitcast i64 %iret to double
|
||||
ret double %ret
|
||||
}
|
||||
@@ -2058,24 +2027,23 @@ define <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
|
||||
ret <WIDTH x double> %ret
|
||||
}
|
||||
|
||||
define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val,
|
||||
<WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
|
||||
float %val) nounwind alwaysinline {
|
||||
%iptr = bitcast float * %ptr to i32 *
|
||||
%icmp = bitcast float %cmp to i32
|
||||
%ival = bitcast float %val to i32
|
||||
%iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp,
|
||||
i32 %ival, <WIDTH x MASK> %mask)
|
||||
i32 %ival)
|
||||
%ret = bitcast i32 %iret to float
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
|
||||
double %val, <WIDTH x MASK> %mask) nounwind alwaysinline {
|
||||
double %val) nounwind alwaysinline {
|
||||
%iptr = bitcast double * %ptr to i64 *
|
||||
%icmp = bitcast double %cmp to i64
|
||||
%ival = bitcast double %val to i64
|
||||
%iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp,
|
||||
i64 %ival, <WIDTH x MASK> %mask)
|
||||
%iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp, i64 %ival)
|
||||
%ret = bitcast i64 %iret to double
|
||||
ret double %ret
|
||||
}
|
||||
@@ -2727,7 +2695,8 @@ define(`gen_gather', `
|
||||
;; Define the utility function to do the gather operation for a single element
|
||||
;; of the type
|
||||
define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
|
||||
<$1 x i32> %offset_delta, <$1 x $2> %ret,
|
||||
i32 %lane) nounwind readonly alwaysinline {
|
||||
; compute address for this one from the base
|
||||
%offset32 = extractelement <$1 x i32> %offsets, i32 %lane
|
||||
; the order and details of the next 4 lines are important--they match LLVMs
|
||||
@@ -2737,15 +2706,20 @@ define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_
|
||||
%offset = mul i64 %offset64, %scale64
|
||||
%ptroffset = getelementptr i8 * %ptr, i64 %offset
|
||||
|
||||
%delta = extractelement <$1 x i32> %offset_delta, i32 %lane
|
||||
%delta64 = sext i32 %delta to i64
|
||||
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
|
||||
|
||||
; load value and insert into returned value
|
||||
%ptrcast = bitcast i8 * %ptroffset to $2 *
|
||||
%ptrcast = bitcast i8 * %finalptr to $2 *
|
||||
%val = load $2 *%ptrcast
|
||||
%updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
|
||||
ret <$1 x $2> %updatedret
|
||||
}
|
||||
|
||||
define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
|
||||
<$1 x i64> %offset_delta, <$1 x $2> %ret,
|
||||
i32 %lane) nounwind readonly alwaysinline {
|
||||
; compute address for this one from the base
|
||||
%offset64 = extractelement <$1 x i64> %offsets, i32 %lane
|
||||
; the order and details of the next 4 lines are important--they match LLVMs
|
||||
@@ -2754,8 +2728,11 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_
|
||||
%offset = mul i64 %offset64, %offset_scale64
|
||||
%ptroffset = getelementptr i8 * %ptr, i64 %offset
|
||||
|
||||
%delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
|
||||
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
|
||||
|
||||
; load value and insert into returned value
|
||||
%ptrcast = bitcast i8 * %ptroffset to $2 *
|
||||
%ptrcast = bitcast i8 * %finalptr to $2 *
|
||||
%val = load $2 *%ptrcast
|
||||
%updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
|
||||
ret <$1 x $2> %updatedret
|
||||
@@ -2763,6 +2740,7 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_
|
||||
|
||||
|
||||
define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
|
||||
<$1 x i32> %offset_delta,
|
||||
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
; We can be clever and avoid the per-lane stuff for gathers if we are willing
|
||||
; to require that the 0th element of the array being gathered from is always
|
||||
@@ -2775,16 +2753,25 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32
|
||||
<$1 x i32> %vecmask)
|
||||
%newOffsets = load <$1 x i32> * %offsetsPtr
|
||||
|
||||
%deltaPtr = alloca <$1 x i32>
|
||||
store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr
|
||||
call void @__masked_store_blend_32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta,
|
||||
<$1 x i32> %vecmask)
|
||||
%newDelta = load <$1 x i32> * %deltaPtr
|
||||
|
||||
%ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
|
||||
i32 %offset_scale, <$1 x $2> undef, i32 0)
|
||||
i32 %offset_scale, <$1 x i32> %offset_delta,
|
||||
<$1 x $2> undef, i32 0)
|
||||
forloop(lane, 1, eval($1-1),
|
||||
`patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr,
|
||||
<$1 x i32> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
|
||||
<$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %offset_delta,
|
||||
<$1 x $2> %retPREV, i32 LANE)
|
||||
', `LANE', lane), `PREV', eval(lane-1))')
|
||||
ret <$1 x $2> %ret`'eval($1-1)
|
||||
}
|
||||
|
||||
define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
|
||||
<$1 x i64> %offset_delta,
|
||||
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
; We can be clever and avoid the per-lane stuff for gathers if we are willing
|
||||
; to require that the 0th element of the array being gathered from is always
|
||||
@@ -2797,11 +2784,19 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32
|
||||
<$1 x i32> %vecmask)
|
||||
%newOffsets = load <$1 x i64> * %offsetsPtr
|
||||
|
||||
%deltaPtr = alloca <$1 x i64>
|
||||
store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr
|
||||
call void @__masked_store_blend_64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta,
|
||||
<$1 x i32> %vecmask)
|
||||
%newDelta = load <$1 x i64> * %deltaPtr
|
||||
|
||||
%ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
|
||||
i32 %offset_scale, <$1 x $2> undef, i32 0)
|
||||
i32 %offset_scale, <$1 x i64> %newDelta,
|
||||
<$1 x $2> undef, i32 0)
|
||||
forloop(lane, 1, eval($1-1),
|
||||
`patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr,
|
||||
<$1 x i64> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
|
||||
<$1 x i64> %newOffsets, i32 %offset_scale, <$1 x i64> %newDelta,
|
||||
<$1 x $2> %retPREV, i32 LANE)
|
||||
', `LANE', lane), `PREV', eval(lane-1))')
|
||||
ret <$1 x $2> %ret`'eval($1-1)
|
||||
}
|
||||
@@ -2852,7 +2847,8 @@ define(`gen_scatter', `
|
||||
;; Define the function that descripes the work to do to scatter a single
|
||||
;; value
|
||||
define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %values, i32 %lane) nounwind alwaysinline {
|
||||
<$1 x i32> %offset_delta, <$1 x $2> %values,
|
||||
i32 %lane) nounwind alwaysinline {
|
||||
%offset32 = extractelement <$1 x i32> %offsets, i32 %lane
|
||||
; the order and details of the next 4 lines are important--they match LLVMs
|
||||
; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
|
||||
@@ -2861,42 +2857,52 @@ define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scal
|
||||
%offset = mul i64 %offset64, %scale64
|
||||
%ptroffset = getelementptr i8 * %ptr, i64 %offset
|
||||
|
||||
%ptrcast = bitcast i8 * %ptroffset to $2 *
|
||||
%delta = extractelement <$1 x i32> %offset_delta, i32 %lane
|
||||
%delta64 = sext i32 %delta to i64
|
||||
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
|
||||
|
||||
%ptrcast = bitcast i8 * %finalptr to $2 *
|
||||
%storeval = extractelement <$1 x $2> %values, i32 %lane
|
||||
store $2 %storeval, $2 * %ptrcast
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__scatter_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %values, i32 %lane) nounwind alwaysinline {
|
||||
<$1 x i64> %offset_delta, <$1 x $2> %values,
|
||||
i32 %lane) nounwind alwaysinline {
|
||||
%offset64 = extractelement <$1 x i64> %offsets, i32 %lane
|
||||
; the order and details of the next 4 lines are important--they match LLVMs
|
||||
; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
|
||||
%scale64 = sext i32 %offset_scale to i64
|
||||
%offset = mul i64 %offset64, %scale64
|
||||
%ptroffset = getelementptr i8 * %ptr, i64 %offset
|
||||
%ptrcast = bitcast i8 * %ptroffset to $2 *
|
||||
|
||||
%delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
|
||||
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
|
||||
|
||||
%ptrcast = bitcast i8 * %finalptr to $2 *
|
||||
%storeval = extractelement <$1 x $2> %values, i32 %lane
|
||||
store $2 %storeval, $2 * %ptrcast
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
|
||||
<$1 x i32> %offset_delta, <$1 x $2> %values,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
|
||||
per_lane($1, <$1 x i32> %mask, `
|
||||
call void @__scatter_elt32_$2(i8 * %base, <$1 x i32> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %values, i32 LANE)')
|
||||
<$1 x i32> %offset_delta, <$1 x $2> %values, i32 LANE)')
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
|
||||
<$1 x i64> %offset_delta, <$1 x $2> %values,
|
||||
<$1 x i32> %mask) nounwind alwaysinline {
|
||||
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
|
||||
per_lane($1, <$1 x i32> %mask, `
|
||||
call void @__scatter_elt64_$2(i8 * %base, <$1 x i64> %offsets, i32 %offset_scale,
|
||||
<$1 x $2> %values, i32 LANE)')
|
||||
<$1 x i64> %offset_delta, <$1 x $2> %values, i32 LANE)')
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
88
cbackend.cpp
88
cbackend.cpp
@@ -16,6 +16,16 @@
|
||||
#warning "The C++ backend isn't supported when building with LLVM 2.9"
|
||||
#else
|
||||
|
||||
#ifndef _MSC_VER
|
||||
#include <inttypes.h>
|
||||
#endif
|
||||
|
||||
#ifndef PRIx64
|
||||
#define PRIx64 "llx"
|
||||
#endif
|
||||
|
||||
#include "llvmutil.h"
|
||||
|
||||
#include "llvm/CallingConv.h"
|
||||
#include "llvm/Constants.h"
|
||||
#include "llvm/DerivedTypes.h"
|
||||
@@ -224,6 +234,7 @@ namespace {
|
||||
unsigned NextAnonValueNumber;
|
||||
|
||||
std::string includeName;
|
||||
int vectorWidth;
|
||||
|
||||
/// UnnamedStructIDs - This contains a unique ID for each struct that is
|
||||
/// either anonymous or has no name.
|
||||
@@ -232,11 +243,13 @@ namespace {
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
explicit CWriter(formatted_raw_ostream &o, const char *incname)
|
||||
explicit CWriter(formatted_raw_ostream &o, const char *incname,
|
||||
int vecwidth)
|
||||
: FunctionPass(ID), Out(o), IL(0), Mang(0), LI(0),
|
||||
TheModule(0), TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0),
|
||||
OpaqueCounter(0), NextAnonValueNumber(0),
|
||||
includeName(incname ? incname : "generic_defs.h") {
|
||||
includeName(incname ? incname : "generic_defs.h"),
|
||||
vectorWidth(vecwidth) {
|
||||
initializeLoopInfoPass(*PassRegistry::getPassRegistry());
|
||||
FPCounter = 0;
|
||||
}
|
||||
@@ -376,7 +389,7 @@ namespace {
|
||||
if (I.getType() == Type::getVoidTy(I.getContext()) || !I.hasOneUse() ||
|
||||
isa<TerminatorInst>(I) || isa<CallInst>(I) || isa<PHINode>(I) ||
|
||||
isa<LoadInst>(I) || isa<VAArgInst>(I) || isa<InsertElementInst>(I) ||
|
||||
isa<InsertValueInst>(I) || isa<ExtractValueInst>(I))
|
||||
isa<InsertValueInst>(I) || isa<ExtractValueInst>(I) || isa<SelectInst>(I))
|
||||
// Don't inline a load across a store or other bad things!
|
||||
return false;
|
||||
|
||||
@@ -765,6 +778,16 @@ raw_ostream &CWriter::printType(raw_ostream &Out, Type *Ty,
|
||||
Out << " return ret;\n";
|
||||
Out << " }\n ";
|
||||
|
||||
// if it's an array of i8s, also provide a version that takes a const
|
||||
// char *
|
||||
if (ATy->getElementType() == LLVMTypes::Int8Type) {
|
||||
Out << " static " << NameSoFar << " init(const char *p) {\n";
|
||||
Out << " " << NameSoFar << " ret;\n";
|
||||
Out << " strncpy((char *)ret.array, p, " << NumElements << ");\n";
|
||||
Out << " return ret;\n";
|
||||
Out << " }\n";
|
||||
}
|
||||
|
||||
printType(Out, ATy->getElementType(), false,
|
||||
"array[" + utostr(NumElements) + "]");
|
||||
return Out << ";\n} ";
|
||||
@@ -834,7 +857,8 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
|
||||
}
|
||||
Out << '\"';
|
||||
} else {
|
||||
Out << '{';
|
||||
if (Static)
|
||||
Out << '{';
|
||||
if (CPA->getNumOperands()) {
|
||||
Out << ' ';
|
||||
printConstant(cast<Constant>(CPA->getOperand(0)), Static);
|
||||
@@ -843,7 +867,8 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
|
||||
printConstant(cast<Constant>(CPA->getOperand(i)), Static);
|
||||
}
|
||||
}
|
||||
Out << " }";
|
||||
if (Static)
|
||||
Out << " }";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1280,7 +1305,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
|
||||
char Buffer[100];
|
||||
|
||||
uint64_t ll = DoubleToBits(V);
|
||||
sprintf(Buffer, "0x%llx", static_cast<long long>(ll));
|
||||
sprintf(Buffer, "0x%"PRIx64, static_cast<long long>(ll));
|
||||
|
||||
std::string Num(&Buffer[0], &Buffer[6]);
|
||||
unsigned long Val = strtoul(Num.c_str(), 0, 16);
|
||||
@@ -1313,7 +1338,8 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
|
||||
break;
|
||||
}
|
||||
|
||||
case Type::ArrayTyID:
|
||||
case Type::ArrayTyID: {
|
||||
ArrayType *AT = cast<ArrayType>(CPV->getType());
|
||||
if (Static)
|
||||
// arrays are wrapped in structs...
|
||||
Out << "{ ";
|
||||
@@ -1326,7 +1352,6 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
|
||||
printConstantArray(CA, Static);
|
||||
} else {
|
||||
assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
|
||||
ArrayType *AT = cast<ArrayType>(CPV->getType());
|
||||
if (AT->getNumElements()) {
|
||||
Out << ' ';
|
||||
Constant *CZ = Constant::getNullValue(AT->getElementType());
|
||||
@@ -1342,7 +1367,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
|
||||
else
|
||||
Out << ")";
|
||||
break;
|
||||
|
||||
}
|
||||
case Type::VectorTyID:
|
||||
printType(Out, CPV->getType());
|
||||
Out << "(";
|
||||
@@ -1741,17 +1766,6 @@ void CWriter::writeOperandWithCast(Value* Operand, const ICmpInst &Cmp) {
|
||||
//
|
||||
static void generateCompilerSpecificCode(formatted_raw_ostream& Out,
|
||||
const TargetData *TD) {
|
||||
// Alloca, ...
|
||||
Out << "#include <stdlib.h>\n"
|
||||
<< "#include <stdint.h>\n"
|
||||
<< "/* get a declaration for alloca */\n"
|
||||
<< "#ifdef _MSC_VER\n"
|
||||
<< "#include <malloc.h>\n"
|
||||
<< "#define alloca _alloca\n"
|
||||
<< "#else\n"
|
||||
<< "#include <alloca.h>\n"
|
||||
<< "#endif\n\n";
|
||||
|
||||
// We output GCC specific attributes to preserve 'linkonce'ness on globals.
|
||||
// If we aren't being compiled with GCC, just drop these attributes.
|
||||
Out << "#ifndef __GNUC__ /* Can only support \"linkonce\" vars with GCC */\n"
|
||||
@@ -1976,7 +1990,6 @@ bool CWriter::doInitialization(Module &M) {
|
||||
Out << " DO NOT EDIT THIS FILE DIRECTLY\n";
|
||||
Out << " *******************************************************************/\n\n";
|
||||
|
||||
// get declaration for alloca
|
||||
Out << "/* Provide Declarations */\n";
|
||||
Out << "#include <stdarg.h>\n"; // Varargs support
|
||||
Out << "#include <setjmp.h>\n"; // Unwind support
|
||||
@@ -1987,6 +2000,15 @@ bool CWriter::doInitialization(Module &M) {
|
||||
Out << " #define NOMINMAX\n";
|
||||
Out << " #include <windows.h>\n";
|
||||
Out << "#endif // _MSC_VER\n";
|
||||
Out << "#include <stdlib.h>\n";
|
||||
Out << "#include <stdint.h>\n";
|
||||
Out << "/* get a declaration for alloca */\n";
|
||||
Out << "#ifdef _MSC_VER\n";
|
||||
Out << " #include <malloc.h>\n";
|
||||
Out << " #define alloca _alloca\n";
|
||||
Out << "#else\n";
|
||||
Out << " #include <alloca.h>\n";
|
||||
Out << "#endif\n\n";
|
||||
|
||||
Out << "#include \"" << includeName << "\"\n";
|
||||
|
||||
@@ -2198,7 +2220,7 @@ bool CWriter::doInitialization(Module &M) {
|
||||
// FIXME common linkage should avoid this problem.
|
||||
if (!I->getInitializer()->isNullValue()) {
|
||||
Out << " = " ;
|
||||
writeOperand(I->getInitializer(), true);
|
||||
writeOperand(I->getInitializer(), false);
|
||||
} else if (I->hasWeakLinkage()) {
|
||||
// We have to specify an initializer, but it doesn't have to be
|
||||
// complete. If the value is an aggregate, print out { 0 }, and let
|
||||
@@ -2213,7 +2235,7 @@ bool CWriter::doInitialization(Module &M) {
|
||||
Out << "{ { 0 } }";
|
||||
} else {
|
||||
// Just print it out normally.
|
||||
writeOperand(I->getInitializer(), true);
|
||||
writeOperand(I->getInitializer(), false);
|
||||
}
|
||||
}
|
||||
Out << ";\n";
|
||||
@@ -2887,7 +2909,21 @@ void CWriter::visitBinaryOperator(Instruction &I) {
|
||||
Out << "(";
|
||||
writeOperand(I.getOperand(0));
|
||||
Out << ", ";
|
||||
writeOperand(I.getOperand(1));
|
||||
if ((I.getOpcode() == Instruction::Shl ||
|
||||
I.getOpcode() == Instruction::LShr ||
|
||||
I.getOpcode() == Instruction::AShr)) {
|
||||
std::vector<PHINode *> phis;
|
||||
if (LLVMVectorValuesAllEqual(I.getOperand(1),
|
||||
vectorWidth, phis)) {
|
||||
Out << "__extract_element(";
|
||||
writeOperand(I.getOperand(1));
|
||||
Out << ", 0) ";
|
||||
}
|
||||
else
|
||||
writeOperand(I.getOperand(1));
|
||||
}
|
||||
else
|
||||
writeOperand(I.getOperand(1));
|
||||
Out << ")";
|
||||
return;
|
||||
}
|
||||
@@ -3628,7 +3664,7 @@ std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
|
||||
#endif
|
||||
|
||||
std::string E;
|
||||
if (const Target *Match = TargetRegistry::lookupTarget(Triple, E))
|
||||
if (const llvm::Target *Match = TargetRegistry::lookupTarget(Triple, E))
|
||||
TargetAsm = Match->createMCAsmInfo(Triple);
|
||||
else
|
||||
return c.Codes[0];
|
||||
@@ -4330,7 +4366,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
|
||||
pm.add(new BitcastCleanupPass);
|
||||
pm.add(createDeadCodeEliminationPass()); // clean up after smear pass
|
||||
//CO pm.add(createPrintModulePass(&fos));
|
||||
pm.add(new CWriter(fos, includeName));
|
||||
pm.add(new CWriter(fos, includeName, vectorWidth));
|
||||
pm.add(createGCInfoDeleter());
|
||||
//CO pm.add(createVerifierPass());
|
||||
|
||||
|
||||
596
ctx.cpp
596
ctx.cpp
@@ -74,18 +74,35 @@ struct CFInfo {
|
||||
llvm::Value *savedContinueLanesPtr,
|
||||
llvm::Value *savedMask, llvm::Value *savedLoopMask);
|
||||
|
||||
static CFInfo *GetSwitch(bool isUniform, llvm::BasicBlock *breakTarget,
|
||||
llvm::BasicBlock *continueTarget,
|
||||
llvm::Value *savedBreakLanesPtr,
|
||||
llvm::Value *savedContinueLanesPtr,
|
||||
llvm::Value *savedMask, llvm::Value *savedLoopMask,
|
||||
llvm::Value *switchExpr,
|
||||
llvm::BasicBlock *bbDefault,
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > *bbCases,
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbNext,
|
||||
bool scUniform);
|
||||
|
||||
bool IsIf() { return type == If; }
|
||||
bool IsLoop() { return type == Loop; }
|
||||
bool IsForeach() { return type == Foreach; }
|
||||
bool IsVaryingType() { return !isUniform; }
|
||||
bool IsSwitch() { return type == Switch; }
|
||||
bool IsVarying() { return !isUniform; }
|
||||
bool IsUniform() { return isUniform; }
|
||||
|
||||
enum CFType { If, Loop, Foreach };
|
||||
enum CFType { If, Loop, Foreach, Switch };
|
||||
CFType type;
|
||||
bool isUniform;
|
||||
llvm::BasicBlock *savedBreakTarget, *savedContinueTarget;
|
||||
llvm::Value *savedBreakLanesPtr, *savedContinueLanesPtr;
|
||||
llvm::Value *savedMask, *savedLoopMask;
|
||||
llvm::Value *savedSwitchExpr;
|
||||
llvm::BasicBlock *savedDefaultBlock;
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > *savedCaseBlocks;
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNextBlocks;
|
||||
bool savedSwitchConditionWasUniform;
|
||||
|
||||
private:
|
||||
CFInfo(CFType t, bool uniformIf, llvm::Value *sm) {
|
||||
@@ -95,11 +112,18 @@ private:
|
||||
savedBreakTarget = savedContinueTarget = NULL;
|
||||
savedBreakLanesPtr = savedContinueLanesPtr = NULL;
|
||||
savedMask = savedLoopMask = sm;
|
||||
savedSwitchExpr = NULL;
|
||||
savedDefaultBlock = NULL;
|
||||
savedCaseBlocks = NULL;
|
||||
savedNextBlocks = NULL;
|
||||
}
|
||||
CFInfo(CFType t, bool iu, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
|
||||
llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
|
||||
llvm::Value *lm) {
|
||||
Assert(t == Loop);
|
||||
llvm::Value *lm, llvm::Value *sse = NULL, llvm::BasicBlock *bbd = NULL,
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > *bbc = NULL,
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbn = NULL,
|
||||
bool scu = false) {
|
||||
Assert(t == Loop || t == Switch);
|
||||
type = t;
|
||||
isUniform = iu;
|
||||
savedBreakTarget = bt;
|
||||
@@ -108,6 +132,11 @@ private:
|
||||
savedContinueLanesPtr = sc;
|
||||
savedMask = sm;
|
||||
savedLoopMask = lm;
|
||||
savedSwitchExpr = sse;
|
||||
savedDefaultBlock = bbd;
|
||||
savedCaseBlocks = bbc;
|
||||
savedNextBlocks = bbn;
|
||||
savedSwitchConditionWasUniform = scu;
|
||||
}
|
||||
CFInfo(CFType t, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
|
||||
llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
|
||||
@@ -121,6 +150,10 @@ private:
|
||||
savedContinueLanesPtr = sc;
|
||||
savedMask = sm;
|
||||
savedLoopMask = lm;
|
||||
savedSwitchExpr = NULL;
|
||||
savedDefaultBlock = NULL;
|
||||
savedCaseBlocks = NULL;
|
||||
savedNextBlocks = NULL;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -154,12 +187,30 @@ CFInfo::GetForeach(llvm::BasicBlock *breakTarget,
|
||||
savedMask, savedForeachMask);
|
||||
}
|
||||
|
||||
|
||||
CFInfo *
|
||||
CFInfo::GetSwitch(bool isUniform, llvm::BasicBlock *breakTarget,
|
||||
llvm::BasicBlock *continueTarget,
|
||||
llvm::Value *savedBreakLanesPtr,
|
||||
llvm::Value *savedContinueLanesPtr, llvm::Value *savedMask,
|
||||
llvm::Value *savedLoopMask, llvm::Value *savedSwitchExpr,
|
||||
llvm::BasicBlock *savedDefaultBlock,
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > *savedCases,
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNext,
|
||||
bool savedSwitchConditionUniform) {
|
||||
return new CFInfo(Switch, isUniform, breakTarget, continueTarget,
|
||||
savedBreakLanesPtr, savedContinueLanesPtr,
|
||||
savedMask, savedLoopMask, savedSwitchExpr, savedDefaultBlock,
|
||||
savedCases, savedNext, savedSwitchConditionUniform);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
|
||||
llvm::Function *llvmFunction,
|
||||
llvm::Function *lf,
|
||||
SourcePos firstStmtPos) {
|
||||
function = func;
|
||||
llvmFunction = lf;
|
||||
|
||||
/* Create a new basic block to store all of the allocas */
|
||||
allocaBlock = llvm::BasicBlock::Create(*g->ctx, "allocas", llvmFunction, 0);
|
||||
@@ -181,6 +232,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
|
||||
breakLanesPtr = continueLanesPtr = NULL;
|
||||
breakTarget = continueTarget = NULL;
|
||||
|
||||
switchExpr = NULL;
|
||||
caseBlocks = NULL;
|
||||
defaultBlock = NULL;
|
||||
nextBlocks = NULL;
|
||||
|
||||
returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
|
||||
StoreInst(LLVMMaskAllOff, returnedLanesPtr);
|
||||
|
||||
@@ -421,51 +477,61 @@ FunctionEmitContext::StartVaryingIf(llvm::Value *oldMask) {
|
||||
|
||||
void
|
||||
FunctionEmitContext::EndIf() {
|
||||
CFInfo *ci = popCFState();
|
||||
// Make sure we match up with a Start{Uniform,Varying}If().
|
||||
Assert(controlFlowInfo.size() > 0 && controlFlowInfo.back()->IsIf());
|
||||
CFInfo *ci = controlFlowInfo.back();
|
||||
controlFlowInfo.pop_back();
|
||||
Assert(ci->IsIf());
|
||||
|
||||
// 'uniform' ifs don't change the mask so we only need to restore the
|
||||
// mask going into the if for 'varying' if statements
|
||||
if (!ci->IsUniform() && bblock != NULL) {
|
||||
// We can't just restore the mask as it was going into the 'if'
|
||||
// statement. First we have to take into account any program
|
||||
// instances that have executed 'return' statements; the restored
|
||||
// mask must be off for those lanes.
|
||||
restoreMaskGivenReturns(ci->savedMask);
|
||||
if (ci->IsUniform() || bblock == NULL)
|
||||
return;
|
||||
|
||||
// If the 'if' statement is inside a loop with a 'varying'
|
||||
// consdition, we also need to account for any break or continue
|
||||
// statements that executed inside the 'if' statmeent; we also must
|
||||
// leave the lane masks for the program instances that ran those
|
||||
// off after we restore the mask after the 'if'. The code below
|
||||
// ends up being optimized out in the case that there were no break
|
||||
// or continue statements (and breakLanesPtr and continueLanesPtr
|
||||
// have their initial 'all off' values), so we don't need to check
|
||||
// for that here.
|
||||
if (continueLanesPtr != NULL) {
|
||||
// We want to compute:
|
||||
// newMask = (oldMask & ~(breakLanes | continueLanes))
|
||||
llvm::Value *oldMask = GetInternalMask();
|
||||
llvm::Value *continueLanes = LoadInst(continueLanesPtr,
|
||||
"continue_lanes");
|
||||
llvm::Value *bcLanes = continueLanes;
|
||||
// We can't just restore the mask as it was going into the 'if'
|
||||
// statement. First we have to take into account any program
|
||||
// instances that have executed 'return' statements; the restored
|
||||
// mask must be off for those lanes.
|
||||
restoreMaskGivenReturns(ci->savedMask);
|
||||
|
||||
if (breakLanesPtr != NULL) {
|
||||
// breakLanesPtr will be NULL if we're inside a 'foreach' loop
|
||||
llvm::Value *breakLanes = LoadInst(breakLanesPtr, "break_lanes");
|
||||
bcLanes = BinaryOperator(llvm::Instruction::Or, breakLanes,
|
||||
continueLanes, "break|continue_lanes");
|
||||
}
|
||||
// If the 'if' statement is inside a loop with a 'varying'
|
||||
// condition, we also need to account for any break or continue
|
||||
// statements that executed inside the 'if' statmeent; we also must
|
||||
// leave the lane masks for the program instances that ran those
|
||||
// off after we restore the mask after the 'if'. The code below
|
||||
// ends up being optimized out in the case that there were no break
|
||||
// or continue statements (and breakLanesPtr and continueLanesPtr
|
||||
// have their initial 'all off' values), so we don't need to check
|
||||
// for that here.
|
||||
//
|
||||
// There are three general cases to deal with here:
|
||||
// - Loops: both break and continue are allowed, and thus the corresponding
|
||||
// lane mask pointers are non-NULL
|
||||
// - Foreach: only continueLanesPtr may be non-NULL
|
||||
// - Switch: only breakLanesPtr may be non-NULL
|
||||
if (continueLanesPtr != NULL || breakLanesPtr != NULL) {
|
||||
// We want to compute:
|
||||
// newMask = (oldMask & ~(breakLanes | continueLanes)),
|
||||
// treading breakLanes or continueLanes as "all off" if the
|
||||
// corresponding pointer is NULL.
|
||||
llvm::Value *bcLanes = NULL;
|
||||
|
||||
llvm::Value *notBreakOrContinue =
|
||||
NotOperator(bcLanes, "!(break|continue)_lanes");
|
||||
llvm::Value *newMask =
|
||||
BinaryOperator(llvm::Instruction::And, oldMask,
|
||||
notBreakOrContinue, "new_mask");
|
||||
SetInternalMask(newMask);
|
||||
if (continueLanesPtr != NULL)
|
||||
bcLanes = LoadInst(continueLanesPtr, "continue_lanes");
|
||||
else
|
||||
bcLanes = LLVMMaskAllOff;
|
||||
|
||||
if (breakLanesPtr != NULL) {
|
||||
llvm::Value *breakLanes = LoadInst(breakLanesPtr, "break_lanes");
|
||||
bcLanes = BinaryOperator(llvm::Instruction::Or, bcLanes,
|
||||
breakLanes, "|break_lanes");
|
||||
}
|
||||
|
||||
llvm::Value *notBreakOrContinue =
|
||||
NotOperator(bcLanes, "!(break|continue)_lanes");
|
||||
llvm::Value *oldMask = GetInternalMask();
|
||||
llvm::Value *newMask =
|
||||
BinaryOperator(llvm::Instruction::And, oldMask,
|
||||
notBreakOrContinue, "new_mask");
|
||||
SetInternalMask(newMask);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -501,17 +567,8 @@ FunctionEmitContext::StartLoop(llvm::BasicBlock *bt, llvm::BasicBlock *ct,
|
||||
|
||||
void
|
||||
FunctionEmitContext::EndLoop() {
|
||||
Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsLoop());
|
||||
CFInfo *ci = controlFlowInfo.back();
|
||||
controlFlowInfo.pop_back();
|
||||
|
||||
// Restore the break/continue state information to what it was before
|
||||
// we went into this loop.
|
||||
breakTarget = ci->savedBreakTarget;
|
||||
continueTarget = ci->savedContinueTarget;
|
||||
breakLanesPtr = ci->savedBreakLanesPtr;
|
||||
continueLanesPtr = ci->savedContinueLanesPtr;
|
||||
loopMask = ci->savedLoopMask;
|
||||
CFInfo *ci = popCFState();
|
||||
Assert(ci->IsLoop());
|
||||
|
||||
if (!ci->IsUniform())
|
||||
// If the loop had a 'uniform' test, then it didn't make any
|
||||
@@ -524,7 +581,7 @@ FunctionEmitContext::EndLoop() {
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {
|
||||
FunctionEmitContext::StartForeach() {
|
||||
// Store the current values of various loop-related state so that we
|
||||
// can restore it when we exit this loop.
|
||||
llvm::Value *oldMask = GetInternalMask();
|
||||
@@ -536,7 +593,7 @@ FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {
|
||||
|
||||
continueLanesPtr = AllocaInst(LLVMTypes::MaskType, "foreach_continue_lanes");
|
||||
StoreInst(LLVMMaskAllOff, continueLanesPtr);
|
||||
continueTarget = ct;
|
||||
continueTarget = NULL; // should be set by SetContinueTarget()
|
||||
|
||||
loopMask = NULL;
|
||||
}
|
||||
@@ -544,17 +601,8 @@ FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {
|
||||
|
||||
void
|
||||
FunctionEmitContext::EndForeach() {
|
||||
Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsForeach());
|
||||
CFInfo *ci = controlFlowInfo.back();
|
||||
controlFlowInfo.pop_back();
|
||||
|
||||
// Restore the break/continue state information to what it was before
|
||||
// we went into this loop.
|
||||
breakTarget = ci->savedBreakTarget;
|
||||
continueTarget = ci->savedContinueTarget;
|
||||
breakLanesPtr = ci->savedBreakLanesPtr;
|
||||
continueLanesPtr = ci->savedContinueLanesPtr;
|
||||
loopMask = ci->savedLoopMask;
|
||||
CFInfo *ci = popCFState();
|
||||
Assert(ci->IsForeach());
|
||||
}
|
||||
|
||||
|
||||
@@ -575,28 +623,64 @@ FunctionEmitContext::restoreMaskGivenReturns(llvm::Value *oldMask) {
|
||||
}
|
||||
|
||||
|
||||
/** Returns "true" if the first enclosing non-if control flow expression is
|
||||
a "switch" statement.
|
||||
*/
|
||||
bool
|
||||
FunctionEmitContext::inSwitchStatement() const {
|
||||
// Go backwards through controlFlowInfo, since we add new nested scopes
|
||||
// to the back.
|
||||
int i = controlFlowInfo.size() - 1;
|
||||
while (i >= 0 && controlFlowInfo[i]->IsIf())
|
||||
--i;
|
||||
// Got to the first non-if (or end of CF info)
|
||||
if (i == -1)
|
||||
return false;
|
||||
return controlFlowInfo[i]->IsSwitch();
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::Break(bool doCoherenceCheck) {
|
||||
Assert(controlFlowInfo.size() > 0);
|
||||
if (breakTarget == NULL) {
|
||||
Error(currentPos, "\"break\" statement is illegal outside of "
|
||||
"for/while/do loops.");
|
||||
"for/while/do loops and \"switch\" statements.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (bblock == NULL)
|
||||
return;
|
||||
|
||||
if (inSwitchStatement() == true &&
|
||||
switchConditionWasUniform == true &&
|
||||
ifsInCFAllUniform(CFInfo::Switch)) {
|
||||
// We know that all program instances are executing the break, so
|
||||
// just jump to the block immediately after the switch.
|
||||
Assert(breakTarget != NULL);
|
||||
BranchInst(breakTarget);
|
||||
bblock = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
// If all of the enclosing 'if' tests in the loop have uniform control
|
||||
// flow or if we can tell that the mask is all on, then we can just
|
||||
// jump to the break location.
|
||||
if (ifsInLoopAllUniform() || GetInternalMask() == LLVMMaskAllOn) {
|
||||
if (inSwitchStatement() == false &&
|
||||
(ifsInCFAllUniform(CFInfo::Loop) ||
|
||||
GetInternalMask() == LLVMMaskAllOn)) {
|
||||
BranchInst(breakTarget);
|
||||
if (ifsInLoopAllUniform() && doCoherenceCheck)
|
||||
Warning(currentPos, "Coherent break statement not necessary in fully uniform "
|
||||
"control flow.");
|
||||
if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
|
||||
Warning(currentPos, "Coherent break statement not necessary in "
|
||||
"fully uniform control flow.");
|
||||
// Set bblock to NULL since the jump has terminated the basic block
|
||||
bblock = NULL;
|
||||
}
|
||||
else {
|
||||
// Otherwise we need to update the mask of the lanes that have
|
||||
// executed a 'break' statement:
|
||||
// Varying switch, uniform switch where the 'break' is under
|
||||
// varying control flow, or a loop with varying 'if's above the
|
||||
// break. In these cases, we need to update the mask of the lanes
|
||||
// that have executed a 'break' statement:
|
||||
// breakLanes = breakLanes | mask
|
||||
Assert(breakLanesPtr != NULL);
|
||||
llvm::Value *mask = GetInternalMask();
|
||||
@@ -612,16 +696,20 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
|
||||
// an 'if' statement and restore the mask then.
|
||||
SetInternalMask(LLVMMaskAllOff);
|
||||
|
||||
if (doCoherenceCheck)
|
||||
// If the user has indicated that this is a 'coherent' break
|
||||
// statement, then check to see if the mask is all off. If so,
|
||||
// we have to conservatively jump to the continueTarget, not
|
||||
// the breakTarget, since part of the reason the mask is all
|
||||
// off may be due to 'continue' statements that executed in the
|
||||
// current loop iteration.
|
||||
// FIXME: if the loop only has break statements and no
|
||||
// continues, we can jump to breakTarget in that case.
|
||||
jumpIfAllLoopLanesAreDone(continueTarget);
|
||||
if (doCoherenceCheck) {
|
||||
if (continueTarget != NULL)
|
||||
// If the user has indicated that this is a 'coherent'
|
||||
// break statement, then check to see if the mask is all
|
||||
// off. If so, we have to conservatively jump to the
|
||||
// continueTarget, not the breakTarget, since part of the
|
||||
// reason the mask is all off may be due to 'continue'
|
||||
// statements that executed in the current loop iteration.
|
||||
jumpIfAllLoopLanesAreDone(continueTarget);
|
||||
else if (breakTarget != NULL)
|
||||
// Similarly handle these for switch statements, where we
|
||||
// only have a break target.
|
||||
jumpIfAllLoopLanesAreDone(breakTarget);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -634,12 +722,12 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (ifsInLoopAllUniform() || GetInternalMask() == LLVMMaskAllOn) {
|
||||
if (ifsInCFAllUniform(CFInfo::Loop) || GetInternalMask() == LLVMMaskAllOn) {
|
||||
// Similarly to 'break' statements, we can immediately jump to the
|
||||
// continue target if we're only in 'uniform' control flow within
|
||||
// loop or if we can tell that the mask is all on.
|
||||
AddInstrumentationPoint("continue: uniform CF, jumped");
|
||||
if (ifsInLoopAllUniform() && doCoherenceCheck)
|
||||
if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
|
||||
Warning(currentPos, "Coherent continue statement not necessary in "
|
||||
"fully uniform control flow.");
|
||||
BranchInst(continueTarget);
|
||||
@@ -652,8 +740,9 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
|
||||
llvm::Value *mask = GetInternalMask();
|
||||
llvm::Value *continueMask =
|
||||
LoadInst(continueLanesPtr, "continue_mask");
|
||||
llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or,
|
||||
mask, continueMask, "mask|continueMask");
|
||||
llvm::Value *newMask =
|
||||
BinaryOperator(llvm::Instruction::Or, mask, continueMask,
|
||||
"mask|continueMask");
|
||||
StoreInst(newMask, continueLanesPtr);
|
||||
|
||||
// And set the current mask to be all off in case there are any
|
||||
@@ -670,22 +759,23 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
|
||||
|
||||
|
||||
/** This function checks to see if all of the 'if' statements (if any)
|
||||
between the current scope and the first enclosing loop have 'uniform'
|
||||
tests.
|
||||
between the current scope and the first enclosing loop/switch of given
|
||||
control flow type have 'uniform' tests.
|
||||
*/
|
||||
bool
|
||||
FunctionEmitContext::ifsInLoopAllUniform() const {
|
||||
FunctionEmitContext::ifsInCFAllUniform(int type) const {
|
||||
Assert(controlFlowInfo.size() > 0);
|
||||
// Go backwards through controlFlowInfo, since we add new nested scopes
|
||||
// to the back. Stop once we come to the first enclosing loop.
|
||||
// to the back. Stop once we come to the first enclosing control flow
|
||||
// structure of the desired type.
|
||||
int i = controlFlowInfo.size() - 1;
|
||||
while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Loop) {
|
||||
while (i >= 0 && controlFlowInfo[i]->type != type) {
|
||||
if (controlFlowInfo[i]->isUniform == false)
|
||||
// Found a scope due to an 'if' statement with a varying test
|
||||
return false;
|
||||
--i;
|
||||
}
|
||||
Assert(i >= 0); // else we didn't find a loop!
|
||||
Assert(i >= 0); // else we didn't find the expected control flow type!
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -758,11 +848,249 @@ FunctionEmitContext::RestoreContinuedLanes() {
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak) {
|
||||
llvm::Value *oldMask = GetInternalMask();
|
||||
controlFlowInfo.push_back(CFInfo::GetSwitch(cfIsUniform, breakTarget,
|
||||
continueTarget, breakLanesPtr,
|
||||
continueLanesPtr, oldMask,
|
||||
loopMask, switchExpr, defaultBlock,
|
||||
caseBlocks, nextBlocks,
|
||||
switchConditionWasUniform));
|
||||
|
||||
breakLanesPtr = AllocaInst(LLVMTypes::MaskType, "break_lanes_memory");
|
||||
StoreInst(LLVMMaskAllOff, breakLanesPtr);
|
||||
breakTarget = bbBreak;
|
||||
|
||||
continueLanesPtr = NULL;
|
||||
continueTarget = NULL;
|
||||
loopMask = NULL;
|
||||
|
||||
// These will be set by the SwitchInst() method
|
||||
switchExpr = NULL;
|
||||
defaultBlock = NULL;
|
||||
caseBlocks = NULL;
|
||||
nextBlocks = NULL;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::EndSwitch() {
|
||||
Assert(bblock != NULL);
|
||||
|
||||
CFInfo *ci = popCFState();
|
||||
if (ci->IsVarying() && bblock != NULL)
|
||||
restoreMaskGivenReturns(ci->savedMask);
|
||||
}
|
||||
|
||||
|
||||
/** Emit code to check for an "all off" mask before the code for a
|
||||
case or default label in a "switch" statement.
|
||||
*/
|
||||
void
|
||||
FunctionEmitContext::addSwitchMaskCheck(llvm::Value *mask) {
|
||||
llvm::Value *allOff = None(mask);
|
||||
llvm::BasicBlock *bbSome = CreateBasicBlock("case_default_on");
|
||||
|
||||
// Find the basic block for the case or default label immediately after
|
||||
// the current one in the switch statement--that's where we want to
|
||||
// jump if the mask is all off at this label.
|
||||
Assert(nextBlocks->find(bblock) != nextBlocks->end());
|
||||
llvm::BasicBlock *bbNext = nextBlocks->find(bblock)->second;
|
||||
|
||||
// Jump to the next one of the mask is all off; otherwise jump to the
|
||||
// newly created block that will hold the actual code for this label.
|
||||
BranchInst(bbNext, bbSome, allOff);
|
||||
SetCurrentBasicBlock(bbSome);
|
||||
}
|
||||
|
||||
|
||||
/** Returns the execution mask at entry to the first enclosing "switch"
|
||||
statement. */
|
||||
llvm::Value *
|
||||
FunctionEmitContext::getMaskAtSwitchEntry() {
|
||||
Assert(controlFlowInfo.size() > 0);
|
||||
int i = controlFlowInfo.size() - 1;
|
||||
while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Switch)
|
||||
--i;
|
||||
Assert(i != -1);
|
||||
return controlFlowInfo[i]->savedMask;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::EmitDefaultLabel(bool checkMask, SourcePos pos) {
|
||||
if (inSwitchStatement() == false) {
|
||||
Error(pos, "\"default\" label illegal outside of \"switch\" "
|
||||
"statement.");
|
||||
return;
|
||||
}
|
||||
|
||||
// If there's a default label in the switch, a basic block for it
|
||||
// should have been provided in the previous call to SwitchInst().
|
||||
Assert(defaultBlock != NULL);
|
||||
|
||||
if (bblock != NULL)
|
||||
// The previous case in the switch fell through, or we're in a
|
||||
// varying switch; terminate the current block with a jump to the
|
||||
// block for the code for the default label.
|
||||
BranchInst(defaultBlock);
|
||||
SetCurrentBasicBlock(defaultBlock);
|
||||
|
||||
if (switchConditionWasUniform)
|
||||
// Nothing more to do for this case; return back to the caller,
|
||||
// which will then emit the code for the default case.
|
||||
return;
|
||||
|
||||
// For a varying switch, we need to update the execution mask.
|
||||
//
|
||||
// First, compute the mask that corresponds to which program instances
|
||||
// should execute the "default" code; this corresponds to the set of
|
||||
// program instances that don't match any of the case statements.
|
||||
// Therefore, we generate code that compares the value of the switch
|
||||
// expression to the value associated with each of the "case"
|
||||
// statements such that the surviving lanes didn't match any of them.
|
||||
llvm::Value *matchesDefault = getMaskAtSwitchEntry();
|
||||
for (int i = 0; i < (int)caseBlocks->size(); ++i) {
|
||||
int value = (*caseBlocks)[i].first;
|
||||
llvm::Value *valueVec = (switchExpr->getType() == LLVMTypes::Int32VectorType) ?
|
||||
LLVMInt32Vector(value) : LLVMInt64Vector(value);
|
||||
// TODO: for AVX2 at least, the following generates better code
|
||||
// than doing ICMP_NE and skipping the NotOperator() below; file a
|
||||
// LLVM bug?
|
||||
llvm::Value *matchesCaseValue =
|
||||
CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr,
|
||||
valueVec, "cmp_case_value");
|
||||
matchesCaseValue = I1VecToBoolVec(matchesCaseValue);
|
||||
|
||||
llvm::Value *notMatchesCaseValue = NotOperator(matchesCaseValue);
|
||||
matchesDefault = BinaryOperator(llvm::Instruction::And, matchesDefault,
|
||||
notMatchesCaseValue, "default&~case_match");
|
||||
}
|
||||
|
||||
// The mask may have some lanes on, which corresponds to the previous
|
||||
// label falling through; compute the updated mask by ANDing with the
|
||||
// current mask.
|
||||
llvm::Value *oldMask = GetInternalMask();
|
||||
llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, oldMask,
|
||||
matchesDefault, "old_mask|matches_default");
|
||||
SetInternalMask(newMask);
|
||||
|
||||
if (checkMask)
|
||||
addSwitchMaskCheck(newMask);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::EmitCaseLabel(int value, bool checkMask, SourcePos pos) {
|
||||
if (inSwitchStatement() == false) {
|
||||
Error(pos, "\"case\" label illegal outside of \"switch\" statement.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Find the basic block for this case statement.
|
||||
llvm::BasicBlock *bbCase = NULL;
|
||||
Assert(caseBlocks != NULL);
|
||||
for (int i = 0; i < (int)caseBlocks->size(); ++i)
|
||||
if ((*caseBlocks)[i].first == value) {
|
||||
bbCase = (*caseBlocks)[i].second;
|
||||
break;
|
||||
}
|
||||
Assert(bbCase != NULL);
|
||||
|
||||
if (bblock != NULL)
|
||||
// fall through from the previous case
|
||||
BranchInst(bbCase);
|
||||
SetCurrentBasicBlock(bbCase);
|
||||
|
||||
if (switchConditionWasUniform)
|
||||
return;
|
||||
|
||||
// update the mask: first, get a mask that indicates which program
|
||||
// instances have a value for the switch expression that matches this
|
||||
// case statement.
|
||||
llvm::Value *valueVec = (switchExpr->getType() == LLVMTypes::Int32VectorType) ?
|
||||
LLVMInt32Vector(value) : LLVMInt64Vector(value);
|
||||
llvm::Value *matchesCaseValue =
|
||||
CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr,
|
||||
valueVec, "cmp_case_value");
|
||||
matchesCaseValue = I1VecToBoolVec(matchesCaseValue);
|
||||
|
||||
// If a lane was off going into the switch, we don't care if has a
|
||||
// value in the switch expression that happens to match this case.
|
||||
llvm::Value *entryMask = getMaskAtSwitchEntry();
|
||||
matchesCaseValue = BinaryOperator(llvm::Instruction::And, entryMask,
|
||||
matchesCaseValue, "entry_mask&case_match");
|
||||
|
||||
// Take the surviving lanes and turn on the mask for them.
|
||||
llvm::Value *oldMask = GetInternalMask();
|
||||
llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, oldMask,
|
||||
matchesCaseValue, "mask|case_match");
|
||||
SetInternalMask(newMask);
|
||||
|
||||
if (checkMask)
|
||||
addSwitchMaskCheck(newMask);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::SwitchInst(llvm::Value *expr, llvm::BasicBlock *bbDefault,
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > &bbCases,
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &bbNext) {
|
||||
// The calling code should have called StartSwitch() before calling
|
||||
// SwitchInst().
|
||||
Assert(controlFlowInfo.size() &&
|
||||
controlFlowInfo.back()->IsSwitch());
|
||||
|
||||
switchExpr = expr;
|
||||
defaultBlock = bbDefault;
|
||||
caseBlocks = new std::vector<std::pair<int, llvm::BasicBlock *> >(bbCases);
|
||||
nextBlocks = new std::map<llvm::BasicBlock *, llvm::BasicBlock *>(bbNext);
|
||||
switchConditionWasUniform =
|
||||
(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(expr->getType()) == false);
|
||||
|
||||
if (switchConditionWasUniform == true) {
|
||||
// For a uniform switch condition, just wire things up to the LLVM
|
||||
// switch instruction.
|
||||
llvm::SwitchInst *s = llvm::SwitchInst::Create(expr, bbDefault,
|
||||
bbCases.size(), bblock);
|
||||
for (int i = 0; i < (int)bbCases.size(); ++i) {
|
||||
if (expr->getType() == LLVMTypes::Int32Type)
|
||||
s->addCase(LLVMInt32(bbCases[i].first), bbCases[i].second);
|
||||
else {
|
||||
Assert(expr->getType() == LLVMTypes::Int64Type);
|
||||
s->addCase(LLVMInt64(bbCases[i].first), bbCases[i].second);
|
||||
}
|
||||
}
|
||||
|
||||
AddDebugPos(s);
|
||||
// switch is a terminator
|
||||
bblock = NULL;
|
||||
}
|
||||
else {
|
||||
// For a varying switch, we first turn off all lanes of the mask
|
||||
SetInternalMask(LLVMMaskAllOff);
|
||||
|
||||
if (nextBlocks->size() > 0) {
|
||||
// If there are any labels inside the switch, jump to the first
|
||||
// one; any code before the first label won't be executed by
|
||||
// anyone.
|
||||
std::map<llvm::BasicBlock *, llvm::BasicBlock *>::const_iterator iter;
|
||||
iter = nextBlocks->find(NULL);
|
||||
Assert(iter != nextBlocks->end());
|
||||
llvm::BasicBlock *bbFirst = iter->second;
|
||||
BranchInst(bbFirst);
|
||||
bblock = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
FunctionEmitContext::VaryingCFDepth() const {
|
||||
int sum = 0;
|
||||
for (unsigned int i = 0; i < controlFlowInfo.size(); ++i)
|
||||
if (controlFlowInfo[i]->IsVaryingType())
|
||||
if (controlFlowInfo[i]->IsVarying())
|
||||
++sum;
|
||||
return sum;
|
||||
}
|
||||
@@ -777,6 +1105,41 @@ FunctionEmitContext::InForeachLoop() const {
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
FunctionEmitContext::initLabelBBlocks(ASTNode *node, void *data) {
|
||||
LabeledStmt *ls = dynamic_cast<LabeledStmt *>(node);
|
||||
if (ls == NULL)
|
||||
return true;
|
||||
|
||||
FunctionEmitContext *ctx = (FunctionEmitContext *)data;
|
||||
|
||||
if (ctx->labelMap.find(ls->name) != ctx->labelMap.end())
|
||||
Error(ls->pos, "Multiple labels named \"%s\" in function.",
|
||||
ls->name.c_str());
|
||||
else {
|
||||
llvm::BasicBlock *bb = ctx->CreateBasicBlock(ls->name.c_str());
|
||||
ctx->labelMap[ls->name] = bb;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::InitializeLabelMap(Stmt *code) {
|
||||
labelMap.erase(labelMap.begin(), labelMap.end());
|
||||
WalkAST(code, initLabelBBlocks, NULL, this);
|
||||
}
|
||||
|
||||
|
||||
llvm::BasicBlock *
|
||||
FunctionEmitContext::GetLabeledBasicBlock(const std::string &label) {
|
||||
if (labelMap.find(label) != labelMap.end())
|
||||
return labelMap[label];
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
|
||||
const Type *returnType = function->GetReturnType();
|
||||
@@ -869,6 +1232,14 @@ FunctionEmitContext::All(llvm::Value *mask) {
|
||||
}
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::None(llvm::Value *mask) {
|
||||
llvm::Value *mmval = LaneMask(mask);
|
||||
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
|
||||
LLVMInt32(0), "none_mm_cmp");
|
||||
}
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::LaneMask(llvm::Value *v) {
|
||||
// Call the target-dependent movmsk function to turn the vector mask
|
||||
@@ -920,8 +1291,7 @@ FunctionEmitContext::GetStringPtr(const std::string &str) {
|
||||
|
||||
llvm::BasicBlock *
|
||||
FunctionEmitContext::CreateBasicBlock(const char *name) {
|
||||
llvm::Function *function = bblock->getParent();
|
||||
return llvm::BasicBlock::Create(*g->ctx, name, function);
|
||||
return llvm::BasicBlock::Create(*g->ctx, name, llvmFunction);
|
||||
}
|
||||
|
||||
|
||||
@@ -2597,3 +2967,37 @@ FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr,
|
||||
|
||||
return BinaryOperator(llvm::Instruction::Add, ptr, offset);
|
||||
}
|
||||
|
||||
|
||||
CFInfo *
|
||||
FunctionEmitContext::popCFState() {
|
||||
Assert(controlFlowInfo.size() > 0);
|
||||
CFInfo *ci = controlFlowInfo.back();
|
||||
controlFlowInfo.pop_back();
|
||||
|
||||
if (ci->IsSwitch()) {
|
||||
breakTarget = ci->savedBreakTarget;
|
||||
continueTarget = ci->savedContinueTarget;
|
||||
breakLanesPtr = ci->savedBreakLanesPtr;
|
||||
continueLanesPtr = ci->savedContinueLanesPtr;
|
||||
loopMask = ci->savedLoopMask;
|
||||
switchExpr = ci->savedSwitchExpr;
|
||||
defaultBlock = ci->savedDefaultBlock;
|
||||
caseBlocks = ci->savedCaseBlocks;
|
||||
nextBlocks = ci->savedNextBlocks;
|
||||
switchConditionWasUniform = ci->savedSwitchConditionWasUniform;
|
||||
}
|
||||
else if (ci->IsLoop() || ci->IsForeach()) {
|
||||
breakTarget = ci->savedBreakTarget;
|
||||
continueTarget = ci->savedContinueTarget;
|
||||
breakLanesPtr = ci->savedBreakLanesPtr;
|
||||
continueLanesPtr = ci->savedContinueLanesPtr;
|
||||
loopMask = ci->savedLoopMask;
|
||||
}
|
||||
else {
|
||||
Assert(ci->IsIf());
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
return ci;
|
||||
}
|
||||
|
||||
124
ctx.h
124
ctx.h
@@ -39,6 +39,7 @@
|
||||
#define ISPC_CTX_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include <map>
|
||||
#include <llvm/InstrTypes.h>
|
||||
#include <llvm/Instructions.h>
|
||||
#include <llvm/Analysis/DIBuilder.h>
|
||||
@@ -160,10 +161,8 @@ public:
|
||||
void EndLoop();
|
||||
|
||||
/** Indicates that code generation for a 'foreach' or 'foreach_tiled'
|
||||
loop is about to start. The provided basic block pointer indicates
|
||||
where control flow should go if a 'continue' statement is executed
|
||||
in the loop. */
|
||||
void StartForeach(llvm::BasicBlock *continueTarget);
|
||||
loop is about to start. */
|
||||
void StartForeach();
|
||||
void EndForeach();
|
||||
|
||||
/** Emit code for a 'break' statement in a loop. If doCoherenceCheck
|
||||
@@ -186,12 +185,62 @@ public:
|
||||
previous iteration. */
|
||||
void RestoreContinuedLanes();
|
||||
|
||||
/** Indicates that code generation for a "switch" statement is about to
|
||||
start. isUniform indicates whether the "switch" value is uniform,
|
||||
and bbAfterSwitch gives the basic block immediately following the
|
||||
"switch" statement. (For example, if the switch condition is
|
||||
uniform, we jump here upon executing a "break" statement.) */
|
||||
void StartSwitch(bool isUniform, llvm::BasicBlock *bbAfterSwitch);
|
||||
/** Indicates the end of code generation for a "switch" statement. */
|
||||
void EndSwitch();
|
||||
|
||||
/** Emits code for a "switch" statement in the program.
|
||||
@param expr Gives the value of the expression after the "switch"
|
||||
@param defaultBlock Basic block to execute for the "default" case. This
|
||||
should be NULL if there is no "default" label inside
|
||||
the switch.
|
||||
@param caseBlocks vector that stores the mapping from label values
|
||||
after "case" statements to basic blocks corresponding
|
||||
to the "case" labels.
|
||||
@param nextBlocks For each basic block for a "case" or "default"
|
||||
label, this gives the basic block for the
|
||||
immediately-following "case" or "default" label (or
|
||||
the basic block after the "switch" statement for the
|
||||
last label.)
|
||||
*/
|
||||
void SwitchInst(llvm::Value *expr, llvm::BasicBlock *defaultBlock,
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > &caseBlocks,
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &nextBlocks);
|
||||
|
||||
/** Generates code for a "default" label after a "switch" statement.
|
||||
The checkMask parameter indicates whether additional code should be
|
||||
generated to check to see if the execution mask is all off after
|
||||
the default label (in which case a jump to the following label will
|
||||
be issued. */
|
||||
void EmitDefaultLabel(bool checkMask, SourcePos pos);
|
||||
|
||||
/** Generates code for a "case" label after a "switch" statement. See
|
||||
the documentation for EmitDefaultLabel() for discussion of the
|
||||
checkMask parameter. */
|
||||
void EmitCaseLabel(int value, bool checkMask, SourcePos pos);
|
||||
|
||||
/** Returns the current number of nested levels of 'varying' control
|
||||
flow */
|
||||
int VaryingCFDepth() const;
|
||||
|
||||
bool InForeachLoop() const;
|
||||
|
||||
void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
|
||||
|
||||
/** Step through the code and find label statements; create a basic
|
||||
block for each one, so that subsequent calls to
|
||||
GetLabeledBasicBlock() return the corresponding basic block. */
|
||||
void InitializeLabelMap(Stmt *code);
|
||||
|
||||
/** If there is a label in the function with the given name, return the
|
||||
new basic block that it starts. */
|
||||
llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);
|
||||
|
||||
/** Called to generate code for 'return' statement; value is the
|
||||
expression in the return statement (if non-NULL), and
|
||||
doCoherenceCheck indicates whether instructions should be generated
|
||||
@@ -211,6 +260,10 @@ public:
|
||||
i1 value that indicates if all of the mask lanes are on. */
|
||||
llvm::Value *All(llvm::Value *mask);
|
||||
|
||||
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
|
||||
i1 value that indicates if all of the mask lanes are off. */
|
||||
llvm::Value *None(llvm::Value *mask);
|
||||
|
||||
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
|
||||
i32 value wherein the i'th bit is on if and only if the i'th lane
|
||||
of the mask is on. */
|
||||
@@ -446,6 +499,9 @@ private:
|
||||
/** Pointer to the Function for which we're currently generating code. */
|
||||
Function *function;
|
||||
|
||||
/** LLVM function representation for the current function. */
|
||||
llvm::Function *llvmFunction;
|
||||
|
||||
/** The basic block into which we add any alloca instructions that need
|
||||
to go at the very start of the function. */
|
||||
llvm::BasicBlock *allocaBlock;
|
||||
@@ -479,10 +535,10 @@ private:
|
||||
the loop. */
|
||||
llvm::Value *loopMask;
|
||||
|
||||
/** If currently in a loop body, this is a pointer to memory to store a
|
||||
mask value that represents which of the lanes have executed a
|
||||
'break' statement. If we're not in a loop body, this should be
|
||||
NULL. */
|
||||
/** If currently in a loop body or switch statement, this is a pointer
|
||||
to memory to store a mask value that represents which of the lanes
|
||||
have executed a 'break' statement. If we're not in a loop body or
|
||||
switch, this should be NULL. */
|
||||
llvm::Value *breakLanesPtr;
|
||||
|
||||
/** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
|
||||
@@ -490,16 +546,49 @@ private:
|
||||
'continue' statement. */
|
||||
llvm::Value *continueLanesPtr;
|
||||
|
||||
/** If we're inside a loop, this gives the basic block immediately
|
||||
after the current loop, which we will jump to if all of the lanes
|
||||
have executed a break statement or are otherwise done with the
|
||||
loop. */
|
||||
/** If we're inside a loop or switch statement, this gives the basic
|
||||
block immediately after the current loop or switch, which we will
|
||||
jump to if all of the lanes have executed a break statement or are
|
||||
otherwise done with it. */
|
||||
llvm::BasicBlock *breakTarget;
|
||||
|
||||
/** If we're inside a loop, this gives the block to jump to if all of
|
||||
the running lanes have executed a 'continue' statement. */
|
||||
llvm::BasicBlock *continueTarget;
|
||||
|
||||
/** @name Switch statement state
|
||||
|
||||
These variables store various state that's active when we're
|
||||
generating code for a switch statement. They should all be NULL
|
||||
outside of a switch.
|
||||
@{
|
||||
*/
|
||||
|
||||
/** The value of the expression used to determine which case in the
|
||||
statements after the switch to execute. */
|
||||
llvm::Value *switchExpr;
|
||||
|
||||
/** Map from case label numbers to the basic block that will hold code
|
||||
for that case. */
|
||||
const std::vector<std::pair<int, llvm::BasicBlock *> > *caseBlocks;
|
||||
|
||||
/** The basic block of code to run for the "default" label in the
|
||||
switch statement. */
|
||||
llvm::BasicBlock *defaultBlock;
|
||||
|
||||
/** For each basic block for the code for cases (and the default label,
|
||||
if present), this map gives the basic block for the immediately
|
||||
following case/default label. */
|
||||
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *nextBlocks;
|
||||
|
||||
/** Records whether the switch condition was uniform; this is a
|
||||
distinct notion from whether the switch represents uniform or
|
||||
varying control flow; we may have varying control flow from a
|
||||
uniform switch condition if there is a 'break' inside the switch
|
||||
that's under varying control flow. */
|
||||
bool switchConditionWasUniform;
|
||||
/** @} */
|
||||
|
||||
/** A pointer to memory that records which of the program instances
|
||||
have executed a 'return' statement (and are thus really truly done
|
||||
running any more instructions in this functions. */
|
||||
@@ -537,9 +626,13 @@ private:
|
||||
tasks launched from the current function. */
|
||||
llvm::Value *launchGroupHandlePtr;
|
||||
|
||||
std::map<std::string, llvm::BasicBlock *> labelMap;
|
||||
|
||||
static bool initLabelBBlocks(ASTNode *node, void *data);
|
||||
|
||||
llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
|
||||
static void addGSMetadata(llvm::Value *inst, SourcePos pos);
|
||||
bool ifsInLoopAllUniform() const;
|
||||
bool ifsInCFAllUniform(int cfType) const;
|
||||
void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
|
||||
llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
|
||||
|
||||
@@ -547,6 +640,11 @@ private:
|
||||
const Type *ptrType);
|
||||
|
||||
void restoreMaskGivenReturns(llvm::Value *oldMask);
|
||||
void addSwitchMaskCheck(llvm::Value *mask);
|
||||
bool inSwitchStatement() const;
|
||||
llvm::Value *getMaskAtSwitchEntry();
|
||||
|
||||
CFInfo *popCFState();
|
||||
|
||||
void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
|
||||
llvm::Value *mask);
|
||||
|
||||
208
decl.cpp
208
decl.cpp
@@ -46,6 +46,18 @@
|
||||
#include <stdio.h>
|
||||
#include <set>
|
||||
|
||||
static void
|
||||
lPrintTypeQualifiers(int typeQualifiers) {
|
||||
if (typeQualifiers & TYPEQUAL_INLINE) printf("inline ");
|
||||
if (typeQualifiers & TYPEQUAL_CONST) printf("const ");
|
||||
if (typeQualifiers & TYPEQUAL_UNIFORM) printf("uniform ");
|
||||
if (typeQualifiers & TYPEQUAL_VARYING) printf("varying ");
|
||||
if (typeQualifiers & TYPEQUAL_TASK) printf("task ");
|
||||
if (typeQualifiers & TYPEQUAL_SIGNED) printf("signed ");
|
||||
if (typeQualifiers & TYPEQUAL_UNSIGNED) printf("unsigned ");
|
||||
}
|
||||
|
||||
|
||||
/** Given a Type and a set of type qualifiers, apply the type qualifiers to
|
||||
the type, returning the type that is the result.
|
||||
*/
|
||||
@@ -54,6 +66,16 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
|
||||
if (type == NULL)
|
||||
return NULL;
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_CONST) != 0)
|
||||
type = type->GetAsConstType();
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
|
||||
type = type->GetAsUniformType();
|
||||
else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
|
||||
type = type->GetAsVaryingType();
|
||||
else
|
||||
type = type->GetAsUnboundVariabilityType();
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
|
||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
|
||||
Error(pos, "Illegal to apply both \"signed\" and \"unsigned\" "
|
||||
@@ -64,29 +86,13 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
|
||||
type = unsignedType;
|
||||
else
|
||||
Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
|
||||
type->GetString().c_str());
|
||||
|
||||
type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
|
||||
}
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false)
|
||||
Error(pos, "\"signed\" qualifier is illegal with non-integer type "
|
||||
"\"%s\".", type->GetString().c_str());
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_CONST) != 0)
|
||||
type = type->GetAsConstType();
|
||||
|
||||
if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
|
||||
type = type->GetAsUniformType();
|
||||
else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
|
||||
type = type->GetAsVaryingType();
|
||||
else {
|
||||
// otherwise, structs are uniform by default and everything
|
||||
// else is varying by default
|
||||
if (dynamic_cast<const StructType *>(type->GetBaseType()) != NULL)
|
||||
type = type->GetAsUniformType();
|
||||
else
|
||||
type = type->GetAsVaryingType();
|
||||
}
|
||||
"\"%s\".",
|
||||
type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
|
||||
|
||||
return type;
|
||||
}
|
||||
@@ -138,21 +144,14 @@ lGetStorageClassName(StorageClass storageClass) {
|
||||
|
||||
void
|
||||
DeclSpecs::Print() const {
|
||||
printf("%s ", lGetStorageClassName(storageClass));
|
||||
printf("Declspecs: [%s ", lGetStorageClassName(storageClass));
|
||||
|
||||
if (soaWidth > 0) printf("soa<%d> ", soaWidth);
|
||||
|
||||
if (typeQualifiers & TYPEQUAL_INLINE) printf("inline ");
|
||||
if (typeQualifiers & TYPEQUAL_CONST) printf("const ");
|
||||
if (typeQualifiers & TYPEQUAL_UNIFORM) printf("uniform ");
|
||||
if (typeQualifiers & TYPEQUAL_VARYING) printf("varying ");
|
||||
if (typeQualifiers & TYPEQUAL_TASK) printf("task ");
|
||||
if (typeQualifiers & TYPEQUAL_SIGNED) printf("signed ");
|
||||
if (typeQualifiers & TYPEQUAL_UNSIGNED) printf("unsigned ");
|
||||
|
||||
printf("%s", baseType->GetString().c_str());
|
||||
lPrintTypeQualifiers(typeQualifiers);
|
||||
printf("base type: %s", baseType->GetString().c_str());
|
||||
|
||||
if (vectorSize > 0) printf("<%d>", vectorSize);
|
||||
printf("]");
|
||||
}
|
||||
|
||||
|
||||
@@ -192,19 +191,46 @@ Declarator::GetSymbol() const {
|
||||
|
||||
|
||||
void
|
||||
Declarator::Print() const {
|
||||
Declarator::Print(int indent) const {
|
||||
printf("%*cdeclarator: [", indent, ' ');
|
||||
pos.Print();
|
||||
|
||||
lPrintTypeQualifiers(typeQualifiers);
|
||||
Symbol *sym = GetSymbol();
|
||||
if (sym != NULL)
|
||||
printf("%s", sym->name.c_str());
|
||||
else
|
||||
printf("(null symbol)");
|
||||
|
||||
printf(", array size = %d", arraySize);
|
||||
|
||||
printf(", kind = ");
|
||||
switch (kind) {
|
||||
case DK_BASE: printf("base"); break;
|
||||
case DK_POINTER: printf("pointer"); break;
|
||||
case DK_REFERENCE: printf("reference"); break;
|
||||
case DK_ARRAY: printf("array"); break;
|
||||
case DK_FUNCTION: printf("function"); break;
|
||||
default: FATAL("Unhandled declarator kind");
|
||||
}
|
||||
|
||||
if (initExpr != NULL) {
|
||||
printf(" = (");
|
||||
initExpr->Print();
|
||||
printf(")");
|
||||
}
|
||||
pos.Print();
|
||||
|
||||
if (functionParams.size() > 0) {
|
||||
for (unsigned int i = 0; i < functionParams.size(); ++i) {
|
||||
printf("\n%*cfunc param %d:\n", indent, ' ', i);
|
||||
functionParams[i]->Print(indent+4);
|
||||
}
|
||||
}
|
||||
|
||||
if (child != NULL)
|
||||
child->Print(indent + 4);
|
||||
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
|
||||
@@ -235,11 +261,13 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
|
||||
Assert(d != NULL);
|
||||
|
||||
for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
|
||||
Declaration *pdecl = d->functionParams[i];
|
||||
Assert(pdecl->declarators.size() == 1);
|
||||
funArgs->push_back(pdecl->declarators[0]->GetSymbol());
|
||||
Symbol *sym = d->GetSymbolForFunctionParameter(i);
|
||||
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
|
||||
funArgs->push_back(sym);
|
||||
}
|
||||
|
||||
funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying);
|
||||
|
||||
return funSym;
|
||||
}
|
||||
|
||||
@@ -258,6 +286,12 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
|
||||
if (kind != DK_FUNCTION && isTask)
|
||||
Error(pos, "\"task\" qualifier illegal in variable declaration.");
|
||||
|
||||
Type::Variability variability = Type::Unbound;
|
||||
if (hasUniformQual)
|
||||
variability = Type::Uniform;
|
||||
else if (hasVaryingQual)
|
||||
variability = Type::Varying;
|
||||
|
||||
const Type *type = base;
|
||||
switch (kind) {
|
||||
case DK_BASE:
|
||||
@@ -268,7 +302,7 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
|
||||
return type;
|
||||
|
||||
case DK_POINTER:
|
||||
type = new PointerType(type, hasUniformQual, isConst);
|
||||
type = new PointerType(type, variability, isConst);
|
||||
if (child != NULL)
|
||||
return child->GetType(type, ds);
|
||||
else
|
||||
@@ -316,25 +350,7 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
|
||||
for (unsigned int i = 0; i < functionParams.size(); ++i) {
|
||||
Declaration *d = functionParams[i];
|
||||
|
||||
char buf[32];
|
||||
Symbol *sym;
|
||||
if (d->declarators.size() == 0) {
|
||||
// function declaration like foo(float), w/o a name for
|
||||
// the parameter
|
||||
sprintf(buf, "__anon_parameter_%d", i);
|
||||
sym = new Symbol(buf, pos);
|
||||
sym->type = d->declSpecs->GetBaseType(pos);
|
||||
}
|
||||
else {
|
||||
sym = d->declarators[0]->GetSymbol();
|
||||
if (sym == NULL) {
|
||||
// Handle more complex anonymous declarations like
|
||||
// float (float **).
|
||||
sprintf(buf, "__anon_parameter_%d", i);
|
||||
sym = new Symbol(buf, d->declarators[0]->pos);
|
||||
sym->type = d->declarators[0]->GetType(d->declSpecs);
|
||||
}
|
||||
}
|
||||
Symbol *sym = GetSymbolForFunctionParameter(i);
|
||||
|
||||
if (d->declSpecs->storageClass != SC_NONE)
|
||||
Error(sym->pos, "Storage class \"%s\" is illegal in "
|
||||
@@ -397,7 +413,7 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
|
||||
Error(pos, "No return type provided in function declaration.");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
bool isExported = ds && (ds->storageClass == SC_EXPORT);
|
||||
bool isExternC = ds && (ds->storageClass == SC_EXTERN_C);
|
||||
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
|
||||
@@ -418,9 +434,10 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Type *functionType =
|
||||
new FunctionType(returnType, args, pos, argNames, argDefaults,
|
||||
const Type *functionType =
|
||||
new FunctionType(returnType, args, argNames, argDefaults,
|
||||
argPos, isTask, isExported, isExternC);
|
||||
functionType = functionType->ResolveUnboundVariability(Type::Varying);
|
||||
return child->GetType(functionType, ds);
|
||||
}
|
||||
default:
|
||||
@@ -461,6 +478,35 @@ Declarator::GetType(DeclSpecs *ds) const {
|
||||
}
|
||||
|
||||
|
||||
Symbol *
|
||||
Declarator::GetSymbolForFunctionParameter(int paramNum) const {
|
||||
Assert(paramNum < (int)functionParams.size());
|
||||
Declaration *d = functionParams[paramNum];
|
||||
|
||||
char buf[32];
|
||||
Symbol *sym;
|
||||
if (d->declarators.size() == 0) {
|
||||
// function declaration like foo(float), w/o a name for
|
||||
// the parameter
|
||||
sprintf(buf, "__anon_parameter_%d", paramNum);
|
||||
sym = new Symbol(buf, pos);
|
||||
sym->type = d->declSpecs->GetBaseType(pos);
|
||||
}
|
||||
else {
|
||||
Assert(d->declarators.size() == 1);
|
||||
sym = d->declarators[0]->GetSymbol();
|
||||
if (sym == NULL) {
|
||||
// Handle more complex anonymous declarations like
|
||||
// float (float **).
|
||||
sprintf(buf, "__anon_parameter_%d", paramNum);
|
||||
sym = new Symbol(buf, d->declarators[0]->pos);
|
||||
sym->type = d->declarators[0]->GetType(d->declSpecs);
|
||||
}
|
||||
}
|
||||
return sym;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Declaration
|
||||
|
||||
@@ -489,19 +535,15 @@ Declaration::GetVariableDeclarations() const {
|
||||
std::vector<VariableDeclaration> vars;
|
||||
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i) {
|
||||
if (declarators[i] == NULL)
|
||||
continue;
|
||||
Declarator *decl = declarators[i];
|
||||
if (decl == NULL)
|
||||
// Ignore earlier errors
|
||||
continue;
|
||||
|
||||
Symbol *sym = decl->GetSymbol();
|
||||
if (dynamic_cast<const FunctionType *>(sym->type) != NULL) {
|
||||
// function declaration
|
||||
m->symbolTable->AddFunction(sym);
|
||||
}
|
||||
else {
|
||||
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
|
||||
|
||||
if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
|
||||
m->symbolTable->AddVariable(sym);
|
||||
vars.push_back(VariableDeclaration(sym, decl->initExpr));
|
||||
}
|
||||
@@ -511,16 +553,36 @@ Declaration::GetVariableDeclarations() const {
|
||||
|
||||
|
||||
void
|
||||
Declaration::Print() const {
|
||||
printf("Declaration: specs [");
|
||||
declSpecs->Print();
|
||||
printf("], declarators [");
|
||||
for (unsigned int i = 0 ; i < declarators.size(); ++i) {
|
||||
declarators[i]->Print();
|
||||
printf("%s", (i == declarators.size() - 1) ? "]" : ", ");
|
||||
Declaration::DeclareFunctions() {
|
||||
Assert(declSpecs->storageClass != SC_TYPEDEF);
|
||||
|
||||
for (unsigned int i = 0; i < declarators.size(); ++i) {
|
||||
Declarator *decl = declarators[i];
|
||||
if (decl == NULL)
|
||||
// Ignore earlier errors
|
||||
continue;
|
||||
|
||||
Symbol *sym = decl->GetSymbol();
|
||||
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
|
||||
|
||||
if (dynamic_cast<const FunctionType *>(sym->type) == NULL)
|
||||
continue;
|
||||
|
||||
bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
|
||||
m->AddFunctionDeclaration(sym, isInline);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Declaration::Print(int indent) const {
|
||||
printf("%*cDeclaration: specs [", indent, ' ');
|
||||
declSpecs->Print();
|
||||
printf("], declarators:\n");
|
||||
for (unsigned int i = 0 ; i < declarators.size(); ++i)
|
||||
declarators[i]->Print(indent+4);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void
|
||||
@@ -539,7 +601,7 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
|
||||
DeclSpecs ds(type);
|
||||
if (type->IsUniformType())
|
||||
ds.typeQualifiers |= TYPEQUAL_UNIFORM;
|
||||
else
|
||||
else if (type->IsVaryingType())
|
||||
ds.typeQualifiers |= TYPEQUAL_VARYING;
|
||||
|
||||
for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
|
||||
|
||||
10
decl.h
10
decl.h
@@ -153,10 +153,12 @@ public:
|
||||
declarator and symbols for its arguments in *args. */
|
||||
Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
|
||||
|
||||
Symbol *GetSymbolForFunctionParameter(int paramNum) const;
|
||||
|
||||
/** Returns the symbol associated with the declarator. */
|
||||
Symbol *GetSymbol() const;
|
||||
|
||||
void Print() const;
|
||||
void Print(int indent) const;
|
||||
|
||||
/** Position of the declarator in the source program. */
|
||||
const SourcePos pos;
|
||||
@@ -199,7 +201,7 @@ public:
|
||||
Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL);
|
||||
Declaration(DeclSpecs *ds, Declarator *d);
|
||||
|
||||
void Print() const;
|
||||
void Print(int indent) const;
|
||||
|
||||
/** This method walks through all of the Declarators in a declaration
|
||||
and returns a fully-initialized Symbol and (possibly) and
|
||||
@@ -208,6 +210,10 @@ public:
|
||||
Declarator representation.) */
|
||||
std::vector<VariableDeclaration> GetVariableDeclarations() const;
|
||||
|
||||
/** For any function declarations in the Declaration, add the
|
||||
declaration to the module. */
|
||||
void DeclareFunctions();
|
||||
|
||||
DeclSpecs *declSpecs;
|
||||
std::vector<Declarator *> declarators;
|
||||
};
|
||||
|
||||
@@ -1,3 +1,43 @@
|
||||
=== v1.1.3 === (20 January 2012)
|
||||
|
||||
With this release, the language now supports "switch" statements, with the
|
||||
same semantics and syntax as in C.
|
||||
|
||||
This release includes fixes for two important performance related issues:
|
||||
the quality of code generated for "foreach" statements has been
|
||||
substantially improved (https://github.com/ispc/ispc/issues/151), and a
|
||||
performance regression with code for "gathers" that was introduced in
|
||||
v1.1.2 has been fixed in this release.
|
||||
|
||||
A number of other small bugs were fixed in this release as well, including
|
||||
one where invalid memory would sometimes be incorrectly accessed
|
||||
(https://github.com/ispc/ispc/issues/160).
|
||||
|
||||
Thanks to Jean-Luc Duprat for a number of patches that improve support for
|
||||
building on various platforms, and to Pierre-Antoine Lacaze for patches so
|
||||
that ispc builds under MinGW.
|
||||
|
||||
=== v1.1.2 === (9 January 2012)
|
||||
|
||||
The major new feature in this release is support for "generic" C++
|
||||
vectorized output; in other words, ispc can emit C++ code that corresponds
|
||||
to the vectorized computation that the ispc program represents. See the
|
||||
examples/intrinsics directory in the ispc distribution for two example
|
||||
implementations of the set of functions that must be provided map the
|
||||
vector calls generated by ispc to target specific functions.
|
||||
|
||||
ispc now has partial support for 'goto' statements; specifically, goto is
|
||||
allowed if any enclosing control flow statements (if/for/while/do) have
|
||||
'uniform' test expressions, but not if they have 'varying' tests.
|
||||
|
||||
A number of improvements have been made to the code generated for gathers
|
||||
and scatters--one of them (better matching x86's "free" scale by 2/4/8 for
|
||||
addressing calculations) improved the performance of the noise example by
|
||||
14%.
|
||||
|
||||
Many small bugs have been fixed in this release as well, including issue
|
||||
numbers 138, 129, 135, 127, 149, and 142.
|
||||
|
||||
=== v1.1.1 === (15 December 2011)
|
||||
|
||||
This release doesn't include any significant new functionality, but does
|
||||
|
||||
@@ -2,11 +2,11 @@
|
||||
|
||||
for i in ispc perfguide faq; do
|
||||
rst2html.py --template=template.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css $i.txt > $i.html
|
||||
--stylesheet-path=css/style.css $i.rst > $i.html
|
||||
done
|
||||
|
||||
rst2html.py --template=template-perf.txt --link-stylesheet \
|
||||
--stylesheet-path=css/style.css perf.txt > perf.html
|
||||
--stylesheet-path=css/style.css perf.rst > perf.html
|
||||
|
||||
#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
|
||||
#pdflatex ispc.tex
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
=============================================================
|
||||
Intel® SPMD Program Compiler Frequently Asked Questions (FAQ)
|
||||
=============================================================
|
||||
=====================================
|
||||
Frequently Asked Questions About ispc
|
||||
=====================================
|
||||
|
||||
This document includes a number of frequently (and not frequently) asked
|
||||
questions about ispc, the Intel® SPMD Program Compiler. The source to this
|
||||
document is in the file ``docs/faq.txt`` in the ``ispc`` source
|
||||
document is in the file ``docs/faq.rst`` in the ``ispc`` source
|
||||
distribution.
|
||||
|
||||
* Understanding ispc's Output
|
||||
@@ -99,7 +99,9 @@ Contents:
|
||||
+ `Control Flow`_
|
||||
|
||||
* `Conditional Statements: "if"`_
|
||||
* `Conditional Statements: "switch"`_
|
||||
* `Basic Iteration Statements: "for", "while", and "do"`_
|
||||
* `Unstructured Control Flow: "goto"`_
|
||||
* `"Coherent" Control Flow Statements: "cif" and Friends`_
|
||||
* `Parallel Iteration Statements: "foreach" and "foreach_tiled"`_
|
||||
* `Parallel Iteration with "programIndex" and "programCount"`_
|
||||
@@ -1140,7 +1142,7 @@ in C:
|
||||
|
||||
* Expression syntax and basic types
|
||||
* Syntax for variable declarations
|
||||
* Control flow structures: if, for, while, do
|
||||
* Control flow structures: ``if``, ``for``, ``while``, ``do``, and ``switch``.
|
||||
* Pointers, including function pointers, ``void *``, and C's array/pointer
|
||||
duality (arrays are converted to pointers when passed to functions, etc.)
|
||||
* Structs and arrays
|
||||
@@ -1184,7 +1186,7 @@ but are likely to be supported in future releases:
|
||||
``int64`` types
|
||||
* Character constants
|
||||
* String constants and arrays of characters as strings
|
||||
* ``switch`` and ``goto`` statements
|
||||
* ``goto`` statements are partially supported (see `Unstructured Control Flow: "goto"`_)
|
||||
* ``union`` types
|
||||
* Bitfield members of ``struct`` types
|
||||
* Variable numbers of arguments to functions
|
||||
@@ -1245,6 +1247,18 @@ Here are three ways of specifying the integer value "15":
|
||||
int fifteen_hex = 0xf;
|
||||
int fifteen_binary = 0b1111;
|
||||
|
||||
A number of suffixes can be provided with integer numeric constants.
|
||||
First, "u" denotes that the constant is unsigned, and "ll" denotes a 64-bit
|
||||
integer constant (while "l" denotes a 32-bit integer constant). It is also
|
||||
possible to denote units of 1024, 1024*1024, or 1024*1024*1024 with the
|
||||
SI-inspired suffixes "k", "M", and "G" respectively:
|
||||
|
||||
::
|
||||
|
||||
int two_kb = 2k; // 2048
|
||||
int two_megs = 2M; // 2 * 1024 * 1024
|
||||
int one_gig = 1G; // 1024 * 1024 * 1024
|
||||
|
||||
Floating-point constants can be specified in one of three ways. First,
|
||||
they may be a sequence of zero or more digits from 0 to 9, followed by a
|
||||
period, followed by zero or more digits from 0 to 9. (There must be at
|
||||
@@ -1980,6 +1994,31 @@ executes if the condition is false.
|
||||
else
|
||||
x *= 2.;
|
||||
|
||||
Conditional Statements: "switch"
|
||||
--------------------------------
|
||||
|
||||
The ``switch`` conditional statement is also available, again with the same
|
||||
behavior as in C; the expression used in the ``switch`` must be of integer
|
||||
type (but it can be uniform or varying). As in C, if there is no ``break``
|
||||
statement at the end of the code for a given case, execution "falls
|
||||
through" to the following case. These features are demonstrated in the
|
||||
code below.
|
||||
|
||||
::
|
||||
|
||||
int x = ...;
|
||||
switch (x) {
|
||||
case 0:
|
||||
case 1:
|
||||
foo(x);
|
||||
/* fall through */
|
||||
case 5:
|
||||
x = 0;
|
||||
break;
|
||||
default:
|
||||
x *= x;
|
||||
}
|
||||
|
||||
Basic Iteration Statements: "for", "while", and "do"
|
||||
----------------------------------------------------
|
||||
|
||||
@@ -2005,6 +2044,37 @@ one of them executes a ``continue`` statement, other program instances
|
||||
executing code in the loop body that didn't execute the ``continue`` will
|
||||
be unaffected by it.
|
||||
|
||||
Unstructured Control Flow: "goto"
|
||||
---------------------------------
|
||||
|
||||
``goto`` statements are allowed in ``ispc`` programs under limited
|
||||
circumstances; specifically, only when the compiler can determine that if
|
||||
any program instance executes a ``goto`` statement, then all of the program
|
||||
instances will be running at that statement, such that all will follow the
|
||||
``goto``.
|
||||
|
||||
Put another way: it's illegal for there to be "varying" control flow
|
||||
statements in scopes that enclose a ``goto`` statement. An error is issued
|
||||
if a ``goto`` is used in this situation.
|
||||
|
||||
The syntax for adding labels to ``ispc`` programs and jumping to them with
|
||||
``goto`` is the same as in C. The following code shows a ``goto`` based
|
||||
equivalent of a ``for`` loop where the induction variable ``i`` goes from
|
||||
zero to ten.
|
||||
|
||||
::
|
||||
|
||||
uniform int i = 0;
|
||||
check:
|
||||
if (i > 10)
|
||||
goto done;
|
||||
// loop body
|
||||
++i;
|
||||
goto check;
|
||||
done:
|
||||
// ...
|
||||
|
||||
|
||||
"Coherent" Control Flow Statements: "cif" and Friends
|
||||
-----------------------------------------------------
|
||||
|
||||
@@ -3374,12 +3444,27 @@ pointer types.
|
||||
System Information
|
||||
------------------
|
||||
|
||||
A routine is available to find the number of CPU cores available in the
|
||||
system:
|
||||
The value of a high-precision hardware clock counter is returned by the
|
||||
``clock()`` routine; its value increments by one each processor cycle.
|
||||
Thus, taking the difference between the values returned by ``clock()`` and
|
||||
different points in program execution gives the number of cycles between
|
||||
those points in the program.
|
||||
|
||||
::
|
||||
|
||||
int num_cores()
|
||||
uniform int64 clock()
|
||||
|
||||
Note that ``clock()`` flushes the processor pipeline. It has an overhead
|
||||
of a hundred or so cycles, so for very fine-grained measurements, it may be
|
||||
worthwhile to measure the cost of calling ``clock()`` and subtracting that
|
||||
value from reported results.
|
||||
|
||||
A routine is also available to find the number of CPU cores available in
|
||||
the system:
|
||||
|
||||
::
|
||||
|
||||
uniform int num_cores()
|
||||
|
||||
This value can be useful for adapting the granularity of parallel task
|
||||
decomposition depending on the number of processors in the system.
|
||||
@@ -45,8 +45,7 @@
|
||||
developers mailing list</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen documentation of
|
||||
<tt>ispc</tt> source code</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -45,8 +45,7 @@
|
||||
developers mailing list</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
|
||||
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen documentation of
|
||||
<tt>ispc</tt> source code</a></li>
|
||||
<li><a href="doxygen/index.html">Doxygen</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
|
||||
# This could be handy for archiving the generated documentation or
|
||||
# if some version control system is used.
|
||||
|
||||
PROJECT_NUMBER = 1.1.1
|
||||
PROJECT_NUMBER = 1.1.3
|
||||
|
||||
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
||||
# base path where the generated documentation will be put.
|
||||
|
||||
@@ -82,7 +82,7 @@ static inline void vnormalize(vec &v) {
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
static void
|
||||
ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
|
||||
float d = -dot(plane.p, plane.n);
|
||||
float v = dot(ray.dir, plane.n);
|
||||
@@ -124,7 +124,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
static void
|
||||
orthoBasis(vec basis[3], vec n) {
|
||||
basis[2] = n;
|
||||
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
|
||||
@@ -147,7 +147,7 @@ orthoBasis(vec basis[3], vec n) {
|
||||
}
|
||||
|
||||
|
||||
static inline float
|
||||
static float
|
||||
ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
|
||||
RNGState &rngstate) {
|
||||
float eps = 0.0001f;
|
||||
|
||||
@@ -14,13 +14,13 @@ dirs:
|
||||
clean:
|
||||
/bin/rm -rf objs *~ ao
|
||||
|
||||
ao: dirs objs/ao.o objs/instrument.o objs/ao_ispc.o
|
||||
$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/instrument.o -lm -lpthread
|
||||
ao: objs/ao.o objs/instrument.o objs/ao_ispc.o ../tasksys.cpp
|
||||
$(CXX) $(CXXFLAGS) -o $@ $^ -lm -lpthread
|
||||
|
||||
objs/%.o: %.cpp
|
||||
objs/%.o: %.cpp dirs
|
||||
$(CXX) $< $(CXXFLAGS) -c -o $@
|
||||
|
||||
objs/ao.o: objs/ao_ispc.h
|
||||
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
|
||||
objs/%_ispc.h objs/%_ispc.o: %.ispc dirs
|
||||
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_instrumented_ispc.h
|
||||
|
||||
@@ -251,6 +251,14 @@ static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) { \
|
||||
return cond ? a : b; \
|
||||
}
|
||||
|
||||
#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP) \
|
||||
static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \
|
||||
TYPE ret; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
ret.v[i] = (CAST)(a.v[i]) OP b; \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
#define SMEAR(VTYPE, NAME, STYPE) \
|
||||
static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \
|
||||
VTYPE ret; \
|
||||
@@ -307,6 +315,12 @@ static FORCEINLINE uint32_t __movmsk(__vec16_i1 mask) {
|
||||
return mask.v;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
|
||||
__vec16_i1 r;
|
||||
r.v = (a.v & b.v) | (~a.v & ~b.v);
|
||||
return r;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
|
||||
__vec16_i1 r;
|
||||
r.v = a.v & b.v;
|
||||
@@ -380,6 +394,10 @@ BINARY_OP_CAST(__vec16_i8, int8_t, __srem, %)
|
||||
BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>)
|
||||
BINARY_OP_CAST(__vec16_i8, int8_t, __ashr, >>)
|
||||
|
||||
SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
|
||||
|
||||
CMP_OP(__vec16_i8, int8_t, __equal, ==)
|
||||
CMP_OP(__vec16_i8, int8_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=)
|
||||
@@ -419,6 +437,10 @@ BINARY_OP_CAST(__vec16_i16, int16_t, __srem, %)
|
||||
BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>)
|
||||
BINARY_OP_CAST(__vec16_i16, int16_t, __ashr, >>)
|
||||
|
||||
SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
|
||||
|
||||
CMP_OP(__vec16_i16, int16_t, __equal, ==)
|
||||
CMP_OP(__vec16_i16, int16_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=)
|
||||
@@ -458,6 +480,10 @@ BINARY_OP_CAST(__vec16_i32, int32_t, __srem, %)
|
||||
BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>)
|
||||
BINARY_OP_CAST(__vec16_i32, int32_t, __ashr, >>)
|
||||
|
||||
SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
|
||||
|
||||
CMP_OP(__vec16_i32, int32_t, __equal, ==)
|
||||
CMP_OP(__vec16_i32, int32_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=)
|
||||
@@ -497,6 +523,10 @@ BINARY_OP_CAST(__vec16_i64, int64_t, __srem, %)
|
||||
BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
|
||||
BINARY_OP_CAST(__vec16_i64, int64_t, __ashr, >>)
|
||||
|
||||
SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
|
||||
|
||||
CMP_OP(__vec16_i64, int64_t, __equal, ==)
|
||||
CMP_OP(__vec16_i64, int64_t, __not_equal, !=)
|
||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
|
||||
@@ -932,7 +962,7 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// masked load/store
|
||||
|
||||
static FORCEINLINE __vec16_i8 __masked_load_8(unsigned char *p,
|
||||
static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i8 ret;
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
@@ -942,7 +972,7 @@ static FORCEINLINE __vec16_i8 __masked_load_8(unsigned char *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i16 __masked_load_16(unsigned char *p,
|
||||
static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i16 ret;
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
@@ -952,7 +982,7 @@ static FORCEINLINE __vec16_i16 __masked_load_16(unsigned char *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __masked_load_32(unsigned char *p,
|
||||
static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i32 ret;
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
@@ -962,7 +992,7 @@ static FORCEINLINE __vec16_i32 __masked_load_32(unsigned char *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i64 __masked_load_64(unsigned char *p,
|
||||
static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i64 ret;
|
||||
int64_t *ptr = (int64_t *)p;
|
||||
@@ -972,7 +1002,7 @@ static FORCEINLINE __vec16_i64 __masked_load_64(unsigned char *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_8(unsigned char *p, __vec16_i8 val,
|
||||
static FORCEINLINE void __masked_store_8(void *p, __vec16_i8 val,
|
||||
__vec16_i1 mask) {
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -980,7 +1010,7 @@ static FORCEINLINE void __masked_store_8(unsigned char *p, __vec16_i8 val,
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_16(unsigned char *p, __vec16_i16 val,
|
||||
static FORCEINLINE void __masked_store_16(void *p, __vec16_i16 val,
|
||||
__vec16_i1 mask) {
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -988,7 +1018,7 @@ static FORCEINLINE void __masked_store_16(unsigned char *p, __vec16_i16 val,
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_32(unsigned char *p, __vec16_i32 val,
|
||||
static FORCEINLINE void __masked_store_32(void *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -996,7 +1026,7 @@ static FORCEINLINE void __masked_store_32(unsigned char *p, __vec16_i32 val,
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_64(unsigned char *p, __vec16_i64 val,
|
||||
static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
|
||||
__vec16_i1 mask) {
|
||||
int64_t *ptr = (int64_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -1004,19 +1034,41 @@ static FORCEINLINE void __masked_store_64(unsigned char *p, __vec16_i64 val,
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_8(void *p, __vec16_i8 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_8(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_16(void *p, __vec16_i16 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_16(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_32(void *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_32(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_64(p, val, mask);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// gather/scatter
|
||||
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
|
||||
__vec16_i1 mask) { \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
__vec16_i1 mask) { \
|
||||
VTYPE ret; \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]); \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
ret.v[i] = *ptr; \
|
||||
} \
|
||||
return ret; \
|
||||
@@ -1054,13 +1106,15 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
|
||||
|
||||
// scatter
|
||||
|
||||
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
|
||||
#define SCATTER_BASE_VARYINGOFFSET(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
VTYPE val, __vec16_i1 mask) { \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]); \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
*ptr = val.v[i]; \
|
||||
} \
|
||||
}
|
||||
|
||||
@@ -51,8 +51,8 @@
|
||||
#define FORCEINLINE __attribute__((always_inline)) inline
|
||||
#endif
|
||||
|
||||
//CO#undef FORCEINLINE
|
||||
//CO#define FORCEINLINE
|
||||
#undef FORCEINLINE
|
||||
#define FORCEINLINE
|
||||
|
||||
typedef float __vec1_f;
|
||||
typedef double __vec1_d;
|
||||
@@ -228,6 +228,10 @@ static FORCEINLINE uint32_t __movmsk(__vec4_i1 mask) {
|
||||
return _mm_movemask_ps(mask.v);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i1 a, __vec4_i1 b) {
|
||||
return _mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __and(__vec4_i1 a, __vec4_i1 b) {
|
||||
return _mm_and_ps(a.v, b.v);
|
||||
}
|
||||
@@ -299,6 +303,13 @@ static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, __vec4_i8 b) {
|
||||
_mm_extract_epi8(a.v, 3) << _mm_extract_epi8(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, int32_t b) {
|
||||
return __vec4_i8(_mm_extract_epi8(a.v, 0) << b,
|
||||
_mm_extract_epi8(a.v, 1) << b,
|
||||
_mm_extract_epi8(a.v, 2) << b,
|
||||
_mm_extract_epi8(a.v, 3) << b);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8 __udiv(__vec4_i8 a, __vec4_i8 b) {
|
||||
return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) /
|
||||
(uint8_t)_mm_extract_epi8(b.v, 0),
|
||||
@@ -354,6 +365,13 @@ static FORCEINLINE __vec4_i8 __lshr(__vec4_i8 a, __vec4_i8 b) {
|
||||
(uint8_t)_mm_extract_epi8(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8 __lshr(__vec4_i8 a, int32_t b) {
|
||||
return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) >> b,
|
||||
(uint8_t)_mm_extract_epi8(a.v, 1) >> b,
|
||||
(uint8_t)_mm_extract_epi8(a.v, 2) >> b,
|
||||
(uint8_t)_mm_extract_epi8(a.v, 3) >> b);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, __vec4_i8 b) {
|
||||
return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) >>
|
||||
(int8_t)_mm_extract_epi8(b.v, 0),
|
||||
@@ -365,6 +383,13 @@ static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, __vec4_i8 b) {
|
||||
(int8_t)_mm_extract_epi8(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, int32_t b) {
|
||||
return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) >> b,
|
||||
(int8_t)_mm_extract_epi8(a.v, 1) >> b,
|
||||
(int8_t)_mm_extract_epi8(a.v, 2) >> b,
|
||||
(int8_t)_mm_extract_epi8(a.v, 3) >> b);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i8 a, __vec4_i8 b) {
|
||||
__m128i cmp = _mm_cmpeq_epi8(a.v, b.v);
|
||||
return __vec4_i1(_mm_extract_epi8(cmp, 0),
|
||||
@@ -543,6 +568,10 @@ static FORCEINLINE __vec4_i16 __shl(__vec4_i16 a, __vec4_i16 b) {
|
||||
_mm_extract_epi16(a.v, 3) << _mm_extract_epi16(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16 __shl(__vec4_i16 a, int32_t b) {
|
||||
return _mm_sll_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16 __udiv(__vec4_i16 a, __vec4_i16 b) {
|
||||
return __vec4_i16((uint16_t)_mm_extract_epi16(a.v, 0) /
|
||||
(uint16_t)_mm_extract_epi16(b.v, 0),
|
||||
@@ -598,6 +627,10 @@ static FORCEINLINE __vec4_i16 __lshr(__vec4_i16 a, __vec4_i16 b) {
|
||||
(uint16_t)_mm_extract_epi16(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16 __lshr(__vec4_i16 a, int32_t b) {
|
||||
return _mm_srl_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, __vec4_i16 b) {
|
||||
return __vec4_i16((int16_t)_mm_extract_epi16(a.v, 0) >>
|
||||
(int16_t)_mm_extract_epi16(b.v, 0),
|
||||
@@ -609,6 +642,10 @@ static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, __vec4_i16 b) {
|
||||
(int16_t)_mm_extract_epi16(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, int32_t b) {
|
||||
return _mm_sra_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i16 a, __vec4_i16 b) {
|
||||
__m128i cmp = _mm_cmpeq_epi16(a.v, b.v);
|
||||
return __vec4_i1(_mm_extract_epi16(cmp, 0),
|
||||
@@ -785,9 +822,6 @@ static FORCEINLINE __vec4_i32 __xor(__vec4_i32 a, __vec4_i32 b) {
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, __vec4_i32 b) {
|
||||
// FIXME: if we can determine at compile time that b has the same value
|
||||
// across all elements, then we can use _mm_sll_epi32.
|
||||
|
||||
/* fixme: llvm generates thie code for shift left, which is presumably
|
||||
more efficient than doing each component individually as below.
|
||||
|
||||
@@ -809,57 +843,92 @@ _f___ii: ## @f___ii
|
||||
ret
|
||||
|
||||
*/
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) << _mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) << _mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) << _mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) << _mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) <<
|
||||
_mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) <<
|
||||
_mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) <<
|
||||
_mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) <<
|
||||
_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, int32_t b) {
|
||||
return _mm_sll_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __udiv(__vec4_i32 a, __vec4_i32 b) {
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) / (uint32_t)_mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) / (uint32_t)_mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) / (uint32_t)_mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) / (uint32_t)_mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) /
|
||||
(uint32_t)_mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) /
|
||||
(uint32_t)_mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) /
|
||||
(uint32_t)_mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) /
|
||||
(uint32_t)_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __sdiv(__vec4_i32 a, __vec4_i32 b) {
|
||||
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) / (int32_t)_mm_extract_epi32(b.v, 0),
|
||||
(int32_t)_mm_extract_epi32(a.v, 1) / (int32_t)_mm_extract_epi32(b.v, 1),
|
||||
(int32_t)_mm_extract_epi32(a.v, 2) / (int32_t)_mm_extract_epi32(b.v, 2),
|
||||
(int32_t)_mm_extract_epi32(a.v, 3) / (int32_t)_mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) /
|
||||
(int32_t)_mm_extract_epi32(b.v, 0),
|
||||
(int32_t)_mm_extract_epi32(a.v, 1) /
|
||||
(int32_t)_mm_extract_epi32(b.v, 1),
|
||||
(int32_t)_mm_extract_epi32(a.v, 2) /
|
||||
(int32_t)_mm_extract_epi32(b.v, 2),
|
||||
(int32_t)_mm_extract_epi32(a.v, 3) /
|
||||
(int32_t)_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __urem(__vec4_i32 a, __vec4_i32 b) {
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) % (uint32_t)_mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) % (uint32_t)_mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) % (uint32_t)_mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) % (uint32_t)_mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) %
|
||||
(uint32_t)_mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) %
|
||||
(uint32_t)_mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) %
|
||||
(uint32_t)_mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) %
|
||||
(uint32_t)_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __srem(__vec4_i32 a, __vec4_i32 b) {
|
||||
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) % (int32_t)_mm_extract_epi32(b.v, 0),
|
||||
(int32_t)_mm_extract_epi32(a.v, 1) % (int32_t)_mm_extract_epi32(b.v, 1),
|
||||
(int32_t)_mm_extract_epi32(a.v, 2) % (int32_t)_mm_extract_epi32(b.v, 2),
|
||||
(int32_t)_mm_extract_epi32(a.v, 3) % (int32_t)_mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) %
|
||||
(int32_t)_mm_extract_epi32(b.v, 0),
|
||||
(int32_t)_mm_extract_epi32(a.v, 1) %
|
||||
(int32_t)_mm_extract_epi32(b.v, 1),
|
||||
(int32_t)_mm_extract_epi32(a.v, 2) %
|
||||
(int32_t)_mm_extract_epi32(b.v, 2),
|
||||
(int32_t)_mm_extract_epi32(a.v, 3) %
|
||||
(int32_t)_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __lshr(__vec4_i32 a, __vec4_i32 b) {
|
||||
// FIXME: if we can determine at compile time that b has the same value
|
||||
// across all elements, e.g. using gcc's __builtin_constant_p, then we
|
||||
// can use _mm_srl_epi32.
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) >>
|
||||
_mm_extract_epi32(b.v, 0),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 1) >>
|
||||
_mm_extract_epi32(b.v, 1),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 2) >>
|
||||
_mm_extract_epi32(b.v, 2),
|
||||
(uint32_t)_mm_extract_epi32(a.v, 3) >>
|
||||
_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __lshr(__vec4_i32 a, int32_t b) {
|
||||
return _mm_srl_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, __vec4_i32 b) {
|
||||
// FIXME: if we can determine at compile time that b has the same value
|
||||
// across all elements, then we can use _mm_sra_epi32.
|
||||
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
|
||||
(int32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
|
||||
(int32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
|
||||
(int32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
|
||||
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) >>
|
||||
_mm_extract_epi32(b.v, 0),
|
||||
(int32_t)_mm_extract_epi32(a.v, 1) >>
|
||||
_mm_extract_epi32(b.v, 1),
|
||||
(int32_t)_mm_extract_epi32(a.v, 2) >>
|
||||
_mm_extract_epi32(b.v, 2),
|
||||
(int32_t)_mm_extract_epi32(a.v, 3) >>
|
||||
_mm_extract_epi32(b.v, 3));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, int32_t b) {
|
||||
return _mm_sra_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i32 a, __vec4_i32 b) {
|
||||
@@ -1012,6 +1081,12 @@ static FORCEINLINE __vec4_i64 __shl(__vec4_i64 a, __vec4_i64 b) {
|
||||
_mm_extract_epi64(a.v[1], 1) << _mm_extract_epi64(b.v[1], 1));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64 __shl(__vec4_i64 a, int32_t b) {
|
||||
__m128i amt = _mm_set_epi32(0, 0, 0, b);
|
||||
return __vec4_i64(_mm_sll_epi64(a.v[0], amt),
|
||||
_mm_sll_epi64(a.v[1], amt));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64 __udiv(__vec4_i64 a, __vec4_i64 b) {
|
||||
return __vec4_i64((uint64_t)_mm_extract_epi64(a.v[0], 0) /
|
||||
(uint64_t)_mm_extract_epi64(b.v[0], 0),
|
||||
@@ -1067,6 +1142,12 @@ static FORCEINLINE __vec4_i64 __lshr(__vec4_i64 a, __vec4_i64 b) {
|
||||
(uint64_t)_mm_extract_epi64(b.v[1], 1));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64 __lshr(__vec4_i64 a, int32_t b) {
|
||||
__m128i amt = _mm_set_epi32(0, 0, 0, b);
|
||||
return __vec4_i64(_mm_srl_epi64(a.v[0], amt),
|
||||
_mm_srl_epi64(a.v[1], amt));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, __vec4_i64 b) {
|
||||
return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) >>
|
||||
(int64_t)_mm_extract_epi64(b.v[0], 0),
|
||||
@@ -1078,6 +1159,13 @@ static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, __vec4_i64 b) {
|
||||
(int64_t)_mm_extract_epi64(b.v[1], 1));
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, int32_t b) {
|
||||
return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) >> b,
|
||||
(int64_t)_mm_extract_epi64(a.v[0], 1) >> b,
|
||||
(int64_t)_mm_extract_epi64(a.v[1], 0) >> b,
|
||||
(int64_t)_mm_extract_epi64(a.v[1], 1) >> b);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i64 a, __vec4_i64 b) {
|
||||
__m128i cmp0 = _mm_cmpeq_epi64(a.v[0], b.v[0]);
|
||||
__m128i cmp1 = _mm_cmpeq_epi64(a.v[1], b.v[1]);
|
||||
@@ -2324,7 +2412,7 @@ static FORCEINLINE uint64_t __reduce_max_uint64(__vec4_i64 v) {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// masked load/store
|
||||
|
||||
static FORCEINLINE __vec4_i8 __masked_load_8(unsigned char *p,
|
||||
static FORCEINLINE __vec4_i8 __masked_load_8(void *p,
|
||||
__vec4_i1 mask) {
|
||||
int8_t r[4];
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
@@ -2344,7 +2432,7 @@ static FORCEINLINE __vec4_i8 __masked_load_8(unsigned char *p,
|
||||
return __vec4_i8(r[0], r[1], r[2], r[3]);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16 __masked_load_16(unsigned char *p,
|
||||
static FORCEINLINE __vec4_i16 __masked_load_16(void *p,
|
||||
__vec4_i1 mask) {
|
||||
int16_t r[4];
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
@@ -2368,7 +2456,7 @@ static FORCEINLINE __vec4_i16 __masked_load_16(unsigned char *p,
|
||||
return __vec4_i16(r[0], r[1], r[2], r[3]);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __masked_load_32(unsigned char *p,
|
||||
static FORCEINLINE __vec4_i32 __masked_load_32(void *p,
|
||||
__vec4_i1 mask) {
|
||||
__m128i r = _mm_set_epi32(0, 0, 0, 0);
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
@@ -2391,7 +2479,7 @@ static FORCEINLINE __vec4_i32 __masked_load_32(unsigned char *p,
|
||||
return r;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64 __masked_load_64(unsigned char *p,
|
||||
static FORCEINLINE __vec4_i64 __masked_load_64(void *p,
|
||||
__vec4_i1 mask) {
|
||||
uint64_t r[4];
|
||||
uint64_t *ptr = (uint64_t *)p;
|
||||
@@ -2414,7 +2502,7 @@ static FORCEINLINE __vec4_i64 __masked_load_64(unsigned char *p,
|
||||
return __vec4_i64(r[0], r[1], r[2], r[3]);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_8(unsigned char *p, __vec4_i8 val,
|
||||
static FORCEINLINE void __masked_store_8(void *p, __vec4_i8 val,
|
||||
__vec4_i1 mask) {
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
|
||||
@@ -2435,7 +2523,8 @@ static FORCEINLINE void __masked_store_8(unsigned char *p, __vec4_i8 val,
|
||||
ptr[3] = _mm_extract_epi8(val.v, 3);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_16(unsigned char *p, __vec4_i16 val, __vec4_i1 mask) {
|
||||
static FORCEINLINE void __masked_store_16(void *p, __vec4_i16 val,
|
||||
__vec4_i1 mask) {
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
@@ -2455,7 +2544,7 @@ static FORCEINLINE void __masked_store_16(unsigned char *p, __vec4_i16 val, __ve
|
||||
ptr[3] = _mm_extract_epi16(val.v, 3);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_32(unsigned char *p, __vec4_i32 val,
|
||||
static FORCEINLINE void __masked_store_32(void *p, __vec4_i32 val,
|
||||
__vec4_i1 mask) {
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
@@ -2475,7 +2564,7 @@ static FORCEINLINE void __masked_store_32(unsigned char *p, __vec4_i32 val,
|
||||
ptr[3] = _mm_extract_epi32(val.v, 3);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_64(unsigned char *p, __vec4_i64 val,
|
||||
static FORCEINLINE void __masked_store_64(void *p, __vec4_i64 val,
|
||||
__vec4_i1 mask) {
|
||||
int64_t *ptr = (int64_t *)p;
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
@@ -2495,58 +2584,82 @@ static FORCEINLINE void __masked_store_64(unsigned char *p, __vec4_i64 val,
|
||||
ptr[3] = _mm_extract_epi64(val.v[1], 1);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_8(void *p, __vec4_i8 val,
|
||||
__vec4_i1 mask) {
|
||||
__masked_store_8(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_16(void *p, __vec4_i16 val,
|
||||
__vec4_i1 mask) {
|
||||
__masked_store_16(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_32(void *p, __vec4_i32 val,
|
||||
__vec4_i1 mask) {
|
||||
// FIXME: do a load, blendvps, store here...
|
||||
__masked_store_32(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_64(void *p, __vec4_i64 val,
|
||||
__vec4_i1 mask) {
|
||||
// FIXME: do a 2x (load, blendvps, store) here...
|
||||
__masked_store_64(p, val, mask);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// gather/scatter
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
template<typename RetVec, typename RetScalar>
|
||||
static FORCEINLINE RetVec
|
||||
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
|
||||
__vec4_i32 offsets, uint32_t scale, __vec4_i1 mask) {
|
||||
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
RetScalar r[4];
|
||||
#if 1
|
||||
// "Fast gather" trick...
|
||||
offsets = __select(mask, offsets, __smear_i32(0));
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0);
|
||||
constOffset = __select(mask, constOffset, __smear_i32(0));
|
||||
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[0] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 1);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[1] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 2);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[2] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 3);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[3] = *ptr;
|
||||
#else
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[0] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 1);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[1] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 2);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[2] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 3);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[3] = *ptr;
|
||||
}
|
||||
@@ -2554,54 +2667,57 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
|
||||
return RetVec(r[0], r[1], r[2], r[3]);
|
||||
}
|
||||
|
||||
|
||||
template<typename RetVec, typename RetScalar>
|
||||
static FORCEINLINE RetVec
|
||||
lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
RetScalar r[4];
|
||||
#if 1
|
||||
// "Fast gather" trick...
|
||||
offsets = __select(mask, offsets, __smear_i64(0));
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
constOffset = __select(mask, constOffset, __smear_i64(0));
|
||||
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[0] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[1] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[2] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[3] = *ptr;
|
||||
#else
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[0] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[1] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[2] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[3] = *ptr;
|
||||
}
|
||||
@@ -2612,80 +2728,89 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
|
||||
|
||||
static FORCEINLINE __vec4_i8
|
||||
__gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale,
|
||||
mask);
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8
|
||||
__gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale,
|
||||
mask);
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16
|
||||
__gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale,
|
||||
mask);
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16
|
||||
__gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale,
|
||||
mask);
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32
|
||||
__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
|
||||
__vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
__m128i r = _mm_set_epi32(0, 0, 0, 0);
|
||||
#if 1
|
||||
// "Fast gather"...
|
||||
offsets = __select(mask, offsets, __smear_i32(0));
|
||||
constOffset = __select(mask, constOffset, __smear_i32(0));
|
||||
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0) +
|
||||
_mm_extract_epi32(constOffset.v, 0);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 0);
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 1);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 1) +
|
||||
_mm_extract_epi32(constOffset.v, 1);
|
||||
ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 1);
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 2);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 2) +
|
||||
_mm_extract_epi32(constOffset.v, 2);
|
||||
ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 2);
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 3);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 3) +
|
||||
_mm_extract_epi32(constOffset.v, 3);
|
||||
ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 3);
|
||||
#else
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0) +
|
||||
_mm_extract_epi32(constOffset.v, 0);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 1);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 1) +
|
||||
_mm_extract_epi32(constOffset.v, 1);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 2);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 2) +
|
||||
_mm_extract_epi32(constOffset.v, 2);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 2);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 3);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 3) +
|
||||
_mm_extract_epi32(constOffset.v, 3);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
r = _mm_insert_epi32(r, *ptr, 3);
|
||||
}
|
||||
@@ -2695,23 +2820,23 @@ __gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
|
||||
|
||||
static FORCEINLINE __vec4_i32
|
||||
__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale,
|
||||
mask);
|
||||
delta, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64
|
||||
__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale,
|
||||
mask);
|
||||
delta, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64
|
||||
__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale,
|
||||
mask);
|
||||
delta, mask);
|
||||
}
|
||||
|
||||
template<typename RetVec, typename RetScalar>
|
||||
@@ -2858,217 +2983,108 @@ static FORCEINLINE __vec4_i64 __gather64_i64(__vec4_i64 ptrs, __vec4_i1 mask) {
|
||||
|
||||
// scatter
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
|
||||
*ptr = _mm_extract_epi8(val.v, 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
|
||||
*ptr = _mm_extract_epi8(val.v, 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
|
||||
*ptr = _mm_extract_epi8(val.v, 2);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
|
||||
*ptr = _mm_extract_epi8(val.v, 3);
|
||||
}
|
||||
#define SCATTER32_64(SUFFIX, TYPE, EXTRACT) \
|
||||
static FORCEINLINE void \
|
||||
__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
|
||||
uint32_t scale, __vec4_i32 constOffset, \
|
||||
__vec4_##SUFFIX val, __vec4_i1 mask) { \
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0); \
|
||||
if (m != 0) { \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
|
||||
_mm_extract_epi32(constOffset.v, 0)); \
|
||||
*ptr = EXTRACT(val.v, 0); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 1); \
|
||||
if (m != 0) { \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \
|
||||
_mm_extract_epi32(constOffset.v, 1)); \
|
||||
*ptr = EXTRACT(val.v, 1); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 2); \
|
||||
if (m != 0) { \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \
|
||||
_mm_extract_epi32(constOffset.v, 2)); \
|
||||
*ptr = EXTRACT(val.v, 2); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 3); \
|
||||
if (m != 0) { \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \
|
||||
_mm_extract_epi32(constOffset.v, 3)); \
|
||||
*ptr = EXTRACT(val.v, 3); \
|
||||
} \
|
||||
} \
|
||||
static FORCEINLINE void \
|
||||
__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
|
||||
uint32_t scale, __vec4_i64 constOffset, \
|
||||
__vec4_##SUFFIX val, __vec4_i1 mask) { \
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0); \
|
||||
if (m != 0) { \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + \
|
||||
_mm_extract_epi64(constOffset.v[0], 0); \
|
||||
TYPE *ptr = (TYPE *)(p + offset); \
|
||||
*ptr = EXTRACT(val.v, 0); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 1); \
|
||||
if (m != 0) { \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + \
|
||||
_mm_extract_epi64(constOffset.v[0], 1); \
|
||||
TYPE *ptr = (TYPE *)(p + offset); \
|
||||
*ptr = EXTRACT(val.v, 1); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 2); \
|
||||
if (m != 0) { \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + \
|
||||
_mm_extract_epi64(constOffset.v[1], 0); \
|
||||
TYPE *ptr = (TYPE *)(p + offset); \
|
||||
*ptr = EXTRACT(val.v, 2); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 3); \
|
||||
if (m != 0) { \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + \
|
||||
_mm_extract_epi64(constOffset.v[1], 1); \
|
||||
TYPE *ptr = (TYPE *)(p + offset); \
|
||||
*ptr = EXTRACT(val.v, 3); \
|
||||
} \
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_i8(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
uint8_t *ptr = (uint8_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi8(val.v, 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
uint8_t *ptr = (uint8_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi8(val.v, 1);
|
||||
}
|
||||
SCATTER32_64(i8, int8_t, _mm_extract_epi8)
|
||||
SCATTER32_64(i16, int16_t, _mm_extract_epi16)
|
||||
SCATTER32_64(i32, int32_t, _mm_extract_epi32)
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
uint8_t *ptr = (uint8_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi8(val.v, 2);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
uint8_t *ptr = (uint8_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi8(val.v, 3);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
|
||||
*ptr = _mm_extract_epi16(val.v, 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
|
||||
*ptr = _mm_extract_epi16(val.v, 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
|
||||
*ptr = _mm_extract_epi16(val.v, 2);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
|
||||
*ptr = _mm_extract_epi16(val.v, 3);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_i16(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
uint16_t *ptr = (uint16_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi16(val.v, 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
uint16_t *ptr = (uint16_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi16(val.v, 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
uint16_t *ptr = (uint16_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi16(val.v, 2);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
uint16_t *ptr = (uint16_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi16(val.v, 3);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i32(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int32_t *ptr = (int32_t *)(b + scale *
|
||||
_mm_extract_epi32(offsets.v, 0));
|
||||
*ptr = _mm_extract_epi32(val.v, 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int32_t *ptr = (int32_t *)(b + scale *
|
||||
_mm_extract_epi32(offsets.v, 1));
|
||||
*ptr = _mm_extract_epi32(val.v, 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int32_t *ptr = (int32_t *)(b + scale *
|
||||
_mm_extract_epi32(offsets.v, 2));
|
||||
*ptr = _mm_extract_epi32(val.v, 2);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int32_t *ptr = (int32_t *)(b + scale *
|
||||
_mm_extract_epi32(offsets.v, 3));
|
||||
*ptr = _mm_extract_epi32(val.v, 3);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi32(val.v, 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi32(val.v, 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi32(val.v, 2);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
uint32_t *ptr = (uint32_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi32(val.v, 3);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val,
|
||||
__vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) +
|
||||
_mm_extract_epi32(constOffset.v, 0);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) +
|
||||
_mm_extract_epi32(constOffset.v, 1);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) +
|
||||
_mm_extract_epi32(constOffset.v, 2);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) +
|
||||
_mm_extract_epi32(constOffset.v, 3);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 1);
|
||||
}
|
||||
@@ -3076,31 +3092,36 @@ __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
|
||||
uint32_t scale, __vec4_i64 constOffset,
|
||||
__vec4_i64 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +
|
||||
_mm_extract_epi64(constOffset.v[0], 0);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +
|
||||
_mm_extract_epi64(constOffset.v[0], 1);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +
|
||||
_mm_extract_epi64(constOffset.v[1], 0);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +
|
||||
_mm_extract_epi64(constOffset.v[1], 1);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 1);
|
||||
}
|
||||
|
||||
@@ -104,8 +104,8 @@ static void generateRay(uniform const float raster2camera[4][4],
|
||||
}
|
||||
|
||||
|
||||
static inline bool BBoxIntersect(const uniform float bounds[2][3],
|
||||
const Ray &ray) {
|
||||
static bool BBoxIntersect(const uniform float bounds[2][3],
|
||||
const Ray &ray) {
|
||||
uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
|
||||
uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
|
||||
float t0 = ray.mint, t1 = ray.maxt;
|
||||
@@ -143,7 +143,7 @@ static inline bool BBoxIntersect(const uniform float bounds[2][3],
|
||||
|
||||
|
||||
|
||||
static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
static bool TriIntersect(const Triangle &tri, Ray &ray) {
|
||||
uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
|
||||
uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
|
||||
uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
|
||||
|
||||
@@ -129,8 +129,8 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
|
||||
}
|
||||
|
||||
|
||||
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
uniform float density[], uniform int nVoxels[3]) {
|
||||
static float Density(float3 Pobj, float3 pMin, float3 pMax,
|
||||
uniform float density[], uniform int nVoxels[3]) {
|
||||
if (!Inside(Pobj, pMin, pMax))
|
||||
return 0;
|
||||
// Compute voxel coordinates and offsets for _Pobj_
|
||||
|
||||
80
expr.cpp
80
expr.cpp
@@ -36,12 +36,22 @@
|
||||
*/
|
||||
|
||||
#include "expr.h"
|
||||
#include "ast.h"
|
||||
#include "type.h"
|
||||
#include "sym.h"
|
||||
#include "ctx.h"
|
||||
#include "module.h"
|
||||
#include "util.h"
|
||||
#include "llvmutil.h"
|
||||
#ifndef _MSC_VER
|
||||
#include <inttypes.h>
|
||||
#endif
|
||||
#ifndef PRId64
|
||||
#define PRId64 "lld"
|
||||
#endif
|
||||
#ifndef PRIu64
|
||||
#define PRIu64 "llu"
|
||||
#endif
|
||||
|
||||
#include <list>
|
||||
#include <set>
|
||||
@@ -224,7 +234,7 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
|
||||
eltType = eltType->GetAsConstType();
|
||||
if (Type::Equal(toPointerType,
|
||||
new PointerType(eltType,
|
||||
toPointerType->IsUniformType(),
|
||||
toPointerType->GetVariability(),
|
||||
toPointerType->IsConstType())))
|
||||
goto typecast_ok;
|
||||
else {
|
||||
@@ -466,7 +476,7 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
|
||||
|
||||
typecast_ok:
|
||||
if (expr != NULL)
|
||||
*expr = new TypeCastExpr(toType, *expr, false, pos);
|
||||
*expr = new TypeCastExpr(toType, *expr, pos);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -638,6 +648,9 @@ lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {
|
||||
|
||||
static llvm::Value *
|
||||
lMaskForSymbol(Symbol *baseSym, FunctionEmitContext *ctx) {
|
||||
if (baseSym == NULL)
|
||||
return ctx->GetFullMask();
|
||||
|
||||
if (dynamic_cast<const PointerType *>(baseSym->type) != NULL ||
|
||||
dynamic_cast<const ReferenceType *>(baseSym->type) != NULL)
|
||||
// FIXME: for pointers, we really only want to do this for
|
||||
@@ -658,10 +671,11 @@ lMaskForSymbol(Symbol *baseSym, FunctionEmitContext *ctx) {
|
||||
static void
|
||||
lStoreAssignResult(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
|
||||
FunctionEmitContext *ctx, Symbol *baseSym) {
|
||||
Assert(baseSym != NULL &&
|
||||
Assert(baseSym == NULL ||
|
||||
baseSym->varyingCFDepth <= ctx->VaryingCFDepth());
|
||||
if (!g->opt.disableMaskedStoreToStore &&
|
||||
!g->opt.disableMaskAllOnOptimizations &&
|
||||
baseSym != NULL &&
|
||||
baseSym->varyingCFDepth == ctx->VaryingCFDepth() &&
|
||||
baseSym->storageClass != SC_STATIC &&
|
||||
dynamic_cast<const ReferenceType *>(baseSym->type) == NULL &&
|
||||
@@ -2016,14 +2030,13 @@ AssignExpr::GetValue(FunctionEmitContext *ctx) const {
|
||||
ctx->SetDebugPos(pos);
|
||||
|
||||
Symbol *baseSym = lvalue->GetBaseSymbol();
|
||||
// Should be caught during type-checking...
|
||||
assert(baseSym != NULL);
|
||||
|
||||
switch (op) {
|
||||
case Assign: {
|
||||
llvm::Value *lv = lvalue->GetLValue(ctx);
|
||||
if (lv == NULL) {
|
||||
Assert(m->errorCount > 0);
|
||||
Error(lvalue->pos, "Left hand side of assignment expression can't "
|
||||
"be assigned to.");
|
||||
return NULL;
|
||||
}
|
||||
const Type *lvalueType = lvalue->GetLValueType();
|
||||
@@ -2146,13 +2159,13 @@ AssignExpr::TypeCheck() {
|
||||
}
|
||||
}
|
||||
|
||||
if (lvalue->GetBaseSymbol() == NULL) {
|
||||
Error(lvalue->pos, "Left hand side of assignment statement can't be "
|
||||
"assigned to.");
|
||||
const Type *lhsType = lvalue->GetType();
|
||||
if (lhsType->IsConstType()) {
|
||||
Error(lvalue->pos, "Can't assign to type \"%s\" on left-hand side of "
|
||||
"expression.", lhsType->GetString().c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const Type *lhsType = lvalue->GetType();
|
||||
if (dynamic_cast<const PointerType *>(lhsType) != NULL) {
|
||||
if (op == AddAssign || op == SubAssign) {
|
||||
if (PointerType::IsVoidPointer(lhsType)) {
|
||||
@@ -2186,12 +2199,6 @@ AssignExpr::TypeCheck() {
|
||||
if (rvalue == NULL)
|
||||
return NULL;
|
||||
|
||||
if (lhsType->IsConstType()) {
|
||||
Error(pos, "Can't assign to type \"%s\" on left-hand side of "
|
||||
"expression.", lhsType->GetString().c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Make sure we're not assigning to a struct that has a constant member
|
||||
const StructType *st = dynamic_cast<const StructType *>(lhsType);
|
||||
if (st != NULL && lCheckForConstStructMember(pos, st, st))
|
||||
@@ -2709,7 +2716,7 @@ FunctionCallExpr::TypeCheck() {
|
||||
!(argCouldBeNULL[i] == true &&
|
||||
dynamic_cast<const PointerType *>(paramType) != NULL)) {
|
||||
Error(args->exprs[i]->pos, "Can't convert argument of "
|
||||
"type \"%s\" to type \"%s\" for funcion call "
|
||||
"type \"%s\" to type \"%s\" for function call "
|
||||
"argument.", argTypes[i]->GetString().c_str(),
|
||||
paramType->GetString().c_str());
|
||||
return NULL;
|
||||
@@ -3525,6 +3532,12 @@ VectorMemberExpr::getElementType() const {
|
||||
MemberExpr *
|
||||
MemberExpr::create(Expr *e, const char *id, SourcePos p, SourcePos idpos,
|
||||
bool derefLValue) {
|
||||
// FIXME: we need to call TypeCheck() here so that we can call
|
||||
// e->GetType() in the following. But really we just shouldn't try to
|
||||
// resolve this now but just have a generic MemberExpr type that
|
||||
// handles all cases so that this is unnecessary.
|
||||
e = ::TypeCheck(e);
|
||||
|
||||
const Type *exprType;
|
||||
if (e == NULL || (exprType = e->GetType()) == NULL)
|
||||
return NULL;
|
||||
@@ -4536,18 +4549,10 @@ ConstExpr::Print() const {
|
||||
printf("%f", floatVal[i]);
|
||||
break;
|
||||
case AtomicType::TYPE_INT64:
|
||||
#ifdef ISPC_IS_LINUX
|
||||
printf("%ld", int64Val[i]);
|
||||
#else
|
||||
printf("%lld", int64Val[i]);
|
||||
#endif
|
||||
printf("%"PRId64, int64Val[i]);
|
||||
break;
|
||||
case AtomicType::TYPE_UINT64:
|
||||
#ifdef ISPC_IS_LINUX
|
||||
printf("%lu", uint64Val[i]);
|
||||
#else
|
||||
printf("%llu", uint64Val[i]);
|
||||
#endif
|
||||
printf("%"PRIu64, uint64Val[i]);
|
||||
break;
|
||||
case AtomicType::TYPE_DOUBLE:
|
||||
printf("%f", doubleVal[i]);
|
||||
@@ -4566,11 +4571,10 @@ ConstExpr::Print() const {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// TypeCastExpr
|
||||
|
||||
TypeCastExpr::TypeCastExpr(const Type *t, Expr *e, bool pu, SourcePos p)
|
||||
TypeCastExpr::TypeCastExpr(const Type *t, Expr *e, SourcePos p)
|
||||
: Expr(p) {
|
||||
type = t;
|
||||
expr = e;
|
||||
preserveUniformity = pu;
|
||||
}
|
||||
|
||||
|
||||
@@ -5213,7 +5217,7 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
|
||||
if (Type::EqualIgnoringConst(arrayAsPtr->GetType(), toPointerType) == false) {
|
||||
Assert(Type::EqualIgnoringConst(arrayAsPtr->GetType()->GetAsVaryingType(),
|
||||
toPointerType) == true);
|
||||
arrayAsPtr = new TypeCastExpr(toPointerType, arrayAsPtr, false, pos);
|
||||
arrayAsPtr = new TypeCastExpr(toPointerType, arrayAsPtr, pos);
|
||||
arrayAsPtr = ::TypeCheck(arrayAsPtr);
|
||||
Assert(arrayAsPtr != NULL);
|
||||
arrayAsPtr = ::Optimize(arrayAsPtr);
|
||||
@@ -5364,6 +5368,7 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
|
||||
|
||||
const Type *
|
||||
TypeCastExpr::GetType() const {
|
||||
Assert(type->HasUnboundVariability() == false);
|
||||
return type;
|
||||
}
|
||||
|
||||
@@ -5373,7 +5378,7 @@ lDeconstifyType(const Type *t) {
|
||||
const PointerType *pt = dynamic_cast<const PointerType *>(t);
|
||||
if (pt != NULL)
|
||||
return new PointerType(lDeconstifyType(pt->GetBaseType()),
|
||||
pt->IsUniformType(), false);
|
||||
pt->GetVariability(), false);
|
||||
else
|
||||
return t->GetAsNonConstType();
|
||||
}
|
||||
@@ -5384,16 +5389,16 @@ TypeCastExpr::TypeCheck() {
|
||||
if (expr == NULL)
|
||||
return NULL;
|
||||
|
||||
const Type *toType = GetType(), *fromType = expr->GetType();
|
||||
const Type *toType = type, *fromType = expr->GetType();
|
||||
if (toType == NULL || fromType == NULL)
|
||||
return NULL;
|
||||
|
||||
if (preserveUniformity == true && fromType->IsUniformType() &&
|
||||
toType->IsVaryingType()) {
|
||||
if (toType->HasUnboundVariability() && fromType->IsUniformType()) {
|
||||
TypeCastExpr *tce = new TypeCastExpr(toType->GetAsUniformType(),
|
||||
expr, false, pos);
|
||||
expr, pos);
|
||||
return ::TypeCheck(tce);
|
||||
}
|
||||
type = toType = type->ResolveUnboundVariability(Type::Varying);
|
||||
|
||||
fromType = lDeconstifyType(fromType);
|
||||
toType = lDeconstifyType(toType);
|
||||
@@ -5862,6 +5867,8 @@ SizeOfExpr::SizeOfExpr(Expr *e, SourcePos p)
|
||||
|
||||
SizeOfExpr::SizeOfExpr(const Type *t, SourcePos p)
|
||||
: Expr(p), expr(NULL), type(t) {
|
||||
if (type->HasUnboundVariability())
|
||||
type = type->ResolveUnboundVariability(Type::Varying);
|
||||
}
|
||||
|
||||
|
||||
@@ -6026,7 +6033,8 @@ FunctionSymbolExpr::GetType() const {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return matchingFunc ? new PointerType(matchingFunc->type, true, true) : NULL;
|
||||
return matchingFunc ?
|
||||
new PointerType(matchingFunc->type, Type::Uniform, true) : NULL;
|
||||
}
|
||||
|
||||
|
||||
|
||||
5
expr.h
5
expr.h
@@ -314,7 +314,6 @@ public:
|
||||
std::string identifier;
|
||||
const SourcePos identifierPos;
|
||||
|
||||
protected:
|
||||
MemberExpr(Expr *expr, const char *identifier, SourcePos pos,
|
||||
SourcePos identifierPos, bool derefLValue);
|
||||
|
||||
@@ -493,8 +492,7 @@ private:
|
||||
probably-different type. */
|
||||
class TypeCastExpr : public Expr {
|
||||
public:
|
||||
TypeCastExpr(const Type *t, Expr *e, bool preserveUniformity,
|
||||
SourcePos p);
|
||||
TypeCastExpr(const Type *t, Expr *e, SourcePos p);
|
||||
|
||||
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
|
||||
const Type *GetType() const;
|
||||
@@ -507,7 +505,6 @@ public:
|
||||
|
||||
const Type *type;
|
||||
Expr *expr;
|
||||
bool preserveUniformity;
|
||||
};
|
||||
|
||||
|
||||
|
||||
16
func.cpp
16
func.cpp
@@ -290,8 +290,10 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
llvm::BasicBlock *bbAllOn = ctx->CreateBasicBlock("all_on");
|
||||
llvm::BasicBlock *bbNotAll = ctx->CreateBasicBlock("not_all_on");
|
||||
|
||||
ctx->BranchInst(bbAllOn, bbNotAll, allOn);
|
||||
// Set up basic blocks for goto targets
|
||||
ctx->InitializeLabelMap(code);
|
||||
|
||||
ctx->BranchInst(bbAllOn, bbNotAll, allOn);
|
||||
// all on: we've determined dynamically that the mask is all
|
||||
// on. Set the current mask to "all on" explicitly so that
|
||||
// codegen for this path can be improved with this knowledge in
|
||||
@@ -322,14 +324,22 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
// above
|
||||
ctx->SetCurrentBasicBlock(bbSomeOn);
|
||||
ctx->SetFunctionMask(mask);
|
||||
|
||||
// Set up basic blocks for goto targets again; we want to have
|
||||
// one set of them for gotos in the 'all on' case, and a
|
||||
// distinct set for the 'mixed mask' case.
|
||||
ctx->InitializeLabelMap(code);
|
||||
|
||||
code->EmitCode(ctx);
|
||||
if (ctx->GetCurrentBasicBlock())
|
||||
ctx->ReturnInst();
|
||||
|
||||
}
|
||||
else
|
||||
else {
|
||||
// Set up basic blocks for goto targets
|
||||
ctx->InitializeLabelMap(code);
|
||||
// No check, just emit the code
|
||||
code->EmitCode(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx->GetCurrentBasicBlock()) {
|
||||
|
||||
44
ispc.cpp
44
ispc.cpp
@@ -210,7 +210,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->isa = Target::AVX2;
|
||||
t->nativeVectorWidth = 8;
|
||||
t->vectorWidth = 8;
|
||||
t->attributes = "+avx2,+popcnt,+cmov";
|
||||
t->attributes = "+avx2,+popcnt,+cmov,+f16c";
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
@@ -219,7 +219,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->isa = Target::AVX2;
|
||||
t->nativeVectorWidth = 16;
|
||||
t->vectorWidth = 16;
|
||||
t->attributes = "+avx2,+popcnt,+cmov";
|
||||
t->attributes = "+avx2,+popcnt,+cmov,+f16c";
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
@@ -358,10 +358,45 @@ Target::GetISAString() const {
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
|
||||
if (type->isPrimitiveType() || type->isIntegerTy())
|
||||
return false;
|
||||
|
||||
if (type == LLVMTypes::BoolVectorType ||
|
||||
type == LLVMTypes::MaskType ||
|
||||
type == LLVMTypes::Int1VectorType)
|
||||
return true;
|
||||
|
||||
LLVM_TYPE_CONST llvm::ArrayType *at =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
|
||||
if (at != NULL)
|
||||
return lGenericTypeLayoutIndeterminate(at->getElementType());
|
||||
|
||||
LLVM_TYPE_CONST llvm::PointerType *pt =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(type);
|
||||
if (pt != NULL)
|
||||
return false;
|
||||
|
||||
LLVM_TYPE_CONST llvm::StructType *st =
|
||||
llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
|
||||
if (st != NULL) {
|
||||
for (int i = 0; i < (int)st->getNumElements(); ++i)
|
||||
if (lGenericTypeLayoutIndeterminate(st->getElementType(i)))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
llvm::Value *
|
||||
Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::BasicBlock *insertAtEnd) {
|
||||
if (isa == Target::GENERIC && type->isPrimitiveType() == false) {
|
||||
if (isa == Target::GENERIC &&
|
||||
lGenericTypeLayoutIndeterminate(type)) {
|
||||
llvm::Value *index[1] = { LLVMInt32(1) };
|
||||
LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
|
||||
llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
|
||||
@@ -396,7 +431,8 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
|
||||
llvm::Value *
|
||||
Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
|
||||
llvm::BasicBlock *insertAtEnd) {
|
||||
if (isa == Target::GENERIC && type->isPrimitiveType() == false) {
|
||||
if (isa == Target::GENERIC &&
|
||||
lGenericTypeLayoutIndeterminate(type) == true) {
|
||||
llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(element) };
|
||||
LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
|
||||
llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
|
||||
|
||||
5
ispc.h
5
ispc.h
@@ -98,6 +98,8 @@ namespace llvm {
|
||||
#endif
|
||||
|
||||
class ArrayType;
|
||||
class AST;
|
||||
class ASTNode;
|
||||
class AtomicType;
|
||||
class FunctionEmitContext;
|
||||
class Expr;
|
||||
@@ -421,6 +423,7 @@ enum {
|
||||
COST_FUNPTR_UNIFORM = 12,
|
||||
COST_FUNPTR_VARYING = 24,
|
||||
COST_GATHER = 8,
|
||||
COST_GOTO = 4,
|
||||
COST_LOAD = 2,
|
||||
COST_REGULAR_BREAK_CONTINUE = 2,
|
||||
COST_RETURN = 4,
|
||||
@@ -434,6 +437,8 @@ enum {
|
||||
COST_VARYING_IF = 3,
|
||||
COST_UNIFORM_LOOP = 4,
|
||||
COST_VARYING_LOOP = 6,
|
||||
COST_UNIFORM_SWITCH = 4,
|
||||
COST_VARYING_SWITCH = 12,
|
||||
COST_ASSERT = 8,
|
||||
|
||||
CHECK_MASK_AT_FUNCTION_START_COST = 16,
|
||||
|
||||
68
ispc.vcxproj
68
ispc.vcxproj
@@ -18,8 +18,10 @@
|
||||
<ClCompile Include="decl.cpp" />
|
||||
<ClCompile Include="expr.cpp" />
|
||||
<ClCompile Include="func.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx1.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx1-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-avx2-x2.cpp" />
|
||||
<ClCompile Include="gen-bitcode-c-32.cpp" />
|
||||
<ClCompile Include="gen-bitcode-c-64.cpp" />
|
||||
<ClCompile Include="gen-bitcode-dispatch.cpp" />
|
||||
@@ -158,29 +160,55 @@
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-avx.ll">
|
||||
<CustomBuild Include="builtins\target-avx1.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll > gen-bitcode-avx.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll > gen-bitcode-avx1.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll > gen-bitcode-avx1.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-avx-x2.ll">
|
||||
<CustomBuild Include="builtins\target-avx1-x2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll > gen-bitcode-avx-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll > gen-bitcode-avx-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll > gen-bitcode-avx1-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll > gen-bitcode-avx1-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-avx2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll > gen-bitcode-avx2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll > gen-bitcode-avx2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="builtins\target-avx2-x2.ll">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll > gen-bitcode-avx2-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll > gen-bitcode-avx2-x2.cpp</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
|
||||
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
|
||||
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
|
||||
54
lex.ll
54
lex.ll
@@ -42,7 +42,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
static uint64_t lParseBinary(const char *ptr, SourcePos pos);
|
||||
static uint64_t lParseBinary(const char *ptr, SourcePos pos, char **endPtr);
|
||||
static void lCComment(SourcePos *);
|
||||
static void lCppComment(SourcePos *);
|
||||
static void lHandleCppHash(SourcePos *);
|
||||
@@ -67,7 +67,7 @@ inline int isatty(int) { return 0; }
|
||||
%option nounistd
|
||||
|
||||
WHITESPACE [ \t\r]+
|
||||
INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))
|
||||
INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[kMG]?
|
||||
FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
|
||||
HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
|
||||
|
||||
@@ -151,30 +151,44 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
|
||||
{INT_NUMBER}+(u|U|l|L)*? {
|
||||
int ls = 0, us = 0;
|
||||
|
||||
char *endPtr = NULL;
|
||||
if (yytext[0] == '0' && yytext[1] == 'b')
|
||||
yylval->intVal = lParseBinary(yytext+2, *yylloc);
|
||||
yylval->intVal = lParseBinary(yytext+2, *yylloc, &endPtr);
|
||||
else {
|
||||
char *endPtr = NULL;
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
|
||||
yylval->intVal = _strtoi64(yytext, &endPtr, 0);
|
||||
#else
|
||||
// FIXME: should use strtouq and then issue an error if we can't
|
||||
// fit into 64 bits...
|
||||
yylval->intVal = strtoull(yytext, &endPtr, 0);
|
||||
#endif
|
||||
for (; *endPtr; endPtr++) {
|
||||
if (*endPtr == 'l' || *endPtr == 'L')
|
||||
ls++;
|
||||
else if (*endPtr == 'u' || *endPtr == 'U')
|
||||
us++;
|
||||
}
|
||||
if (ls >= 2)
|
||||
return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
|
||||
else if (ls == 1)
|
||||
return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
|
||||
}
|
||||
|
||||
bool kilo = false, mega = false, giga = false;
|
||||
for (; *endPtr; endPtr++) {
|
||||
if (*endPtr == 'k')
|
||||
kilo = true;
|
||||
else if (*endPtr == 'M')
|
||||
mega = true;
|
||||
else if (*endPtr == 'G')
|
||||
giga = true;
|
||||
else if (*endPtr == 'l' || *endPtr == 'L')
|
||||
ls++;
|
||||
else if (*endPtr == 'u' || *endPtr == 'U')
|
||||
us++;
|
||||
}
|
||||
if (kilo)
|
||||
yylval->intVal *= 1024;
|
||||
if (mega)
|
||||
yylval->intVal *= 1024*1024;
|
||||
if (giga)
|
||||
yylval->intVal *= 1024*1024*1024;
|
||||
|
||||
if (ls >= 2)
|
||||
return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
|
||||
else if (ls == 1)
|
||||
return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
|
||||
|
||||
// See if we can fit this into a 32-bit integer...
|
||||
if ((yylval->intVal & 0xffffffff) == yylval->intVal)
|
||||
return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
|
||||
@@ -268,14 +282,11 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
|
||||
/** Return the integer version of a binary constant from a string.
|
||||
*/
|
||||
static uint64_t
|
||||
lParseBinary(const char *ptr, SourcePos pos) {
|
||||
lParseBinary(const char *ptr, SourcePos pos, char **endPtr) {
|
||||
uint64_t val = 0;
|
||||
bool warned = false;
|
||||
|
||||
while (*ptr != '\0') {
|
||||
/* if this hits, the regexp for 0b... constants is broken */
|
||||
Assert(*ptr == '0' || *ptr == '1');
|
||||
|
||||
while (*ptr == '0' || *ptr == '1') {
|
||||
if ((val & (((int64_t)1)<<63)) && warned == false) {
|
||||
// We're about to shift out a set bit
|
||||
Warning(pos, "Can't represent binary constant with a 64-bit integer type");
|
||||
@@ -285,6 +296,7 @@ lParseBinary(const char *ptr, SourcePos pos) {
|
||||
val = (val << 1) | (*ptr == '0' ? 0 : 1);
|
||||
++ptr;
|
||||
}
|
||||
*endPtr = (char *)ptr;
|
||||
return val;
|
||||
}
|
||||
|
||||
|
||||
240
llvmutil.cpp
240
llvmutil.cpp
@@ -36,7 +36,9 @@
|
||||
*/
|
||||
|
||||
#include "llvmutil.h"
|
||||
#include "ispc.h"
|
||||
#include "type.h"
|
||||
#include <llvm/Instructions.h>
|
||||
|
||||
LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
|
||||
LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
|
||||
@@ -109,7 +111,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
|
||||
LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
|
||||
llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
|
||||
else {
|
||||
assert(target.maskBitCount == 32);
|
||||
Assert(target.maskBitCount == 32);
|
||||
LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
|
||||
llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth);
|
||||
}
|
||||
@@ -465,3 +467,239 @@ LLVMBoolVector(const bool *bvec) {
|
||||
}
|
||||
return llvm::ConstantVector::get(vals);
|
||||
}
|
||||
|
||||
|
||||
/** Conservative test to see if two llvm::Values are equal. There are
|
||||
(potentially many) cases where the two values actually are equal but
|
||||
this will return false. However, if it does return true, the two
|
||||
vectors definitely are equal.
|
||||
|
||||
@todo This seems to catch all of the cases we currently need it for in
|
||||
practice, but it's be nice to make it a little more robust/general. In
|
||||
general, though, a little something called the halting problem means we
|
||||
won't get all of them.
|
||||
*/
|
||||
static bool
|
||||
lValuesAreEqual(llvm::Value *v0, llvm::Value *v1,
|
||||
std::vector<llvm::PHINode *> &seenPhi0,
|
||||
std::vector<llvm::PHINode *> &seenPhi1) {
|
||||
// Thanks to the fact that LLVM hashes and returns the same pointer for
|
||||
// constants (of all sorts, even constant expressions), this first test
|
||||
// actually catches a lot of cases. LLVM's SSA form also helps a lot
|
||||
// with this..
|
||||
if (v0 == v1)
|
||||
return true;
|
||||
|
||||
Assert(seenPhi0.size() == seenPhi1.size());
|
||||
for (unsigned int i = 0; i < seenPhi0.size(); ++i)
|
||||
if (v0 == seenPhi0[i] && v1 == seenPhi1[i])
|
||||
return true;
|
||||
|
||||
llvm::BinaryOperator *bo0 = llvm::dyn_cast<llvm::BinaryOperator>(v0);
|
||||
llvm::BinaryOperator *bo1 = llvm::dyn_cast<llvm::BinaryOperator>(v1);
|
||||
if (bo0 != NULL && bo1 != NULL) {
|
||||
if (bo0->getOpcode() != bo1->getOpcode())
|
||||
return false;
|
||||
return (lValuesAreEqual(bo0->getOperand(0), bo1->getOperand(0),
|
||||
seenPhi0, seenPhi1) &&
|
||||
lValuesAreEqual(bo0->getOperand(1), bo1->getOperand(1),
|
||||
seenPhi0, seenPhi1));
|
||||
}
|
||||
|
||||
llvm::PHINode *phi0 = llvm::dyn_cast<llvm::PHINode>(v0);
|
||||
llvm::PHINode *phi1 = llvm::dyn_cast<llvm::PHINode>(v1);
|
||||
if (phi0 != NULL && phi1 != NULL) {
|
||||
if (phi0->getNumIncomingValues() != phi1->getNumIncomingValues())
|
||||
return false;
|
||||
|
||||
seenPhi0.push_back(phi0);
|
||||
seenPhi1.push_back(phi1);
|
||||
|
||||
unsigned int numIncoming = phi0->getNumIncomingValues();
|
||||
// Check all of the incoming values: if all of them are all equal,
|
||||
// then we're good.
|
||||
bool anyFailure = false;
|
||||
for (unsigned int i = 0; i < numIncoming; ++i) {
|
||||
Assert(phi0->getIncomingBlock(i) == phi1->getIncomingBlock(i));
|
||||
if (!lValuesAreEqual(phi0->getIncomingValue(i),
|
||||
phi1->getIncomingValue(i), seenPhi0, seenPhi1)) {
|
||||
anyFailure = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
seenPhi0.pop_back();
|
||||
seenPhi1.pop_back();
|
||||
|
||||
return !anyFailure;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/** Given an llvm::Value known to be an integer, return its value as
|
||||
an int64_t.
|
||||
*/
|
||||
static int64_t
|
||||
lGetIntValue(llvm::Value *offset) {
|
||||
llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
|
||||
Assert(intOffset && (intOffset->getBitWidth() == 32 ||
|
||||
intOffset->getBitWidth() == 64));
|
||||
return intOffset->getSExtValue();
|
||||
}
|
||||
|
||||
|
||||
/** This function takes chains of InsertElement instructions along the
|
||||
lines of:
|
||||
|
||||
%v0 = insertelement undef, value_0, i32 index_0
|
||||
%v1 = insertelement %v1, value_1, i32 index_1
|
||||
...
|
||||
%vn = insertelement %vn-1, value_n-1, i32 index_n-1
|
||||
|
||||
and initializes the provided elements array such that the i'th
|
||||
llvm::Value * in the array is the element that was inserted into the
|
||||
i'th element of the vector.
|
||||
*/
|
||||
void
|
||||
LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
|
||||
llvm::Value **elements) {
|
||||
for (int i = 0; i < vectorWidth; ++i)
|
||||
elements[i] = NULL;
|
||||
|
||||
while (ie != NULL) {
|
||||
int64_t iOffset = lGetIntValue(ie->getOperand(2));
|
||||
Assert(iOffset >= 0 && iOffset < vectorWidth);
|
||||
Assert(elements[iOffset] == NULL);
|
||||
|
||||
elements[iOffset] = ie->getOperand(1);
|
||||
|
||||
llvm::Value *insertBase = ie->getOperand(0);
|
||||
ie = llvm::dyn_cast<llvm::InsertElementInst>(insertBase);
|
||||
if (ie == NULL) {
|
||||
if (llvm::isa<llvm::UndefValue>(insertBase))
|
||||
return;
|
||||
|
||||
llvm::ConstantVector *cv =
|
||||
llvm::dyn_cast<llvm::ConstantVector>(insertBase);
|
||||
Assert(cv != NULL);
|
||||
Assert(iOffset < (int)cv->getNumOperands());
|
||||
elements[iOffset] = cv->getOperand(iOffset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Tests to see if all of the elements of the vector in the 'v' parameter
|
||||
are equal. Like lValuesAreEqual(), this is a conservative test and may
|
||||
return false for arrays where the values are actually all equal. */
|
||||
bool
|
||||
LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
|
||||
std::vector<llvm::PHINode *> &seenPhis) {
|
||||
if (llvm::isa<llvm::ConstantAggregateZero>(v))
|
||||
return true;
|
||||
|
||||
llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v);
|
||||
if (cv != NULL)
|
||||
return (cv->getSplatValue() != NULL);
|
||||
|
||||
llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v);
|
||||
if (bop != NULL)
|
||||
return (LLVMVectorValuesAllEqual(bop->getOperand(0), vectorLength,
|
||||
seenPhis) &&
|
||||
LLVMVectorValuesAllEqual(bop->getOperand(1), vectorLength,
|
||||
seenPhis));
|
||||
|
||||
llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(v);
|
||||
if (cast != NULL)
|
||||
return LLVMVectorValuesAllEqual(cast->getOperand(0), vectorLength,
|
||||
seenPhis);
|
||||
|
||||
llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(v);
|
||||
if (ie != NULL) {
|
||||
llvm::Value *elements[ISPC_MAX_NVEC];
|
||||
LLVMFlattenInsertChain(ie, vectorLength, elements);
|
||||
|
||||
// We will ignore any values of elements[] that are NULL; as they
|
||||
// correspond to undefined values--we just want to see if all of
|
||||
// the defined values have the same value.
|
||||
int lastNonNull = 0;
|
||||
while (lastNonNull < vectorLength && elements[lastNonNull] == NULL)
|
||||
++lastNonNull;
|
||||
|
||||
if (lastNonNull == vectorLength)
|
||||
// all of them are undef!
|
||||
return true;
|
||||
|
||||
for (int i = lastNonNull; i < vectorLength; ++i) {
|
||||
if (elements[i] == NULL)
|
||||
continue;
|
||||
|
||||
std::vector<llvm::PHINode *> seenPhi0;
|
||||
std::vector<llvm::PHINode *> seenPhi1;
|
||||
if (lValuesAreEqual(elements[lastNonNull], elements[i], seenPhi0,
|
||||
seenPhi1) == false)
|
||||
return false;
|
||||
lastNonNull = i;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v);
|
||||
if (phi) {
|
||||
for (unsigned int i = 0; i < seenPhis.size(); ++i)
|
||||
if (seenPhis[i] == phi)
|
||||
return true;
|
||||
|
||||
seenPhis.push_back(phi);
|
||||
|
||||
unsigned int numIncoming = phi->getNumIncomingValues();
|
||||
// Check all of the incoming values: if all of them are all equal,
|
||||
// then we're good.
|
||||
for (unsigned int i = 0; i < numIncoming; ++i) {
|
||||
if (!LLVMVectorValuesAllEqual(phi->getIncomingValue(i), vectorLength,
|
||||
seenPhis)) {
|
||||
seenPhis.pop_back();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
seenPhis.pop_back();
|
||||
return true;
|
||||
}
|
||||
|
||||
Assert(!llvm::isa<llvm::Constant>(v));
|
||||
|
||||
if (llvm::isa<llvm::CallInst>(v) || llvm::isa<llvm::LoadInst>(v) ||
|
||||
!llvm::isa<llvm::Instruction>(v))
|
||||
return false;
|
||||
|
||||
llvm::ShuffleVectorInst *shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(v);
|
||||
if (shuffle != NULL) {
|
||||
llvm::Value *indices = shuffle->getOperand(2);
|
||||
if (LLVMVectorValuesAllEqual(indices, vectorLength, seenPhis))
|
||||
// The easy case--just a smear of the same element across the
|
||||
// whole vector.
|
||||
return true;
|
||||
|
||||
// TODO: handle more general cases?
|
||||
return false;
|
||||
}
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "all equal: ");
|
||||
v->dump();
|
||||
fprintf(stderr, "\n");
|
||||
llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(v);
|
||||
if (inst) {
|
||||
inst->getParent()->dump();
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
23
llvmutil.h
23
llvmutil.h
@@ -38,12 +38,23 @@
|
||||
#ifndef ISPC_LLVMUTIL_H
|
||||
#define ISPC_LLVMUTIL_H 1
|
||||
|
||||
#include "ispc.h"
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Type.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#include <llvm/Constants.h>
|
||||
|
||||
namespace llvm {
|
||||
class PHINode;
|
||||
class InsertElementInst;
|
||||
}
|
||||
|
||||
// llvm::Type *s are no longer const in llvm 3.0
|
||||
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
||||
#define LLVM_TYPE_CONST
|
||||
#else
|
||||
#define LLVM_TYPE_CONST const
|
||||
#endif
|
||||
|
||||
|
||||
/** This structure holds pointers to a variety of LLVM types; code
|
||||
elsewhere can use them from here, ratherthan needing to make more
|
||||
@@ -99,6 +110,7 @@ extern llvm::Constant *LLVMTrue, *LLVMFalse;
|
||||
of LLVMTypes and the LLVMTrue/LLVMFalse constants. However, it can't
|
||||
be called until the compilation target is known.
|
||||
*/
|
||||
struct Target;
|
||||
extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target);
|
||||
|
||||
/** Returns an LLVM i8 constant of the given value */
|
||||
@@ -205,4 +217,13 @@ extern llvm::Constant *LLVMMaskAllOn;
|
||||
/** LLVM constant value representing an 'all off' SIMD lane mask */
|
||||
extern llvm::Constant *LLVMMaskAllOff;
|
||||
|
||||
/** Tests to see if all of the elements of the vector in the 'v' parameter
|
||||
are equal. Like lValuesAreEqual(), this is a conservative test and may
|
||||
return false for arrays where the values are actually all equal. */
|
||||
extern bool LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
|
||||
std::vector<llvm::PHINode *> &seenPhis);
|
||||
|
||||
void LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
|
||||
llvm::Value **elements);
|
||||
|
||||
#endif // ISPC_LLVMUTIL_H
|
||||
|
||||
33
main.cpp
33
main.cpp
@@ -38,6 +38,7 @@
|
||||
#include "ispc.h"
|
||||
#include "module.h"
|
||||
#include "util.h"
|
||||
#include "type.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <llvm/Support/PrettyStackTrace.h>
|
||||
@@ -53,14 +54,33 @@
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#define strcasecmp stricmp
|
||||
#ifndef BUILD_DATE
|
||||
#define BUILD_DATE __DATE__
|
||||
#endif
|
||||
#define BUILD_VERSION ""
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
|
||||
static void usage(int ret) {
|
||||
printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n",
|
||||
BUILD_DATE, BUILD_VERSION);
|
||||
printf("usage: ispc\n");
|
||||
static void
|
||||
lPrintVersion() {
|
||||
printf("Intel(r) SPMD Program Compiler (ispc), build %s (%s, LLVM %s)\n",
|
||||
BUILD_DATE, BUILD_VERSION,
|
||||
#ifdef LLVM_2_9
|
||||
"2.9"
|
||||
#elif defined(LLVM_3_0) || defined(LLVM_3_0svn)
|
||||
"3.0"
|
||||
#elif defined(LLVM_3_1) || defined(LLVM_3_1svn)
|
||||
"3.1"
|
||||
#else
|
||||
#error "Unhandled LLVM version"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
usage(int ret) {
|
||||
lPrintVersion();
|
||||
printf("\nusage: ispc\n");
|
||||
printf(" [--addressing={32,64}]\t\tSelect 32- or 64-bit addressing. (Note that 32-bit\n");
|
||||
printf(" \t\taddressing calculations are done by default, even\n");
|
||||
printf(" \t\ton 64-bit target architectures.)\n");
|
||||
@@ -188,6 +208,8 @@ int main(int Argc, char *Argv[]) {
|
||||
LLVMInitializeX86TargetMC();
|
||||
#endif
|
||||
|
||||
AtomicType::Init();
|
||||
|
||||
char *file = NULL;
|
||||
const char *headerFileName = NULL;
|
||||
const char *outFileName = NULL;
|
||||
@@ -362,8 +384,7 @@ int main(int Argc, char *Argv[]) {
|
||||
generatePIC = true;
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
|
||||
printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n",
|
||||
BUILD_DATE, BUILD_VERSION);
|
||||
lPrintVersion();
|
||||
return 0;
|
||||
}
|
||||
else if (argv[i][0] == '-') {
|
||||
|
||||
93
parse.yy
93
parse.yy
@@ -224,7 +224,7 @@ struct ForeachDimension {
|
||||
%type <declSpecs> declaration_specifiers
|
||||
|
||||
%type <stringVal> string_constant
|
||||
%type <constCharPtr> struct_or_union_name enum_identifier
|
||||
%type <constCharPtr> struct_or_union_name enum_identifier goto_identifier
|
||||
%type <intVal> int_constant soa_width_specifier
|
||||
|
||||
%type <foreachDimension> foreach_dimension_specifier
|
||||
@@ -362,13 +362,7 @@ cast_expression
|
||||
: unary_expression
|
||||
| '(' type_name ')' cast_expression
|
||||
{
|
||||
// Pass true here to try to preserve uniformity
|
||||
// so that things like:
|
||||
// uniform int y = ...;
|
||||
// uniform float x = 1. / (float)y;
|
||||
// don't issue an error due to (float)y being inadvertently
|
||||
// and undesirably-to-the-user "varying"...
|
||||
$$ = new TypeCastExpr($2, $4, true, Union(@1,@4));
|
||||
$$ = new TypeCastExpr($2, $4, Union(@1,@4));
|
||||
}
|
||||
;
|
||||
|
||||
@@ -500,6 +494,7 @@ declaration_statement
|
||||
$$ = NULL;
|
||||
}
|
||||
else {
|
||||
$1->DeclareFunctions();
|
||||
std::vector<VariableDeclaration> vars = $1->GetVariableDeclarations();
|
||||
$$ = new DeclStmt(vars, @1);
|
||||
}
|
||||
@@ -638,13 +633,13 @@ type_specifier
|
||||
|
||||
atomic_var_type_specifier
|
||||
: TOKEN_VOID { $$ = AtomicType::Void; }
|
||||
| TOKEN_BOOL { $$ = AtomicType::VaryingBool; }
|
||||
| TOKEN_INT8 { $$ = AtomicType::VaryingInt8; }
|
||||
| TOKEN_INT16 { $$ = AtomicType::VaryingInt16; }
|
||||
| TOKEN_INT { $$ = AtomicType::VaryingInt32; }
|
||||
| TOKEN_FLOAT { $$ = AtomicType::VaryingFloat; }
|
||||
| TOKEN_DOUBLE { $$ = AtomicType::VaryingDouble; }
|
||||
| TOKEN_INT64 { $$ = AtomicType::VaryingInt64; }
|
||||
| TOKEN_BOOL { $$ = AtomicType::UnboundBool; }
|
||||
| TOKEN_INT8 { $$ = AtomicType::UnboundInt8; }
|
||||
| TOKEN_INT16 { $$ = AtomicType::UnboundInt16; }
|
||||
| TOKEN_INT { $$ = AtomicType::UnboundInt32; }
|
||||
| TOKEN_FLOAT { $$ = AtomicType::UnboundFloat; }
|
||||
| TOKEN_DOUBLE { $$ = AtomicType::UnboundDouble; }
|
||||
| TOKEN_INT64 { $$ = AtomicType::UnboundInt64; }
|
||||
;
|
||||
|
||||
short_vec_specifier
|
||||
@@ -670,7 +665,7 @@ struct_or_union_specifier
|
||||
GetStructTypesNamesPositions(*$4, &elementTypes, &elementNames,
|
||||
&elementPositions);
|
||||
StructType *st = new StructType($2, elementTypes, elementNames,
|
||||
elementPositions, false, true, @2);
|
||||
elementPositions, false, Type::Unbound, @2);
|
||||
m->symbolTable->AddType($2, st, @2);
|
||||
$$ = st;
|
||||
}
|
||||
@@ -681,8 +676,9 @@ struct_or_union_specifier
|
||||
std::vector<SourcePos> elementPositions;
|
||||
GetStructTypesNamesPositions(*$3, &elementTypes, &elementNames,
|
||||
&elementPositions);
|
||||
// FIXME: should be unbound
|
||||
$$ = new StructType("", elementTypes, elementNames, elementPositions,
|
||||
false, true, @1);
|
||||
false, Type::Unbound, @1);
|
||||
}
|
||||
| struct_or_union '{' '}'
|
||||
{
|
||||
@@ -748,7 +744,7 @@ specifier_qualifier_list
|
||||
else if ($1 == TYPEQUAL_SIGNED) {
|
||||
if ($2->IsIntType() == false) {
|
||||
Error(@1, "Can't apply \"signed\" qualifier to \"%s\" type.",
|
||||
$2->GetString().c_str());
|
||||
$2->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
|
||||
$$ = $2;
|
||||
}
|
||||
}
|
||||
@@ -758,7 +754,7 @@ specifier_qualifier_list
|
||||
$$ = t;
|
||||
else {
|
||||
Error(@1, "Can't apply \"unsigned\" qualifier to \"%s\" type. Ignoring.",
|
||||
$2->GetString().c_str());
|
||||
$2->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
|
||||
$$ = $2;
|
||||
}
|
||||
}
|
||||
@@ -775,8 +771,11 @@ specifier_qualifier_list
|
||||
else
|
||||
FATAL("Unhandled type qualifier in parser.");
|
||||
}
|
||||
else
|
||||
else {
|
||||
if (m->errorCount == 0)
|
||||
Error(@1, "Lost type qualifier in parser.");
|
||||
$$ = NULL;
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
@@ -1112,8 +1111,7 @@ type_name
|
||||
abstract_declarator
|
||||
: pointer
|
||||
{
|
||||
Declarator *d = new Declarator(DK_POINTER, @1);
|
||||
$$ = d;
|
||||
$$ = $1;
|
||||
}
|
||||
| direct_abstract_declarator
|
||||
| pointer direct_abstract_declarator
|
||||
@@ -1262,10 +1260,22 @@ statement
|
||||
;
|
||||
|
||||
labeled_statement
|
||||
: TOKEN_CASE constant_expression ':' statement
|
||||
{ UNIMPLEMENTED; }
|
||||
: goto_identifier ':' statement
|
||||
{
|
||||
$$ = new LabeledStmt($1, $3, @1);
|
||||
}
|
||||
| TOKEN_CASE constant_expression ':' statement
|
||||
{
|
||||
int value;
|
||||
if ($2 != NULL &&
|
||||
lGetConstantInt($2, &value, @2, "Case statement value")) {
|
||||
$$ = new CaseStmt(value, $4, Union(@1, @2));
|
||||
}
|
||||
else
|
||||
$$ = NULL;
|
||||
}
|
||||
| TOKEN_DEFAULT ':' statement
|
||||
{ UNIMPLEMENTED; }
|
||||
{ $$ = new DefaultStmt($3, @1); }
|
||||
;
|
||||
|
||||
start_scope
|
||||
@@ -1311,7 +1321,7 @@ selection_statement
|
||||
| TOKEN_CIF '(' expression ')' statement TOKEN_ELSE statement
|
||||
{ $$ = new IfStmt($3, $5, $7, true, @1); }
|
||||
| TOKEN_SWITCH '(' expression ')' statement
|
||||
{ UNIMPLEMENTED; }
|
||||
{ $$ = new SwitchStmt($3, $5, @1); }
|
||||
;
|
||||
|
||||
for_test
|
||||
@@ -1433,9 +1443,13 @@ iteration_statement
|
||||
}
|
||||
;
|
||||
|
||||
goto_identifier
|
||||
: TOKEN_IDENTIFIER { $$ = yylval.stringVal->c_str(); }
|
||||
;
|
||||
|
||||
jump_statement
|
||||
: TOKEN_GOTO TOKEN_IDENTIFIER ';'
|
||||
{ UNIMPLEMENTED; }
|
||||
: TOKEN_GOTO goto_identifier ';'
|
||||
{ $$ = new GotoStmt($2, @1, @2); }
|
||||
| TOKEN_CONTINUE ';'
|
||||
{ $$ = new ContinueStmt(false, @1); }
|
||||
| TOKEN_BREAK ';'
|
||||
@@ -1551,19 +1565,21 @@ lAddDeclaration(DeclSpecs *ds, Declarator *decl) {
|
||||
const Type *t = decl->GetType(ds);
|
||||
if (t == NULL)
|
||||
return;
|
||||
|
||||
Symbol *sym = decl->GetSymbol();
|
||||
Assert(sym != NULL);
|
||||
const FunctionType *ft = dynamic_cast<const FunctionType *>(t);
|
||||
if (ft != NULL) {
|
||||
Symbol *funSym = decl->GetSymbol();
|
||||
Assert(funSym != NULL);
|
||||
funSym->type = ft;
|
||||
funSym->storageClass = ds->storageClass;
|
||||
|
||||
sym->type = ft;
|
||||
sym->storageClass = ds->storageClass;
|
||||
bool isInline = (ds->typeQualifiers & TYPEQUAL_INLINE);
|
||||
m->AddFunctionDeclaration(funSym, isInline);
|
||||
m->AddFunctionDeclaration(sym, isInline);
|
||||
}
|
||||
else {
|
||||
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
|
||||
bool isConst = (ds->typeQualifiers & TYPEQUAL_CONST) != 0;
|
||||
m->AddGlobalVariable(sym, decl->initExpr, isConst);
|
||||
}
|
||||
else
|
||||
m->AddGlobalVariable(decl->GetSymbol(), decl->initExpr,
|
||||
(ds->typeQualifiers & TYPEQUAL_CONST) != 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1589,6 +1605,7 @@ lAddFunctionParams(Declarator *decl) {
|
||||
continue;
|
||||
Assert(pdecl->declarators.size() == 1);
|
||||
Symbol *sym = pdecl->declarators[0]->GetSymbol();
|
||||
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
|
||||
#ifndef NDEBUG
|
||||
bool ok = m->symbolTable->AddVariable(sym);
|
||||
if (ok == false)
|
||||
@@ -1754,7 +1771,7 @@ lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
|
||||
the actual enum type here and optimize it, which will have
|
||||
us end up with a ConstExpr with the desired EnumType... */
|
||||
Expr *castExpr = new TypeCastExpr(enumType, enums[i]->constValue,
|
||||
false, enums[i]->pos);
|
||||
enums[i]->pos);
|
||||
castExpr = Optimize(castExpr);
|
||||
enums[i]->constValue = dynamic_cast<ConstExpr *>(castExpr);
|
||||
Assert(enums[i]->constValue != NULL);
|
||||
|
||||
126
run_tests.py
126
run_tests.py
@@ -15,6 +15,7 @@ import string
|
||||
import subprocess
|
||||
import shlex
|
||||
import platform
|
||||
import tempfile
|
||||
|
||||
# This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
|
||||
# git history has a workaround for that issue.
|
||||
@@ -79,7 +80,12 @@ if len(args) == 0:
|
||||
files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc") + \
|
||||
glob.glob("tests_errors/*ispc")
|
||||
else:
|
||||
files = args
|
||||
files = [ ]
|
||||
for f in args:
|
||||
if os.path.splitext(string.lower(f))[1] != ".ispc":
|
||||
print "Ignoring file %s, which doesn't have an .ispc extension." % f
|
||||
else:
|
||||
files += [ f ]
|
||||
|
||||
# randomly shuffle the tests if asked to do so
|
||||
if (options.random):
|
||||
@@ -146,16 +152,22 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
|
||||
|
||||
|
||||
def run_test(filename):
|
||||
global is_windows
|
||||
if is_windows:
|
||||
input_prefix = "../"
|
||||
else:
|
||||
input_prefix = ""
|
||||
|
||||
# is this a test to make sure an error is issued?
|
||||
want_error = (filename.find("tests_errors") != -1)
|
||||
if want_error == True:
|
||||
ispc_cmd = ispc_exe + " --werror --nowrap %s --arch=%s --target=%s" % \
|
||||
(filename, options.arch, options.target)
|
||||
(input_prefix + filename, options.arch, options.target)
|
||||
(return_code, output) = run_command(ispc_cmd)
|
||||
got_error = (return_code != 0)
|
||||
|
||||
# figure out the error message we're expecting
|
||||
file = open(filename, 'r')
|
||||
file = open(input_prefix + filename, 'r')
|
||||
firstline = file.readline()
|
||||
firstline = firstline.replace("//", "")
|
||||
firstline = firstline.lstrip()
|
||||
@@ -179,7 +191,7 @@ def run_test(filename):
|
||||
# function that this test has.
|
||||
sig2def = { "f_v(" : 0, "f_f(" : 1, "f_fu(" : 2, "f_fi(" : 3,
|
||||
"f_du(" : 4, "f_duf(" : 5, "f_di(" : 6 }
|
||||
file = open(filename, 'r')
|
||||
file = open(input_prefix + filename, 'r')
|
||||
match = -1
|
||||
for line in file:
|
||||
# look for lines with 'export'...
|
||||
@@ -201,14 +213,13 @@ def run_test(filename):
|
||||
if is_generic_target:
|
||||
obj_name = "%s.cpp" % filename
|
||||
|
||||
global is_windows
|
||||
if is_windows:
|
||||
if not is_generic_target:
|
||||
obj_name = "%s.obj" % filename
|
||||
exe_name = "%s.exe" % filename
|
||||
obj_name = "%s%s.obj" % (input_prefix, filename)
|
||||
exe_name = "%s%s.exe" % (input_prefix, filename)
|
||||
|
||||
cc_cmd = "%s /I. /Iwinstuff /Zi /nologo /DTEST_SIG=%d test_static.cpp %s /Fe%s" % \
|
||||
(options.compiler_exe, match, obj_name, exe_name)
|
||||
cc_cmd = "%s /I. /Iwinstuff /Zi /nologo /DTEST_SIG=%d %stest_static.cpp %s /Fe%s" % \
|
||||
(options.compiler_exe, match, input_prefix, obj_name, exe_name)
|
||||
if should_fail:
|
||||
cc_cmd += " /DEXPECT_FAILURE"
|
||||
else:
|
||||
@@ -220,7 +231,7 @@ def run_test(filename):
|
||||
gcc_arch = '-m32'
|
||||
else:
|
||||
gcc_arch = '-m64'
|
||||
cc_cmd = "%s -msse4.2 -I. %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \
|
||||
cc_cmd = "%s -O2 -msse4.2 -I. %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \
|
||||
(options.compiler_exe, gcc_arch, match, obj_name, exe_name)
|
||||
if platform.system() == 'Darwin':
|
||||
cc_cmd += ' -Wl,-no_pie'
|
||||
@@ -228,7 +239,7 @@ def run_test(filename):
|
||||
cc_cmd += " -DEXPECT_FAILURE"
|
||||
|
||||
ispc_cmd = ispc_exe + " --woff %s -o %s --arch=%s --target=%s" % \
|
||||
(filename, obj_name, options.arch, options.target)
|
||||
(input_prefix+filename, obj_name, options.arch, options.target)
|
||||
if options.no_opt:
|
||||
ispc_cmd += " -O0"
|
||||
if is_generic_target:
|
||||
@@ -257,12 +268,28 @@ def run_test(filename):
|
||||
# this function will be running in parallel across all of the CPU cores of
|
||||
# the system.
|
||||
def run_tasks_from_queue(queue, queue_ret):
|
||||
if is_windows:
|
||||
tmpdir = "tmp%d" % os.getpid()
|
||||
os.mkdir(tmpdir)
|
||||
os.chdir(tmpdir)
|
||||
else:
|
||||
olddir = ""
|
||||
|
||||
compile_error_files = [ ]
|
||||
run_error_files = [ ]
|
||||
while True:
|
||||
filename = queue.get()
|
||||
if (filename == 'STOP'):
|
||||
queue_ret.put((compile_error_files, run_error_files))
|
||||
if is_windows:
|
||||
try:
|
||||
os.remove("test_static.obj")
|
||||
os.remove("/vc100.pdb")
|
||||
os.chdir("..")
|
||||
os.rmdir(tmpdir)
|
||||
except:
|
||||
None
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
(compile_error, run_error) = run_test(filename)
|
||||
@@ -286,61 +313,38 @@ if __name__ == '__main__':
|
||||
|
||||
compile_error_files = [ ]
|
||||
run_error_files = [ ]
|
||||
if is_windows:
|
||||
# cl.exe gets itself all confused if we have multiple instances of
|
||||
# it running concurrently and operating on the same .cpp file
|
||||
# (test_static.cpp), even if we are generating a differently-named
|
||||
# exe in the end. So run serially. :-(
|
||||
nthreads = 1
|
||||
num_done = 0
|
||||
sys.stdout.write("Running %d tests.\n" % (total_tests))
|
||||
for fn in files:
|
||||
fn = fn.replace("\\",'/')
|
||||
(compile_error, run_error) = run_test(fn)
|
||||
if compile_error != 0:
|
||||
compile_error_files += [ fn ]
|
||||
if run_error != 0:
|
||||
run_error_files += [ fn ]
|
||||
num_done += 1
|
||||
progress_str = " Done %d / %d [%s]\n" % (num_done, total_tests, fn)
|
||||
# spaces to clear out detrius from previous printing...
|
||||
for x in range(30):
|
||||
progress_str += ' '
|
||||
progress_str += '\r'
|
||||
sys.stdout.write(progress_str)
|
||||
sys.stdout.flush()
|
||||
else:
|
||||
nthreads = multiprocessing.cpu_count()
|
||||
sys.stdout.write("Found %d CPUs. Running %d tests.\n" % (nthreads, total_tests))
|
||||
|
||||
# put each of the test filenames into a queue
|
||||
q = multiprocessing.Queue()
|
||||
for fn in files:
|
||||
q.put(fn)
|
||||
for x in range(nthreads):
|
||||
q.put('STOP')
|
||||
qret = multiprocessing.Queue()
|
||||
nthreads = multiprocessing.cpu_count()
|
||||
print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)
|
||||
|
||||
# need to catch sigint so that we can terminate all of the tasks if
|
||||
# we're interrupted
|
||||
signal.signal(signal.SIGINT, sigint)
|
||||
# put each of the test filenames into a queue
|
||||
q = multiprocessing.Queue()
|
||||
for fn in files:
|
||||
q.put(fn)
|
||||
for x in range(nthreads):
|
||||
q.put('STOP')
|
||||
qret = multiprocessing.Queue()
|
||||
|
||||
# launch jobs to run tests
|
||||
for x in range(nthreads):
|
||||
t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,qret))
|
||||
task_threads.append(t)
|
||||
t.start()
|
||||
# need to catch sigint so that we can terminate all of the tasks if
|
||||
# we're interrupted
|
||||
signal.signal(signal.SIGINT, sigint)
|
||||
|
||||
# wait for them to all finish and then return the number that failed
|
||||
# (i.e. return 0 if all is ok)
|
||||
for t in task_threads:
|
||||
t.join()
|
||||
sys.stdout.write("\n")
|
||||
# launch jobs to run tests
|
||||
for x in range(nthreads):
|
||||
t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,qret))
|
||||
task_threads.append(t)
|
||||
t.start()
|
||||
|
||||
while not qret.empty():
|
||||
(c, r) = qret.get()
|
||||
compile_error_files += c
|
||||
run_error_files += r
|
||||
# wait for them to all finish and then return the number that failed
|
||||
# (i.e. return 0 if all is ok)
|
||||
for t in task_threads:
|
||||
t.join()
|
||||
print
|
||||
|
||||
while not qret.empty():
|
||||
(c, r) = qret.get()
|
||||
compile_error_files += c
|
||||
run_error_files += r
|
||||
|
||||
if len(compile_error_files) > 0:
|
||||
compile_error_files.sort()
|
||||
|
||||
705
stdlib.ispc
705
stdlib.ispc
@@ -787,10 +787,14 @@ packed_store_active(uniform int * uniform a, int vals) {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// System information
|
||||
|
||||
static inline int num_cores() {
|
||||
static inline uniform int num_cores() {
|
||||
return __num_cores();
|
||||
}
|
||||
|
||||
static inline uniform int64 clock() {
|
||||
return __clock();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Atomics and memory barriers
|
||||
|
||||
@@ -808,8 +812,7 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
|
||||
(MASKTYPE)__mask); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
@@ -824,22 +827,80 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
||||
continue; \
|
||||
uniform TA * uniform p = ptrArray[i]; \
|
||||
uniform TA v = extract(value, i); \
|
||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
|
||||
(MASKTYPE)__mask); \
|
||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
|
||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE) \
|
||||
#define DEFINE_ATOMIC_SWAP(TA,TB) \
|
||||
static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform int i = 0; \
|
||||
TA ret[programCount]; \
|
||||
TA memVal; \
|
||||
uniform int lastSwap; \
|
||||
uniform int mask = lanemask(); \
|
||||
/* First, have the first running program instance (if any) perform \
|
||||
the swap with memory with its value of "value"; record the \
|
||||
value returned. */ \
|
||||
for (; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
|
||||
lastSwap = i; \
|
||||
break; \
|
||||
} \
|
||||
/* Now, for all of the remaining running program instances, set the \
|
||||
return value of the last instance that did a swap with this \
|
||||
instance's value of "value"; this gives the same effect as if the \
|
||||
current instance had executed a hardware atomic swap right before \
|
||||
the last one that did a swap. */ \
|
||||
for (; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
ret[lastSwap] = extract(value, i); \
|
||||
lastSwap = i; \
|
||||
} \
|
||||
/* And the last instance that wanted to swap gets the value we \
|
||||
originally got back from memory... */ \
|
||||
ret[lastSwap] = memVal; \
|
||||
memory_barrier(); \
|
||||
return ret[programIndex]; \
|
||||
} \
|
||||
static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
memory_barrier(); \
|
||||
TA ret; \
|
||||
uniform int mask = lanemask(); \
|
||||
for (uniform int i = 0; i < programCount; ++i) { \
|
||||
if ((mask & (1 << i)) == 0) \
|
||||
continue; \
|
||||
uniform TA * uniform p = ptrArray[i]; \
|
||||
uniform TA v = extract(value, i); \
|
||||
uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
|
||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
uniform TA oneval = reduce_##OPA(value); \
|
||||
TA ret; \
|
||||
if (lanemask() != 0) { \
|
||||
memory_barrier(); \
|
||||
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, \
|
||||
(MASKTYPE)__mask); \
|
||||
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval); \
|
||||
memory_barrier(); \
|
||||
} \
|
||||
return ret; \
|
||||
@@ -847,8 +908,7 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
|
||||
(MASKTYPE)__mask); \
|
||||
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
} \
|
||||
@@ -864,8 +924,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
||||
continue; \
|
||||
uniform TA * uniform p = ptrArray[i]; \
|
||||
uniform TA v = extract(value, i); \
|
||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
|
||||
(MASKTYPE)__mask); \
|
||||
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
memory_barrier(); \
|
||||
@@ -874,49 +933,51 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
||||
|
||||
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
|
||||
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,swap,swap,IntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(int32,int32)
|
||||
|
||||
// For everything but atomic min and max, we can use the same
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,UIntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(unsigned int32,int32)
|
||||
|
||||
DEFINE_ATOMIC_OP(float,float,swap,swap,IntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(float,float)
|
||||
|
||||
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
|
||||
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,swap,swap,IntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(int64,int64)
|
||||
|
||||
// For everything but atomic min and max, we can use the same
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,UIntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(unsigned int64,int64)
|
||||
|
||||
DEFINE_ATOMIC_OP(double,double,swap,swap,IntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(double,double)
|
||||
|
||||
#undef DEFINE_ATOMIC_OP
|
||||
#undef DEFINE_ATOMIC_MINMAX_OP
|
||||
#undef DEFINE_ATOMIC_SWAP
|
||||
|
||||
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \
|
||||
static inline TA atomic_compare_exchange_global( \
|
||||
@@ -931,8 +992,7 @@ static inline uniform TA atomic_compare_exchange_global( \
|
||||
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
|
||||
memory_barrier(); \
|
||||
uniform TA ret = \
|
||||
__atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, \
|
||||
(MASKTYPE)__mask); \
|
||||
__atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
|
||||
memory_barrier(); \
|
||||
return ret; \
|
||||
}
|
||||
@@ -2764,114 +2824,124 @@ static inline uniform double pow(uniform double a, uniform double b) {
|
||||
// half-precision floats
|
||||
|
||||
static inline uniform float half_to_float(uniform unsigned int16 h) {
|
||||
if ((h & 0x7FFFu) == 0)
|
||||
// Signed zero
|
||||
return floatbits(((unsigned int32) h) << 16);
|
||||
if (__have_native_half) {
|
||||
return __half_to_float_uniform(h);
|
||||
}
|
||||
else {
|
||||
// Though these are int16 quantities, we get much better code
|
||||
// with them stored as int32s...
|
||||
uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
||||
uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
||||
uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
||||
if (he == 0) {
|
||||
// Denormal will convert to normalized
|
||||
uniform int e = -1;
|
||||
// The following loop figures out how much extra to adjust the exponent
|
||||
// Shift until leading bit overflows into exponent bit
|
||||
do {
|
||||
e++;
|
||||
hm <<= 1;
|
||||
} while((hm & 0x0400u) == 0);
|
||||
|
||||
// Sign bit
|
||||
uniform unsigned int32 xs = ((unsigned int32) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
uniform int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
|
||||
// Exponent
|
||||
uniform unsigned int32 xe = (unsigned int32) (xes << 23);
|
||||
// Mantissa
|
||||
uniform unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13;
|
||||
return floatbits(xs | xe | xm);
|
||||
}
|
||||
if ((h & 0x7FFFu) == 0)
|
||||
// Signed zero
|
||||
return floatbits(((unsigned int32) h) << 16);
|
||||
else {
|
||||
if (he == 0x7C00u) {
|
||||
// Inf or NaN (all the exponent bits are set)
|
||||
if (hm == 0)
|
||||
// Zero mantissa -> signed inf
|
||||
return floatbits((((unsigned int32) hs) << 16) |
|
||||
((unsigned int32) 0x7F800000u));
|
||||
else
|
||||
// NaN
|
||||
return floatbits(0xFFC00000u);
|
||||
}
|
||||
else {
|
||||
// Normalized number
|
||||
// sign
|
||||
// Though these are int16 quantities, we get much better code
|
||||
// with them stored as int32s...
|
||||
uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
||||
uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
||||
uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
||||
if (he == 0) {
|
||||
// Denormal will convert to normalized
|
||||
uniform int e = -1;
|
||||
// The following loop figures out how much extra to adjust the exponent
|
||||
// Shift until leading bit overflows into exponent bit
|
||||
do {
|
||||
e++;
|
||||
hm <<= 1;
|
||||
} while((hm & 0x0400u) == 0);
|
||||
|
||||
// Sign bit
|
||||
uniform unsigned int32 xs = ((unsigned int32) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
uniform int32 xes = ((int32) (he >> 10)) - 15 + 127;
|
||||
uniform int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
|
||||
// Exponent
|
||||
uniform unsigned int32 xe = (unsigned int32) (xes << 23);
|
||||
uniform unsigned int32 xe = (unsigned int32) (xes << 23);
|
||||
// Mantissa
|
||||
uniform unsigned int32 xm = ((unsigned int32) hm) << 13;
|
||||
uniform unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13;
|
||||
return floatbits(xs | xe | xm);
|
||||
}
|
||||
else {
|
||||
if (he == 0x7C00u) {
|
||||
// Inf or NaN (all the exponent bits are set)
|
||||
if (hm == 0)
|
||||
// Zero mantissa -> signed inf
|
||||
return floatbits((((unsigned int32) hs) << 16) |
|
||||
((unsigned int32) 0x7F800000u));
|
||||
else
|
||||
// NaN
|
||||
return floatbits(0xFFC00000u);
|
||||
}
|
||||
else {
|
||||
// Normalized number
|
||||
// sign
|
||||
uniform unsigned int32 xs = ((unsigned int32) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
uniform int32 xes = ((int32) (he >> 10)) - 15 + 127;
|
||||
// Exponent
|
||||
uniform unsigned int32 xe = (unsigned int32) (xes << 23);
|
||||
// Mantissa
|
||||
uniform unsigned int32 xm = ((unsigned int32) hm) << 13;
|
||||
return floatbits(xs | xe | xm);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline float half_to_float(unsigned int16 h) {
|
||||
if ((h & 0x7FFFu) == 0)
|
||||
// Signed zero
|
||||
return floatbits(((unsigned int32) h) << 16);
|
||||
if (__have_native_half) {
|
||||
return __half_to_float_varying(h);
|
||||
}
|
||||
else {
|
||||
// Though these are int16 quantities, we get much better code
|
||||
// with them stored as int32s...
|
||||
unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
||||
unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
||||
unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
||||
cif (he == 0) {
|
||||
// Denormal will convert to normalized
|
||||
int e = -1;
|
||||
// The following loop figures out how much extra to adjust the exponent
|
||||
// Shift until leading bit overflows into exponent bit
|
||||
do {
|
||||
e++;
|
||||
hm <<= 1;
|
||||
} while((hm & 0x0400u) == 0);
|
||||
|
||||
// Sign bit
|
||||
unsigned int32 xs = ((unsigned int32) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
|
||||
// Exponent
|
||||
unsigned int32 xe = (unsigned int32) (xes << 23);
|
||||
// Mantissa
|
||||
unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13;
|
||||
return floatbits(xs | xe | xm);
|
||||
}
|
||||
if ((h & 0x7FFFu) == 0)
|
||||
// Signed zero
|
||||
return floatbits(((unsigned int32) h) << 16);
|
||||
else {
|
||||
if (he == 0x7C00u) {
|
||||
// Inf or NaN (all the exponent bits are set)
|
||||
if (hm == 0)
|
||||
// Zero mantissa -> signed inf
|
||||
return floatbits((((unsigned int32) hs) << 16) |
|
||||
((unsigned int32) 0x7F800000u));
|
||||
else
|
||||
// NaN
|
||||
return floatbits(0xFFC00000u);
|
||||
}
|
||||
else {
|
||||
// Normalized number
|
||||
// sign
|
||||
// Though these are int16 quantities, we get much better code
|
||||
// with them stored as int32s...
|
||||
unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
||||
unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
||||
unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
||||
cif (he == 0) {
|
||||
// Denormal will convert to normalized
|
||||
int e = -1;
|
||||
// The following loop figures out how much extra to adjust the exponent
|
||||
// Shift until leading bit overflows into exponent bit
|
||||
do {
|
||||
e++;
|
||||
hm <<= 1;
|
||||
} while((hm & 0x0400u) == 0);
|
||||
|
||||
// Sign bit
|
||||
unsigned int32 xs = ((unsigned int32) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
int32 xes = ((int32) (he >> 10)) - 15 + 127;
|
||||
int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
|
||||
// Exponent
|
||||
unsigned int32 xe = (unsigned int32) (xes << 23);
|
||||
unsigned int32 xe = (unsigned int32) (xes << 23);
|
||||
// Mantissa
|
||||
unsigned int32 xm = ((unsigned int32) hm) << 13;
|
||||
unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13;
|
||||
return floatbits(xs | xe | xm);
|
||||
}
|
||||
else {
|
||||
if (he == 0x7C00u) {
|
||||
// Inf or NaN (all the exponent bits are set)
|
||||
if (hm == 0)
|
||||
// Zero mantissa -> signed inf
|
||||
return floatbits((((unsigned int32) hs) << 16) |
|
||||
((unsigned int32) 0x7F800000u));
|
||||
else
|
||||
// NaN
|
||||
return floatbits(0xFFC00000u);
|
||||
}
|
||||
else {
|
||||
// Normalized number
|
||||
// sign
|
||||
unsigned int32 xs = ((unsigned int32) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
int32 xes = ((int32) (he >> 10)) - 15 + 127;
|
||||
// Exponent
|
||||
unsigned int32 xe = (unsigned int32) (xes << 23);
|
||||
// Mantissa
|
||||
unsigned int32 xm = ((unsigned int32) hm) << 13;
|
||||
return floatbits(xs | xe | xm);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2879,209 +2949,237 @@ static inline float half_to_float(unsigned int16 h) {
|
||||
|
||||
|
||||
static inline uniform int16 float_to_half(uniform float f) {
|
||||
uniform int32 x = intbits(f);
|
||||
// Store the return value in an int32 until the very end; this ends up
|
||||
// generating better code...
|
||||
uniform int32 ret;
|
||||
if ((x & 0x7FFFFFFFu) == 0)
|
||||
// Signed zero
|
||||
ret = (x >> 16);
|
||||
if (__have_native_half) {
|
||||
return __float_to_half_uniform(f);
|
||||
}
|
||||
else {
|
||||
uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
||||
uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
||||
uniform unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
|
||||
if (xe == 0) {
|
||||
// Denormal will underflow, return a signed zero
|
||||
ret = (xs >> 16);
|
||||
}
|
||||
uniform int32 x = intbits(f);
|
||||
// Store the return value in an int32 until the very end; this ends up
|
||||
// generating better code...
|
||||
uniform int32 ret;
|
||||
if ((x & 0x7FFFFFFFu) == 0)
|
||||
// Signed zero
|
||||
ret = (x >> 16);
|
||||
else {
|
||||
if (xe == 0x7F800000u) {
|
||||
// Inf or NaN (all the exponent bits are set)
|
||||
if (xm == 0)
|
||||
// Zero mantissa -> signed infinity
|
||||
ret = ((xs >> 16) | 0x7C00u);
|
||||
else
|
||||
// NaN, only 1st mantissa bit set
|
||||
ret = 0xFE00u;
|
||||
}
|
||||
else {
|
||||
// Normalized number
|
||||
uniform unsigned int32 hs = (xs >> 16); // Sign bit
|
||||
uniform unsigned int32 hm;
|
||||
// Exponent unbias the single, then bias the halfp
|
||||
uniform int32 hes = ((int)(xe >> 23)) - 127 + 15;
|
||||
if (hes >= 0x1F)
|
||||
// Overflow: return signed infinity
|
||||
ret = ((xs >> 16) | 0x7C00u);
|
||||
else if (hes <= 0) {
|
||||
// Underflow
|
||||
if ((14 - hes) > 24) {
|
||||
// Mantissa shifted all the way off & no rounding possibility
|
||||
hm = 0u; // Set mantissa to zero
|
||||
uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
||||
uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
||||
uniform unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
|
||||
if (xe == 0) {
|
||||
// Denormal will underflow, return a signed zero
|
||||
ret = (xs >> 16);
|
||||
}
|
||||
else {
|
||||
if (xe == 0x7F800000u) {
|
||||
// Inf or NaN (all the exponent bits are set)
|
||||
if (xm == 0)
|
||||
// Zero mantissa -> signed infinity
|
||||
ret = ((xs >> 16) | 0x7C00u);
|
||||
else
|
||||
// NaN, only 1st mantissa bit set
|
||||
ret = 0xFE00u;
|
||||
}
|
||||
else {
|
||||
// Normalized number
|
||||
uniform unsigned int32 hs = (xs >> 16); // Sign bit
|
||||
uniform unsigned int32 hm;
|
||||
// Exponent unbias the single, then bias the halfp
|
||||
uniform int32 hes = ((int)(xe >> 23)) - 127 + 15;
|
||||
if (hes >= 0x1F)
|
||||
// Overflow: return signed infinity
|
||||
ret = ((xs >> 16) | 0x7C00u);
|
||||
else if (hes <= 0) {
|
||||
// Underflow
|
||||
if ((14 - hes) > 24) {
|
||||
// Mantissa shifted all the way off & no rounding possibility
|
||||
hm = 0u; // Set mantissa to zero
|
||||
}
|
||||
else {
|
||||
xm |= 0x00800000u; // Add the hidden leading bit
|
||||
hm = (xm >> (14 - hes)); // Mantissa
|
||||
if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
|
||||
// Round, might overflow into exp bit, but this is OK
|
||||
hm += 1u;
|
||||
}
|
||||
ret = (hs | hm);
|
||||
}
|
||||
else {
|
||||
xm |= 0x00800000u; // Add the hidden leading bit
|
||||
hm = (xm >> (14 - hes)); // Mantissa
|
||||
if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
|
||||
// Round, might overflow into exp bit, but this is OK
|
||||
hm += 1u;
|
||||
uniform unsigned int32 he = (hes << 10); // Exponent
|
||||
hm = (xm >> 13); // Mantissa
|
||||
if (xm & 0x00001000u) // Check for rounding
|
||||
// Round, might overflow to inf, this is OK
|
||||
ret = (hs | he | hm) + 1u;
|
||||
else
|
||||
ret = (hs | he | hm);
|
||||
}
|
||||
ret = (hs | hm);
|
||||
}
|
||||
else {
|
||||
uniform unsigned int32 he = (hes << 10); // Exponent
|
||||
hm = (xm >> 13); // Mantissa
|
||||
if (xm & 0x00001000u) // Check for rounding
|
||||
// Round, might overflow to inf, this is OK
|
||||
ret = (hs | he | hm) + 1u;
|
||||
else
|
||||
ret = (hs | he | hm);
|
||||
}
|
||||
}
|
||||
}
|
||||
return (int16)ret;
|
||||
}
|
||||
return (int16)ret;
|
||||
}
|
||||
|
||||
|
||||
static inline int16 float_to_half(float f) {
|
||||
int32 x = intbits(f);
|
||||
// Store the return value in an int32 until the very end; this ends up
|
||||
// generating better code...
|
||||
int32 ret;
|
||||
if ((x & 0x7FFFFFFFu) == 0)
|
||||
// Signed zero
|
||||
ret = (x >> 16);
|
||||
if (__have_native_half) {
|
||||
return __float_to_half_varying(f);
|
||||
}
|
||||
else {
|
||||
unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
||||
unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
||||
unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
|
||||
if (xe == 0) {
|
||||
// Denormal will underflow, return a signed zero
|
||||
ret = (xs >> 16);
|
||||
}
|
||||
int32 x = intbits(f);
|
||||
// Store the return value in an int32 until the very end; this ends up
|
||||
// generating better code...
|
||||
int32 ret;
|
||||
if ((x & 0x7FFFFFFFu) == 0)
|
||||
// Signed zero
|
||||
ret = (x >> 16);
|
||||
else {
|
||||
cif (xe == 0x7F800000u) {
|
||||
// Inf or NaN (all the exponent bits are set)
|
||||
if (xm == 0)
|
||||
// Zero mantissa -> signed infinity
|
||||
ret = ((xs >> 16) | 0x7C00u);
|
||||
else
|
||||
// NaN, only 1st mantissa bit set
|
||||
ret = 0xFE00u;
|
||||
}
|
||||
else {
|
||||
// Normalized number
|
||||
unsigned int32 hs = (xs >> 16); // Sign bit
|
||||
unsigned int32 hm;
|
||||
// Exponent unbias the single, then bias the halfp
|
||||
int32 hes = ((int)(xe >> 23)) - 127 + 15;
|
||||
if (hes >= 0x1F)
|
||||
// Overflow: return signed infinity
|
||||
ret = ((xs >> 16) | 0x7C00u);
|
||||
else if (hes <= 0) {
|
||||
// Underflow
|
||||
if ((14 - hes) > 24) {
|
||||
// Mantissa shifted all the way off & no rounding possibility
|
||||
hm = 0u; // Set mantissa to zero
|
||||
unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
||||
unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
||||
unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
|
||||
if (xe == 0) {
|
||||
// Denormal will underflow, return a signed zero
|
||||
ret = (xs >> 16);
|
||||
}
|
||||
else {
|
||||
cif (xe == 0x7F800000u) {
|
||||
// Inf or NaN (all the exponent bits are set)
|
||||
if (xm == 0)
|
||||
// Zero mantissa -> signed infinity
|
||||
ret = ((xs >> 16) | 0x7C00u);
|
||||
else
|
||||
// NaN, only 1st mantissa bit set
|
||||
ret = 0xFE00u;
|
||||
}
|
||||
else {
|
||||
// Normalized number
|
||||
unsigned int32 hs = (xs >> 16); // Sign bit
|
||||
unsigned int32 hm;
|
||||
// Exponent unbias the single, then bias the halfp
|
||||
int32 hes = ((int)(xe >> 23)) - 127 + 15;
|
||||
if (hes >= 0x1F)
|
||||
// Overflow: return signed infinity
|
||||
ret = ((xs >> 16) | 0x7C00u);
|
||||
else if (hes <= 0) {
|
||||
// Underflow
|
||||
if ((14 - hes) > 24) {
|
||||
// Mantissa shifted all the way off & no rounding possibility
|
||||
hm = 0u; // Set mantissa to zero
|
||||
}
|
||||
else {
|
||||
xm |= 0x00800000u; // Add the hidden leading bit
|
||||
hm = (xm >> (14 - hes)); // Mantissa
|
||||
if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
|
||||
// Round, might overflow into exp bit, but this is OK
|
||||
hm += 1u;
|
||||
}
|
||||
ret = (hs | hm);
|
||||
}
|
||||
else {
|
||||
xm |= 0x00800000u; // Add the hidden leading bit
|
||||
hm = (xm >> (14 - hes)); // Mantissa
|
||||
if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
|
||||
// Round, might overflow into exp bit, but this is OK
|
||||
hm += 1u;
|
||||
unsigned int32 he = (hes << 10); // Exponent
|
||||
hm = (xm >> 13); // Mantissa
|
||||
if (xm & 0x00001000u) // Check for rounding
|
||||
// Round, might overflow to inf, this is OK
|
||||
ret = (hs | he | hm) + 1u;
|
||||
else
|
||||
ret = (hs | he | hm);
|
||||
}
|
||||
ret = (hs | hm);
|
||||
}
|
||||
else {
|
||||
unsigned int32 he = (hes << 10); // Exponent
|
||||
hm = (xm >> 13); // Mantissa
|
||||
if (xm & 0x00001000u) // Check for rounding
|
||||
// Round, might overflow to inf, this is OK
|
||||
ret = (hs | he | hm) + 1u;
|
||||
else
|
||||
ret = (hs | he | hm);
|
||||
}
|
||||
}
|
||||
}
|
||||
return (int16)ret;
|
||||
}
|
||||
return (int16)ret;
|
||||
}
|
||||
|
||||
|
||||
static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
|
||||
uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
||||
uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
||||
uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
||||
|
||||
// sign
|
||||
uniform unsigned int32 xs = ((unsigned int32) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
uniform int32 xes = ((int32) (he >> 10)) - 15 + 127;
|
||||
// Exponent
|
||||
uniform unsigned int32 xe = (unsigned int32) (xes << 23);
|
||||
// Mantissa
|
||||
uniform unsigned int32 xm = ((unsigned int32) hm) << 13;
|
||||
return floatbits(xs | xe | xm);
|
||||
if (__have_native_half) {
|
||||
return __half_to_float_uniform(h);
|
||||
}
|
||||
else {
|
||||
uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
||||
uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
||||
uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
||||
|
||||
// sign
|
||||
uniform unsigned int32 xs = ((unsigned int32) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
uniform int32 xes = ((int32) (he >> 10)) - 15 + 127;
|
||||
// Exponent
|
||||
uniform unsigned int32 xe = (unsigned int32) (xes << 23);
|
||||
// Mantissa
|
||||
uniform unsigned int32 xm = ((unsigned int32) hm) << 13;
|
||||
return floatbits(xs | xe | xm);
|
||||
}
|
||||
}
|
||||
|
||||
static inline float half_to_float_fast(unsigned int16 h) {
|
||||
unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
||||
unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
||||
unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
||||
|
||||
// sign
|
||||
unsigned int32 xs = ((unsigned int32) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
int32 xes = ((int32) (he >> 10)) - 15 + 127;
|
||||
// Exponent
|
||||
unsigned int32 xe = (unsigned int32) (xes << 23);
|
||||
// Mantissa
|
||||
unsigned int32 xm = ((unsigned int32) hm) << 13;
|
||||
return floatbits(xs | xe | xm);
|
||||
if (__have_native_half) {
|
||||
return __half_to_float_varying(h);
|
||||
}
|
||||
else {
|
||||
unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
||||
unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
||||
unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
||||
|
||||
// sign
|
||||
unsigned int32 xs = ((unsigned int32) hs) << 16;
|
||||
// Exponent: unbias the halfp, then bias the single
|
||||
int32 xes = ((int32) (he >> 10)) - 15 + 127;
|
||||
// Exponent
|
||||
unsigned int32 xe = (unsigned int32) (xes << 23);
|
||||
// Mantissa
|
||||
unsigned int32 xm = ((unsigned int32) hm) << 13;
|
||||
return floatbits(xs | xe | xm);
|
||||
}
|
||||
}
|
||||
|
||||
static inline uniform int16 float_to_half_fast(uniform float f) {
|
||||
uniform int32 x = intbits(f);
|
||||
uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
||||
uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
||||
uniform unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
|
||||
if (__have_native_half) {
|
||||
return __float_to_half_uniform(f);
|
||||
}
|
||||
else {
|
||||
uniform int32 x = intbits(f);
|
||||
uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
||||
uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
||||
uniform unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
|
||||
|
||||
uniform unsigned int32 hs = (xs >> 16); // Sign bit
|
||||
// Exponent unbias the single, then bias the halfp
|
||||
uniform int32 hes = ((int)(xe >> 23)) - 127 + 15;
|
||||
uniform unsigned int32 he = (hes << 10); // Exponent
|
||||
uniform int32 hm = (xm >> 13); // Mantissa
|
||||
uniform int32 ret = (hs | he | hm);
|
||||
uniform unsigned int32 hs = (xs >> 16); // Sign bit
|
||||
// Exponent unbias the single, then bias the halfp
|
||||
uniform int32 hes = ((int)(xe >> 23)) - 127 + 15;
|
||||
uniform unsigned int32 he = (hes << 10); // Exponent
|
||||
uniform int32 hm = (xm >> 13); // Mantissa
|
||||
uniform int32 ret = (hs | he | hm);
|
||||
|
||||
if (xm & 0x00001000u) // Check for rounding
|
||||
// Round, might overflow to inf, this is OK
|
||||
ret += 1u;
|
||||
if (xm & 0x00001000u) // Check for rounding
|
||||
// Round, might overflow to inf, this is OK
|
||||
ret += 1u;
|
||||
|
||||
return (int16)ret;
|
||||
return (int16)ret;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int16 float_to_half_fast(float f) {
|
||||
int32 x = intbits(f);
|
||||
unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
||||
unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
||||
unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
|
||||
if (__have_native_half) {
|
||||
return __float_to_half_varying(f);
|
||||
}
|
||||
else {
|
||||
int32 x = intbits(f);
|
||||
unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
||||
unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
||||
unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
|
||||
|
||||
unsigned int32 hs = (xs >> 16); // Sign bit
|
||||
// Exponent unbias the single, then bias the halfp
|
||||
int32 hes = ((int)(xe >> 23)) - 127 + 15;
|
||||
unsigned int32 he = (hes << 10); // Exponent
|
||||
int32 hm = (xm >> 13); // Mantissa
|
||||
int32 ret = (hs | he | hm);
|
||||
unsigned int32 hs = (xs >> 16); // Sign bit
|
||||
// Exponent unbias the single, then bias the halfp
|
||||
int32 hes = ((int)(xe >> 23)) - 127 + 15;
|
||||
unsigned int32 he = (hes << 10); // Exponent
|
||||
int32 hm = (xm >> 13); // Mantissa
|
||||
int32 ret = (hs | he | hm);
|
||||
|
||||
if (xm & 0x00001000u) // Check for rounding
|
||||
// Round, might overflow to inf, this is OK
|
||||
ret += 1u;
|
||||
if (xm & 0x00001000u) // Check for rounding
|
||||
// Round, might overflow to inf, this is OK
|
||||
ret += 1u;
|
||||
|
||||
return (int16)ret;
|
||||
return (int16)ret;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -3095,16 +3193,15 @@ static inline unsigned int random(RNGState * uniform state)
|
||||
{
|
||||
unsigned int b;
|
||||
|
||||
// FIXME: state->z1, etc..
|
||||
b = (((*state).z1 << 6) ^ (*state).z1) >> 13;
|
||||
(*state).z1 = (((*state).z1 & 4294967294U) << 18) ^ b;
|
||||
b = (((*state).z2 << 2) ^ (*state).z2) >> 27;
|
||||
(*state).z2 = (((*state).z2 & 4294967288U) << 2) ^ b;
|
||||
b = (((*state).z3 << 13) ^ (*state).z3) >> 21;
|
||||
(*state).z3 = (((*state).z3 & 4294967280U) << 7) ^ b;
|
||||
b = (((*state).z4 << 3) ^ (*state).z4) >> 12;
|
||||
(*state).z4 = (((*state).z4 & 4294967168U) << 13) ^ b;
|
||||
return ((*state).z1 ^ (*state).z2 ^ (*state).z3 ^ (*state).z4);
|
||||
b = ((state->z1 << 6) ^ state->z1) >> 13;
|
||||
state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
|
||||
b = ((state->z2 << 2) ^ state->z2) >> 27;
|
||||
state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
|
||||
b = ((state->z3 << 13) ^ state->z3) >> 21;
|
||||
state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
|
||||
b = ((state->z4 << 3) ^ state->z4) >> 12;
|
||||
state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
|
||||
return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
|
||||
}
|
||||
|
||||
static inline float frandom(RNGState * uniform state)
|
||||
@@ -3120,30 +3217,30 @@ static inline uniform unsigned int __seed4(RNGState * uniform state,
|
||||
uniform unsigned int c1 = 0xf0f0f0f0;
|
||||
uniform unsigned int c2 = 0x0f0f0f0f;
|
||||
|
||||
(*state).z1 = insert((*state).z1, start + 0, seed);
|
||||
(*state).z1 = insert((*state).z1, start + 1, seed ^ c1);
|
||||
(*state).z1 = insert((*state).z1, start + 2, (seed << 3) ^ c1);
|
||||
(*state).z1 = insert((*state).z1, start + 3, (seed << 2) ^ c2);
|
||||
state->z1 = insert(state->z1, start + 0, seed);
|
||||
state->z1 = insert(state->z1, start + 1, seed ^ c1);
|
||||
state->z1 = insert(state->z1, start + 2, (seed << 3) ^ c1);
|
||||
state->z1 = insert(state->z1, start + 3, (seed << 2) ^ c2);
|
||||
|
||||
seed += 131;
|
||||
(*state).z2 = insert((*state).z2, start + 0, seed);
|
||||
(*state).z2 = insert((*state).z2, start + 1, seed ^ c1);
|
||||
(*state).z2 = insert((*state).z2, start + 2, (seed << 3) ^ c1);
|
||||
(*state).z2 = insert((*state).z2, start + 3, (seed << 2) ^ c2);
|
||||
state->z2 = insert(state->z2, start + 0, seed);
|
||||
state->z2 = insert(state->z2, start + 1, seed ^ c1);
|
||||
state->z2 = insert(state->z2, start + 2, (seed << 3) ^ c1);
|
||||
state->z2 = insert(state->z2, start + 3, (seed << 2) ^ c2);
|
||||
|
||||
seed ^= extract((*state).z2, 2);
|
||||
(*state).z3 = insert((*state).z3, start + 0, seed);
|
||||
(*state).z3 = insert((*state).z3, start + 1, seed ^ c1);
|
||||
(*state).z3 = insert((*state).z3, start + 2, (seed << 3) ^ c1);
|
||||
(*state).z3 = insert((*state).z3, start + 3, (seed << 2) ^ c2);
|
||||
seed ^= extract(state->z2, 2);
|
||||
state->z3 = insert(state->z3, start + 0, seed);
|
||||
state->z3 = insert(state->z3, start + 1, seed ^ c1);
|
||||
state->z3 = insert(state->z3, start + 2, (seed << 3) ^ c1);
|
||||
state->z3 = insert(state->z3, start + 3, (seed << 2) ^ c2);
|
||||
|
||||
seed <<= 4;
|
||||
seed += 3;
|
||||
seed ^= extract((*state).z1, 3);
|
||||
(*state).z4 = insert((*state).z4, start + 0, seed);
|
||||
(*state).z4 = insert((*state).z4, start + 1, seed ^ c1);
|
||||
(*state).z4 = insert((*state).z4, start + 2, (seed << 3) ^ c1);
|
||||
(*state).z4 = insert((*state).z4, start + 3, (seed << 2) ^ c2);
|
||||
seed ^= extract(state->z1, 3);
|
||||
state->z4 = insert(state->z4, start + 0, seed);
|
||||
state->z4 = insert(state->z4, start + 1, seed ^ c1);
|
||||
state->z4 = insert(state->z4, start + 2, (seed << 3) ^ c1);
|
||||
state->z4 = insert(state->z4, start + 3, (seed << 2) ^ c2);
|
||||
|
||||
return seed;
|
||||
}
|
||||
|
||||
787
stmt.cpp
787
stmt.cpp
@@ -494,6 +494,7 @@ lEmitIfStatements(FunctionEmitContext *ctx, Stmt *stmts, const char *trueOrFalse
|
||||
ctx->EndScope();
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
IfStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// First check all of the things that might happen due to errors
|
||||
@@ -694,6 +695,23 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
|
||||
}
|
||||
|
||||
// All indices are in-bounds
|
||||
return true;
|
||||
}
|
||||
|
||||
MemberExpr *me;
|
||||
if ((me = dynamic_cast<MemberExpr *>(node)) != NULL &&
|
||||
me->dereferenceExpr) {
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
DereferenceExpr *de;
|
||||
if ((de = dynamic_cast<DereferenceExpr *>(node)) != NULL) {
|
||||
const Type *exprType = de->expr->GetType();
|
||||
if (dynamic_cast<const PointerType *>(exprType) != NULL) {
|
||||
*okPtr = false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -1132,7 +1150,7 @@ DoStmt::TypeCheck() {
|
||||
!lHasVaryingBreakOrContinue(bodyStmts));
|
||||
testExpr = new TypeCastExpr(uniformTest ? AtomicType::UniformBool :
|
||||
AtomicType::VaryingBool,
|
||||
testExpr, false, testExpr->pos);
|
||||
testExpr, testExpr->pos);
|
||||
}
|
||||
|
||||
return this;
|
||||
@@ -1317,8 +1335,7 @@ ForStmt::TypeCheck() {
|
||||
!g->opt.disableUniformControlFlow &&
|
||||
!lHasVaryingBreakOrContinue(stmts));
|
||||
test = new TypeCastExpr(uniformTest ? AtomicType::UniformBool :
|
||||
AtomicType::VaryingBool,
|
||||
test, false, test->pos);
|
||||
AtomicType::VaryingBool, test, test->pos);
|
||||
test = ::TypeCheck(test);
|
||||
if (test == NULL)
|
||||
return NULL;
|
||||
@@ -1558,9 +1575,8 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
if (ctx->GetCurrentBasicBlock() == NULL || stmts == NULL)
|
||||
return;
|
||||
|
||||
llvm::BasicBlock *bbCheckExtras = ctx->CreateBasicBlock("foreach_check_extras");
|
||||
llvm::BasicBlock *bbDoExtras = ctx->CreateBasicBlock("foreach_do_extras");
|
||||
llvm::BasicBlock *bbBody = ctx->CreateBasicBlock("foreach_body");
|
||||
llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body");
|
||||
llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body");
|
||||
llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit");
|
||||
|
||||
llvm::Value *oldMask = ctx->GetInternalMask();
|
||||
@@ -1578,8 +1594,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// dimension and a number of derived values.
|
||||
std::vector<llvm::BasicBlock *> bbReset, bbStep, bbTest;
|
||||
std::vector<llvm::Value *> startVals, endVals, uniformCounterPtrs;
|
||||
std::vector<llvm::Value *> nItems, nExtras, alignedEnd;
|
||||
std::vector<llvm::Value *> extrasMaskPtrs;
|
||||
std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
|
||||
|
||||
std::vector<int> span(nDims, 0);
|
||||
lGetSpans(nDims-1, nDims, g->target.vectorWidth, isTiled, &span[0]);
|
||||
@@ -1588,7 +1603,9 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// Basic blocks that we'll fill in later with the looping logic for
|
||||
// this dimension.
|
||||
bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
|
||||
bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
|
||||
if (i < nDims-1)
|
||||
// stepping for the innermost dimension is handled specially
|
||||
bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
|
||||
bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
|
||||
|
||||
// Start and end value for this loop dimension
|
||||
@@ -1600,14 +1617,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
endVals.push_back(ev);
|
||||
|
||||
// nItems = endVal - startVal
|
||||
nItems.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv,
|
||||
"nitems"));
|
||||
llvm::Value *nItems =
|
||||
ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
|
||||
|
||||
// nExtras = nItems % (span for this dimension)
|
||||
// This gives us the number of extra elements we need to deal with
|
||||
// at the end of the loop for this dimension that don't fit cleanly
|
||||
// into a vector width.
|
||||
nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems[i],
|
||||
nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
|
||||
LLVMInt32(span[i]), "nextras"));
|
||||
|
||||
// alignedEnd = endVal - nExtras
|
||||
@@ -1626,8 +1643,9 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// There is also a varying variable that holds the set of index
|
||||
// values for each dimension in the current loop iteration; this is
|
||||
// the value that is program-visible.
|
||||
dimVariables[i]->storagePtr = ctx->AllocaInst(LLVMTypes::Int32VectorType,
|
||||
dimVariables[i]->name.c_str());
|
||||
dimVariables[i]->storagePtr =
|
||||
ctx->AllocaInst(LLVMTypes::Int32VectorType,
|
||||
dimVariables[i]->name.c_str());
|
||||
dimVariables[i]->parentFunction = ctx->GetFunction();
|
||||
ctx->EmitVariableDebugInfo(dimVariables[i]);
|
||||
|
||||
@@ -1639,7 +1657,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
|
||||
}
|
||||
|
||||
ctx->StartForeach(bbStep[nDims-1]);
|
||||
ctx->StartForeach();
|
||||
|
||||
// On to the outermost loop's test
|
||||
ctx->BranchInst(bbTest[0]);
|
||||
@@ -1660,9 +1678,25 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// foreach_test
|
||||
// foreach_step: increment the uniform counter by the vector width.
|
||||
// Note that we don't increment the varying counter here as well but
|
||||
// just generate its value when we need it in the loop body. Don't do
|
||||
// this for the innermost dimension, which has a more complex stepping
|
||||
// structure..
|
||||
for (int i = 0; i < nDims-1; ++i) {
|
||||
ctx->SetCurrentBasicBlock(bbStep[i]);
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
|
||||
llvm::Value *newCounter =
|
||||
ctx->BinaryOperator(llvm::Instruction::Add, counter,
|
||||
LLVMInt32(span[i]), "new_counter");
|
||||
ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
|
||||
ctx->BranchInst(bbTest[i]);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// foreach_test (for all dimensions other than the innermost...)
|
||||
std::vector<llvm::Value *> inExtras;
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
for (int i = 0; i < nDims-1; ++i) {
|
||||
ctx->SetCurrentBasicBlock(bbTest[i]);
|
||||
|
||||
llvm::Value *haveExtras =
|
||||
@@ -1700,8 +1734,6 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
if (i == 0)
|
||||
ctx->StoreInst(emask, extrasMaskPtrs[i]);
|
||||
else {
|
||||
// FIXME: at least specialize the innermost loop to not do all
|
||||
// this mask stuff each time through the test...
|
||||
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
|
||||
llvm::Value *newMask =
|
||||
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
|
||||
@@ -1712,59 +1744,267 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
llvm::Value *notAtEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
counter, endVals[i]);
|
||||
if (i != nDims-1)
|
||||
ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
|
||||
ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// foreach_test (for innermost dimension)
|
||||
//
|
||||
// All of the outer dimensions are handled generically--basically as a
|
||||
// for() loop from the start value to the end value, where at each loop
|
||||
// test, we compute the mask of active elements for the current
|
||||
// dimension and then update an overall mask that is the AND
|
||||
// combination of all of the outer ones.
|
||||
//
|
||||
// The innermost loop is handled specially, for performance purposes.
|
||||
// When starting the innermost dimension, we start by checking once
|
||||
// whether any of the outer dimensions has set the mask to be
|
||||
// partially-active or not. We follow different code paths for these
|
||||
// two cases, taking advantage of the knowledge that the mask is all
|
||||
// on, when this is the case.
|
||||
//
|
||||
// In each of these code paths, we start with a loop from the starting
|
||||
// value to the aligned end value for the innermost dimension; we can
|
||||
// guarantee that the innermost loop will have an "all on" mask (as far
|
||||
// as its dimension is concerned) for the duration of this loop. Doing
|
||||
// so allows us to emit code that assumes the mask is all on (for the
|
||||
// case where none of the outer dimensions has set the mask to be
|
||||
// partially on), or allows us to emit code that just uses the mask
|
||||
// from the outer dimensions directly (for the case where they have).
|
||||
//
|
||||
// After this loop, we just need to deal with one vector's worth of
|
||||
// "ragged extra bits", where the mask used includes the effect of the
|
||||
// mask for the innermost dimension.
|
||||
//
|
||||
// We start out this process by emitting the check that determines
|
||||
// whether any of the enclosing dimensions is partially active
|
||||
// (i.e. processing extra elements that don't exactly fit into a
|
||||
// vector).
|
||||
llvm::BasicBlock *bbOuterInExtras =
|
||||
ctx->CreateBasicBlock("outer_in_extras");
|
||||
llvm::BasicBlock *bbOuterNotInExtras =
|
||||
ctx->CreateBasicBlock("outer_not_in_extras");
|
||||
|
||||
ctx->SetCurrentBasicBlock(bbTest[nDims-1]);
|
||||
if (inExtras.size())
|
||||
ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
|
||||
inExtras.back());
|
||||
else
|
||||
// for a 1D iteration domain, we certainly don't have any enclosing
|
||||
// dimensions that are processing extra elements.
|
||||
ctx->BranchInst(bbOuterNotInExtras);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// One or more outer dimensions in extras, so we need to mask for the loop
|
||||
// body regardless. We break this into two cases, roughly:
|
||||
// for (counter = start; counter < alignedEnd; counter += step) {
|
||||
// // mask is all on for inner, so set mask to outer mask
|
||||
// // run loop body with mask
|
||||
// }
|
||||
// // counter == alignedEnd
|
||||
// if (counter < end) {
|
||||
// // set mask to outermask & (counter+programCounter < end)
|
||||
// // run loop body with mask
|
||||
// }
|
||||
llvm::BasicBlock *bbAllInnerPartialOuter =
|
||||
ctx->CreateBasicBlock("all_inner_partial_outer");
|
||||
llvm::BasicBlock *bbPartial =
|
||||
ctx->CreateBasicBlock("both_partial");
|
||||
ctx->SetCurrentBasicBlock(bbOuterInExtras); {
|
||||
// Update the varying counter value here, since all subsequent
|
||||
// blocks along this path need it.
|
||||
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
|
||||
dimVariables[nDims-1]->storagePtr, span);
|
||||
|
||||
// here we just check to see if counter < alignedEnd
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
|
||||
llvm::Value *beforeAlignedEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
counter, alignedEnd[nDims-1], "before_aligned_end");
|
||||
ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
|
||||
}
|
||||
|
||||
// Below we have a basic block that runs the loop body code for the
|
||||
// case where the mask is partially but not fully on. This same block
|
||||
// runs in multiple cases: both for handling any ragged extra data for
|
||||
// the innermost dimension but also when outer dimensions have set the
|
||||
// mask to be partially on.
|
||||
//
|
||||
// The value stored in stepIndexAfterMaskedBodyPtr is used after each
|
||||
// execution of the body code to determine whether the innermost index
|
||||
// value should be incremented by the step (we're running the "for"
|
||||
// loop of full vectors at the innermost dimension, with outer
|
||||
// dimensions having set the mask to be partially on), or whether we're
|
||||
// running once for the ragged extra bits at the end of the innermost
|
||||
// dimension, in which case we're done with the innermost dimension and
|
||||
// should step the loop counter for the next enclosing dimension
|
||||
// instead.
|
||||
llvm::Value *stepIndexAfterMaskedBodyPtr =
|
||||
ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// We're in the inner loop part where the only masking is due to outer
|
||||
// dimensions but the innermost dimension fits fully into a vector's
|
||||
// width. Set the mask and jump to the masked loop body.
|
||||
ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); {
|
||||
llvm::Value *mask;
|
||||
if (extrasMaskPtrs.size() == 0)
|
||||
// 1D loop; we shouldn't ever get here anyway
|
||||
mask = LLVMMaskAllOff;
|
||||
else
|
||||
ctx->BranchInst(bbCheckExtras, bbReset[i], notAtEnd);
|
||||
mask = ctx->LoadInst(extrasMaskPtrs.back());
|
||||
ctx->SetInternalMask(mask);
|
||||
|
||||
ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
|
||||
ctx->BranchInst(bbMaskedBody);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// foreach_step: increment the uniform counter by the vector width.
|
||||
// Note that we don't increment the varying counter here as well but
|
||||
// just generate its value when we need it in the loop body.
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
ctx->SetCurrentBasicBlock(bbStep[i]);
|
||||
if (i == nDims-1)
|
||||
ctx->RestoreContinuedLanes();
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
|
||||
llvm::Value *newCounter =
|
||||
ctx->BinaryOperator(llvm::Instruction::Add, counter,
|
||||
LLVMInt32(span[i]), "new_counter");
|
||||
ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
|
||||
ctx->BranchInst(bbTest[i]);
|
||||
// We need to include the effect of the innermost dimension in the mask
|
||||
// for the final bits here
|
||||
ctx->SetCurrentBasicBlock(bbPartial); {
|
||||
llvm::Value *varyingCounter =
|
||||
ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
|
||||
llvm::Value *smearEnd = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
|
||||
for (int j = 0; j < g->target.vectorWidth; ++j)
|
||||
smearEnd = ctx->InsertInst(smearEnd, endVals[nDims-1], j, "smear_end");
|
||||
llvm::Value *emask =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
varyingCounter, smearEnd);
|
||||
emask = ctx->I1VecToBoolVec(emask);
|
||||
|
||||
if (nDims == 1)
|
||||
ctx->SetInternalMask(emask);
|
||||
else {
|
||||
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
|
||||
llvm::Value *newMask =
|
||||
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
|
||||
"extras_mask");
|
||||
ctx->SetInternalMask(newMask);
|
||||
}
|
||||
|
||||
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
|
||||
ctx->BranchInst(bbMaskedBody);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// foreach_check_extras: see if we need to deal with any partial
|
||||
// vector's worth of work that's left.
|
||||
ctx->SetCurrentBasicBlock(bbCheckExtras);
|
||||
ctx->AddInstrumentationPoint("foreach loop check extras");
|
||||
ctx->BranchInst(bbDoExtras, bbBody, inExtras[nDims-1]);
|
||||
// None of the outer dimensions is processing extras; along the lines
|
||||
// of above, we can express this as:
|
||||
// for (counter = start; counter < alignedEnd; counter += step) {
|
||||
// // mask is all on
|
||||
// // run loop body with mask all on
|
||||
// }
|
||||
// // counter == alignedEnd
|
||||
// if (counter < end) {
|
||||
// // set mask to (counter+programCounter < end)
|
||||
// // run loop body with mask
|
||||
// }
|
||||
llvm::BasicBlock *bbPartialInnerAllOuter =
|
||||
ctx->CreateBasicBlock("partial_inner_all_outer");
|
||||
ctx->SetCurrentBasicBlock(bbOuterNotInExtras); {
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
|
||||
llvm::Value *beforeAlignedEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
counter, alignedEnd[nDims-1], "before_aligned_end");
|
||||
ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
|
||||
beforeAlignedEnd);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// foreach_body: do a full vector's worth of work. We know that all
|
||||
// full_body: do a full vector's worth of work. We know that all
|
||||
// lanes will be running here, so we explicitly set the mask to be 'all
|
||||
// on'. This ends up being relatively straightforward: just update the
|
||||
// value of the varying loop counter and have the statements in the
|
||||
// loop body emit their code.
|
||||
ctx->SetCurrentBasicBlock(bbBody);
|
||||
ctx->SetInternalMask(LLVMMaskAllOn);
|
||||
ctx->AddInstrumentationPoint("foreach loop body");
|
||||
stmts->EmitCode(ctx);
|
||||
Assert(ctx->GetCurrentBasicBlock() != NULL);
|
||||
ctx->BranchInst(bbStep[nDims-1]);
|
||||
llvm::BasicBlock *bbFullBodyContinue =
|
||||
ctx->CreateBasicBlock("foreach_full_continue");
|
||||
ctx->SetCurrentBasicBlock(bbFullBody); {
|
||||
ctx->SetInternalMask(LLVMMaskAllOn);
|
||||
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
|
||||
dimVariables[nDims-1]->storagePtr, span);
|
||||
ctx->SetContinueTarget(bbFullBodyContinue);
|
||||
ctx->AddInstrumentationPoint("foreach loop body (all on)");
|
||||
stmts->EmitCode(ctx);
|
||||
Assert(ctx->GetCurrentBasicBlock() != NULL);
|
||||
ctx->BranchInst(bbFullBodyContinue);
|
||||
}
|
||||
ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
|
||||
ctx->RestoreContinuedLanes();
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
|
||||
llvm::Value *newCounter =
|
||||
ctx->BinaryOperator(llvm::Instruction::Add, counter,
|
||||
LLVMInt32(span[nDims-1]), "new_counter");
|
||||
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
|
||||
ctx->BranchInst(bbOuterNotInExtras);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// foreach_doextras: set the mask and have the statements emit their
|
||||
// We're done running blocks with the mask all on; see if the counter is
|
||||
// less than the end value, in which case we need to run the body one
|
||||
// more time to get the extra bits.
|
||||
llvm::BasicBlock *bbSetInnerMask =
|
||||
ctx->CreateBasicBlock("partial_inner_only");
|
||||
ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); {
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
|
||||
llvm::Value *beforeFullEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
counter, endVals[nDims-1], "before_full_end");
|
||||
ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// The outer dimensions are all on, so the mask is just given by the
|
||||
// mask for the innermost dimension
|
||||
ctx->SetCurrentBasicBlock(bbSetInnerMask); {
|
||||
llvm::Value *varyingCounter =
|
||||
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
|
||||
dimVariables[nDims-1]->storagePtr, span);
|
||||
llvm::Value *smearEnd = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
|
||||
for (int j = 0; j < g->target.vectorWidth; ++j)
|
||||
smearEnd = ctx->InsertInst(smearEnd, endVals[nDims-1], j, "smear_end");
|
||||
llvm::Value *emask =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
varyingCounter, smearEnd);
|
||||
emask = ctx->I1VecToBoolVec(emask);
|
||||
ctx->SetInternalMask(emask);
|
||||
|
||||
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
|
||||
ctx->BranchInst(bbMaskedBody);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// masked_body: set the mask and have the statements emit their
|
||||
// code again. Note that it's generally worthwhile having two copies
|
||||
// of the statements' code, since the code above is emitted with the
|
||||
// mask known to be all-on, which in turn leads to more efficient code
|
||||
// for that case.
|
||||
ctx->SetCurrentBasicBlock(bbDoExtras);
|
||||
llvm::Value *mask = ctx->LoadInst(extrasMaskPtrs[nDims-1]);
|
||||
ctx->SetInternalMask(mask);
|
||||
stmts->EmitCode(ctx);
|
||||
ctx->BranchInst(bbStep[nDims-1]);
|
||||
llvm::BasicBlock *bbStepInnerIndex =
|
||||
ctx->CreateBasicBlock("step_inner_index");
|
||||
llvm::BasicBlock *bbMaskedBodyContinue =
|
||||
ctx->CreateBasicBlock("foreach_masked_continue");
|
||||
ctx->SetCurrentBasicBlock(bbMaskedBody); {
|
||||
ctx->AddInstrumentationPoint("foreach loop body (masked)");
|
||||
ctx->SetContinueTarget(bbMaskedBodyContinue);
|
||||
stmts->EmitCode(ctx);
|
||||
ctx->BranchInst(bbMaskedBodyContinue);
|
||||
}
|
||||
ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); {
|
||||
ctx->RestoreContinuedLanes();
|
||||
llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
|
||||
ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// step the innermost index, for the case where we're doing the
|
||||
// innermost for loop over full vectors.
|
||||
ctx->SetCurrentBasicBlock(bbStepInnerIndex); {
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
|
||||
llvm::Value *newCounter =
|
||||
ctx->BinaryOperator(llvm::Instruction::Add, counter,
|
||||
LLVMInt32(span[nDims-1]), "new_counter");
|
||||
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
|
||||
ctx->BranchInst(bbOuterInExtras);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// foreach_exit: All done. Restore the old mask and clean up
|
||||
@@ -1869,6 +2109,301 @@ ForeachStmt::Print(int indent) const {
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// CaseStmt
|
||||
|
||||
/** Given the statements following a 'case' or 'default' label, this
|
||||
function determines whether the mask should be checked to see if it is
|
||||
"all off" immediately after the label, before executing the code for
|
||||
the statements.
|
||||
*/
|
||||
static bool
|
||||
lCheckMask(Stmt *stmts) {
|
||||
if (stmts == NULL)
|
||||
return false;
|
||||
|
||||
int cost = EstimateCost(stmts);
|
||||
|
||||
bool safeToRunWithAllLanesOff = true;
|
||||
WalkAST(stmts, lCheckAllOffSafety, NULL, &safeToRunWithAllLanesOff);
|
||||
|
||||
// The mask should be checked if the code following the
|
||||
// 'case'/'default' is relatively complex, or if it would be unsafe to
|
||||
// run that code with the execution mask all off.
|
||||
return (cost > PREDICATE_SAFE_IF_STATEMENT_COST ||
|
||||
safeToRunWithAllLanesOff == false);
|
||||
}
|
||||
|
||||
|
||||
CaseStmt::CaseStmt(int v, Stmt *s, SourcePos pos)
|
||||
: Stmt(pos), value(v) {
|
||||
stmts = s;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
CaseStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
ctx->EmitCaseLabel(value, lCheckMask(stmts), pos);
|
||||
if (stmts)
|
||||
stmts->EmitCode(ctx);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
CaseStmt::Print(int indent) const {
|
||||
printf("%*cCase [%d] label", indent, ' ', value);
|
||||
pos.Print();
|
||||
printf("\n");
|
||||
stmts->Print(indent+4);
|
||||
}
|
||||
|
||||
|
||||
Stmt *
|
||||
CaseStmt::TypeCheck() {
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
CaseStmt::EstimateCost() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// DefaultStmt
|
||||
|
||||
DefaultStmt::DefaultStmt(Stmt *s, SourcePos pos)
|
||||
: Stmt(pos) {
|
||||
stmts = s;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
DefaultStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
ctx->EmitDefaultLabel(lCheckMask(stmts), pos);
|
||||
if (stmts)
|
||||
stmts->EmitCode(ctx);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
DefaultStmt::Print(int indent) const {
|
||||
printf("%*cDefault Stmt", indent, ' ');
|
||||
pos.Print();
|
||||
printf("\n");
|
||||
stmts->Print(indent+4);
|
||||
}
|
||||
|
||||
|
||||
Stmt *
|
||||
DefaultStmt::TypeCheck() {
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
DefaultStmt::EstimateCost() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// SwitchStmt
|
||||
|
||||
SwitchStmt::SwitchStmt(Expr *e, Stmt *s, SourcePos pos)
|
||||
: Stmt(pos) {
|
||||
expr = e;
|
||||
stmts = s;
|
||||
}
|
||||
|
||||
|
||||
/* An instance of this structure is carried along as we traverse the AST
|
||||
nodes for the statements after a "switch" statement. We use this
|
||||
structure to record all of the 'case' and 'default' statements after the
|
||||
"switch". */
|
||||
struct SwitchVisitInfo {
|
||||
SwitchVisitInfo(FunctionEmitContext *c) {
|
||||
ctx = c;
|
||||
defaultBlock = NULL;
|
||||
lastBlock = NULL;
|
||||
}
|
||||
|
||||
FunctionEmitContext *ctx;
|
||||
|
||||
/* Basic block for the code following the "default" label (if any). */
|
||||
llvm::BasicBlock *defaultBlock;
|
||||
|
||||
/* Map from integer values after "case" labels to the basic blocks that
|
||||
follow the corresponding "case" label. */
|
||||
std::vector<std::pair<int, llvm::BasicBlock *> > caseBlocks;
|
||||
|
||||
/* For each basic block for a "case" label or a "default" label,
|
||||
nextBlock[block] stores the basic block pointer for the next
|
||||
subsequent "case" or "default" label in the program. */
|
||||
std::map<llvm::BasicBlock *, llvm::BasicBlock *> nextBlock;
|
||||
|
||||
/* The last basic block created for a "case" or "default" label; when
|
||||
we create the basic block for the next one, we'll use this to update
|
||||
the nextBlock map<> above. */
|
||||
llvm::BasicBlock *lastBlock;
|
||||
};
|
||||
|
||||
|
||||
static bool
|
||||
lSwitchASTPreVisit(ASTNode *node, void *d) {
|
||||
if (dynamic_cast<SwitchStmt *>(node) != NULL)
|
||||
// don't continue recursively into a nested switch--we only want
|
||||
// our own case and default statements!
|
||||
return false;
|
||||
|
||||
CaseStmt *cs = dynamic_cast<CaseStmt *>(node);
|
||||
DefaultStmt *ds = dynamic_cast<DefaultStmt *>(node);
|
||||
|
||||
SwitchVisitInfo *svi = (SwitchVisitInfo *)d;
|
||||
llvm::BasicBlock *bb = NULL;
|
||||
if (cs != NULL) {
|
||||
// Complain if we've seen a case statement with the same value
|
||||
// already
|
||||
for (int i = 0; i < (int)svi->caseBlocks.size(); ++i) {
|
||||
if (svi->caseBlocks[i].first == cs->value) {
|
||||
Error(cs->pos, "Duplicate case value \"%d\".", cs->value);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise create a new basic block for the code following this
|
||||
// 'case' statement and record the mappign between the case label
|
||||
// value and the basic block
|
||||
char buf[32];
|
||||
sprintf(buf, "case_%d", cs->value);
|
||||
bb = svi->ctx->CreateBasicBlock(buf);
|
||||
svi->caseBlocks.push_back(std::make_pair(cs->value, bb));
|
||||
}
|
||||
else if (ds != NULL) {
|
||||
// And complain if we've seen another 'default' label..
|
||||
if (svi->defaultBlock != NULL) {
|
||||
Error(ds->pos, "Multiple \"default\" lables in switch statement.");
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
// Otherwise create a basic block for the code following the
|
||||
// "default".
|
||||
bb = svi->ctx->CreateBasicBlock("default");
|
||||
svi->defaultBlock = bb;
|
||||
}
|
||||
}
|
||||
|
||||
// If we saw a "case" or "default" label, then update the map to record
|
||||
// that the block we just created follows the block created for the
|
||||
// previous label in the "switch".
|
||||
if (bb != NULL) {
|
||||
svi->nextBlock[svi->lastBlock] = bb;
|
||||
svi->lastBlock = bb;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
SwitchStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
if (ctx->GetCurrentBasicBlock() == NULL)
|
||||
return;
|
||||
|
||||
const Type *type;
|
||||
if (expr == NULL || ((type = expr->GetType()) == NULL)) {
|
||||
Assert(m->errorCount > 0);
|
||||
return;
|
||||
}
|
||||
|
||||
// Basic block we'll end up after the switch statement
|
||||
llvm::BasicBlock *bbDone = ctx->CreateBasicBlock("switch_done");
|
||||
|
||||
// Walk the AST of the statements after the 'switch' to collect a bunch
|
||||
// of information about the structure of the 'case' and 'default'
|
||||
// statements.
|
||||
SwitchVisitInfo svi(ctx);
|
||||
WalkAST(stmts, lSwitchASTPreVisit, NULL, &svi);
|
||||
// Record that the basic block following the last one created for a
|
||||
// case/default is the block after the end of the switch statement.
|
||||
svi.nextBlock[svi.lastBlock] = bbDone;
|
||||
|
||||
llvm::Value *exprValue = expr->GetValue(ctx);
|
||||
if (exprValue == NULL) {
|
||||
Assert(m->errorCount > 0);
|
||||
return;
|
||||
}
|
||||
|
||||
bool isUniformCF = (type->IsUniformType() &&
|
||||
lHasVaryingBreakOrContinue(stmts) == false);
|
||||
ctx->StartSwitch(isUniformCF, bbDone);
|
||||
ctx->SwitchInst(exprValue, svi.defaultBlock ? svi.defaultBlock : bbDone,
|
||||
svi.caseBlocks, svi.nextBlock);
|
||||
|
||||
if (stmts != NULL)
|
||||
stmts->EmitCode(ctx);
|
||||
|
||||
if (ctx->GetCurrentBasicBlock() != NULL)
|
||||
ctx->BranchInst(bbDone);
|
||||
|
||||
ctx->SetCurrentBasicBlock(bbDone);
|
||||
ctx->EndSwitch();
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
SwitchStmt::Print(int indent) const {
|
||||
printf("%*cSwitch Stmt", indent, ' ');
|
||||
pos.Print();
|
||||
printf("\n");
|
||||
printf("%*cexpr = ", indent, ' ');
|
||||
expr->Print();
|
||||
printf("\n");
|
||||
stmts->Print(indent+4);
|
||||
}
|
||||
|
||||
|
||||
Stmt *
|
||||
SwitchStmt::TypeCheck() {
|
||||
const Type *exprType = expr->GetType();
|
||||
if (exprType == NULL)
|
||||
return NULL;
|
||||
|
||||
const Type *toType = NULL;
|
||||
exprType = exprType->GetAsConstType();
|
||||
bool is64bit = (exprType->GetAsUniformType() ==
|
||||
AtomicType::UniformConstUInt64 ||
|
||||
exprType->GetAsUniformType() ==
|
||||
AtomicType::UniformConstInt64);
|
||||
|
||||
if (exprType->IsUniformType()) {
|
||||
if (is64bit) toType = AtomicType::UniformInt64;
|
||||
else toType = AtomicType::UniformInt32;
|
||||
}
|
||||
else {
|
||||
if (is64bit) toType = AtomicType::VaryingInt64;
|
||||
else toType = AtomicType::VaryingInt32;
|
||||
}
|
||||
|
||||
expr = TypeConvertExpr(expr, toType, "switch expression");
|
||||
if (expr == NULL)
|
||||
return NULL;
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
SwitchStmt::EstimateCost() const {
|
||||
const Type *type = expr->GetType();
|
||||
if (type && type->IsVaryingType())
|
||||
return COST_VARYING_SWITCH;
|
||||
else
|
||||
return COST_UNIFORM_SWITCH;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// ReturnStmt
|
||||
|
||||
@@ -1915,14 +2450,137 @@ ReturnStmt::Print(int indent) const {
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// GotoStmt
|
||||
|
||||
GotoStmt::GotoStmt(const char *l, SourcePos gotoPos, SourcePos ip)
|
||||
: Stmt(gotoPos) {
|
||||
label = l;
|
||||
identifierPos = ip;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
GotoStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
if (ctx->VaryingCFDepth() > 0) {
|
||||
Error(pos, "\"goto\" statements are only legal under \"uniform\" "
|
||||
"control flow.");
|
||||
return;
|
||||
}
|
||||
if (ctx->InForeachLoop()) {
|
||||
Error(pos, "\"goto\" statements are currently illegal inside "
|
||||
"\"foreach\" loops.");
|
||||
return;
|
||||
}
|
||||
|
||||
llvm::BasicBlock *bb = ctx->GetLabeledBasicBlock(label);
|
||||
if (bb == NULL) {
|
||||
// TODO: use the string distance stuff to suggest alternatives if
|
||||
// there are some with names close to the label name we have here..
|
||||
Error(identifierPos, "No label named \"%s\" found in current function.",
|
||||
label.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
ctx->BranchInst(bb);
|
||||
ctx->SetCurrentBasicBlock(NULL);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
GotoStmt::Print(int indent) const {
|
||||
printf("%*cGoto label \"%s\"\n", indent, ' ', label.c_str());
|
||||
}
|
||||
|
||||
|
||||
Stmt *
|
||||
GotoStmt::Optimize() {
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
Stmt *
|
||||
GotoStmt::TypeCheck() {
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
GotoStmt::EstimateCost() const {
|
||||
return COST_GOTO;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// LabeledStmt
|
||||
|
||||
LabeledStmt::LabeledStmt(const char *n, Stmt *s, SourcePos p)
|
||||
: Stmt(p) {
|
||||
name = n;
|
||||
stmt = s;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
LabeledStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
llvm::BasicBlock *bblock = ctx->GetLabeledBasicBlock(name);
|
||||
Assert(bblock != NULL);
|
||||
|
||||
// End the current basic block with a jump to our basic block and then
|
||||
// set things up for emission to continue there. Note that the current
|
||||
// basic block may validly be NULL going into this statement due to an
|
||||
// earlier goto that NULLed it out; that doesn't stop us from
|
||||
// re-establishing a current basic block starting at the label..
|
||||
if (ctx->GetCurrentBasicBlock() != NULL)
|
||||
ctx->BranchInst(bblock);
|
||||
ctx->SetCurrentBasicBlock(bblock);
|
||||
|
||||
if (stmt != NULL)
|
||||
stmt->EmitCode(ctx);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
LabeledStmt::Print(int indent) const {
|
||||
printf("%*cLabel \"%s\"\n", indent, ' ', name.c_str());
|
||||
if (stmt != NULL)
|
||||
stmt->Print(indent);
|
||||
}
|
||||
|
||||
|
||||
Stmt *
|
||||
LabeledStmt::Optimize() {
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
Stmt *
|
||||
LabeledStmt::TypeCheck() {
|
||||
if (!isalpha(name[0]) || name[0] == '_') {
|
||||
Error(pos, "Label must start with either alphabetic character or '_'.");
|
||||
return NULL;
|
||||
}
|
||||
for (unsigned int i = 1; i < name.size(); ++i) {
|
||||
if (!isalnum(name[i]) && name[i] != '_') {
|
||||
Error(pos, "Character \"%c\" is illegal in labels.", name[i]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
LabeledStmt::EstimateCost() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// StmtList
|
||||
|
||||
void
|
||||
StmtList::EmitCode(FunctionEmitContext *ctx) const {
|
||||
if (!ctx->GetCurrentBasicBlock())
|
||||
return;
|
||||
|
||||
ctx->StartScope();
|
||||
ctx->SetDebugPos(pos);
|
||||
for (unsigned int i = 0; i < stmts.size(); ++i)
|
||||
@@ -2020,7 +2678,7 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
|
||||
baseType == AtomicType::UniformUInt16) {
|
||||
expr = new TypeCastExpr(type->IsUniformType() ? AtomicType::UniformInt32 :
|
||||
AtomicType::VaryingInt32,
|
||||
expr, false, expr->pos);
|
||||
expr, expr->pos);
|
||||
type = expr->GetType();
|
||||
}
|
||||
|
||||
@@ -2173,16 +2831,6 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
m->module->getFunction("__do_assert_varying");
|
||||
Assert(assertFunc != NULL);
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
char errorString[2048];
|
||||
if (sprintf_s(errorString, sizeof(errorString),
|
||||
"%s(%d): Assertion failed: %s\n", pos.name,
|
||||
pos.first_line, message.c_str()) == -1) {
|
||||
Error(pos, "Fatal error in sprintf_s() call when generating assert "
|
||||
"string.");
|
||||
return;
|
||||
}
|
||||
#else
|
||||
char *errorString;
|
||||
if (asprintf(&errorString, "%s:%d:%d: Assertion failed: %s\n",
|
||||
pos.name, pos.first_line, pos.first_column,
|
||||
@@ -2191,7 +2839,6 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
"unable to allocate memory!");
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(ctx->GetStringPtr(errorString));
|
||||
@@ -2199,9 +2846,7 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
args.push_back(ctx->GetFullMask());
|
||||
ctx->CallInst(assertFunc, NULL, args, "");
|
||||
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
free(errorString);
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
}
|
||||
|
||||
|
||||
@@ -2223,7 +2868,7 @@ AssertStmt::TypeCheck() {
|
||||
}
|
||||
expr = new TypeCastExpr(isUniform ? AtomicType::UniformBool :
|
||||
AtomicType::VaryingBool,
|
||||
expr, false, expr->pos);
|
||||
expr, expr->pos);
|
||||
expr = ::TypeCheck(expr);
|
||||
}
|
||||
return this;
|
||||
|
||||
91
stmt.h
91
stmt.h
@@ -282,6 +282,97 @@ public:
|
||||
};
|
||||
|
||||
|
||||
/** Statement corresponding to a "case" label in the program. In addition
|
||||
to the value associated with the "case", this statement also stores the
|
||||
statements following it. */
|
||||
class CaseStmt : public Stmt {
|
||||
public:
|
||||
CaseStmt(int value, Stmt *stmt, SourcePos pos);
|
||||
|
||||
void EmitCode(FunctionEmitContext *ctx) const;
|
||||
void Print(int indent) const;
|
||||
|
||||
Stmt *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
/** Integer value after the "case" statement */
|
||||
const int value;
|
||||
Stmt *stmts;
|
||||
};
|
||||
|
||||
|
||||
/** Statement for a "default" label (as would be found inside a "switch"
|
||||
statement). */
|
||||
class DefaultStmt : public Stmt {
|
||||
public:
|
||||
DefaultStmt(Stmt *stmt, SourcePos pos);
|
||||
|
||||
void EmitCode(FunctionEmitContext *ctx) const;
|
||||
void Print(int indent) const;
|
||||
|
||||
Stmt *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
Stmt *stmts;
|
||||
};
|
||||
|
||||
|
||||
/** A "switch" statement in the program. */
|
||||
class SwitchStmt : public Stmt {
|
||||
public:
|
||||
SwitchStmt(Expr *expr, Stmt *stmts, SourcePos pos);
|
||||
|
||||
void EmitCode(FunctionEmitContext *ctx) const;
|
||||
void Print(int indent) const;
|
||||
|
||||
Stmt *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
/** Expression that is used to determine which label to jump to. */
|
||||
Expr *expr;
|
||||
/** Statement block after the "switch" expression. */
|
||||
Stmt *stmts;
|
||||
};
|
||||
|
||||
|
||||
/** A "goto" in an ispc program. */
|
||||
class GotoStmt : public Stmt {
|
||||
public:
|
||||
GotoStmt(const char *label, SourcePos gotoPos, SourcePos idPos);
|
||||
|
||||
void EmitCode(FunctionEmitContext *ctx) const;
|
||||
void Print(int indent) const;
|
||||
|
||||
Stmt *Optimize();
|
||||
Stmt *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
/** Name of the label to jump to when the goto is executed. */
|
||||
std::string label;
|
||||
SourcePos identifierPos;
|
||||
};
|
||||
|
||||
|
||||
/** Statement corresponding to a label (as would be used as a goto target)
|
||||
in the program. */
|
||||
class LabeledStmt : public Stmt {
|
||||
public:
|
||||
LabeledStmt(const char *label, Stmt *stmt, SourcePos p);
|
||||
|
||||
void EmitCode(FunctionEmitContext *ctx) const;
|
||||
void Print(int indent) const;
|
||||
|
||||
Stmt *Optimize();
|
||||
Stmt *TypeCheck();
|
||||
int EstimateCost() const;
|
||||
|
||||
/** Name of the label. */
|
||||
std::string name;
|
||||
/** Statements following the label. */
|
||||
Stmt *stmt;
|
||||
};
|
||||
|
||||
|
||||
/** @brief Representation of a list of statements in the program.
|
||||
*/
|
||||
class StmtList : public Stmt {
|
||||
|
||||
73
sym.cpp
73
sym.cpp
@@ -72,8 +72,7 @@ SymbolTable::SymbolTable() {
|
||||
|
||||
SymbolTable::~SymbolTable() {
|
||||
// Otherwise we have mismatched push/pop scopes
|
||||
Assert(variables.size() == 1 && functions.size() == 1 &&
|
||||
types.size() == 1);
|
||||
Assert(variables.size() == 1 && types.size() == 1);
|
||||
PopScope();
|
||||
}
|
||||
|
||||
@@ -81,7 +80,6 @@ SymbolTable::~SymbolTable() {
|
||||
void
|
||||
SymbolTable::PushScope() {
|
||||
variables.push_back(new SymbolMapType);
|
||||
functions.push_back(new FunctionMapType);
|
||||
types.push_back(new TypeMapType);
|
||||
}
|
||||
|
||||
@@ -92,10 +90,6 @@ SymbolTable::PopScope() {
|
||||
delete variables.back();
|
||||
variables.pop_back();
|
||||
|
||||
Assert(functions.size() > 1);
|
||||
delete functions.back();
|
||||
functions.pop_back();
|
||||
|
||||
Assert(types.size() > 1);
|
||||
delete types.back();
|
||||
types.pop_back();
|
||||
@@ -160,7 +154,7 @@ SymbolTable::AddFunction(Symbol *symbol) {
|
||||
// the symbol table
|
||||
return false;
|
||||
|
||||
std::vector<Symbol *> &funOverloads = (*functions.back())[symbol->name];
|
||||
std::vector<Symbol *> &funOverloads = functions[symbol->name];
|
||||
funOverloads.push_back(symbol);
|
||||
return true;
|
||||
}
|
||||
@@ -168,17 +162,14 @@ SymbolTable::AddFunction(Symbol *symbol) {
|
||||
|
||||
bool
|
||||
SymbolTable::LookupFunction(const char *name, std::vector<Symbol *> *matches) {
|
||||
for (int i = (int)functions.size() - 1; i >= 0; --i) {
|
||||
FunctionMapType &fm = *(functions[i]);
|
||||
FunctionMapType::iterator iter = fm.find(name);
|
||||
if (iter != fm.end()) {
|
||||
if (matches == NULL)
|
||||
return true;
|
||||
else {
|
||||
const std::vector<Symbol *> &funcs = iter->second;
|
||||
for (int j = 0; j < (int)funcs.size(); ++j)
|
||||
matches->push_back(funcs[j]);
|
||||
}
|
||||
FunctionMapType::iterator iter = functions.find(name);
|
||||
if (iter != functions.end()) {
|
||||
if (matches == NULL)
|
||||
return true;
|
||||
else {
|
||||
const std::vector<Symbol *> &funcs = iter->second;
|
||||
for (int j = 0; j < (int)funcs.size(); ++j)
|
||||
matches->push_back(funcs[j]);
|
||||
}
|
||||
}
|
||||
return matches ? (matches->size() > 0) : false;
|
||||
@@ -187,15 +178,12 @@ SymbolTable::LookupFunction(const char *name, std::vector<Symbol *> *matches) {
|
||||
|
||||
Symbol *
|
||||
SymbolTable::LookupFunction(const char *name, const FunctionType *type) {
|
||||
for (int i = (int)functions.size() - 1; i >= 0; --i) {
|
||||
FunctionMapType &fm = *(functions[i]);
|
||||
FunctionMapType::iterator iter = fm.find(name);
|
||||
if (iter != fm.end()) {
|
||||
std::vector<Symbol *> funcs = iter->second;
|
||||
for (int j = 0; j < (int)funcs.size(); ++j) {
|
||||
if (Type::Equal(funcs[j]->type, type))
|
||||
return funcs[j];
|
||||
}
|
||||
FunctionMapType::iterator iter = functions.find(name);
|
||||
if (iter != functions.end()) {
|
||||
std::vector<Symbol *> funcs = iter->second;
|
||||
for (int j = 0; j < (int)funcs.size(); ++j) {
|
||||
if (Type::Equal(funcs[j]->type, type))
|
||||
return funcs[j];
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
@@ -261,14 +249,11 @@ SymbolTable::ClosestVariableOrFunctionMatch(const char *str) const {
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int)functions.size(); ++i) {
|
||||
const FunctionMapType &fm = *(functions[i]);
|
||||
FunctionMapType::const_iterator iter;
|
||||
for (iter = fm.begin(); iter != fm.end(); ++iter) {
|
||||
int dist = StringEditDistance(str, iter->first, maxDelta+1);
|
||||
if (dist <= maxDelta)
|
||||
matches[dist].push_back(iter->first);
|
||||
}
|
||||
FunctionMapType::const_iterator iter;
|
||||
for (iter = functions.begin(); iter != functions.end(); ++iter) {
|
||||
int dist = StringEditDistance(str, iter->first, maxDelta+1);
|
||||
if (dist <= maxDelta)
|
||||
matches[dist].push_back(iter->first);
|
||||
}
|
||||
|
||||
// Now, return the first entry of matches[] that is non-empty, if any.
|
||||
@@ -346,15 +331,13 @@ SymbolTable::Print() {
|
||||
}
|
||||
|
||||
fprintf(stderr, "Functions:\n----------------\n");
|
||||
for (int i = 0; i < (int)functions.size(); ++i) {
|
||||
FunctionMapType::iterator fiter = functions[i]->begin();
|
||||
while (fiter != functions[i]->end()) {
|
||||
fprintf(stderr, "%s\n", fiter->first.c_str());
|
||||
std::vector<Symbol *> &syms = fiter->second;
|
||||
for (unsigned int j = 0; j < syms.size(); ++j)
|
||||
fprintf(stderr, " %s\n", syms[j]->type->GetString().c_str());
|
||||
++fiter;
|
||||
}
|
||||
FunctionMapType::iterator fiter = functions.begin();
|
||||
while (fiter != functions.end()) {
|
||||
fprintf(stderr, "%s\n", fiter->first.c_str());
|
||||
std::vector<Symbol *> &syms = fiter->second;
|
||||
for (unsigned int j = 0; j < syms.size(); ++j)
|
||||
fprintf(stderr, " %s\n", syms[j]->type->GetString().c_str());
|
||||
++fiter;
|
||||
}
|
||||
|
||||
depth = 0;
|
||||
|
||||
26
sym.h
26
sym.h
@@ -257,12 +257,13 @@ private:
|
||||
typedef std::map<std::string, Symbol *> SymbolMapType;
|
||||
std::vector<SymbolMapType *> variables;
|
||||
|
||||
/** Function declarations are also scoped., A STL \c vector is used to
|
||||
store the function symbols for a given name since, due to function
|
||||
overloading, a name can have multiple function symbols associated
|
||||
with it. */
|
||||
/** Function declarations are *not* scoped. (C99, for example, allows
|
||||
an implementation to maintain function declarations in a single
|
||||
namespace.) A STL \c vector is used to store the function symbols
|
||||
for a given name since, due to function overloading, a name can
|
||||
have multiple function symbols associated with it. */
|
||||
typedef std::map<std::string, std::vector<Symbol *> > FunctionMapType;
|
||||
std::vector<FunctionMapType *> functions;
|
||||
FunctionMapType functions;
|
||||
|
||||
/** Type definitions can also be scoped. A new \c TypeMapType
|
||||
is added to the back of the \c types \c vector each time a new scope
|
||||
@@ -278,15 +279,12 @@ SymbolTable::GetMatchingFunctions(Predicate pred,
|
||||
std::vector<Symbol *> *matches) const {
|
||||
// Iterate through all function symbols and apply the given predicate.
|
||||
// If it returns true, add the Symbol * to the provided vector.
|
||||
for (unsigned int i = 0; i < functions.size(); ++i) {
|
||||
FunctionMapType &fm = *(functions[i]);
|
||||
FunctionMapType::const_iterator iter;
|
||||
for (iter = fm.begin(); iter != fm.end(); ++iter) {
|
||||
const std::vector<Symbol *> &syms = iter->second;
|
||||
for (unsigned int j = 0; j < syms.size(); ++j) {
|
||||
if (pred(syms[j]))
|
||||
matches->push_back(syms[j]);
|
||||
}
|
||||
FunctionMapType::const_iterator iter;
|
||||
for (iter = functions.begin(); iter != functions.end(); ++iter) {
|
||||
const std::vector<Symbol *> &syms = iter->second;
|
||||
for (unsigned int j = 0; j < syms.size(); ++j) {
|
||||
if (pred(syms[j]))
|
||||
matches->push_back(syms[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,7 +46,6 @@
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
|
||||
17
tests/atomics-swap.ispc
Normal file
17
tests/atomics-swap.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
uniform int32 s = 1234;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0;
|
||||
if (programIndex & 1) {
|
||||
b = atomic_swap_global(&s, programIndex);
|
||||
}
|
||||
RET[programIndex] = reduce_add(b) + s;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1234 + reduce_add(programIndex & 1 ? programIndex : 0);
|
||||
}
|
||||
17
tests/goto-1.ispc
Normal file
17
tests/goto-1.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0.; b = a;
|
||||
RET[programIndex] = a+b;
|
||||
goto skip;
|
||||
RET[programIndex] = 0;
|
||||
skip:
|
||||
;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 2 + 2*programIndex;
|
||||
}
|
||||
18
tests/goto-2.ispc
Normal file
18
tests/goto-2.ispc
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0.; b = a;
|
||||
RET[programIndex] = a+b;
|
||||
if (all(a != 0))
|
||||
goto skip;
|
||||
RET[programIndex] = 0;
|
||||
skip:
|
||||
;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 2 + 2*programIndex;
|
||||
}
|
||||
18
tests/goto-3.ispc
Normal file
18
tests/goto-3.ispc
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0.; b = a;
|
||||
RET[programIndex] = a+b;
|
||||
if (all(a == 0))
|
||||
goto skip;
|
||||
RET[programIndex] = 0;
|
||||
skip:
|
||||
;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
19
tests/goto-4.ispc
Normal file
19
tests/goto-4.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = 0.; b = a;
|
||||
RET[programIndex] = 0;
|
||||
encore:
|
||||
++RET[programIndex];
|
||||
if (any(a != 0)) {
|
||||
a = max(a-1, 0);
|
||||
goto encore;
|
||||
}
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = programCount+1;
|
||||
}
|
||||
21
tests/half-3.ispc
Normal file
21
tests/half-3.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_v(uniform float RET[]) {
|
||||
int errors = 0;
|
||||
|
||||
foreach (i = 0 ... 65535) {
|
||||
unsigned int16 h = i;
|
||||
float f = half_to_float(i);
|
||||
h = float_to_half(f);
|
||||
|
||||
int mismatches = (f == f && i != h);
|
||||
errors += reduce_add(mismatches);
|
||||
}
|
||||
|
||||
RET[programIndex] = errors;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
13
tests/kilo-mega-giga-1.ispc
Normal file
13
tests/kilo-mega-giga-1.ispc
Normal file
@@ -0,0 +1,13 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
a *= 1k;
|
||||
RET[programIndex] = a;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1024*(programIndex+1);
|
||||
}
|
||||
12
tests/kilo-mega-giga-2.ispc
Normal file
12
tests/kilo-mega-giga-2.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = b + 2M;
|
||||
RET[programIndex] = a;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 2*1024*1024 + 5;
|
||||
}
|
||||
14
tests/kilo-mega-giga-3.ispc
Normal file
14
tests/kilo-mega-giga-3.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
unsigned int32 a = 3G;
|
||||
a -= 2G;
|
||||
a -= 1024M;
|
||||
RET[programIndex] = a;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
16
tests/ptr-assign-lhs-math-1.ispc
Normal file
16
tests/ptr-assign-lhs-math-1.ispc
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float a[programCount];
|
||||
a[programIndex] = aFOO[programIndex];
|
||||
|
||||
uniform float * uniform ptr = a;
|
||||
*(ptr+1) = 0;
|
||||
RET[programIndex] = a[programIndex];
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1+programIndex;
|
||||
RET[1] = 0;
|
||||
}
|
||||
15
tests/ptr-assign-lhs-math-2.ispc
Normal file
15
tests/ptr-assign-lhs-math-2.ispc
Normal file
@@ -0,0 +1,15 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float a[programCount];
|
||||
a[programIndex] = aFOO[programIndex];
|
||||
|
||||
uniform float * varying ptr = a;
|
||||
*(ptr+programIndex) = 0;
|
||||
RET[programIndex] = a[programIndex];
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
18
tests/switch-1.ispc
Normal file
18
tests/switch-1.ispc
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
switch (b) {
|
||||
default:
|
||||
RET[programIndex] = -1;
|
||||
break;
|
||||
case 5:
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
44
tests/switch-10.ispc
Normal file
44
tests/switch-10.ispc
Normal file
@@ -0,0 +1,44 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int switchit(int a, uniform int b) {
|
||||
switch (a) {
|
||||
case 3:
|
||||
return 1;
|
||||
case 7:
|
||||
case 6:
|
||||
case 4:
|
||||
case 5:
|
||||
if (a & 1)
|
||||
break;
|
||||
return 2;
|
||||
case 1: {
|
||||
switch (a+b) {
|
||||
case 6:
|
||||
return 42;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return -1234;
|
||||
}
|
||||
case 32:
|
||||
*((int *)NULL) = 0;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
int x = switchit(a, b);
|
||||
RET[programIndex] = x;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 42;
|
||||
RET[2] = 1;
|
||||
RET[6] = RET[4] = 3;
|
||||
RET[5] = RET[3] = 2;
|
||||
}
|
||||
50
tests/switch-11.ispc
Normal file
50
tests/switch-11.ispc
Normal file
@@ -0,0 +1,50 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int switchit(int a, uniform int b) {
|
||||
switch (a) {
|
||||
case 3:
|
||||
return 1;
|
||||
case 7:
|
||||
case 6:
|
||||
case 4:
|
||||
case 5:
|
||||
if (a & 1)
|
||||
break;
|
||||
return 2;
|
||||
case 1: {
|
||||
switch (a+b) {
|
||||
case 60:
|
||||
return -1234;
|
||||
default:
|
||||
break;
|
||||
case 6:
|
||||
if (b == 5)
|
||||
break;
|
||||
return -42;
|
||||
case 12:
|
||||
return -1;
|
||||
}
|
||||
return 42;
|
||||
}
|
||||
case 32:
|
||||
*((int *)NULL) = 0;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
int x = switchit(a, b);
|
||||
RET[programIndex] = x;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 42;
|
||||
RET[2] = 1;
|
||||
RET[6] = RET[4] = 3;
|
||||
RET[5] = RET[3] = 2;
|
||||
}
|
||||
54
tests/switch-12.ispc
Normal file
54
tests/switch-12.ispc
Normal file
@@ -0,0 +1,54 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int switchit(int a, uniform int b) {
|
||||
switch (a) {
|
||||
case 3:
|
||||
return 1;
|
||||
case 7:
|
||||
case 6:
|
||||
case 4:
|
||||
case 5:
|
||||
if (a & 1)
|
||||
break;
|
||||
return 2;
|
||||
case 1: {
|
||||
switch (a+b) {
|
||||
case 60:
|
||||
return -1234;
|
||||
default:
|
||||
break;
|
||||
case 6:
|
||||
int count = 0;
|
||||
for (count = 0; count < 10; ++count) {
|
||||
a += b;
|
||||
if (a == 11)
|
||||
break;
|
||||
}
|
||||
return a;
|
||||
case 12:
|
||||
return -1;
|
||||
}
|
||||
return 42;
|
||||
}
|
||||
case 32:
|
||||
*((int *)NULL) = 0;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
int x = switchit(a, b);
|
||||
RET[programIndex] = x;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 11;
|
||||
RET[2] = 1;
|
||||
RET[6] = RET[4] = 3;
|
||||
RET[5] = RET[3] = 2;
|
||||
}
|
||||
28
tests/switch-13.ispc
Normal file
28
tests/switch-13.ispc
Normal file
@@ -0,0 +1,28 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int switchit(int a, uniform int b) {
|
||||
int r = -1;
|
||||
switch (b) {
|
||||
case 5:
|
||||
if (a & 1) {
|
||||
r=3;
|
||||
break;
|
||||
}
|
||||
r= 2;
|
||||
break;
|
||||
default:
|
||||
r= 3;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
int x = switchit(a, b);
|
||||
RET[programIndex] = x;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (programIndex & 1) ? 2 : 3;
|
||||
}
|
||||
24
tests/switch-14.ispc
Normal file
24
tests/switch-14.ispc
Normal file
@@ -0,0 +1,24 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int switchit(int a, uniform int b) {
|
||||
switch (b) {
|
||||
case 5:
|
||||
if (a & 1)
|
||||
break;
|
||||
return 2;
|
||||
default:
|
||||
return 42;
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
int x = switchit(a, b);
|
||||
RET[programIndex] = x;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (programIndex & 1) ? 2 : 3;
|
||||
}
|
||||
17
tests/switch-2.ispc
Normal file
17
tests/switch-2.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
switch (b) {
|
||||
default:
|
||||
RET[programIndex] = -1;
|
||||
case 5:
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
18
tests/switch-3.ispc
Normal file
18
tests/switch-3.ispc
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
switch (b) {
|
||||
case 5:
|
||||
RET[programIndex] = 0;
|
||||
break;
|
||||
default:
|
||||
RET[programIndex] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
24
tests/switch-4.ispc
Normal file
24
tests/switch-4.ispc
Normal file
@@ -0,0 +1,24 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int switchit(int a, uniform int b) {
|
||||
int r = 0;
|
||||
switch (a) {
|
||||
case 3:
|
||||
r = 1;
|
||||
break;
|
||||
default:
|
||||
r = 0;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
int x = switchit(a, b);
|
||||
RET[programIndex] = x;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (programIndex == 2) ? 1 : 0;
|
||||
}
|
||||
22
tests/switch-5.ispc
Normal file
22
tests/switch-5.ispc
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int switchit(int a, uniform int b) {
|
||||
int r = 0;
|
||||
switch (a) {
|
||||
case 3:
|
||||
return 1;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
int x = switchit(a, b);
|
||||
RET[programIndex] = x;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (programIndex == 2) ? 1 : 0;
|
||||
}
|
||||
27
tests/switch-6.ispc
Normal file
27
tests/switch-6.ispc
Normal file
@@ -0,0 +1,27 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int switchit(int a, uniform int b) {
|
||||
switch (a) {
|
||||
case 3:
|
||||
return 1;
|
||||
case 7:
|
||||
if (b == 5)
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
int x = switchit(a, b);
|
||||
RET[programIndex] = x;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[2] = 1;
|
||||
RET[6] = -1;
|
||||
}
|
||||
32
tests/switch-7.ispc
Normal file
32
tests/switch-7.ispc
Normal file
@@ -0,0 +1,32 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int switchit(int a, uniform int b) {
|
||||
switch (a) {
|
||||
case 3:
|
||||
return 1;
|
||||
case 7:
|
||||
case 6:
|
||||
case 4:
|
||||
case 5:
|
||||
if (a & 1)
|
||||
break;
|
||||
return 2;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
int x = switchit(a, b);
|
||||
RET[programIndex] = x;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[2] = 1;
|
||||
RET[6] = RET[4] = 3;
|
||||
RET[5] = RET[3] = 2;
|
||||
}
|
||||
36
tests/switch-8.ispc
Normal file
36
tests/switch-8.ispc
Normal file
@@ -0,0 +1,36 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int switchit(int a, uniform int b) {
|
||||
switch (a) {
|
||||
case 3:
|
||||
return 1;
|
||||
case 7:
|
||||
case 6:
|
||||
case 4:
|
||||
case 5:
|
||||
if (a & 1)
|
||||
break;
|
||||
return 2;
|
||||
case 32:
|
||||
*((int *)NULL) = 0;
|
||||
default:
|
||||
case 1:
|
||||
case 2:
|
||||
return 0;
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
int x = switchit(a, b);
|
||||
RET[programIndex] = x;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[2] = 1;
|
||||
RET[6] = RET[4] = 3;
|
||||
RET[5] = RET[3] = 2;
|
||||
}
|
||||
34
tests/switch-9.ispc
Normal file
34
tests/switch-9.ispc
Normal file
@@ -0,0 +1,34 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int switchit(int a, uniform int b) {
|
||||
switch (a) {
|
||||
case 3:
|
||||
return 1;
|
||||
case 7:
|
||||
case 6:
|
||||
case 4:
|
||||
case 5:
|
||||
if (a & 1)
|
||||
break;
|
||||
return 2;
|
||||
case 32:
|
||||
*((int *)NULL) = 0;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
int a = aFOO[programIndex];
|
||||
int x = switchit(a, b);
|
||||
RET[programIndex] = x;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[2] = 1;
|
||||
RET[6] = RET[4] = 3;
|
||||
RET[5] = RET[3] = 2;
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
// Can't convert argument of type "void * const uniform" to type "float" for funcion call argument.
|
||||
// Can't convert argument of type "void * uniform" to type "float" for function call argument.
|
||||
|
||||
float bar(float a, float b);
|
||||
|
||||
|
||||
10
tests_errors/goto-1.ispc
Normal file
10
tests_errors/goto-1.ispc
Normal file
@@ -0,0 +1,10 @@
|
||||
// Multiple labels named "label" in function
|
||||
|
||||
void func(int x) {
|
||||
label:
|
||||
;
|
||||
label:
|
||||
;
|
||||
}
|
||||
|
||||
|
||||
11
tests_errors/goto-2.ispc
Normal file
11
tests_errors/goto-2.ispc
Normal file
@@ -0,0 +1,11 @@
|
||||
// "goto" statements are only legal under "uniform" control flow
|
||||
|
||||
void func(int x) {
|
||||
if (x < 0)
|
||||
goto label;
|
||||
|
||||
label:
|
||||
;
|
||||
}
|
||||
|
||||
|
||||
11
tests_errors/goto-3.ispc
Normal file
11
tests_errors/goto-3.ispc
Normal file
@@ -0,0 +1,11 @@
|
||||
// "goto" statements are only legal under "uniform" control flow
|
||||
|
||||
void func(int x) {
|
||||
cif (x < 0)
|
||||
goto label;
|
||||
|
||||
label:
|
||||
;
|
||||
}
|
||||
|
||||
|
||||
10
tests_errors/goto-4.ispc
Normal file
10
tests_errors/goto-4.ispc
Normal file
@@ -0,0 +1,10 @@
|
||||
// "goto" statements are only legal under "uniform" control flow
|
||||
|
||||
void func(int x) {
|
||||
label:
|
||||
|
||||
for(int i =0 ;i<x;)
|
||||
goto label;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// Left hand side of assignment statement can't be assigned to
|
||||
// Left hand side of assignment expression can't be assigned to
|
||||
|
||||
int foo() {return 2;}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// Left hand side of assignment statement can't be assigned to
|
||||
// Can't assign to type "const uniform int32" on left-hand side of expression
|
||||
|
||||
int bar(){
|
||||
4 = 0;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// Left hand side of assignment statement can't be assigned to
|
||||
// Can't assign to type "const uniform int32" on left-hand side of expression
|
||||
|
||||
int bar(){
|
||||
int x;
|
||||
|
||||
9
tests_errors/switch-1.ispc
Normal file
9
tests_errors/switch-1.ispc
Normal file
@@ -0,0 +1,9 @@
|
||||
// Case statement value must be a compile-time integer constant
|
||||
|
||||
void foo(float f) {
|
||||
switch (f) {
|
||||
case 1.5:
|
||||
++f;
|
||||
}
|
||||
}
|
||||
|
||||
12
tests_errors/switch-2.ispc
Normal file
12
tests_errors/switch-2.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
// Duplicate case value "1"
|
||||
|
||||
void foo(float f) {
|
||||
switch (f) {
|
||||
case 1:
|
||||
++f;
|
||||
case 2:
|
||||
case 1:
|
||||
f = 0;
|
||||
}
|
||||
}
|
||||
|
||||
13
tests_errors/switch-3.ispc
Normal file
13
tests_errors/switch-3.ispc
Normal file
@@ -0,0 +1,13 @@
|
||||
// "case" label illegal outside of "switch" statement
|
||||
|
||||
void foo(float f) {
|
||||
switch (f) {
|
||||
case 1:
|
||||
++f;
|
||||
case 2:
|
||||
f = 0;
|
||||
}
|
||||
case 3:
|
||||
--f;
|
||||
}
|
||||
|
||||
13
tests_errors/switch-4.ispc
Normal file
13
tests_errors/switch-4.ispc
Normal file
@@ -0,0 +1,13 @@
|
||||
// "default" label illegal outside of "switch" statement
|
||||
|
||||
void foo(float f) {
|
||||
default:
|
||||
++f;
|
||||
switch (f) {
|
||||
case 1:
|
||||
++f;
|
||||
case 2:
|
||||
f = 0;
|
||||
}
|
||||
}
|
||||
|
||||
14
tests_errors/switch-5.ispc
Normal file
14
tests_errors/switch-5.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
// "default" label illegal outside of "switch" statement
|
||||
|
||||
void foo(float f) {
|
||||
default:
|
||||
++f;
|
||||
switch (f) {
|
||||
case 1:
|
||||
++f;
|
||||
continue;
|
||||
case 2:
|
||||
f = 0;
|
||||
}
|
||||
}
|
||||
|
||||
12
tests_errors/switch-6.ispc
Normal file
12
tests_errors/switch-6.ispc
Normal file
@@ -0,0 +1,12 @@
|
||||
// "continue" statement illegal outside of for/while/do/foreach loops
|
||||
|
||||
void foo(float f) {
|
||||
switch (f) {
|
||||
case 1:
|
||||
++f;
|
||||
continue;
|
||||
case 2:
|
||||
f = 0;
|
||||
}
|
||||
}
|
||||
|
||||
151
type.h
151
type.h
@@ -78,20 +78,44 @@ public:
|
||||
/** Returns true if the underlying type is a float or integer type. */
|
||||
bool IsNumericType() const { return IsFloatType() || IsIntType(); }
|
||||
|
||||
/** Types may have uniform, varying, or not-yet-determined variability;
|
||||
this enumerant is used by Type implementations to record their
|
||||
variability. */
|
||||
enum Variability {
|
||||
Uniform,
|
||||
Varying,
|
||||
Unbound
|
||||
};
|
||||
|
||||
/** Returns the variability of the type. */
|
||||
virtual Variability GetVariability() const = 0;
|
||||
|
||||
/** Returns true if the underlying type is uniform */
|
||||
virtual bool IsUniformType() const = 0;
|
||||
bool IsUniformType() const { return GetVariability() == Uniform; }
|
||||
|
||||
/** Returns true if the underlying type is varying */
|
||||
bool IsVaryingType() const { return !IsUniformType(); }
|
||||
bool IsVaryingType() const { return GetVariability() == Varying; }
|
||||
|
||||
/** Returns true if the underlying type's uniform/varying-ness is
|
||||
unbound. */
|
||||
bool HasUnboundVariability() const { return GetVariability() == Unbound; }
|
||||
|
||||
/* Returns a type wherein any elements of the original type and
|
||||
contained types that have unbound variability have their variability
|
||||
set to the given variability. */
|
||||
virtual const Type *ResolveUnboundVariability(Variability v) const = 0;
|
||||
|
||||
/** Return a "uniform" instance of this type. If the type is already
|
||||
uniform, its "this" pointer will be returned. */
|
||||
virtual const Type *GetAsUniformType() const = 0;
|
||||
|
||||
/** Return a "varying" instance of this type. If the type is already
|
||||
uniform, its "this" pointer will be returned. */
|
||||
varying, its "this" pointer will be returned. */
|
||||
virtual const Type *GetAsVaryingType() const = 0;
|
||||
|
||||
/** Get an instance of the type with unbound variability. */
|
||||
virtual const Type *GetAsUnboundVariabilityType() const = 0;
|
||||
|
||||
/** If this is a signed integer type, return the unsigned version of
|
||||
the type. Otherwise, return the original type. */
|
||||
virtual const Type *GetAsUnsignedType() const;
|
||||
@@ -185,7 +209,8 @@ public:
|
||||
*/
|
||||
class AtomicType : public Type {
|
||||
public:
|
||||
bool IsUniformType() const;
|
||||
Variability GetVariability() const;
|
||||
|
||||
bool IsBoolType() const;
|
||||
bool IsFloatType() const;
|
||||
bool IsIntType() const;
|
||||
@@ -195,8 +220,10 @@ public:
|
||||
/** For AtomicTypes, the base type is just the same as the AtomicType
|
||||
itself. */
|
||||
const AtomicType *GetBaseType() const;
|
||||
const AtomicType *GetAsVaryingType() const;
|
||||
const AtomicType *GetAsUniformType() const;
|
||||
const AtomicType *GetAsVaryingType() const;
|
||||
const AtomicType *GetAsUnboundVariabilityType() const;
|
||||
const AtomicType *ResolveUnboundVariability(Variability v) const;
|
||||
const AtomicType *GetAsUnsignedType() const;
|
||||
const Type *GetSOAType(int width) const;
|
||||
const AtomicType *GetAsConstType() const;
|
||||
@@ -224,38 +251,45 @@ public:
|
||||
TYPE_INT64,
|
||||
TYPE_UINT64,
|
||||
TYPE_DOUBLE,
|
||||
NUM_BASIC_TYPES
|
||||
};
|
||||
|
||||
const BasicType basicType;
|
||||
|
||||
static const AtomicType *UniformBool, *VaryingBool;
|
||||
static const AtomicType *UniformInt8, *VaryingInt8;
|
||||
static const AtomicType *UniformInt16, *VaryingInt16;
|
||||
static const AtomicType *UniformInt32, *VaryingInt32;
|
||||
static const AtomicType *UniformUInt8, *VaryingUInt8;
|
||||
static const AtomicType *UniformUInt16, *VaryingUInt16;
|
||||
static const AtomicType *UniformUInt32, *VaryingUInt32;
|
||||
static const AtomicType *UniformFloat, *VaryingFloat;
|
||||
static const AtomicType *UniformInt64, *VaryingInt64;
|
||||
static const AtomicType *UniformUInt64, *VaryingUInt64;
|
||||
static const AtomicType *UniformDouble, *VaryingDouble;
|
||||
static const AtomicType *UniformConstBool, *VaryingConstBool;
|
||||
static const AtomicType *UniformConstInt8, *VaryingConstInt8;
|
||||
static const AtomicType *UniformConstInt16, *VaryingConstInt16;
|
||||
static const AtomicType *UniformConstInt32, *VaryingConstInt32;
|
||||
static const AtomicType *UniformConstUInt8, *VaryingConstUInt8;
|
||||
static const AtomicType *UniformConstUInt16, *VaryingConstUInt16;
|
||||
static const AtomicType *UniformConstUInt32, *VaryingConstUInt32;
|
||||
static const AtomicType *UniformConstFloat, *VaryingConstFloat;
|
||||
static const AtomicType *UniformConstInt64, *VaryingConstInt64;
|
||||
static const AtomicType *UniformConstUInt64, *VaryingConstUInt64;
|
||||
static const AtomicType *UniformConstDouble, *VaryingConstDouble;
|
||||
static const AtomicType *UniformBool, *VaryingBool, *UnboundBool;
|
||||
static const AtomicType *UniformInt8, *VaryingInt8, *UnboundInt8;
|
||||
static const AtomicType *UniformInt16, *VaryingInt16, *UnboundInt16;
|
||||
static const AtomicType *UniformInt32, *VaryingInt32, *UnboundInt32;
|
||||
static const AtomicType *UniformUInt8, *VaryingUInt8, *UnboundUInt8;
|
||||
static const AtomicType *UniformUInt16, *VaryingUInt16, *UnboundUInt16;
|
||||
static const AtomicType *UniformUInt32, *VaryingUInt32, *UnboundUInt32;
|
||||
static const AtomicType *UniformFloat, *VaryingFloat, *UnboundFloat;
|
||||
static const AtomicType *UniformInt64, *VaryingInt64, *UnboundInt64;
|
||||
static const AtomicType *UniformUInt64, *VaryingUInt64, *UnboundUInt64;
|
||||
static const AtomicType *UniformDouble, *VaryingDouble, *UnboundDouble;
|
||||
static const AtomicType *UniformConstBool, *VaryingConstBool, *UnboundConstBool;
|
||||
static const AtomicType *UniformConstInt8, *VaryingConstInt8, *UnboundConstInt8;
|
||||
static const AtomicType *UniformConstInt16, *VaryingConstInt16, *UnboundConstInt16;
|
||||
static const AtomicType *UniformConstInt32, *VaryingConstInt32, *UnboundConstInt32;
|
||||
static const AtomicType *UniformConstUInt8, *VaryingConstUInt8, *UnboundConstUInt8;
|
||||
static const AtomicType *UniformConstUInt16, *VaryingConstUInt16, *UnboundConstUInt16;
|
||||
static const AtomicType *UniformConstUInt32, *VaryingConstUInt32, *UnboundConstUInt32;
|
||||
static const AtomicType *UniformConstFloat, *VaryingConstFloat, *UnboundConstFloat;
|
||||
static const AtomicType *UniformConstInt64, *VaryingConstInt64, *UnboundConstInt64;
|
||||
static const AtomicType *UniformConstUInt64, *VaryingConstUInt64, *UnboundConstUInt64;
|
||||
static const AtomicType *UniformConstDouble, *VaryingConstDouble, *UnboundConstDouble;
|
||||
static const AtomicType *Void;
|
||||
|
||||
/** This function must be called before any of the above static const
|
||||
AtomicType values is used; in practice, we do it early in
|
||||
main(). */
|
||||
static void Init();
|
||||
|
||||
private:
|
||||
const bool isUniform;
|
||||
static const AtomicType *typeTable[NUM_BASIC_TYPES][3][2];
|
||||
const Variability variability;
|
||||
const bool isConst;
|
||||
AtomicType(BasicType basicType, bool isUniform, bool isConst);
|
||||
AtomicType(BasicType basicType, Variability v, bool isConst);
|
||||
};
|
||||
|
||||
|
||||
@@ -268,7 +302,8 @@ public:
|
||||
/** Constructor for named enumerated types */
|
||||
EnumType(const char *name, SourcePos pos);
|
||||
|
||||
bool IsUniformType() const;
|
||||
Variability GetVariability() const;
|
||||
|
||||
bool IsBoolType() const;
|
||||
bool IsFloatType() const;
|
||||
bool IsIntType() const;
|
||||
@@ -278,6 +313,8 @@ public:
|
||||
const EnumType *GetBaseType() const;
|
||||
const EnumType *GetAsVaryingType() const;
|
||||
const EnumType *GetAsUniformType() const;
|
||||
const EnumType *GetAsUnboundVariabilityType() const;
|
||||
const EnumType *ResolveUnboundVariability(Variability v) const;
|
||||
const Type *GetSOAType(int width) const;
|
||||
const EnumType *GetAsConstType() const;
|
||||
const EnumType *GetAsNonConstType() const;
|
||||
@@ -300,15 +337,17 @@ public:
|
||||
|
||||
private:
|
||||
const std::string name;
|
||||
bool isUniform, isConst;
|
||||
Variability variability;
|
||||
bool isConst;
|
||||
std::vector<Symbol *> enumerators;
|
||||
};
|
||||
|
||||
|
||||
/** @brief Type implementation for pointers to other types
|
||||
*/
|
||||
class PointerType : public Type {
|
||||
public:
|
||||
PointerType(const Type *t, bool isUniform, bool isConst);
|
||||
PointerType(const Type *t, Variability v, bool isConst);
|
||||
|
||||
/** Helper method to return a uniform pointer to the given type. */
|
||||
static PointerType *GetUniform(const Type *t);
|
||||
@@ -318,7 +357,8 @@ public:
|
||||
/** Returns true if the given type is a void * type. */
|
||||
static bool IsVoidPointer(const Type *t);
|
||||
|
||||
bool IsUniformType() const;
|
||||
Variability GetVariability() const;
|
||||
|
||||
bool IsBoolType() const;
|
||||
bool IsFloatType() const;
|
||||
bool IsIntType() const;
|
||||
@@ -328,6 +368,8 @@ public:
|
||||
const Type *GetBaseType() const;
|
||||
const PointerType *GetAsVaryingType() const;
|
||||
const PointerType *GetAsUniformType() const;
|
||||
const PointerType *GetAsUnboundVariabilityType() const;
|
||||
const PointerType *ResolveUnboundVariability(Variability v) const;
|
||||
const Type *GetSOAType(int width) const;
|
||||
const PointerType *GetAsConstType() const;
|
||||
const PointerType *GetAsNonConstType() const;
|
||||
@@ -342,7 +384,8 @@ public:
|
||||
static PointerType *Void;
|
||||
|
||||
private:
|
||||
const bool isUniform, isConst;
|
||||
const Variability variability;
|
||||
const bool isConst;
|
||||
const Type *baseType;
|
||||
};
|
||||
|
||||
@@ -408,7 +451,8 @@ public:
|
||||
*/
|
||||
ArrayType(const Type *elementType, int numElements);
|
||||
|
||||
bool IsUniformType() const;
|
||||
Variability GetVariability() const;
|
||||
|
||||
bool IsBoolType() const;
|
||||
bool IsFloatType() const;
|
||||
bool IsIntType() const;
|
||||
@@ -418,6 +462,9 @@ public:
|
||||
const Type *GetBaseType() const;
|
||||
const ArrayType *GetAsVaryingType() const;
|
||||
const ArrayType *GetAsUniformType() const;
|
||||
const ArrayType *GetAsUnboundVariabilityType() const;
|
||||
const ArrayType *ResolveUnboundVariability(Variability v) const;
|
||||
|
||||
const ArrayType *GetAsUnsignedType() const;
|
||||
const Type *GetSOAType(int width) const;
|
||||
const ArrayType *GetAsConstType() const;
|
||||
@@ -495,6 +542,9 @@ public:
|
||||
|
||||
const SOAArrayType *GetAsVaryingType() const;
|
||||
const SOAArrayType *GetAsUniformType() const;
|
||||
const SOAArrayType *GetAsUnboundVariabilityType() const;
|
||||
const SOAArrayType *ResolveUnboundVariability(Variability v) const;
|
||||
|
||||
const Type *GetSOAType(int width) const;
|
||||
const SOAArrayType *GetAsConstType() const;
|
||||
const SOAArrayType *GetAsNonConstType() const;
|
||||
@@ -536,7 +586,8 @@ class VectorType : public SequentialType {
|
||||
public:
|
||||
VectorType(const AtomicType *base, int size);
|
||||
|
||||
bool IsUniformType() const;
|
||||
Variability GetVariability() const;
|
||||
|
||||
bool IsBoolType() const;
|
||||
bool IsFloatType() const;
|
||||
bool IsIntType() const;
|
||||
@@ -546,6 +597,9 @@ public:
|
||||
const Type *GetBaseType() const;
|
||||
const VectorType *GetAsVaryingType() const;
|
||||
const VectorType *GetAsUniformType() const;
|
||||
const VectorType *GetAsUnboundVariabilityType() const;
|
||||
const VectorType *ResolveUnboundVariability(Variability v) const;
|
||||
|
||||
const Type *GetSOAType(int width) const;
|
||||
const VectorType *GetAsConstType() const;
|
||||
const VectorType *GetAsNonConstType() const;
|
||||
@@ -580,9 +634,10 @@ public:
|
||||
StructType(const std::string &name, const std::vector<const Type *> &elts,
|
||||
const std::vector<std::string> &eltNames,
|
||||
const std::vector<SourcePos> &eltPositions, bool isConst,
|
||||
bool isUniform, SourcePos pos);
|
||||
Variability variability, SourcePos pos);
|
||||
|
||||
Variability GetVariability() const;
|
||||
|
||||
bool IsUniformType() const;
|
||||
bool IsBoolType() const;
|
||||
bool IsFloatType() const;
|
||||
bool IsIntType() const;
|
||||
@@ -592,6 +647,9 @@ public:
|
||||
const Type *GetBaseType() const;
|
||||
const StructType *GetAsVaryingType() const;
|
||||
const StructType *GetAsUniformType() const;
|
||||
const StructType *GetAsUnboundVariabilityType() const;
|
||||
const StructType *ResolveUnboundVariability(Variability v) const;
|
||||
|
||||
const Type *GetSOAType(int width) const;
|
||||
const StructType *GetAsConstType() const;
|
||||
const StructType *GetAsNonConstType() const;
|
||||
@@ -641,7 +699,7 @@ private:
|
||||
/** Source file position at which each structure element declaration
|
||||
appeared. */
|
||||
const std::vector<SourcePos> elementPositions;
|
||||
const bool isUniform;
|
||||
const Variability variability;
|
||||
const bool isConst;
|
||||
const SourcePos pos;
|
||||
};
|
||||
@@ -653,7 +711,8 @@ class ReferenceType : public Type {
|
||||
public:
|
||||
ReferenceType(const Type *targetType);
|
||||
|
||||
bool IsUniformType() const;
|
||||
Variability GetVariability() const;
|
||||
|
||||
bool IsBoolType() const;
|
||||
bool IsFloatType() const;
|
||||
bool IsIntType() const;
|
||||
@@ -664,6 +723,9 @@ public:
|
||||
const Type *GetReferenceTarget() const;
|
||||
const ReferenceType *GetAsVaryingType() const;
|
||||
const ReferenceType *GetAsUniformType() const;
|
||||
const ReferenceType *GetAsUnboundVariabilityType() const;
|
||||
const ReferenceType *ResolveUnboundVariability(Variability v) const;
|
||||
|
||||
const Type *GetSOAType(int width) const;
|
||||
const ReferenceType *GetAsConstType() const;
|
||||
const ReferenceType *GetAsNonConstType() const;
|
||||
@@ -696,13 +758,14 @@ public:
|
||||
FunctionType(const Type *returnType,
|
||||
const std::vector<const Type *> &argTypes, SourcePos pos);
|
||||
FunctionType(const Type *returnType,
|
||||
const std::vector<const Type *> &argTypes, SourcePos pos,
|
||||
const std::vector<const Type *> &argTypes,
|
||||
const std::vector<std::string> &argNames,
|
||||
const std::vector<ConstExpr *> &argDefaults,
|
||||
const std::vector<SourcePos> &argPos,
|
||||
bool isTask, bool isExported, bool isExternC);
|
||||
|
||||
bool IsUniformType() const;
|
||||
Variability GetVariability() const;
|
||||
|
||||
bool IsBoolType() const;
|
||||
bool IsFloatType() const;
|
||||
bool IsIntType() const;
|
||||
@@ -712,6 +775,9 @@ public:
|
||||
const Type *GetBaseType() const;
|
||||
const Type *GetAsVaryingType() const;
|
||||
const Type *GetAsUniformType() const;
|
||||
const Type *GetAsUnboundVariabilityType() const;
|
||||
const FunctionType *ResolveUnboundVariability(Variability v) const;
|
||||
|
||||
const Type *GetSOAType(int width) const;
|
||||
const Type *GetAsConstType() const;
|
||||
const Type *GetAsNonConstType() const;
|
||||
@@ -752,6 +818,7 @@ public:
|
||||
|
||||
private:
|
||||
const Type * const returnType;
|
||||
|
||||
// The following four vectors should all have the same length (which is
|
||||
// in turn the length returned by GetNumParameters()).
|
||||
const std::vector<const Type *> paramTypes;
|
||||
|
||||
58
util.cpp
58
util.cpp
@@ -39,6 +39,9 @@
|
||||
#include "module.h"
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#include <shlwapi.h>
|
||||
#ifdef __MINGW32__
|
||||
#include <malloc.h> // for alloca()
|
||||
#endif
|
||||
#else
|
||||
#include <alloca.h>
|
||||
#endif
|
||||
@@ -75,7 +78,7 @@ lTerminalWidth() {
|
||||
HANDLE h = GetStdHandle(STD_OUTPUT_HANDLE);
|
||||
if (h == INVALID_HANDLE_VALUE || h == NULL)
|
||||
return 80;
|
||||
CONSOLE_SCREEN_BUFFER_INFO bufferInfo = { 0 };
|
||||
CONSOLE_SCREEN_BUFFER_INFO bufferInfo = { {0} };
|
||||
GetConsoleScreenBufferInfo(h, &bufferInfo);
|
||||
return bufferInfo.dwSize.X;
|
||||
#else
|
||||
@@ -187,6 +190,32 @@ lPrintWithWordBreaks(const char *buf, int columnWidth, FILE *out) {
|
||||
}
|
||||
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
// we cover for the lack vasprintf and asprintf on windows (also covers mingw)
|
||||
int
|
||||
vasprintf(char **sptr, const char *fmt, va_list argv)
|
||||
{
|
||||
int wanted = vsnprintf(*sptr = NULL, 0, fmt, argv);
|
||||
if((wanted < 0) || ((*sptr = (char*)malloc( 1 + wanted )) == NULL))
|
||||
return -1;
|
||||
|
||||
return vsprintf(*sptr, fmt, argv);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
asprintf(char **sptr, const char *fmt, ...)
|
||||
{
|
||||
int retval;
|
||||
va_list argv;
|
||||
va_start(argv, fmt);
|
||||
retval = vasprintf(sptr, fmt, argv);
|
||||
va_end(argv);
|
||||
return retval;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/** Helper function for Error(), Warning(), etc.
|
||||
|
||||
@param type The type of message being printed (e.g. "Warning")
|
||||
@@ -197,30 +226,6 @@ lPrintWithWordBreaks(const char *buf, int columnWidth, FILE *out) {
|
||||
*/
|
||||
static void
|
||||
lPrint(const char *type, SourcePos p, const char *fmt, va_list args) {
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
char errorBuf[2048], formattedBuf[2048];
|
||||
if (vsnprintf_s(errorBuf, sizeof(errorBuf), _TRUNCATE, fmt, args) == -1) {
|
||||
fprintf(stderr, "vsnprintf_s() error!\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (p.first_line == 0) {
|
||||
// We don't have a valid SourcePos, so create a message without it
|
||||
if (sprintf_s(formattedBuf, sizeof(formattedBuf), "%s: %s\n",
|
||||
type, errorBuf) == -1) {
|
||||
fprintf(stderr, "vsnprintf_s() error!\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Create an error message that includes the file and line number
|
||||
if (sprintf_s(formattedBuf, sizeof(formattedBuf), "%s(%d): %s: %s\n",
|
||||
p.name, p.first_line, type, errorBuf) == -1) {
|
||||
fprintf(stderr, "vsnprintf_s() error!\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
#else
|
||||
char *errorBuf, *formattedBuf;
|
||||
if (vasprintf(&errorBuf, fmt, args) == -1) {
|
||||
fprintf(stderr, "vasprintf() unable to allocate memory!\n");
|
||||
@@ -241,7 +246,6 @@ lPrint(const char *type, SourcePos p, const char *fmt, va_list args) {
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Now that we've done all that work, see if we've already printed the
|
||||
// exact same error message. If so, return, so we don't redundantly
|
||||
@@ -254,10 +258,8 @@ lPrint(const char *type, SourcePos p, const char *fmt, va_list args) {
|
||||
lPrintWithWordBreaks(formattedBuf, lTerminalWidth(), stderr);
|
||||
lPrintFileLineContext(p);
|
||||
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
free(errorBuf);
|
||||
free(formattedBuf);
|
||||
#endif // !ISPC_IS_WINDOWS
|
||||
}
|
||||
|
||||
|
||||
|
||||
9
util.h
9
util.h
@@ -40,6 +40,9 @@
|
||||
#define ISPC_UTIL_H
|
||||
|
||||
#include "ispc.h"
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
struct SourcePos;
|
||||
|
||||
@@ -62,6 +65,12 @@ inline uint32_t RoundUpPow2(uint32_t v) {
|
||||
#define PRINTF_FUNC
|
||||
#endif // __GNUG__
|
||||
|
||||
// for cross-platform compatibility
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
int vasprintf(char **sptr, const char *fmt, va_list argv);
|
||||
int asprintf(char **sptr, const char *fmt, ...);
|
||||
#endif
|
||||
|
||||
/** Prints a debugging message. These messages are only printed if
|
||||
g->debugPrint is \c true. In addition to a program source code
|
||||
position to associate with the message, a printf()-style format string
|
||||
|
||||
Reference in New Issue
Block a user