Merge remote-tracking branch 'matt/master'

This commit is contained in:
Jean-Luc Duprat
2012-01-26 10:41:13 -08:00
100 changed files with 5850 additions and 2116 deletions

View File

@@ -3,6 +3,11 @@
#
ARCH_OS = $(shell uname)
ifeq ($(ARCH_OS), Darwin)
ARCH_OS2 = "OSX"
else
ARCH_OS2 = $(shell uname -o)
endif
ARCH_TYPE = $(shell arch)
ifeq ($(shell llvm-config --version), 3.1svn)
@@ -26,7 +31,15 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
-lclangAnalysis -lclangAST -lclangLex -lclangBasic
ISPC_LIBS=$(shell llvm-config --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
-lpthread -ldl
-lpthread
ifeq ($(ARCH_OS),Linux)
ISPC_LIBS += -ldl
endif
ifeq ($(ARCH_OS2),Msys)
ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
endif
LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
LLVM_VERSION=LLVM_$(shell llvm-config --version | sed s/\\./_/)
@@ -58,7 +71,8 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
type.cpp util.cpp
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
opt.h stmt.h sym.h type.h util.h
TARGETS=avx avx-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 generic-16
TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
generic-16
BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
builtins/dispatch.ll
BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
@@ -129,22 +143,22 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
@echo Creating C++ source from builtins definition file $<
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | ./bitcode2cpp.py $< > $@
@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | python bitcode2cpp.py $< > $@
objs/builtins-c-32.cpp: builtins/builtins.c
@echo Creating C++ source from builtins definition file $<
@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-32 > $@
@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-32 > $@
objs/builtins-c-64.cpp: builtins/builtins.c
@echo Creating C++ source from builtins definition file $<
@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-64 > $@
@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-64 > $@
objs/stdlib_generic_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for generic
@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
./stdlib2cpp.py generic > $@
python stdlib2cpp.py generic > $@
objs/stdlib_x86_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for x86
@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
./stdlib2cpp.py x86 > $@
python stdlib2cpp.py x86 > $@

View File

@@ -47,7 +47,7 @@ remarkable `LLVM Compiler Infrastructure <http://llvm.org>`_ for back-end
code generation and optimization and is `hosted on
github <http://github.com/ispc/ispc/>`_. It supports Windows, Mac, and
Linux, with both x86 and x86-64 targets. It currently supports the SSE2,
SSE4, and AVX instruction sets.
SSE4, AVX1, and AVX2 instruction sets.
Features
--------

20
ast.cpp
View File

@@ -90,7 +90,11 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
DoStmt *dos;
ForStmt *fs;
ForeachStmt *fes;
CaseStmt *cs;
DefaultStmt *defs;
SwitchStmt *ss;
ReturnStmt *rs;
LabeledStmt *ls;
StmtList *sl;
PrintStmt *ps;
AssertStmt *as;
@@ -130,10 +134,21 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
postFunc, data);
fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
}
else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data);
else if ((ss = dynamic_cast<SwitchStmt *>(node)) != NULL) {
ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data);
ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data);
}
else if (dynamic_cast<BreakStmt *>(node) != NULL ||
dynamic_cast<ContinueStmt *>(node) != NULL) {
dynamic_cast<ContinueStmt *>(node) != NULL ||
dynamic_cast<GotoStmt *>(node) != NULL) {
// nothing
}
else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
rs->val = (Expr *)WalkAST(rs->val, preFunc, postFunc, data);
else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
@@ -151,7 +166,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
else {
///////////////////////////////////////////////////////////////////////////
// Handle expressions
assert(dynamic_cast<Expr *>(node) != NULL);
Assert(dynamic_cast<Expr *>(node) != NULL);
UnaryExpr *ue;
BinaryExpr *be;
AssignExpr *ae;
@@ -289,3 +304,4 @@ EstimateCost(ASTNode *root) {
WalkAST(root, lCostCallback, NULL, &cost);
return cost;
}

View File

@@ -386,6 +386,7 @@ lSetInternalFunctions(llvm::Module *module) {
"__ceil_uniform_float",
"__ceil_varying_double",
"__ceil_varying_float",
"__clock",
"__count_trailing_zeros_i32",
"__count_trailing_zeros_i64",
"__count_leading_zeros_i32",
@@ -717,11 +718,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
extern int builtins_bitcode_sse4_x2_length;
switch (g->target.vectorWidth) {
case 4:
AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length,
AddBitcodeToModule(builtins_bitcode_sse4,
builtins_bitcode_sse4_length,
module, symbolTable);
break;
case 8:
AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length,
AddBitcodeToModule(builtins_bitcode_sse4_x2,
builtins_bitcode_sse4_x2_length,
module, symbolTable);
break;
default:
@@ -729,18 +732,39 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
}
break;
case Target::AVX:
case Target::AVX2:
switch (g->target.vectorWidth) {
case 8:
extern unsigned char builtins_bitcode_avx[];
extern int builtins_bitcode_avx_length;
AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length,
extern unsigned char builtins_bitcode_avx1[];
extern int builtins_bitcode_avx1_length;
AddBitcodeToModule(builtins_bitcode_avx1,
builtins_bitcode_avx1_length,
module, symbolTable);
break;
case 16:
extern unsigned char builtins_bitcode_avx_x2[];
extern int builtins_bitcode_avx_x2_length;
AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
extern unsigned char builtins_bitcode_avx1_x2[];
extern int builtins_bitcode_avx1_x2_length;
AddBitcodeToModule(builtins_bitcode_avx1_x2,
builtins_bitcode_avx1_x2_length,
module, symbolTable);
break;
default:
FATAL("logic error in DefineStdlib");
}
break;
case Target::AVX2:
switch (g->target.vectorWidth) {
case 8:
extern unsigned char builtins_bitcode_avx2[];
extern int builtins_bitcode_avx2_length;
AddBitcodeToModule(builtins_bitcode_avx2,
builtins_bitcode_avx2_length,
module, symbolTable);
break;
case 16:
extern unsigned char builtins_bitcode_avx2_x2[];
extern int builtins_bitcode_avx2_x2_length;
AddBitcodeToModule(builtins_bitcode_avx2_x2,
builtins_bitcode_avx2_x2_length,
module, symbolTable);
break;
default:
@@ -798,6 +822,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
symbolTable);
lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
module, symbolTable);
if (includeStdlibISPC) {
// If the user wants the standard library to be included, parse the
// serialized version of the stdlib.ispc file to get its

View File

@@ -149,7 +149,7 @@ void __do_print(const char *format, const char *types, int width, int mask,
int __num_cores() {
#ifdef _MSC_VER
#if defined(_MSC_VER) || defined(__MINGW32__)
// This is quite a hack. Including all of windows.h to get this definition
// pulls in a bunch of stuff that leads to undefined symbols at link time.
// So we don't #include <windows.h> but instead have the equivalent declarations

View File

@@ -48,23 +48,42 @@ declare void @abort() noreturn
;; corresponding to one of the Target::ISA enumerant values that gives the
;; most capable ISA that the curremt system can run.
;;
;; #ifdef _MSC_VER
;; extern void __stdcall __cpuid(int info[4], int infoType);
;; #else
;; Note: clang from LLVM 2.9 should be used if this is updated, for maximum
;; backwards compatibility for anyone building ispc with LLVM 2.9.
;;
;; #include <stdint.h>
;; #include <stdlib.h>
;;
;; static void __cpuid(int info[4], int infoType) {
;; __asm__ __volatile__ ("cpuid"
;; : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
;; : "0" (infoType));
;; }
;; #endif
;;
;; /* Save %ebx in case it's the PIC register */
;; static void __cpuid_count(int info[4], int level, int count) {
;; __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
;; "cpuid\n\t"
;; "xchg{l}\t{%%}ebx, %1\n\t"
;; : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
;; : "0" (level), "2" (count));
;; }
;;
;; int32_t __get_system_isa() {
;; int info[4];
;; __cpuid(info, 1);
;;
;; /* NOTE: the values returned below must be the same as the
;; corresponding enumerant values in Target::ISA. */
;; if ((info[2] & (1 << 28)) != 0)
;; return 2; // AVX
;; if ((info[2] & (1 << 28)) != 0) {
;; // AVX1 for sure. Do we have AVX2?
;; // Call cpuid with eax=7, ecx=0
;; __cpuid_count(info, 7, 0);
;; if ((info[1] & (1 << 5)) != 0)
;; return 3; // AVX2
;; else
;; return 2; // AVX1
;; }
;; else if ((info[2] & (1 << 19)) != 0)
;; return 1; // SSE4
;; else if ((info[3] & (1 << 26)) != 0)
@@ -76,33 +95,42 @@ declare void @abort() noreturn
%0 = type { i32, i32, i32, i32 }
define i32 @__get_system_isa() nounwind ssp {
%1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
%2 = extractvalue %0 %1, 2
%3 = extractvalue %0 %1, 3
%4 = and i32 %2, 268435456
%5 = icmp eq i32 %4, 0
br i1 %5, label %6, label %13
entry:
%0 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
%asmresult9.i = extractvalue %0 %0, 2
%asmresult10.i = extractvalue %0 %0, 3
%and = and i32 %asmresult9.i, 268435456
%cmp = icmp eq i32 %and, 0
br i1 %cmp, label %if.else7, label %if.then
; <label>:6 ; preds = %0
%7 = and i32 %2, 524288
%8 = icmp eq i32 %7, 0
br i1 %8, label %9, label %13
if.then: ; preds = %entry
%1 = tail call %0 asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
%asmresult9.i24 = extractvalue %0 %1, 1
%and4 = lshr i32 %asmresult9.i24, 5
%2 = and i32 %and4, 1
%3 = or i32 %2, 2
br label %return
; <label>:9 ; preds = %6
%10 = and i32 %3, 67108864
%11 = icmp eq i32 %10, 0
br i1 %11, label %12, label %13
if.else7: ; preds = %entry
%and10 = and i32 %asmresult9.i, 524288
%cmp11 = icmp eq i32 %and10, 0
br i1 %cmp11, label %if.else13, label %return
; <label>:12 ; preds = %9
if.else13: ; preds = %if.else7
%and16 = and i32 %asmresult10.i, 67108864
%cmp17 = icmp eq i32 %and16, 0
br i1 %cmp17, label %if.else19, label %return
if.else19: ; preds = %if.else13
tail call void @abort() noreturn nounwind
unreachable
; <label>:13 ; preds = %9, %6, %0
%.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
ret i32 %.0
return: ; preds = %if.else13, %if.else7, %if.then
%retval.0 = phi i32 [ %3, %if.then ], [ 1, %if.else7 ], [ 0, %if.else13 ]
ret i32 %retval.0
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This function is called by each of the dispatch functions we generate;
;; it sets @__system_best_isa if it is unset.

View File

@@ -170,33 +170,6 @@ define <16 x float> @__min_varying_float(<16 x float>,
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret <16 x i32> %ret
}
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret <16 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
ret <16 x i32> %ret
}
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret <16 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops
@@ -622,12 +595,7 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
gen_gather(16, i8)
gen_gather(16, i16)
gen_gather(16, i32)
gen_gather(16, i64)
;; scatter
gen_scatter(16, i8)
gen_scatter(16, i16)

View File

@@ -170,33 +170,6 @@ define <8 x float> @__min_varying_float(<8 x float>,
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret <8 x i32> %ret
}
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret <8 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
ret <8 x i32> %ret
}
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret <8 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops
@@ -403,9 +376,6 @@ define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store
; FIXME: there is no AVX instruction for these, but we could be clever
; by packing the bits down and setting the last 3/4 or half, respectively,
; of the mask to zero... Not sure if this would be a win in the end
gen_masked_store(8, i8, 8)
gen_masked_store(8, i16, 16)
@@ -520,12 +490,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
gen_gather(8, i8)
gen_gather(8, i16)
gen_gather(8, i32)
gen_gather(8, i64)
;; scatter
gen_scatter(8, i8)
gen_scatter(8, i16)

View File

@@ -0,0 +1,77 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(`target-avx-x2.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret <16 x i32> %ret
}
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret <16 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
ret <16 x i32> %ret
}
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret <16 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(16, i8)
gen_gather(16, i16)
gen_gather(16, i32)
gen_gather(16, i64)

75
builtins/target-avx1.ll Normal file
View File

@@ -0,0 +1,75 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(`target-avx.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret <8 x i32> %ret
}
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret <8 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
ret <8 x i32> %ret
}
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret <8 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(8, i8)
gen_gather(8, i16)
gen_gather(8, i32)
gen_gather(8, i64)

129
builtins/target-avx2-x2.ll Normal file
View File

@@ -0,0 +1,129 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(`target-avx-x2.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
ret <16 x i32> %m
}
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
ret <16 x i32> %m
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
ret <16 x i32> %m
}
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
ret <16 x i32> %m
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float/half conversions
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
%r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
%r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
%r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x float> %r
}
define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
%r_0 = shufflevector <16 x float> %v, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
%r_1 = shufflevector <16 x float> %v, <16 x float> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
%r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %r
}
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
%v1 = bitcast i16 %v to <1 x i16>
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
%r = extractelement <8 x float> %rv, i32 0
ret float %r
}
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%v1 = bitcast float %v to <1 x float>
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
; round to nearest even
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
%r = extractelement <8 x i16> %rv, i32 0
ret i16 %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(16, i8)
gen_gather(16, i16)
gen_gather(16, i32)
gen_gather(16, i64)

110
builtins/target-avx2.ll Normal file
View File

@@ -0,0 +1,110 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are
;; met:
;;
;; * Redistributions of source code must retain the above copyright
;; notice, this list of conditions and the following disclaimer.
;;
;; * Redistributions in binary form must reproduce the above copyright
;; notice, this list of conditions and the following disclaimer in the
;; documentation and/or other materials provided with the distribution.
;;
;; * Neither the name of Intel Corporation nor the names of its
;; contributors may be used to endorse or promote products derived from
;; this software without specific prior written permission.
;;
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(`target-avx.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
%m = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %0, <8 x i32> %1)
ret <8 x i32> %m
}
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
%m = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %0, <8 x i32> %1)
ret <8 x i32> %m
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
%m = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %0, <8 x i32> %1)
ret <8 x i32> %m
}
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
%m = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %0, <8 x i32> %1)
ret <8 x i32> %m
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float/half conversions
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
%r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
ret <8 x float> %r
}
define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
%r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
ret <8 x i16> %r
}
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
%v1 = bitcast i16 %v to <1 x i16>
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
%r = extractelement <8 x float> %rv, i32 0
ret float %r
}
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%v1 = bitcast float %v to <1 x float>
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
; round to nearest even
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
%r = extractelement <8 x i16> %rv, i32 0
ret i16 %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(8, i8)
gen_gather(8, i16)
gen_gather(8, i32)
gen_gather(8, i64)

View File

@@ -241,8 +241,9 @@ declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
<WIDTH x i1> %mask) nounwind
ifelse(LLVM_VERSION, `LLVM_3_1svn',`
define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
<WIDTH x i1>) nounwind {
<WIDTH x i1>) nounwind alwaysinline {
%v = load <WIDTH x i8> * %0
%v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
store <WIDTH x i8> %v1, <WIDTH x i8> * %0
@@ -250,7 +251,7 @@ define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
}
define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
<WIDTH x i1>) nounwind {
<WIDTH x i1>) nounwind alwaysinline {
%v = load <WIDTH x i16> * %0
%v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
store <WIDTH x i16> %v1, <WIDTH x i16> * %0
@@ -258,7 +259,7 @@ define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
}
define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
<WIDTH x i1>) nounwind {
<WIDTH x i1>) nounwind alwaysinline {
%v = load <WIDTH x i32> * %0
%v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
store <WIDTH x i32> %v1, <WIDTH x i32> * %0
@@ -266,30 +267,40 @@ define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
}
define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
<WIDTH x i64>, <WIDTH x i1>) nounwind {
<WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
%v = load <WIDTH x i64> * %0
%v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
store <WIDTH x i64> %v1, <WIDTH x i64> * %0
ret void
}
',`
declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
<WIDTH x i1>) nounwind
declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
<WIDTH x i1>) nounwind
declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
<WIDTH x i1>) nounwind
declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
<WIDTH x i1> %mask) nounwind
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
define(`gather_scatter', `
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
i32, <WIDTH x i1>) nounwind readonly
i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
i32, <WIDTH x i1>) nounwind readonly
i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>,
<WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>,
<WIDTH x i1>) nounwind readonly
declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
i32, <WIDTH x $1>, <WIDTH x i1>) nounwind
i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
i32, <WIDTH x $1>, <WIDTH x i1>) nounwind
i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
<WIDTH x i1>) nounwind
declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,

View File

@@ -47,6 +47,14 @@ int64minmax()
include(`target-sse2-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp

View File

@@ -44,6 +44,14 @@ int64minmax()
include(`target-sse2-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding
;;

View File

@@ -47,6 +47,14 @@ int64minmax()
include(`target-sse4-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp

View File

@@ -44,6 +44,14 @@ int64minmax()
include(`target-sse4-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp

View File

@@ -760,14 +760,12 @@ define(`global_atomic_uniform', `
ifelse(LLVM_VERSION, `LLVM_2_9',`
declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
<$1 x MASK> %mask) nounwind alwaysinline {
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline {
%r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
ret $3 %r
}
', `
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
<$1 x MASK> %mask) nounwind alwaysinline {
define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline {
%r = atomicrmw $2 $3 * %ptr, $3 %val seq_cst
ret $3 %r
}
@@ -786,26 +784,7 @@ declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val)
declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)')
define(`global_swap', `
define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
<$1 x MASK> %mask) nounwind alwaysinline {
%rptr = alloca <$1 x $2>
%rptr32 = bitcast <$1 x $2> * %rptr to $2 *
per_lane($1, <$1 x MASK> %mask, `
%val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
ifelse(LLVM_VERSION, `LLVM_2_9',`
%r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)', `
%r_LANE_ID = atomicrmw xchg $2 * %ptr, $2 %val_LANE_ID seq_cst')
%rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE
store $2 %r_LANE_ID, $2 * %rp_LANE_ID')
%r = load <$1 x $2> * %rptr
ret <$1 x $2> %r
}
define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
<$1 x MASK> %mask) nounwind alwaysinline {
define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline {
ifelse(LLVM_VERSION, `LLVM_2_9',`
%r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)', `
%r = atomicrmw xchg $2 * %ptr, $2 %val seq_cst')
@@ -845,7 +824,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
}
define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
$2 %val, <$1 x MASK> %mask) nounwind alwaysinline {
$2 %val) nounwind alwaysinline {
ifelse(LLVM_VERSION, `LLVM_2_9',`
%r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)', `
%r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst')
@@ -1586,17 +1565,15 @@ declare void @__pseudo_masked_store_64(<WIDTH x i64> * nocapture, <WIDTH x i64>,
; these represent gathers from a common base pointer with offsets. The
; offset_scale factor scales the offsets before they are added to the base
; pointer--it should have the value 1, 2, 4, or 8. (It can always just be 1.)
; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling
; available in x86 addressing calculations...
; Then, the offset delta_value (guaranteed to be a compile-time constant value),
; is added to the final address. The 2, 4, 8 scales are used to match LLVM patterns
; that use the free 2/4/8 scaling available in x86 addressing calculations, and
; offset_delta feeds into the free offset calculation.
;
; varying int8 __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base,
; int{32,64} offsets, int32 offset_scale, mask)
; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base,
; int{32,64} offsets, int32 offset_scale, mask)
; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base,
; int{32,64} offsets, int32 offset_scale, mask)
; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base,
; int{32,64} offsets, int32 offset_scale, mask)
; varying int{8,16,32,64}
; __pseudo_gather_base_offsets{32,64}_{8,16,32,64}(uniform int8 *base,
; int{32,64} offsets, uniform int32 offset_scale,
; int{32,64} offset_delta, mask)
;
; Then, the GSImprovementsPass optimizations finds these and either
; converts them to native gather functions or converts them to vector
@@ -1612,22 +1589,22 @@ declare <WIDTH x i16> @__pseudo_gather64_16(<WIDTH x i64>, <WIDTH x MASK>) nounw
declare <WIDTH x i32> @__pseudo_gather64_32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64> @__pseudo_gather64_64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i8> @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32,
declare <WIDTH x i8> @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32,
declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32,
declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32,
declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i8> @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32,
declare <WIDTH x i8> @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32,
declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32,
declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32,
declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
; Similarly to the pseudo-gathers defined above, we also declare undefined
@@ -1642,13 +1619,9 @@ declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i3
; transforms them to scatters like:
;
; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base,
; varying int32 offsets, int32 offset_scale, varying int8 values, mask)
; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base,
; varying int32 offsets, int32 offset_scale, varying int16 values, mask)
; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base,
; varying int32 offsets, int32 offset_scale, varying int32 values, mask)
; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base,
; varying int32 offsets, int32 offset_scale, varying int64 values, mask)
; varying int32 offsets, uniform int32 offset_scale,
; varying int{32,64} offset_delta, varying int8 values, mask)
; (and similarly for 16/32/64 bit values)
;
; And the GSImprovementsPass in turn converts these to actual native
; scatters or masked stores.
@@ -1663,22 +1636,22 @@ declare void @__pseudo_scatter64_16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>
declare void @__pseudo_scatter64_32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter64_64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32,
declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i8>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32,
declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32,
declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32,
declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32,
declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i8>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32,
declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32,
declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32,
declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i64>, <WIDTH x MASK>) nounwind
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1832,6 +1805,22 @@ ok:
ret void
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; read hw clock
define i64 @__clock() nounwind uwtable ssp {
entry:
tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
%0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
%asmresult = extractvalue { i32, i32 } %0, 0
%asmresult1 = extractvalue { i32, i32 } %0, 1
%conv = zext i32 %asmresult1 to i64
%shl = shl nuw i64 %conv, 32
%conv2 = zext i32 %asmresult to i64
%or = or i64 %shl, %conv2
ret i64 %or
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; stdlib transcendentals
;;
@@ -1997,38 +1986,18 @@ global_atomic_uniform(WIDTH, umax, i64, uint64)
global_swap(WIDTH, i32, int32)
global_swap(WIDTH, i64, int64)
define <WIDTH x float> @__atomic_swap_float_global(float * %ptr, <WIDTH x float> %val,
<WIDTH x MASK> %mask) nounwind alwaysinline {
%iptr = bitcast float * %ptr to i32 *
%ival = bitcast <WIDTH x float> %val to <WIDTH x i32>
%iret = call <WIDTH x i32> @__atomic_swap_int32_global(i32 * %iptr, <WIDTH x i32> %ival, <WIDTH x MASK> %mask)
%ret = bitcast <WIDTH x i32> %iret to <WIDTH x float>
ret <WIDTH x float> %ret
}
define <WIDTH x double> @__atomic_swap_double_global(double * %ptr, <WIDTH x double> %val,
<WIDTH x MASK> %mask) nounwind alwaysinline {
%iptr = bitcast double * %ptr to i64 *
%ival = bitcast <WIDTH x double> %val to <WIDTH x i64>
%iret = call <WIDTH x i64> @__atomic_swap_int64_global(i64 * %iptr, <WIDTH x i64> %ival, <WIDTH x MASK> %mask)
%ret = bitcast <WIDTH x i64> %iret to <WIDTH x double>
ret <WIDTH x double> %ret
}
define float @__atomic_swap_uniform_float_global(float * %ptr, float %val,
<WIDTH x MASK> %mask) nounwind alwaysinline {
define float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline {
%iptr = bitcast float * %ptr to i32 *
%ival = bitcast float %val to i32
%iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <WIDTH x MASK> %mask)
%iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival)
%ret = bitcast i32 %iret to float
ret float %ret
}
define double @__atomic_swap_uniform_double_global(double * %ptr, double %val,
<WIDTH x MASK> %mask) nounwind alwaysinline {
define double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline {
%iptr = bitcast double * %ptr to i64 *
%ival = bitcast double %val to i64
%iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <WIDTH x MASK> %mask)
%iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival)
%ret = bitcast i64 %iret to double
ret double %ret
}
@@ -2058,24 +2027,23 @@ define <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
ret <WIDTH x double> %ret
}
define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val,
<WIDTH x MASK> %mask) nounwind alwaysinline {
define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
float %val) nounwind alwaysinline {
%iptr = bitcast float * %ptr to i32 *
%icmp = bitcast float %cmp to i32
%ival = bitcast float %val to i32
%iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp,
i32 %ival, <WIDTH x MASK> %mask)
i32 %ival)
%ret = bitcast i32 %iret to float
ret float %ret
}
define double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
double %val, <WIDTH x MASK> %mask) nounwind alwaysinline {
double %val) nounwind alwaysinline {
%iptr = bitcast double * %ptr to i64 *
%icmp = bitcast double %cmp to i64
%ival = bitcast double %val to i64
%iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp,
i64 %ival, <WIDTH x MASK> %mask)
%iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp, i64 %ival)
%ret = bitcast i64 %iret to double
ret double %ret
}
@@ -2727,7 +2695,8 @@ define(`gen_gather', `
;; Define the utility function to do the gather operation for a single element
;; of the type
define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
<$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
<$1 x i32> %offset_delta, <$1 x $2> %ret,
i32 %lane) nounwind readonly alwaysinline {
; compute address for this one from the base
%offset32 = extractelement <$1 x i32> %offsets, i32 %lane
; the order and details of the next 4 lines are important--they match LLVMs
@@ -2737,15 +2706,20 @@ define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_
%offset = mul i64 %offset64, %scale64
%ptroffset = getelementptr i8 * %ptr, i64 %offset
%delta = extractelement <$1 x i32> %offset_delta, i32 %lane
%delta64 = sext i32 %delta to i64
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
; load value and insert into returned value
%ptrcast = bitcast i8 * %ptroffset to $2 *
%ptrcast = bitcast i8 * %finalptr to $2 *
%val = load $2 *%ptrcast
%updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
ret <$1 x $2> %updatedret
}
define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
<$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
<$1 x i64> %offset_delta, <$1 x $2> %ret,
i32 %lane) nounwind readonly alwaysinline {
; compute address for this one from the base
%offset64 = extractelement <$1 x i64> %offsets, i32 %lane
; the order and details of the next 4 lines are important--they match LLVMs
@@ -2754,8 +2728,11 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_
%offset = mul i64 %offset64, %offset_scale64
%ptroffset = getelementptr i8 * %ptr, i64 %offset
%delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
; load value and insert into returned value
%ptrcast = bitcast i8 * %ptroffset to $2 *
%ptrcast = bitcast i8 * %finalptr to $2 *
%val = load $2 *%ptrcast
%updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
ret <$1 x $2> %updatedret
@@ -2763,6 +2740,7 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_
define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
<$1 x i32> %offset_delta,
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
; We can be clever and avoid the per-lane stuff for gathers if we are willing
; to require that the 0th element of the array being gathered from is always
@@ -2775,16 +2753,25 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32
<$1 x i32> %vecmask)
%newOffsets = load <$1 x i32> * %offsetsPtr
%deltaPtr = alloca <$1 x i32>
store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr
call void @__masked_store_blend_32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta,
<$1 x i32> %vecmask)
%newDelta = load <$1 x i32> * %deltaPtr
%ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
i32 %offset_scale, <$1 x $2> undef, i32 0)
i32 %offset_scale, <$1 x i32> %offset_delta,
<$1 x $2> undef, i32 0)
forloop(lane, 1, eval($1-1),
`patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr,
<$1 x i32> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
<$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %offset_delta,
<$1 x $2> %retPREV, i32 LANE)
', `LANE', lane), `PREV', eval(lane-1))')
ret <$1 x $2> %ret`'eval($1-1)
}
define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
<$1 x i64> %offset_delta,
<$1 x i32> %vecmask) nounwind readonly alwaysinline {
; We can be clever and avoid the per-lane stuff for gathers if we are willing
; to require that the 0th element of the array being gathered from is always
@@ -2797,11 +2784,19 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32
<$1 x i32> %vecmask)
%newOffsets = load <$1 x i64> * %offsetsPtr
%deltaPtr = alloca <$1 x i64>
store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr
call void @__masked_store_blend_64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta,
<$1 x i32> %vecmask)
%newDelta = load <$1 x i64> * %deltaPtr
%ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
i32 %offset_scale, <$1 x $2> undef, i32 0)
i32 %offset_scale, <$1 x i64> %newDelta,
<$1 x $2> undef, i32 0)
forloop(lane, 1, eval($1-1),
`patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr,
<$1 x i64> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
<$1 x i64> %newOffsets, i32 %offset_scale, <$1 x i64> %newDelta,
<$1 x $2> %retPREV, i32 LANE)
', `LANE', lane), `PREV', eval(lane-1))')
ret <$1 x $2> %ret`'eval($1-1)
}
@@ -2852,7 +2847,8 @@ define(`gen_scatter', `
;; Define the function that descripes the work to do to scatter a single
;; value
define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
<$1 x $2> %values, i32 %lane) nounwind alwaysinline {
<$1 x i32> %offset_delta, <$1 x $2> %values,
i32 %lane) nounwind alwaysinline {
%offset32 = extractelement <$1 x i32> %offsets, i32 %lane
; the order and details of the next 4 lines are important--they match LLVMs
; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
@@ -2861,42 +2857,52 @@ define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scal
%offset = mul i64 %offset64, %scale64
%ptroffset = getelementptr i8 * %ptr, i64 %offset
%ptrcast = bitcast i8 * %ptroffset to $2 *
%delta = extractelement <$1 x i32> %offset_delta, i32 %lane
%delta64 = sext i32 %delta to i64
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
%ptrcast = bitcast i8 * %finalptr to $2 *
%storeval = extractelement <$1 x $2> %values, i32 %lane
store $2 %storeval, $2 * %ptrcast
ret void
}
define void @__scatter_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
<$1 x $2> %values, i32 %lane) nounwind alwaysinline {
<$1 x i64> %offset_delta, <$1 x $2> %values,
i32 %lane) nounwind alwaysinline {
%offset64 = extractelement <$1 x i64> %offsets, i32 %lane
; the order and details of the next 4 lines are important--they match LLVMs
; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
%scale64 = sext i32 %offset_scale to i64
%offset = mul i64 %offset64, %scale64
%ptroffset = getelementptr i8 * %ptr, i64 %offset
%ptrcast = bitcast i8 * %ptroffset to $2 *
%delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
%finalptr = getelementptr i8 * %ptroffset, i64 %delta64
%ptrcast = bitcast i8 * %finalptr to $2 *
%storeval = extractelement <$1 x $2> %values, i32 %lane
store $2 %storeval, $2 * %ptrcast
ret void
}
define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, i32 %offset_scale,
<$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
<$1 x i32> %offset_delta, <$1 x $2> %values,
<$1 x i32> %mask) nounwind alwaysinline {
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
per_lane($1, <$1 x i32> %mask, `
call void @__scatter_elt32_$2(i8 * %base, <$1 x i32> %offsets, i32 %offset_scale,
<$1 x $2> %values, i32 LANE)')
<$1 x i32> %offset_delta, <$1 x $2> %values, i32 LANE)')
ret void
}
define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %offset_scale,
<$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
<$1 x i64> %offset_delta, <$1 x $2> %values,
<$1 x i32> %mask) nounwind alwaysinline {
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
per_lane($1, <$1 x i32> %mask, `
call void @__scatter_elt64_$2(i8 * %base, <$1 x i64> %offsets, i32 %offset_scale,
<$1 x $2> %values, i32 LANE)')
<$1 x i64> %offset_delta, <$1 x $2> %values, i32 LANE)')
ret void
}

View File

@@ -16,6 +16,16 @@
#warning "The C++ backend isn't supported when building with LLVM 2.9"
#else
#ifndef _MSC_VER
#include <inttypes.h>
#endif
#ifndef PRIx64
#define PRIx64 "llx"
#endif
#include "llvmutil.h"
#include "llvm/CallingConv.h"
#include "llvm/Constants.h"
#include "llvm/DerivedTypes.h"
@@ -224,6 +234,7 @@ namespace {
unsigned NextAnonValueNumber;
std::string includeName;
int vectorWidth;
/// UnnamedStructIDs - This contains a unique ID for each struct that is
/// either anonymous or has no name.
@@ -232,11 +243,13 @@ namespace {
public:
static char ID;
explicit CWriter(formatted_raw_ostream &o, const char *incname)
explicit CWriter(formatted_raw_ostream &o, const char *incname,
int vecwidth)
: FunctionPass(ID), Out(o), IL(0), Mang(0), LI(0),
TheModule(0), TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0),
OpaqueCounter(0), NextAnonValueNumber(0),
includeName(incname ? incname : "generic_defs.h") {
includeName(incname ? incname : "generic_defs.h"),
vectorWidth(vecwidth) {
initializeLoopInfoPass(*PassRegistry::getPassRegistry());
FPCounter = 0;
}
@@ -376,7 +389,7 @@ namespace {
if (I.getType() == Type::getVoidTy(I.getContext()) || !I.hasOneUse() ||
isa<TerminatorInst>(I) || isa<CallInst>(I) || isa<PHINode>(I) ||
isa<LoadInst>(I) || isa<VAArgInst>(I) || isa<InsertElementInst>(I) ||
isa<InsertValueInst>(I) || isa<ExtractValueInst>(I))
isa<InsertValueInst>(I) || isa<ExtractValueInst>(I) || isa<SelectInst>(I))
// Don't inline a load across a store or other bad things!
return false;
@@ -765,6 +778,16 @@ raw_ostream &CWriter::printType(raw_ostream &Out, Type *Ty,
Out << " return ret;\n";
Out << " }\n ";
// if it's an array of i8s, also provide a version that takes a const
// char *
if (ATy->getElementType() == LLVMTypes::Int8Type) {
Out << " static " << NameSoFar << " init(const char *p) {\n";
Out << " " << NameSoFar << " ret;\n";
Out << " strncpy((char *)ret.array, p, " << NumElements << ");\n";
Out << " return ret;\n";
Out << " }\n";
}
printType(Out, ATy->getElementType(), false,
"array[" + utostr(NumElements) + "]");
return Out << ";\n} ";
@@ -834,6 +857,7 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
}
Out << '\"';
} else {
if (Static)
Out << '{';
if (CPA->getNumOperands()) {
Out << ' ';
@@ -843,6 +867,7 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
printConstant(cast<Constant>(CPA->getOperand(i)), Static);
}
}
if (Static)
Out << " }";
}
}
@@ -1280,7 +1305,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
char Buffer[100];
uint64_t ll = DoubleToBits(V);
sprintf(Buffer, "0x%llx", static_cast<long long>(ll));
sprintf(Buffer, "0x%"PRIx64, static_cast<long long>(ll));
std::string Num(&Buffer[0], &Buffer[6]);
unsigned long Val = strtoul(Num.c_str(), 0, 16);
@@ -1313,7 +1338,8 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
break;
}
case Type::ArrayTyID:
case Type::ArrayTyID: {
ArrayType *AT = cast<ArrayType>(CPV->getType());
if (Static)
// arrays are wrapped in structs...
Out << "{ ";
@@ -1326,7 +1352,6 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
printConstantArray(CA, Static);
} else {
assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
ArrayType *AT = cast<ArrayType>(CPV->getType());
if (AT->getNumElements()) {
Out << ' ';
Constant *CZ = Constant::getNullValue(AT->getElementType());
@@ -1342,7 +1367,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
else
Out << ")";
break;
}
case Type::VectorTyID:
printType(Out, CPV->getType());
Out << "(";
@@ -1741,17 +1766,6 @@ void CWriter::writeOperandWithCast(Value* Operand, const ICmpInst &Cmp) {
//
static void generateCompilerSpecificCode(formatted_raw_ostream& Out,
const TargetData *TD) {
// Alloca, ...
Out << "#include <stdlib.h>\n"
<< "#include <stdint.h>\n"
<< "/* get a declaration for alloca */\n"
<< "#ifdef _MSC_VER\n"
<< "#include <malloc.h>\n"
<< "#define alloca _alloca\n"
<< "#else\n"
<< "#include <alloca.h>\n"
<< "#endif\n\n";
// We output GCC specific attributes to preserve 'linkonce'ness on globals.
// If we aren't being compiled with GCC, just drop these attributes.
Out << "#ifndef __GNUC__ /* Can only support \"linkonce\" vars with GCC */\n"
@@ -1976,7 +1990,6 @@ bool CWriter::doInitialization(Module &M) {
Out << " DO NOT EDIT THIS FILE DIRECTLY\n";
Out << " *******************************************************************/\n\n";
// get declaration for alloca
Out << "/* Provide Declarations */\n";
Out << "#include <stdarg.h>\n"; // Varargs support
Out << "#include <setjmp.h>\n"; // Unwind support
@@ -1987,6 +2000,15 @@ bool CWriter::doInitialization(Module &M) {
Out << " #define NOMINMAX\n";
Out << " #include <windows.h>\n";
Out << "#endif // _MSC_VER\n";
Out << "#include <stdlib.h>\n";
Out << "#include <stdint.h>\n";
Out << "/* get a declaration for alloca */\n";
Out << "#ifdef _MSC_VER\n";
Out << " #include <malloc.h>\n";
Out << " #define alloca _alloca\n";
Out << "#else\n";
Out << " #include <alloca.h>\n";
Out << "#endif\n\n";
Out << "#include \"" << includeName << "\"\n";
@@ -2198,7 +2220,7 @@ bool CWriter::doInitialization(Module &M) {
// FIXME common linkage should avoid this problem.
if (!I->getInitializer()->isNullValue()) {
Out << " = " ;
writeOperand(I->getInitializer(), true);
writeOperand(I->getInitializer(), false);
} else if (I->hasWeakLinkage()) {
// We have to specify an initializer, but it doesn't have to be
// complete. If the value is an aggregate, print out { 0 }, and let
@@ -2213,7 +2235,7 @@ bool CWriter::doInitialization(Module &M) {
Out << "{ { 0 } }";
} else {
// Just print it out normally.
writeOperand(I->getInitializer(), true);
writeOperand(I->getInitializer(), false);
}
}
Out << ";\n";
@@ -2887,6 +2909,20 @@ void CWriter::visitBinaryOperator(Instruction &I) {
Out << "(";
writeOperand(I.getOperand(0));
Out << ", ";
if ((I.getOpcode() == Instruction::Shl ||
I.getOpcode() == Instruction::LShr ||
I.getOpcode() == Instruction::AShr)) {
std::vector<PHINode *> phis;
if (LLVMVectorValuesAllEqual(I.getOperand(1),
vectorWidth, phis)) {
Out << "__extract_element(";
writeOperand(I.getOperand(1));
Out << ", 0) ";
}
else
writeOperand(I.getOperand(1));
}
else
writeOperand(I.getOperand(1));
Out << ")";
return;
@@ -3628,7 +3664,7 @@ std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
#endif
std::string E;
if (const Target *Match = TargetRegistry::lookupTarget(Triple, E))
if (const llvm::Target *Match = TargetRegistry::lookupTarget(Triple, E))
TargetAsm = Match->createMCAsmInfo(Triple);
else
return c.Codes[0];
@@ -4330,7 +4366,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
pm.add(new BitcastCleanupPass);
pm.add(createDeadCodeEliminationPass()); // clean up after smear pass
//CO pm.add(createPrintModulePass(&fos));
pm.add(new CWriter(fos, includeName));
pm.add(new CWriter(fos, includeName, vectorWidth));
pm.add(createGCInfoDeleter());
//CO pm.add(createVerifierPass());

550
ctx.cpp
View File

@@ -74,18 +74,35 @@ struct CFInfo {
llvm::Value *savedContinueLanesPtr,
llvm::Value *savedMask, llvm::Value *savedLoopMask);
static CFInfo *GetSwitch(bool isUniform, llvm::BasicBlock *breakTarget,
llvm::BasicBlock *continueTarget,
llvm::Value *savedBreakLanesPtr,
llvm::Value *savedContinueLanesPtr,
llvm::Value *savedMask, llvm::Value *savedLoopMask,
llvm::Value *switchExpr,
llvm::BasicBlock *bbDefault,
const std::vector<std::pair<int, llvm::BasicBlock *> > *bbCases,
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbNext,
bool scUniform);
bool IsIf() { return type == If; }
bool IsLoop() { return type == Loop; }
bool IsForeach() { return type == Foreach; }
bool IsVaryingType() { return !isUniform; }
bool IsSwitch() { return type == Switch; }
bool IsVarying() { return !isUniform; }
bool IsUniform() { return isUniform; }
enum CFType { If, Loop, Foreach };
enum CFType { If, Loop, Foreach, Switch };
CFType type;
bool isUniform;
llvm::BasicBlock *savedBreakTarget, *savedContinueTarget;
llvm::Value *savedBreakLanesPtr, *savedContinueLanesPtr;
llvm::Value *savedMask, *savedLoopMask;
llvm::Value *savedSwitchExpr;
llvm::BasicBlock *savedDefaultBlock;
const std::vector<std::pair<int, llvm::BasicBlock *> > *savedCaseBlocks;
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNextBlocks;
bool savedSwitchConditionWasUniform;
private:
CFInfo(CFType t, bool uniformIf, llvm::Value *sm) {
@@ -95,11 +112,18 @@ private:
savedBreakTarget = savedContinueTarget = NULL;
savedBreakLanesPtr = savedContinueLanesPtr = NULL;
savedMask = savedLoopMask = sm;
savedSwitchExpr = NULL;
savedDefaultBlock = NULL;
savedCaseBlocks = NULL;
savedNextBlocks = NULL;
}
CFInfo(CFType t, bool iu, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
llvm::Value *lm) {
Assert(t == Loop);
llvm::Value *lm, llvm::Value *sse = NULL, llvm::BasicBlock *bbd = NULL,
const std::vector<std::pair<int, llvm::BasicBlock *> > *bbc = NULL,
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbn = NULL,
bool scu = false) {
Assert(t == Loop || t == Switch);
type = t;
isUniform = iu;
savedBreakTarget = bt;
@@ -108,6 +132,11 @@ private:
savedContinueLanesPtr = sc;
savedMask = sm;
savedLoopMask = lm;
savedSwitchExpr = sse;
savedDefaultBlock = bbd;
savedCaseBlocks = bbc;
savedNextBlocks = bbn;
savedSwitchConditionWasUniform = scu;
}
CFInfo(CFType t, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
@@ -121,6 +150,10 @@ private:
savedContinueLanesPtr = sc;
savedMask = sm;
savedLoopMask = lm;
savedSwitchExpr = NULL;
savedDefaultBlock = NULL;
savedCaseBlocks = NULL;
savedNextBlocks = NULL;
}
};
@@ -154,12 +187,30 @@ CFInfo::GetForeach(llvm::BasicBlock *breakTarget,
savedMask, savedForeachMask);
}
CFInfo *
CFInfo::GetSwitch(bool isUniform, llvm::BasicBlock *breakTarget,
llvm::BasicBlock *continueTarget,
llvm::Value *savedBreakLanesPtr,
llvm::Value *savedContinueLanesPtr, llvm::Value *savedMask,
llvm::Value *savedLoopMask, llvm::Value *savedSwitchExpr,
llvm::BasicBlock *savedDefaultBlock,
const std::vector<std::pair<int, llvm::BasicBlock *> > *savedCases,
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNext,
bool savedSwitchConditionUniform) {
return new CFInfo(Switch, isUniform, breakTarget, continueTarget,
savedBreakLanesPtr, savedContinueLanesPtr,
savedMask, savedLoopMask, savedSwitchExpr, savedDefaultBlock,
savedCases, savedNext, savedSwitchConditionUniform);
}
///////////////////////////////////////////////////////////////////////////
FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
llvm::Function *llvmFunction,
llvm::Function *lf,
SourcePos firstStmtPos) {
function = func;
llvmFunction = lf;
/* Create a new basic block to store all of the allocas */
allocaBlock = llvm::BasicBlock::Create(*g->ctx, "allocas", llvmFunction, 0);
@@ -181,6 +232,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
breakLanesPtr = continueLanesPtr = NULL;
breakTarget = continueTarget = NULL;
switchExpr = NULL;
caseBlocks = NULL;
defaultBlock = NULL;
nextBlocks = NULL;
returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
StoreInst(LLVMMaskAllOff, returnedLanesPtr);
@@ -421,14 +477,15 @@ FunctionEmitContext::StartVaryingIf(llvm::Value *oldMask) {
void
FunctionEmitContext::EndIf() {
CFInfo *ci = popCFState();
// Make sure we match up with a Start{Uniform,Varying}If().
Assert(controlFlowInfo.size() > 0 && controlFlowInfo.back()->IsIf());
CFInfo *ci = controlFlowInfo.back();
controlFlowInfo.pop_back();
Assert(ci->IsIf());
// 'uniform' ifs don't change the mask so we only need to restore the
// mask going into the if for 'varying' if statements
if (!ci->IsUniform() && bblock != NULL) {
if (ci->IsUniform() || bblock == NULL)
return;
// We can't just restore the mask as it was going into the 'if'
// statement. First we have to take into account any program
// instances that have executed 'return' statements; the restored
@@ -436,7 +493,7 @@ FunctionEmitContext::EndIf() {
restoreMaskGivenReturns(ci->savedMask);
// If the 'if' statement is inside a loop with a 'varying'
// consdition, we also need to account for any break or continue
// condition, we also need to account for any break or continue
// statements that executed inside the 'if' statmeent; we also must
// leave the lane masks for the program instances that ran those
// off after we restore the mask after the 'if'. The code below
@@ -444,29 +501,38 @@ FunctionEmitContext::EndIf() {
// or continue statements (and breakLanesPtr and continueLanesPtr
// have their initial 'all off' values), so we don't need to check
// for that here.
if (continueLanesPtr != NULL) {
//
// There are three general cases to deal with here:
// - Loops: both break and continue are allowed, and thus the corresponding
// lane mask pointers are non-NULL
// - Foreach: only continueLanesPtr may be non-NULL
// - Switch: only breakLanesPtr may be non-NULL
if (continueLanesPtr != NULL || breakLanesPtr != NULL) {
// We want to compute:
// newMask = (oldMask & ~(breakLanes | continueLanes))
llvm::Value *oldMask = GetInternalMask();
llvm::Value *continueLanes = LoadInst(continueLanesPtr,
"continue_lanes");
llvm::Value *bcLanes = continueLanes;
// newMask = (oldMask & ~(breakLanes | continueLanes)),
// treading breakLanes or continueLanes as "all off" if the
// corresponding pointer is NULL.
llvm::Value *bcLanes = NULL;
if (continueLanesPtr != NULL)
bcLanes = LoadInst(continueLanesPtr, "continue_lanes");
else
bcLanes = LLVMMaskAllOff;
if (breakLanesPtr != NULL) {
// breakLanesPtr will be NULL if we're inside a 'foreach' loop
llvm::Value *breakLanes = LoadInst(breakLanesPtr, "break_lanes");
bcLanes = BinaryOperator(llvm::Instruction::Or, breakLanes,
continueLanes, "break|continue_lanes");
bcLanes = BinaryOperator(llvm::Instruction::Or, bcLanes,
breakLanes, "|break_lanes");
}
llvm::Value *notBreakOrContinue =
NotOperator(bcLanes, "!(break|continue)_lanes");
llvm::Value *oldMask = GetInternalMask();
llvm::Value *newMask =
BinaryOperator(llvm::Instruction::And, oldMask,
notBreakOrContinue, "new_mask");
SetInternalMask(newMask);
}
}
}
@@ -501,17 +567,8 @@ FunctionEmitContext::StartLoop(llvm::BasicBlock *bt, llvm::BasicBlock *ct,
void
FunctionEmitContext::EndLoop() {
Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsLoop());
CFInfo *ci = controlFlowInfo.back();
controlFlowInfo.pop_back();
// Restore the break/continue state information to what it was before
// we went into this loop.
breakTarget = ci->savedBreakTarget;
continueTarget = ci->savedContinueTarget;
breakLanesPtr = ci->savedBreakLanesPtr;
continueLanesPtr = ci->savedContinueLanesPtr;
loopMask = ci->savedLoopMask;
CFInfo *ci = popCFState();
Assert(ci->IsLoop());
if (!ci->IsUniform())
// If the loop had a 'uniform' test, then it didn't make any
@@ -524,7 +581,7 @@ FunctionEmitContext::EndLoop() {
void
FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {
FunctionEmitContext::StartForeach() {
// Store the current values of various loop-related state so that we
// can restore it when we exit this loop.
llvm::Value *oldMask = GetInternalMask();
@@ -536,7 +593,7 @@ FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {
continueLanesPtr = AllocaInst(LLVMTypes::MaskType, "foreach_continue_lanes");
StoreInst(LLVMMaskAllOff, continueLanesPtr);
continueTarget = ct;
continueTarget = NULL; // should be set by SetContinueTarget()
loopMask = NULL;
}
@@ -544,17 +601,8 @@ FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {
void
FunctionEmitContext::EndForeach() {
Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsForeach());
CFInfo *ci = controlFlowInfo.back();
controlFlowInfo.pop_back();
// Restore the break/continue state information to what it was before
// we went into this loop.
breakTarget = ci->savedBreakTarget;
continueTarget = ci->savedContinueTarget;
breakLanesPtr = ci->savedBreakLanesPtr;
continueLanesPtr = ci->savedContinueLanesPtr;
loopMask = ci->savedLoopMask;
CFInfo *ci = popCFState();
Assert(ci->IsForeach());
}
@@ -575,28 +623,64 @@ FunctionEmitContext::restoreMaskGivenReturns(llvm::Value *oldMask) {
}
/** Returns "true" if the first enclosing non-if control flow expression is
a "switch" statement.
*/
bool
FunctionEmitContext::inSwitchStatement() const {
// Go backwards through controlFlowInfo, since we add new nested scopes
// to the back.
int i = controlFlowInfo.size() - 1;
while (i >= 0 && controlFlowInfo[i]->IsIf())
--i;
// Got to the first non-if (or end of CF info)
if (i == -1)
return false;
return controlFlowInfo[i]->IsSwitch();
}
void
FunctionEmitContext::Break(bool doCoherenceCheck) {
Assert(controlFlowInfo.size() > 0);
if (breakTarget == NULL) {
Error(currentPos, "\"break\" statement is illegal outside of "
"for/while/do loops.");
"for/while/do loops and \"switch\" statements.");
return;
}
if (bblock == NULL)
return;
if (inSwitchStatement() == true &&
switchConditionWasUniform == true &&
ifsInCFAllUniform(CFInfo::Switch)) {
// We know that all program instances are executing the break, so
// just jump to the block immediately after the switch.
Assert(breakTarget != NULL);
BranchInst(breakTarget);
bblock = NULL;
return;
}
// If all of the enclosing 'if' tests in the loop have uniform control
// flow or if we can tell that the mask is all on, then we can just
// jump to the break location.
if (ifsInLoopAllUniform() || GetInternalMask() == LLVMMaskAllOn) {
if (inSwitchStatement() == false &&
(ifsInCFAllUniform(CFInfo::Loop) ||
GetInternalMask() == LLVMMaskAllOn)) {
BranchInst(breakTarget);
if (ifsInLoopAllUniform() && doCoherenceCheck)
Warning(currentPos, "Coherent break statement not necessary in fully uniform "
"control flow.");
if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
Warning(currentPos, "Coherent break statement not necessary in "
"fully uniform control flow.");
// Set bblock to NULL since the jump has terminated the basic block
bblock = NULL;
}
else {
// Otherwise we need to update the mask of the lanes that have
// executed a 'break' statement:
// Varying switch, uniform switch where the 'break' is under
// varying control flow, or a loop with varying 'if's above the
// break. In these cases, we need to update the mask of the lanes
// that have executed a 'break' statement:
// breakLanes = breakLanes | mask
Assert(breakLanesPtr != NULL);
llvm::Value *mask = GetInternalMask();
@@ -612,16 +696,20 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
// an 'if' statement and restore the mask then.
SetInternalMask(LLVMMaskAllOff);
if (doCoherenceCheck)
// If the user has indicated that this is a 'coherent' break
// statement, then check to see if the mask is all off. If so,
// we have to conservatively jump to the continueTarget, not
// the breakTarget, since part of the reason the mask is all
// off may be due to 'continue' statements that executed in the
// current loop iteration.
// FIXME: if the loop only has break statements and no
// continues, we can jump to breakTarget in that case.
if (doCoherenceCheck) {
if (continueTarget != NULL)
// If the user has indicated that this is a 'coherent'
// break statement, then check to see if the mask is all
// off. If so, we have to conservatively jump to the
// continueTarget, not the breakTarget, since part of the
// reason the mask is all off may be due to 'continue'
// statements that executed in the current loop iteration.
jumpIfAllLoopLanesAreDone(continueTarget);
else if (breakTarget != NULL)
// Similarly handle these for switch statements, where we
// only have a break target.
jumpIfAllLoopLanesAreDone(breakTarget);
}
}
}
@@ -634,12 +722,12 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
return;
}
if (ifsInLoopAllUniform() || GetInternalMask() == LLVMMaskAllOn) {
if (ifsInCFAllUniform(CFInfo::Loop) || GetInternalMask() == LLVMMaskAllOn) {
// Similarly to 'break' statements, we can immediately jump to the
// continue target if we're only in 'uniform' control flow within
// loop or if we can tell that the mask is all on.
AddInstrumentationPoint("continue: uniform CF, jumped");
if (ifsInLoopAllUniform() && doCoherenceCheck)
if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
Warning(currentPos, "Coherent continue statement not necessary in "
"fully uniform control flow.");
BranchInst(continueTarget);
@@ -652,8 +740,9 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
llvm::Value *mask = GetInternalMask();
llvm::Value *continueMask =
LoadInst(continueLanesPtr, "continue_mask");
llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or,
mask, continueMask, "mask|continueMask");
llvm::Value *newMask =
BinaryOperator(llvm::Instruction::Or, mask, continueMask,
"mask|continueMask");
StoreInst(newMask, continueLanesPtr);
// And set the current mask to be all off in case there are any
@@ -670,22 +759,23 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
/** This function checks to see if all of the 'if' statements (if any)
between the current scope and the first enclosing loop have 'uniform'
tests.
between the current scope and the first enclosing loop/switch of given
control flow type have 'uniform' tests.
*/
bool
FunctionEmitContext::ifsInLoopAllUniform() const {
FunctionEmitContext::ifsInCFAllUniform(int type) const {
Assert(controlFlowInfo.size() > 0);
// Go backwards through controlFlowInfo, since we add new nested scopes
// to the back. Stop once we come to the first enclosing loop.
// to the back. Stop once we come to the first enclosing control flow
// structure of the desired type.
int i = controlFlowInfo.size() - 1;
while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Loop) {
while (i >= 0 && controlFlowInfo[i]->type != type) {
if (controlFlowInfo[i]->isUniform == false)
// Found a scope due to an 'if' statement with a varying test
return false;
--i;
}
Assert(i >= 0); // else we didn't find a loop!
Assert(i >= 0); // else we didn't find the expected control flow type!
return true;
}
@@ -758,11 +848,249 @@ FunctionEmitContext::RestoreContinuedLanes() {
}
void
FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak) {
llvm::Value *oldMask = GetInternalMask();
controlFlowInfo.push_back(CFInfo::GetSwitch(cfIsUniform, breakTarget,
continueTarget, breakLanesPtr,
continueLanesPtr, oldMask,
loopMask, switchExpr, defaultBlock,
caseBlocks, nextBlocks,
switchConditionWasUniform));
breakLanesPtr = AllocaInst(LLVMTypes::MaskType, "break_lanes_memory");
StoreInst(LLVMMaskAllOff, breakLanesPtr);
breakTarget = bbBreak;
continueLanesPtr = NULL;
continueTarget = NULL;
loopMask = NULL;
// These will be set by the SwitchInst() method
switchExpr = NULL;
defaultBlock = NULL;
caseBlocks = NULL;
nextBlocks = NULL;
}
void
FunctionEmitContext::EndSwitch() {
Assert(bblock != NULL);
CFInfo *ci = popCFState();
if (ci->IsVarying() && bblock != NULL)
restoreMaskGivenReturns(ci->savedMask);
}
/** Emit code to check for an "all off" mask before the code for a
case or default label in a "switch" statement.
*/
void
FunctionEmitContext::addSwitchMaskCheck(llvm::Value *mask) {
llvm::Value *allOff = None(mask);
llvm::BasicBlock *bbSome = CreateBasicBlock("case_default_on");
// Find the basic block for the case or default label immediately after
// the current one in the switch statement--that's where we want to
// jump if the mask is all off at this label.
Assert(nextBlocks->find(bblock) != nextBlocks->end());
llvm::BasicBlock *bbNext = nextBlocks->find(bblock)->second;
// Jump to the next one of the mask is all off; otherwise jump to the
// newly created block that will hold the actual code for this label.
BranchInst(bbNext, bbSome, allOff);
SetCurrentBasicBlock(bbSome);
}
/** Returns the execution mask at entry to the first enclosing "switch"
statement. */
llvm::Value *
FunctionEmitContext::getMaskAtSwitchEntry() {
Assert(controlFlowInfo.size() > 0);
int i = controlFlowInfo.size() - 1;
while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Switch)
--i;
Assert(i != -1);
return controlFlowInfo[i]->savedMask;
}
void
FunctionEmitContext::EmitDefaultLabel(bool checkMask, SourcePos pos) {
if (inSwitchStatement() == false) {
Error(pos, "\"default\" label illegal outside of \"switch\" "
"statement.");
return;
}
// If there's a default label in the switch, a basic block for it
// should have been provided in the previous call to SwitchInst().
Assert(defaultBlock != NULL);
if (bblock != NULL)
// The previous case in the switch fell through, or we're in a
// varying switch; terminate the current block with a jump to the
// block for the code for the default label.
BranchInst(defaultBlock);
SetCurrentBasicBlock(defaultBlock);
if (switchConditionWasUniform)
// Nothing more to do for this case; return back to the caller,
// which will then emit the code for the default case.
return;
// For a varying switch, we need to update the execution mask.
//
// First, compute the mask that corresponds to which program instances
// should execute the "default" code; this corresponds to the set of
// program instances that don't match any of the case statements.
// Therefore, we generate code that compares the value of the switch
// expression to the value associated with each of the "case"
// statements such that the surviving lanes didn't match any of them.
llvm::Value *matchesDefault = getMaskAtSwitchEntry();
for (int i = 0; i < (int)caseBlocks->size(); ++i) {
int value = (*caseBlocks)[i].first;
llvm::Value *valueVec = (switchExpr->getType() == LLVMTypes::Int32VectorType) ?
LLVMInt32Vector(value) : LLVMInt64Vector(value);
// TODO: for AVX2 at least, the following generates better code
// than doing ICMP_NE and skipping the NotOperator() below; file a
// LLVM bug?
llvm::Value *matchesCaseValue =
CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr,
valueVec, "cmp_case_value");
matchesCaseValue = I1VecToBoolVec(matchesCaseValue);
llvm::Value *notMatchesCaseValue = NotOperator(matchesCaseValue);
matchesDefault = BinaryOperator(llvm::Instruction::And, matchesDefault,
notMatchesCaseValue, "default&~case_match");
}
// The mask may have some lanes on, which corresponds to the previous
// label falling through; compute the updated mask by ANDing with the
// current mask.
llvm::Value *oldMask = GetInternalMask();
llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, oldMask,
matchesDefault, "old_mask|matches_default");
SetInternalMask(newMask);
if (checkMask)
addSwitchMaskCheck(newMask);
}
void
FunctionEmitContext::EmitCaseLabel(int value, bool checkMask, SourcePos pos) {
if (inSwitchStatement() == false) {
Error(pos, "\"case\" label illegal outside of \"switch\" statement.");
return;
}
// Find the basic block for this case statement.
llvm::BasicBlock *bbCase = NULL;
Assert(caseBlocks != NULL);
for (int i = 0; i < (int)caseBlocks->size(); ++i)
if ((*caseBlocks)[i].first == value) {
bbCase = (*caseBlocks)[i].second;
break;
}
Assert(bbCase != NULL);
if (bblock != NULL)
// fall through from the previous case
BranchInst(bbCase);
SetCurrentBasicBlock(bbCase);
if (switchConditionWasUniform)
return;
// update the mask: first, get a mask that indicates which program
// instances have a value for the switch expression that matches this
// case statement.
llvm::Value *valueVec = (switchExpr->getType() == LLVMTypes::Int32VectorType) ?
LLVMInt32Vector(value) : LLVMInt64Vector(value);
llvm::Value *matchesCaseValue =
CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr,
valueVec, "cmp_case_value");
matchesCaseValue = I1VecToBoolVec(matchesCaseValue);
// If a lane was off going into the switch, we don't care if has a
// value in the switch expression that happens to match this case.
llvm::Value *entryMask = getMaskAtSwitchEntry();
matchesCaseValue = BinaryOperator(llvm::Instruction::And, entryMask,
matchesCaseValue, "entry_mask&case_match");
// Take the surviving lanes and turn on the mask for them.
llvm::Value *oldMask = GetInternalMask();
llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, oldMask,
matchesCaseValue, "mask|case_match");
SetInternalMask(newMask);
if (checkMask)
addSwitchMaskCheck(newMask);
}
void
FunctionEmitContext::SwitchInst(llvm::Value *expr, llvm::BasicBlock *bbDefault,
const std::vector<std::pair<int, llvm::BasicBlock *> > &bbCases,
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &bbNext) {
// The calling code should have called StartSwitch() before calling
// SwitchInst().
Assert(controlFlowInfo.size() &&
controlFlowInfo.back()->IsSwitch());
switchExpr = expr;
defaultBlock = bbDefault;
caseBlocks = new std::vector<std::pair<int, llvm::BasicBlock *> >(bbCases);
nextBlocks = new std::map<llvm::BasicBlock *, llvm::BasicBlock *>(bbNext);
switchConditionWasUniform =
(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(expr->getType()) == false);
if (switchConditionWasUniform == true) {
// For a uniform switch condition, just wire things up to the LLVM
// switch instruction.
llvm::SwitchInst *s = llvm::SwitchInst::Create(expr, bbDefault,
bbCases.size(), bblock);
for (int i = 0; i < (int)bbCases.size(); ++i) {
if (expr->getType() == LLVMTypes::Int32Type)
s->addCase(LLVMInt32(bbCases[i].first), bbCases[i].second);
else {
Assert(expr->getType() == LLVMTypes::Int64Type);
s->addCase(LLVMInt64(bbCases[i].first), bbCases[i].second);
}
}
AddDebugPos(s);
// switch is a terminator
bblock = NULL;
}
else {
// For a varying switch, we first turn off all lanes of the mask
SetInternalMask(LLVMMaskAllOff);
if (nextBlocks->size() > 0) {
// If there are any labels inside the switch, jump to the first
// one; any code before the first label won't be executed by
// anyone.
std::map<llvm::BasicBlock *, llvm::BasicBlock *>::const_iterator iter;
iter = nextBlocks->find(NULL);
Assert(iter != nextBlocks->end());
llvm::BasicBlock *bbFirst = iter->second;
BranchInst(bbFirst);
bblock = NULL;
}
}
}
int
FunctionEmitContext::VaryingCFDepth() const {
int sum = 0;
for (unsigned int i = 0; i < controlFlowInfo.size(); ++i)
if (controlFlowInfo[i]->IsVaryingType())
if (controlFlowInfo[i]->IsVarying())
++sum;
return sum;
}
@@ -777,6 +1105,41 @@ FunctionEmitContext::InForeachLoop() const {
}
bool
FunctionEmitContext::initLabelBBlocks(ASTNode *node, void *data) {
LabeledStmt *ls = dynamic_cast<LabeledStmt *>(node);
if (ls == NULL)
return true;
FunctionEmitContext *ctx = (FunctionEmitContext *)data;
if (ctx->labelMap.find(ls->name) != ctx->labelMap.end())
Error(ls->pos, "Multiple labels named \"%s\" in function.",
ls->name.c_str());
else {
llvm::BasicBlock *bb = ctx->CreateBasicBlock(ls->name.c_str());
ctx->labelMap[ls->name] = bb;
}
return true;
}
void
FunctionEmitContext::InitializeLabelMap(Stmt *code) {
labelMap.erase(labelMap.begin(), labelMap.end());
WalkAST(code, initLabelBBlocks, NULL, this);
}
llvm::BasicBlock *
FunctionEmitContext::GetLabeledBasicBlock(const std::string &label) {
if (labelMap.find(label) != labelMap.end())
return labelMap[label];
else
return NULL;
}
void
FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
const Type *returnType = function->GetReturnType();
@@ -869,6 +1232,14 @@ FunctionEmitContext::All(llvm::Value *mask) {
}
llvm::Value *
FunctionEmitContext::None(llvm::Value *mask) {
llvm::Value *mmval = LaneMask(mask);
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
LLVMInt32(0), "none_mm_cmp");
}
llvm::Value *
FunctionEmitContext::LaneMask(llvm::Value *v) {
// Call the target-dependent movmsk function to turn the vector mask
@@ -920,8 +1291,7 @@ FunctionEmitContext::GetStringPtr(const std::string &str) {
llvm::BasicBlock *
FunctionEmitContext::CreateBasicBlock(const char *name) {
llvm::Function *function = bblock->getParent();
return llvm::BasicBlock::Create(*g->ctx, name, function);
return llvm::BasicBlock::Create(*g->ctx, name, llvmFunction);
}
@@ -2597,3 +2967,37 @@ FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr,
return BinaryOperator(llvm::Instruction::Add, ptr, offset);
}
CFInfo *
FunctionEmitContext::popCFState() {
Assert(controlFlowInfo.size() > 0);
CFInfo *ci = controlFlowInfo.back();
controlFlowInfo.pop_back();
if (ci->IsSwitch()) {
breakTarget = ci->savedBreakTarget;
continueTarget = ci->savedContinueTarget;
breakLanesPtr = ci->savedBreakLanesPtr;
continueLanesPtr = ci->savedContinueLanesPtr;
loopMask = ci->savedLoopMask;
switchExpr = ci->savedSwitchExpr;
defaultBlock = ci->savedDefaultBlock;
caseBlocks = ci->savedCaseBlocks;
nextBlocks = ci->savedNextBlocks;
switchConditionWasUniform = ci->savedSwitchConditionWasUniform;
}
else if (ci->IsLoop() || ci->IsForeach()) {
breakTarget = ci->savedBreakTarget;
continueTarget = ci->savedContinueTarget;
breakLanesPtr = ci->savedBreakLanesPtr;
continueLanesPtr = ci->savedContinueLanesPtr;
loopMask = ci->savedLoopMask;
}
else {
Assert(ci->IsIf());
// nothing to do
}
return ci;
}

124
ctx.h
View File

@@ -39,6 +39,7 @@
#define ISPC_CTX_H 1
#include "ispc.h"
#include <map>
#include <llvm/InstrTypes.h>
#include <llvm/Instructions.h>
#include <llvm/Analysis/DIBuilder.h>
@@ -160,10 +161,8 @@ public:
void EndLoop();
/** Indicates that code generation for a 'foreach' or 'foreach_tiled'
loop is about to start. The provided basic block pointer indicates
where control flow should go if a 'continue' statement is executed
in the loop. */
void StartForeach(llvm::BasicBlock *continueTarget);
loop is about to start. */
void StartForeach();
void EndForeach();
/** Emit code for a 'break' statement in a loop. If doCoherenceCheck
@@ -186,12 +185,62 @@ public:
previous iteration. */
void RestoreContinuedLanes();
/** Indicates that code generation for a "switch" statement is about to
start. isUniform indicates whether the "switch" value is uniform,
and bbAfterSwitch gives the basic block immediately following the
"switch" statement. (For example, if the switch condition is
uniform, we jump here upon executing a "break" statement.) */
void StartSwitch(bool isUniform, llvm::BasicBlock *bbAfterSwitch);
/** Indicates the end of code generation for a "switch" statement. */
void EndSwitch();
/** Emits code for a "switch" statement in the program.
@param expr Gives the value of the expression after the "switch"
@param defaultBlock Basic block to execute for the "default" case. This
should be NULL if there is no "default" label inside
the switch.
@param caseBlocks vector that stores the mapping from label values
after "case" statements to basic blocks corresponding
to the "case" labels.
@param nextBlocks For each basic block for a "case" or "default"
label, this gives the basic block for the
immediately-following "case" or "default" label (or
the basic block after the "switch" statement for the
last label.)
*/
void SwitchInst(llvm::Value *expr, llvm::BasicBlock *defaultBlock,
const std::vector<std::pair<int, llvm::BasicBlock *> > &caseBlocks,
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &nextBlocks);
/** Generates code for a "default" label after a "switch" statement.
The checkMask parameter indicates whether additional code should be
generated to check to see if the execution mask is all off after
the default label (in which case a jump to the following label will
be issued. */
void EmitDefaultLabel(bool checkMask, SourcePos pos);
/** Generates code for a "case" label after a "switch" statement. See
the documentation for EmitDefaultLabel() for discussion of the
checkMask parameter. */
void EmitCaseLabel(int value, bool checkMask, SourcePos pos);
/** Returns the current number of nested levels of 'varying' control
flow */
int VaryingCFDepth() const;
bool InForeachLoop() const;
void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
/** Step through the code and find label statements; create a basic
block for each one, so that subsequent calls to
GetLabeledBasicBlock() return the corresponding basic block. */
void InitializeLabelMap(Stmt *code);
/** If there is a label in the function with the given name, return the
new basic block that it starts. */
llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);
/** Called to generate code for 'return' statement; value is the
expression in the return statement (if non-NULL), and
doCoherenceCheck indicates whether instructions should be generated
@@ -211,6 +260,10 @@ public:
i1 value that indicates if all of the mask lanes are on. */
llvm::Value *All(llvm::Value *mask);
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
i1 value that indicates if all of the mask lanes are off. */
llvm::Value *None(llvm::Value *mask);
/** Given a boolean mask value of type LLVMTypes::MaskType, return an
i32 value wherein the i'th bit is on if and only if the i'th lane
of the mask is on. */
@@ -446,6 +499,9 @@ private:
/** Pointer to the Function for which we're currently generating code. */
Function *function;
/** LLVM function representation for the current function. */
llvm::Function *llvmFunction;
/** The basic block into which we add any alloca instructions that need
to go at the very start of the function. */
llvm::BasicBlock *allocaBlock;
@@ -479,10 +535,10 @@ private:
the loop. */
llvm::Value *loopMask;
/** If currently in a loop body, this is a pointer to memory to store a
mask value that represents which of the lanes have executed a
'break' statement. If we're not in a loop body, this should be
NULL. */
/** If currently in a loop body or switch statement, this is a pointer
to memory to store a mask value that represents which of the lanes
have executed a 'break' statement. If we're not in a loop body or
switch, this should be NULL. */
llvm::Value *breakLanesPtr;
/** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
@@ -490,16 +546,49 @@ private:
'continue' statement. */
llvm::Value *continueLanesPtr;
/** If we're inside a loop, this gives the basic block immediately
after the current loop, which we will jump to if all of the lanes
have executed a break statement or are otherwise done with the
loop. */
/** If we're inside a loop or switch statement, this gives the basic
block immediately after the current loop or switch, which we will
jump to if all of the lanes have executed a break statement or are
otherwise done with it. */
llvm::BasicBlock *breakTarget;
/** If we're inside a loop, this gives the block to jump to if all of
the running lanes have executed a 'continue' statement. */
llvm::BasicBlock *continueTarget;
/** @name Switch statement state
These variables store various state that's active when we're
generating code for a switch statement. They should all be NULL
outside of a switch.
@{
*/
/** The value of the expression used to determine which case in the
statements after the switch to execute. */
llvm::Value *switchExpr;
/** Map from case label numbers to the basic block that will hold code
for that case. */
const std::vector<std::pair<int, llvm::BasicBlock *> > *caseBlocks;
/** The basic block of code to run for the "default" label in the
switch statement. */
llvm::BasicBlock *defaultBlock;
/** For each basic block for the code for cases (and the default label,
if present), this map gives the basic block for the immediately
following case/default label. */
const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *nextBlocks;
/** Records whether the switch condition was uniform; this is a
distinct notion from whether the switch represents uniform or
varying control flow; we may have varying control flow from a
uniform switch condition if there is a 'break' inside the switch
that's under varying control flow. */
bool switchConditionWasUniform;
/** @} */
/** A pointer to memory that records which of the program instances
have executed a 'return' statement (and are thus really truly done
running any more instructions in this functions. */
@@ -537,9 +626,13 @@ private:
tasks launched from the current function. */
llvm::Value *launchGroupHandlePtr;
std::map<std::string, llvm::BasicBlock *> labelMap;
static bool initLabelBBlocks(ASTNode *node, void *data);
llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
static void addGSMetadata(llvm::Value *inst, SourcePos pos);
bool ifsInLoopAllUniform() const;
bool ifsInCFAllUniform(int cfType) const;
void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
@@ -547,6 +640,11 @@ private:
const Type *ptrType);
void restoreMaskGivenReturns(llvm::Value *oldMask);
void addSwitchMaskCheck(llvm::Value *mask);
bool inSwitchStatement() const;
llvm::Value *getMaskAtSwitchEntry();
CFInfo *popCFState();
void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
llvm::Value *mask);

206
decl.cpp
View File

@@ -46,6 +46,18 @@
#include <stdio.h>
#include <set>
static void
lPrintTypeQualifiers(int typeQualifiers) {
if (typeQualifiers & TYPEQUAL_INLINE) printf("inline ");
if (typeQualifiers & TYPEQUAL_CONST) printf("const ");
if (typeQualifiers & TYPEQUAL_UNIFORM) printf("uniform ");
if (typeQualifiers & TYPEQUAL_VARYING) printf("varying ");
if (typeQualifiers & TYPEQUAL_TASK) printf("task ");
if (typeQualifiers & TYPEQUAL_SIGNED) printf("signed ");
if (typeQualifiers & TYPEQUAL_UNSIGNED) printf("unsigned ");
}
/** Given a Type and a set of type qualifiers, apply the type qualifiers to
the type, returning the type that is the result.
*/
@@ -54,6 +66,16 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
if (type == NULL)
return NULL;
if ((typeQualifiers & TYPEQUAL_CONST) != 0)
type = type->GetAsConstType();
if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
type = type->GetAsUniformType();
else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
type = type->GetAsVaryingType();
else
type = type->GetAsUnboundVariabilityType();
if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
Error(pos, "Illegal to apply both \"signed\" and \"unsigned\" "
@@ -64,29 +86,13 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
type = unsignedType;
else
Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
type->GetString().c_str());
type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
}
if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false)
Error(pos, "\"signed\" qualifier is illegal with non-integer type "
"\"%s\".", type->GetString().c_str());
if ((typeQualifiers & TYPEQUAL_CONST) != 0)
type = type->GetAsConstType();
if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
type = type->GetAsUniformType();
else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
type = type->GetAsVaryingType();
else {
// otherwise, structs are uniform by default and everything
// else is varying by default
if (dynamic_cast<const StructType *>(type->GetBaseType()) != NULL)
type = type->GetAsUniformType();
else
type = type->GetAsVaryingType();
}
"\"%s\".",
type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
return type;
}
@@ -138,21 +144,14 @@ lGetStorageClassName(StorageClass storageClass) {
void
DeclSpecs::Print() const {
printf("%s ", lGetStorageClassName(storageClass));
printf("Declspecs: [%s ", lGetStorageClassName(storageClass));
if (soaWidth > 0) printf("soa<%d> ", soaWidth);
if (typeQualifiers & TYPEQUAL_INLINE) printf("inline ");
if (typeQualifiers & TYPEQUAL_CONST) printf("const ");
if (typeQualifiers & TYPEQUAL_UNIFORM) printf("uniform ");
if (typeQualifiers & TYPEQUAL_VARYING) printf("varying ");
if (typeQualifiers & TYPEQUAL_TASK) printf("task ");
if (typeQualifiers & TYPEQUAL_SIGNED) printf("signed ");
if (typeQualifiers & TYPEQUAL_UNSIGNED) printf("unsigned ");
printf("%s", baseType->GetString().c_str());
lPrintTypeQualifiers(typeQualifiers);
printf("base type: %s", baseType->GetString().c_str());
if (vectorSize > 0) printf("<%d>", vectorSize);
printf("]");
}
@@ -192,19 +191,46 @@ Declarator::GetSymbol() const {
void
Declarator::Print() const {
Declarator::Print(int indent) const {
printf("%*cdeclarator: [", indent, ' ');
pos.Print();
lPrintTypeQualifiers(typeQualifiers);
Symbol *sym = GetSymbol();
if (sym != NULL)
printf("%s", sym->name.c_str());
else
printf("(null symbol)");
printf(", array size = %d", arraySize);
printf(", kind = ");
switch (kind) {
case DK_BASE: printf("base"); break;
case DK_POINTER: printf("pointer"); break;
case DK_REFERENCE: printf("reference"); break;
case DK_ARRAY: printf("array"); break;
case DK_FUNCTION: printf("function"); break;
default: FATAL("Unhandled declarator kind");
}
if (initExpr != NULL) {
printf(" = (");
initExpr->Print();
printf(")");
}
pos.Print();
if (functionParams.size() > 0) {
for (unsigned int i = 0; i < functionParams.size(); ++i) {
printf("\n%*cfunc param %d:\n", indent, ' ', i);
functionParams[i]->Print(indent+4);
}
}
if (child != NULL)
child->Print(indent + 4);
printf("]\n");
}
@@ -235,11 +261,13 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
Assert(d != NULL);
for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
Declaration *pdecl = d->functionParams[i];
Assert(pdecl->declarators.size() == 1);
funArgs->push_back(pdecl->declarators[0]->GetSymbol());
Symbol *sym = d->GetSymbolForFunctionParameter(i);
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
funArgs->push_back(sym);
}
funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying);
return funSym;
}
@@ -258,6 +286,12 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
if (kind != DK_FUNCTION && isTask)
Error(pos, "\"task\" qualifier illegal in variable declaration.");
Type::Variability variability = Type::Unbound;
if (hasUniformQual)
variability = Type::Uniform;
else if (hasVaryingQual)
variability = Type::Varying;
const Type *type = base;
switch (kind) {
case DK_BASE:
@@ -268,7 +302,7 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
return type;
case DK_POINTER:
type = new PointerType(type, hasUniformQual, isConst);
type = new PointerType(type, variability, isConst);
if (child != NULL)
return child->GetType(type, ds);
else
@@ -316,25 +350,7 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
for (unsigned int i = 0; i < functionParams.size(); ++i) {
Declaration *d = functionParams[i];
char buf[32];
Symbol *sym;
if (d->declarators.size() == 0) {
// function declaration like foo(float), w/o a name for
// the parameter
sprintf(buf, "__anon_parameter_%d", i);
sym = new Symbol(buf, pos);
sym->type = d->declSpecs->GetBaseType(pos);
}
else {
sym = d->declarators[0]->GetSymbol();
if (sym == NULL) {
// Handle more complex anonymous declarations like
// float (float **).
sprintf(buf, "__anon_parameter_%d", i);
sym = new Symbol(buf, d->declarators[0]->pos);
sym->type = d->declarators[0]->GetType(d->declSpecs);
}
}
Symbol *sym = GetSymbolForFunctionParameter(i);
if (d->declSpecs->storageClass != SC_NONE)
Error(sym->pos, "Storage class \"%s\" is illegal in "
@@ -418,9 +434,10 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
return NULL;
}
Type *functionType =
new FunctionType(returnType, args, pos, argNames, argDefaults,
const Type *functionType =
new FunctionType(returnType, args, argNames, argDefaults,
argPos, isTask, isExported, isExternC);
functionType = functionType->ResolveUnboundVariability(Type::Varying);
return child->GetType(functionType, ds);
}
default:
@@ -461,6 +478,35 @@ Declarator::GetType(DeclSpecs *ds) const {
}
Symbol *
Declarator::GetSymbolForFunctionParameter(int paramNum) const {
Assert(paramNum < (int)functionParams.size());
Declaration *d = functionParams[paramNum];
char buf[32];
Symbol *sym;
if (d->declarators.size() == 0) {
// function declaration like foo(float), w/o a name for
// the parameter
sprintf(buf, "__anon_parameter_%d", paramNum);
sym = new Symbol(buf, pos);
sym->type = d->declSpecs->GetBaseType(pos);
}
else {
Assert(d->declarators.size() == 1);
sym = d->declarators[0]->GetSymbol();
if (sym == NULL) {
// Handle more complex anonymous declarations like
// float (float **).
sprintf(buf, "__anon_parameter_%d", paramNum);
sym = new Symbol(buf, d->declarators[0]->pos);
sym->type = d->declarators[0]->GetType(d->declSpecs);
}
}
return sym;
}
///////////////////////////////////////////////////////////////////////////
// Declaration
@@ -489,19 +535,15 @@ Declaration::GetVariableDeclarations() const {
std::vector<VariableDeclaration> vars;
for (unsigned int i = 0; i < declarators.size(); ++i) {
if (declarators[i] == NULL)
continue;
Declarator *decl = declarators[i];
if (decl == NULL)
// Ignore earlier errors
continue;
Symbol *sym = decl->GetSymbol();
if (dynamic_cast<const FunctionType *>(sym->type) != NULL) {
// function declaration
m->symbolTable->AddFunction(sym);
}
else {
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
m->symbolTable->AddVariable(sym);
vars.push_back(VariableDeclaration(sym, decl->initExpr));
}
@@ -511,16 +553,36 @@ Declaration::GetVariableDeclarations() const {
void
Declaration::Print() const {
printf("Declaration: specs [");
declSpecs->Print();
printf("], declarators [");
for (unsigned int i = 0 ; i < declarators.size(); ++i) {
declarators[i]->Print();
printf("%s", (i == declarators.size() - 1) ? "]" : ", ");
Declaration::DeclareFunctions() {
Assert(declSpecs->storageClass != SC_TYPEDEF);
for (unsigned int i = 0; i < declarators.size(); ++i) {
Declarator *decl = declarators[i];
if (decl == NULL)
// Ignore earlier errors
continue;
Symbol *sym = decl->GetSymbol();
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
if (dynamic_cast<const FunctionType *>(sym->type) == NULL)
continue;
bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
m->AddFunctionDeclaration(sym, isInline);
}
}
void
Declaration::Print(int indent) const {
printf("%*cDeclaration: specs [", indent, ' ');
declSpecs->Print();
printf("], declarators:\n");
for (unsigned int i = 0 ; i < declarators.size(); ++i)
declarators[i]->Print(indent+4);
}
///////////////////////////////////////////////////////////////////////////
void
@@ -539,7 +601,7 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
DeclSpecs ds(type);
if (type->IsUniformType())
ds.typeQualifiers |= TYPEQUAL_UNIFORM;
else
else if (type->IsVaryingType())
ds.typeQualifiers |= TYPEQUAL_VARYING;
for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {

10
decl.h
View File

@@ -153,10 +153,12 @@ public:
declarator and symbols for its arguments in *args. */
Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
Symbol *GetSymbolForFunctionParameter(int paramNum) const;
/** Returns the symbol associated with the declarator. */
Symbol *GetSymbol() const;
void Print() const;
void Print(int indent) const;
/** Position of the declarator in the source program. */
const SourcePos pos;
@@ -199,7 +201,7 @@ public:
Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL);
Declaration(DeclSpecs *ds, Declarator *d);
void Print() const;
void Print(int indent) const;
/** This method walks through all of the Declarators in a declaration
and returns a fully-initialized Symbol and (possibly) and
@@ -208,6 +210,10 @@ public:
Declarator representation.) */
std::vector<VariableDeclaration> GetVariableDeclarations() const;
/** For any function declarations in the Declaration, add the
declaration to the module. */
void DeclareFunctions();
DeclSpecs *declSpecs;
std::vector<Declarator *> declarators;
};

View File

@@ -1,3 +1,43 @@
=== v1.1.3 === (20 January 2012)
With this release, the language now supports "switch" statements, with the
same semantics and syntax as in C.
This release includes fixes for two important performance related issues:
the quality of code generated for "foreach" statements has been
substantially improved (https://github.com/ispc/ispc/issues/151), and a
performance regression with code for "gathers" that was introduced in
v1.1.2 has been fixed in this release.
A number of other small bugs were fixed in this release as well, including
one where invalid memory would sometimes be incorrectly accessed
(https://github.com/ispc/ispc/issues/160).
Thanks to Jean-Luc Duprat for a number of patches that improve support for
building on various platforms, and to Pierre-Antoine Lacaze for patches so
that ispc builds under MinGW.
=== v1.1.2 === (9 January 2012)
The major new feature in this release is support for "generic" C++
vectorized output; in other words, ispc can emit C++ code that corresponds
to the vectorized computation that the ispc program represents. See the
examples/intrinsics directory in the ispc distribution for two example
implementations of the set of functions that must be provided map the
vector calls generated by ispc to target specific functions.
ispc now has partial support for 'goto' statements; specifically, goto is
allowed if any enclosing control flow statements (if/for/while/do) have
'uniform' test expressions, but not if they have 'varying' tests.
A number of improvements have been made to the code generated for gathers
and scatters--one of them (better matching x86's "free" scale by 2/4/8 for
addressing calculations) improved the performance of the noise example by
14%.
Many small bugs have been fixed in this release as well, including issue
numbers 138, 129, 135, 127, 149, and 142.
=== v1.1.1 === (15 December 2011)
This release doesn't include any significant new functionality, but does

View File

@@ -2,11 +2,11 @@
for i in ispc perfguide faq; do
rst2html.py --template=template.txt --link-stylesheet \
--stylesheet-path=css/style.css $i.txt > $i.html
--stylesheet-path=css/style.css $i.rst > $i.html
done
rst2html.py --template=template-perf.txt --link-stylesheet \
--stylesheet-path=css/style.css perf.txt > perf.html
--stylesheet-path=css/style.css perf.rst > perf.html
#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
#pdflatex ispc.tex

View File

@@ -1,10 +1,10 @@
=============================================================
Intel® SPMD Program Compiler Frequently Asked Questions (FAQ)
=============================================================
=====================================
Frequently Asked Questions About ispc
=====================================
This document includes a number of frequently (and not frequently) asked
questions about ispc, the Intel® SPMD Program Compiler. The source to this
document is in the file ``docs/faq.txt`` in the ``ispc`` source
document is in the file ``docs/faq.rst`` in the ``ispc`` source
distribution.
* Understanding ispc's Output

View File

@@ -99,7 +99,9 @@ Contents:
+ `Control Flow`_
* `Conditional Statements: "if"`_
* `Conditional Statements: "switch"`_
* `Basic Iteration Statements: "for", "while", and "do"`_
* `Unstructured Control Flow: "goto"`_
* `"Coherent" Control Flow Statements: "cif" and Friends`_
* `Parallel Iteration Statements: "foreach" and "foreach_tiled"`_
* `Parallel Iteration with "programIndex" and "programCount"`_
@@ -1140,7 +1142,7 @@ in C:
* Expression syntax and basic types
* Syntax for variable declarations
* Control flow structures: if, for, while, do
* Control flow structures: ``if``, ``for``, ``while``, ``do``, and ``switch``.
* Pointers, including function pointers, ``void *``, and C's array/pointer
duality (arrays are converted to pointers when passed to functions, etc.)
* Structs and arrays
@@ -1184,7 +1186,7 @@ but are likely to be supported in future releases:
``int64`` types
* Character constants
* String constants and arrays of characters as strings
* ``switch`` and ``goto`` statements
* ``goto`` statements are partially supported (see `Unstructured Control Flow: "goto"`_)
* ``union`` types
* Bitfield members of ``struct`` types
* Variable numbers of arguments to functions
@@ -1245,6 +1247,18 @@ Here are three ways of specifying the integer value "15":
int fifteen_hex = 0xf;
int fifteen_binary = 0b1111;
A number of suffixes can be provided with integer numeric constants.
First, "u" denotes that the constant is unsigned, and "ll" denotes a 64-bit
integer constant (while "l" denotes a 32-bit integer constant). It is also
possible to denote units of 1024, 1024*1024, or 1024*1024*1024 with the
SI-inspired suffixes "k", "M", and "G" respectively:
::
int two_kb = 2k; // 2048
int two_megs = 2M; // 2 * 1024 * 1024
int one_gig = 1G; // 1024 * 1024 * 1024
Floating-point constants can be specified in one of three ways. First,
they may be a sequence of zero or more digits from 0 to 9, followed by a
period, followed by zero or more digits from 0 to 9. (There must be at
@@ -1980,6 +1994,31 @@ executes if the condition is false.
else
x *= 2.;
Conditional Statements: "switch"
--------------------------------
The ``switch`` conditional statement is also available, again with the same
behavior as in C; the expression used in the ``switch`` must be of integer
type (but it can be uniform or varying). As in C, if there is no ``break``
statement at the end of the code for a given case, execution "falls
through" to the following case. These features are demonstrated in the
code below.
::
int x = ...;
switch (x) {
case 0:
case 1:
foo(x);
/* fall through */
case 5:
x = 0;
break;
default:
x *= x;
}
Basic Iteration Statements: "for", "while", and "do"
----------------------------------------------------
@@ -2005,6 +2044,37 @@ one of them executes a ``continue`` statement, other program instances
executing code in the loop body that didn't execute the ``continue`` will
be unaffected by it.
Unstructured Control Flow: "goto"
---------------------------------
``goto`` statements are allowed in ``ispc`` programs under limited
circumstances; specifically, only when the compiler can determine that if
any program instance executes a ``goto`` statement, then all of the program
instances will be running at that statement, such that all will follow the
``goto``.
Put another way: it's illegal for there to be "varying" control flow
statements in scopes that enclose a ``goto`` statement. An error is issued
if a ``goto`` is used in this situation.
The syntax for adding labels to ``ispc`` programs and jumping to them with
``goto`` is the same as in C. The following code shows a ``goto`` based
equivalent of a ``for`` loop where the induction variable ``i`` goes from
zero to ten.
::
uniform int i = 0;
check:
if (i > 10)
goto done;
// loop body
++i;
goto check;
done:
// ...
"Coherent" Control Flow Statements: "cif" and Friends
-----------------------------------------------------
@@ -3374,12 +3444,27 @@ pointer types.
System Information
------------------
A routine is available to find the number of CPU cores available in the
system:
The value of a high-precision hardware clock counter is returned by the
``clock()`` routine; its value increments by one each processor cycle.
Thus, taking the difference between the values returned by ``clock()`` and
different points in program execution gives the number of cycles between
those points in the program.
::
int num_cores()
uniform int64 clock()
Note that ``clock()`` flushes the processor pipeline. It has an overhead
of a hundred or so cycles, so for very fine-grained measurements, it may be
worthwhile to measure the cost of calling ``clock()`` and subtracting that
value from reported results.
A routine is also available to find the number of CPU cores available in
the system:
::
uniform int num_cores()
This value can be useful for adapting the granularity of parallel task
decomposition depending on the number of processors in the system.

View File

@@ -45,8 +45,7 @@
developers mailing list</a></li>
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
<li><a href="doxygen/index.html">Doxygen documentation of
<tt>ispc</tt> source code</a></li>
<li><a href="doxygen/index.html">Doxygen</a></li>
</ul>
</div>
</div>

View File

@@ -45,8 +45,7 @@
developers mailing list</a></li>
<li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
<li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
<li><a href="doxygen/index.html">Doxygen documentation of
<tt>ispc</tt> source code</a></li>
<li><a href="doxygen/index.html">Doxygen</a></li>
</ul>
</div>
</div>

View File

@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
# This could be handy for archiving the generated documentation or
# if some version control system is used.
PROJECT_NUMBER = 1.1.1
PROJECT_NUMBER = 1.1.3
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
# base path where the generated documentation will be put.

View File

@@ -82,7 +82,7 @@ static inline void vnormalize(vec &v) {
}
static inline void
static void
ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
float d = -dot(plane.p, plane.n);
float v = dot(ray.dir, plane.n);
@@ -124,7 +124,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
}
static inline void
static void
orthoBasis(vec basis[3], vec n) {
basis[2] = n;
basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
@@ -147,7 +147,7 @@ orthoBasis(vec basis[3], vec n) {
}
static inline float
static float
ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
RNGState &rngstate) {
float eps = 0.0001f;

View File

@@ -14,13 +14,13 @@ dirs:
clean:
/bin/rm -rf objs *~ ao
ao: dirs objs/ao.o objs/instrument.o objs/ao_ispc.o
$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/instrument.o -lm -lpthread
ao: objs/ao.o objs/instrument.o objs/ao_ispc.o ../tasksys.cpp
$(CXX) $(CXXFLAGS) -o $@ $^ -lm -lpthread
objs/%.o: %.cpp
objs/%.o: %.cpp dirs
$(CXX) $< $(CXXFLAGS) -c -o $@
objs/ao.o: objs/ao_ispc.h
objs/%_ispc.h objs/%_ispc.o: %.ispc
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
objs/%_ispc.h objs/%_ispc.o: %.ispc dirs
$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_instrumented_ispc.h

View File

@@ -251,6 +251,14 @@ static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) { \
return cond ? a : b; \
}
#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP) \
static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \
TYPE ret; \
for (int i = 0; i < 16; ++i) \
ret.v[i] = (CAST)(a.v[i]) OP b; \
return ret; \
}
#define SMEAR(VTYPE, NAME, STYPE) \
static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \
VTYPE ret; \
@@ -307,6 +315,12 @@ static FORCEINLINE uint32_t __movmsk(__vec16_i1 mask) {
return mask.v;
}
static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
__vec16_i1 r;
r.v = (a.v & b.v) | (~a.v & ~b.v);
return r;
}
static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
__vec16_i1 r;
r.v = a.v & b.v;
@@ -380,6 +394,10 @@ BINARY_OP_CAST(__vec16_i8, int8_t, __srem, %)
BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>)
BINARY_OP_CAST(__vec16_i8, int8_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
CMP_OP(__vec16_i8, int8_t, __equal, ==)
CMP_OP(__vec16_i8, int8_t, __not_equal, !=)
CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=)
@@ -419,6 +437,10 @@ BINARY_OP_CAST(__vec16_i16, int16_t, __srem, %)
BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>)
BINARY_OP_CAST(__vec16_i16, int16_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
CMP_OP(__vec16_i16, int16_t, __equal, ==)
CMP_OP(__vec16_i16, int16_t, __not_equal, !=)
CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=)
@@ -458,6 +480,10 @@ BINARY_OP_CAST(__vec16_i32, int32_t, __srem, %)
BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>)
BINARY_OP_CAST(__vec16_i32, int32_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
CMP_OP(__vec16_i32, int32_t, __equal, ==)
CMP_OP(__vec16_i32, int32_t, __not_equal, !=)
CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=)
@@ -497,6 +523,10 @@ BINARY_OP_CAST(__vec16_i64, int64_t, __srem, %)
BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
BINARY_OP_CAST(__vec16_i64, int64_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
CMP_OP(__vec16_i64, int64_t, __equal, ==)
CMP_OP(__vec16_i64, int64_t, __not_equal, !=)
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
@@ -932,7 +962,7 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
///////////////////////////////////////////////////////////////////////////
// masked load/store
static FORCEINLINE __vec16_i8 __masked_load_8(unsigned char *p,
static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
__vec16_i1 mask) {
__vec16_i8 ret;
int8_t *ptr = (int8_t *)p;
@@ -942,7 +972,7 @@ static FORCEINLINE __vec16_i8 __masked_load_8(unsigned char *p,
return ret;
}
static FORCEINLINE __vec16_i16 __masked_load_16(unsigned char *p,
static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
__vec16_i1 mask) {
__vec16_i16 ret;
int16_t *ptr = (int16_t *)p;
@@ -952,7 +982,7 @@ static FORCEINLINE __vec16_i16 __masked_load_16(unsigned char *p,
return ret;
}
static FORCEINLINE __vec16_i32 __masked_load_32(unsigned char *p,
static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
__vec16_i1 mask) {
__vec16_i32 ret;
int32_t *ptr = (int32_t *)p;
@@ -962,7 +992,7 @@ static FORCEINLINE __vec16_i32 __masked_load_32(unsigned char *p,
return ret;
}
static FORCEINLINE __vec16_i64 __masked_load_64(unsigned char *p,
static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
__vec16_i1 mask) {
__vec16_i64 ret;
int64_t *ptr = (int64_t *)p;
@@ -972,7 +1002,7 @@ static FORCEINLINE __vec16_i64 __masked_load_64(unsigned char *p,
return ret;
}
static FORCEINLINE void __masked_store_8(unsigned char *p, __vec16_i8 val,
static FORCEINLINE void __masked_store_8(void *p, __vec16_i8 val,
__vec16_i1 mask) {
int8_t *ptr = (int8_t *)p;
for (int i = 0; i < 16; ++i)
@@ -980,7 +1010,7 @@ static FORCEINLINE void __masked_store_8(unsigned char *p, __vec16_i8 val,
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_16(unsigned char *p, __vec16_i16 val,
static FORCEINLINE void __masked_store_16(void *p, __vec16_i16 val,
__vec16_i1 mask) {
int16_t *ptr = (int16_t *)p;
for (int i = 0; i < 16; ++i)
@@ -988,7 +1018,7 @@ static FORCEINLINE void __masked_store_16(unsigned char *p, __vec16_i16 val,
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_32(unsigned char *p, __vec16_i32 val,
static FORCEINLINE void __masked_store_32(void *p, __vec16_i32 val,
__vec16_i1 mask) {
int32_t *ptr = (int32_t *)p;
for (int i = 0; i < 16; ++i)
@@ -996,7 +1026,7 @@ static FORCEINLINE void __masked_store_32(unsigned char *p, __vec16_i32 val,
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_64(unsigned char *p, __vec16_i64 val,
static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
__vec16_i1 mask) {
int64_t *ptr = (int64_t *)p;
for (int i = 0; i < 16; ++i)
@@ -1004,19 +1034,41 @@ static FORCEINLINE void __masked_store_64(unsigned char *p, __vec16_i64 val,
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_blend_8(void *p, __vec16_i8 val,
__vec16_i1 mask) {
__masked_store_8(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_16(void *p, __vec16_i16 val,
__vec16_i1 mask) {
__masked_store_16(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_32(void *p, __vec16_i32 val,
__vec16_i1 mask) {
__masked_store_32(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
__vec16_i1 mask) {
__masked_store_64(p, val, mask);
}
///////////////////////////////////////////////////////////////////////////
// gather/scatter
// offsets * offsetScale is in bytes (for all of these)
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
__vec16_i1 mask) { \
VTYPE ret; \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
ret.v[i] = *ptr; \
} \
return ret; \
@@ -1054,13 +1106,15 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
// scatter
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
#define SCATTER_BASE_VARYINGOFFSET(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
VTYPE val, __vec16_i1 mask) { \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
*ptr = val.v[i]; \
} \
}

View File

@@ -51,8 +51,8 @@
#define FORCEINLINE __attribute__((always_inline)) inline
#endif
//CO#undef FORCEINLINE
//CO#define FORCEINLINE
#undef FORCEINLINE
#define FORCEINLINE
typedef float __vec1_f;
typedef double __vec1_d;
@@ -228,6 +228,10 @@ static FORCEINLINE uint32_t __movmsk(__vec4_i1 mask) {
return _mm_movemask_ps(mask.v);
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i1 a, __vec4_i1 b) {
return _mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v));
}
static FORCEINLINE __vec4_i1 __and(__vec4_i1 a, __vec4_i1 b) {
return _mm_and_ps(a.v, b.v);
}
@@ -299,6 +303,13 @@ static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, __vec4_i8 b) {
_mm_extract_epi8(a.v, 3) << _mm_extract_epi8(b.v, 3));
}
static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, int32_t b) {
return __vec4_i8(_mm_extract_epi8(a.v, 0) << b,
_mm_extract_epi8(a.v, 1) << b,
_mm_extract_epi8(a.v, 2) << b,
_mm_extract_epi8(a.v, 3) << b);
}
static FORCEINLINE __vec4_i8 __udiv(__vec4_i8 a, __vec4_i8 b) {
return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) /
(uint8_t)_mm_extract_epi8(b.v, 0),
@@ -354,6 +365,13 @@ static FORCEINLINE __vec4_i8 __lshr(__vec4_i8 a, __vec4_i8 b) {
(uint8_t)_mm_extract_epi8(b.v, 3));
}
static FORCEINLINE __vec4_i8 __lshr(__vec4_i8 a, int32_t b) {
return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) >> b,
(uint8_t)_mm_extract_epi8(a.v, 1) >> b,
(uint8_t)_mm_extract_epi8(a.v, 2) >> b,
(uint8_t)_mm_extract_epi8(a.v, 3) >> b);
}
static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, __vec4_i8 b) {
return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) >>
(int8_t)_mm_extract_epi8(b.v, 0),
@@ -365,6 +383,13 @@ static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, __vec4_i8 b) {
(int8_t)_mm_extract_epi8(b.v, 3));
}
static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, int32_t b) {
return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) >> b,
(int8_t)_mm_extract_epi8(a.v, 1) >> b,
(int8_t)_mm_extract_epi8(a.v, 2) >> b,
(int8_t)_mm_extract_epi8(a.v, 3) >> b);
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i8 a, __vec4_i8 b) {
__m128i cmp = _mm_cmpeq_epi8(a.v, b.v);
return __vec4_i1(_mm_extract_epi8(cmp, 0),
@@ -543,6 +568,10 @@ static FORCEINLINE __vec4_i16 __shl(__vec4_i16 a, __vec4_i16 b) {
_mm_extract_epi16(a.v, 3) << _mm_extract_epi16(b.v, 3));
}
static FORCEINLINE __vec4_i16 __shl(__vec4_i16 a, int32_t b) {
return _mm_sll_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i16 __udiv(__vec4_i16 a, __vec4_i16 b) {
return __vec4_i16((uint16_t)_mm_extract_epi16(a.v, 0) /
(uint16_t)_mm_extract_epi16(b.v, 0),
@@ -598,6 +627,10 @@ static FORCEINLINE __vec4_i16 __lshr(__vec4_i16 a, __vec4_i16 b) {
(uint16_t)_mm_extract_epi16(b.v, 3));
}
static FORCEINLINE __vec4_i16 __lshr(__vec4_i16 a, int32_t b) {
return _mm_srl_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, __vec4_i16 b) {
return __vec4_i16((int16_t)_mm_extract_epi16(a.v, 0) >>
(int16_t)_mm_extract_epi16(b.v, 0),
@@ -609,6 +642,10 @@ static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, __vec4_i16 b) {
(int16_t)_mm_extract_epi16(b.v, 3));
}
static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, int32_t b) {
return _mm_sra_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i16 a, __vec4_i16 b) {
__m128i cmp = _mm_cmpeq_epi16(a.v, b.v);
return __vec4_i1(_mm_extract_epi16(cmp, 0),
@@ -785,9 +822,6 @@ static FORCEINLINE __vec4_i32 __xor(__vec4_i32 a, __vec4_i32 b) {
}
static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, __vec4_i32 b) {
// FIXME: if we can determine at compile time that b has the same value
// across all elements, then we can use _mm_sll_epi32.
/* fixme: llvm generates thie code for shift left, which is presumably
more efficient than doing each component individually as below.
@@ -809,57 +843,92 @@ _f___ii: ## @f___ii
ret
*/
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) << _mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) << _mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) << _mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) << _mm_extract_epi32(b.v, 3));
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) <<
_mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) <<
_mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) <<
_mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) <<
_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, int32_t b) {
return _mm_sll_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i32 __udiv(__vec4_i32 a, __vec4_i32 b) {
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) / (uint32_t)_mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) / (uint32_t)_mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) / (uint32_t)_mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) / (uint32_t)_mm_extract_epi32(b.v, 3));
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) /
(uint32_t)_mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) /
(uint32_t)_mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) /
(uint32_t)_mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) /
(uint32_t)_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __sdiv(__vec4_i32 a, __vec4_i32 b) {
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) / (int32_t)_mm_extract_epi32(b.v, 0),
(int32_t)_mm_extract_epi32(a.v, 1) / (int32_t)_mm_extract_epi32(b.v, 1),
(int32_t)_mm_extract_epi32(a.v, 2) / (int32_t)_mm_extract_epi32(b.v, 2),
(int32_t)_mm_extract_epi32(a.v, 3) / (int32_t)_mm_extract_epi32(b.v, 3));
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) /
(int32_t)_mm_extract_epi32(b.v, 0),
(int32_t)_mm_extract_epi32(a.v, 1) /
(int32_t)_mm_extract_epi32(b.v, 1),
(int32_t)_mm_extract_epi32(a.v, 2) /
(int32_t)_mm_extract_epi32(b.v, 2),
(int32_t)_mm_extract_epi32(a.v, 3) /
(int32_t)_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __urem(__vec4_i32 a, __vec4_i32 b) {
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) % (uint32_t)_mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) % (uint32_t)_mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) % (uint32_t)_mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) % (uint32_t)_mm_extract_epi32(b.v, 3));
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) %
(uint32_t)_mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) %
(uint32_t)_mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) %
(uint32_t)_mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) %
(uint32_t)_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __srem(__vec4_i32 a, __vec4_i32 b) {
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) % (int32_t)_mm_extract_epi32(b.v, 0),
(int32_t)_mm_extract_epi32(a.v, 1) % (int32_t)_mm_extract_epi32(b.v, 1),
(int32_t)_mm_extract_epi32(a.v, 2) % (int32_t)_mm_extract_epi32(b.v, 2),
(int32_t)_mm_extract_epi32(a.v, 3) % (int32_t)_mm_extract_epi32(b.v, 3));
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) %
(int32_t)_mm_extract_epi32(b.v, 0),
(int32_t)_mm_extract_epi32(a.v, 1) %
(int32_t)_mm_extract_epi32(b.v, 1),
(int32_t)_mm_extract_epi32(a.v, 2) %
(int32_t)_mm_extract_epi32(b.v, 2),
(int32_t)_mm_extract_epi32(a.v, 3) %
(int32_t)_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __lshr(__vec4_i32 a, __vec4_i32 b) {
// FIXME: if we can determine at compile time that b has the same value
// across all elements, e.g. using gcc's __builtin_constant_p, then we
// can use _mm_srl_epi32.
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) >>
_mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) >>
_mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) >>
_mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) >>
_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __lshr(__vec4_i32 a, int32_t b) {
return _mm_srl_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, __vec4_i32 b) {
// FIXME: if we can determine at compile time that b has the same value
// across all elements, then we can use _mm_sra_epi32.
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
(int32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
(int32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
(int32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) >>
_mm_extract_epi32(b.v, 0),
(int32_t)_mm_extract_epi32(a.v, 1) >>
_mm_extract_epi32(b.v, 1),
(int32_t)_mm_extract_epi32(a.v, 2) >>
_mm_extract_epi32(b.v, 2),
(int32_t)_mm_extract_epi32(a.v, 3) >>
_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, int32_t b) {
return _mm_sra_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i32 a, __vec4_i32 b) {
@@ -1012,6 +1081,12 @@ static FORCEINLINE __vec4_i64 __shl(__vec4_i64 a, __vec4_i64 b) {
_mm_extract_epi64(a.v[1], 1) << _mm_extract_epi64(b.v[1], 1));
}
static FORCEINLINE __vec4_i64 __shl(__vec4_i64 a, int32_t b) {
__m128i amt = _mm_set_epi32(0, 0, 0, b);
return __vec4_i64(_mm_sll_epi64(a.v[0], amt),
_mm_sll_epi64(a.v[1], amt));
}
static FORCEINLINE __vec4_i64 __udiv(__vec4_i64 a, __vec4_i64 b) {
return __vec4_i64((uint64_t)_mm_extract_epi64(a.v[0], 0) /
(uint64_t)_mm_extract_epi64(b.v[0], 0),
@@ -1067,6 +1142,12 @@ static FORCEINLINE __vec4_i64 __lshr(__vec4_i64 a, __vec4_i64 b) {
(uint64_t)_mm_extract_epi64(b.v[1], 1));
}
static FORCEINLINE __vec4_i64 __lshr(__vec4_i64 a, int32_t b) {
__m128i amt = _mm_set_epi32(0, 0, 0, b);
return __vec4_i64(_mm_srl_epi64(a.v[0], amt),
_mm_srl_epi64(a.v[1], amt));
}
static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, __vec4_i64 b) {
return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) >>
(int64_t)_mm_extract_epi64(b.v[0], 0),
@@ -1078,6 +1159,13 @@ static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, __vec4_i64 b) {
(int64_t)_mm_extract_epi64(b.v[1], 1));
}
static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, int32_t b) {
return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) >> b,
(int64_t)_mm_extract_epi64(a.v[0], 1) >> b,
(int64_t)_mm_extract_epi64(a.v[1], 0) >> b,
(int64_t)_mm_extract_epi64(a.v[1], 1) >> b);
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i64 a, __vec4_i64 b) {
__m128i cmp0 = _mm_cmpeq_epi64(a.v[0], b.v[0]);
__m128i cmp1 = _mm_cmpeq_epi64(a.v[1], b.v[1]);
@@ -2324,7 +2412,7 @@ static FORCEINLINE uint64_t __reduce_max_uint64(__vec4_i64 v) {
///////////////////////////////////////////////////////////////////////////
// masked load/store
static FORCEINLINE __vec4_i8 __masked_load_8(unsigned char *p,
static FORCEINLINE __vec4_i8 __masked_load_8(void *p,
__vec4_i1 mask) {
int8_t r[4];
int8_t *ptr = (int8_t *)p;
@@ -2344,7 +2432,7 @@ static FORCEINLINE __vec4_i8 __masked_load_8(unsigned char *p,
return __vec4_i8(r[0], r[1], r[2], r[3]);
}
static FORCEINLINE __vec4_i16 __masked_load_16(unsigned char *p,
static FORCEINLINE __vec4_i16 __masked_load_16(void *p,
__vec4_i1 mask) {
int16_t r[4];
int16_t *ptr = (int16_t *)p;
@@ -2368,7 +2456,7 @@ static FORCEINLINE __vec4_i16 __masked_load_16(unsigned char *p,
return __vec4_i16(r[0], r[1], r[2], r[3]);
}
static FORCEINLINE __vec4_i32 __masked_load_32(unsigned char *p,
static FORCEINLINE __vec4_i32 __masked_load_32(void *p,
__vec4_i1 mask) {
__m128i r = _mm_set_epi32(0, 0, 0, 0);
int32_t *ptr = (int32_t *)p;
@@ -2391,7 +2479,7 @@ static FORCEINLINE __vec4_i32 __masked_load_32(unsigned char *p,
return r;
}
static FORCEINLINE __vec4_i64 __masked_load_64(unsigned char *p,
static FORCEINLINE __vec4_i64 __masked_load_64(void *p,
__vec4_i1 mask) {
uint64_t r[4];
uint64_t *ptr = (uint64_t *)p;
@@ -2414,7 +2502,7 @@ static FORCEINLINE __vec4_i64 __masked_load_64(unsigned char *p,
return __vec4_i64(r[0], r[1], r[2], r[3]);
}
static FORCEINLINE void __masked_store_8(unsigned char *p, __vec4_i8 val,
static FORCEINLINE void __masked_store_8(void *p, __vec4_i8 val,
__vec4_i1 mask) {
int8_t *ptr = (int8_t *)p;
@@ -2435,7 +2523,8 @@ static FORCEINLINE void __masked_store_8(unsigned char *p, __vec4_i8 val,
ptr[3] = _mm_extract_epi8(val.v, 3);
}
static FORCEINLINE void __masked_store_16(unsigned char *p, __vec4_i16 val, __vec4_i1 mask) {
static FORCEINLINE void __masked_store_16(void *p, __vec4_i16 val,
__vec4_i1 mask) {
int16_t *ptr = (int16_t *)p;
uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2455,7 +2544,7 @@ static FORCEINLINE void __masked_store_16(unsigned char *p, __vec4_i16 val, __ve
ptr[3] = _mm_extract_epi16(val.v, 3);
}
static FORCEINLINE void __masked_store_32(unsigned char *p, __vec4_i32 val,
static FORCEINLINE void __masked_store_32(void *p, __vec4_i32 val,
__vec4_i1 mask) {
int32_t *ptr = (int32_t *)p;
uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2475,7 +2564,7 @@ static FORCEINLINE void __masked_store_32(unsigned char *p, __vec4_i32 val,
ptr[3] = _mm_extract_epi32(val.v, 3);
}
static FORCEINLINE void __masked_store_64(unsigned char *p, __vec4_i64 val,
static FORCEINLINE void __masked_store_64(void *p, __vec4_i64 val,
__vec4_i1 mask) {
int64_t *ptr = (int64_t *)p;
uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2495,58 +2584,82 @@ static FORCEINLINE void __masked_store_64(unsigned char *p, __vec4_i64 val,
ptr[3] = _mm_extract_epi64(val.v[1], 1);
}
static FORCEINLINE void __masked_store_blend_8(void *p, __vec4_i8 val,
__vec4_i1 mask) {
__masked_store_8(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_16(void *p, __vec4_i16 val,
__vec4_i1 mask) {
__masked_store_16(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_32(void *p, __vec4_i32 val,
__vec4_i1 mask) {
// FIXME: do a load, blendvps, store here...
__masked_store_32(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_64(void *p, __vec4_i64 val,
__vec4_i1 mask) {
// FIXME: do a 2x (load, blendvps, store) here...
__masked_store_64(p, val, mask);
}
///////////////////////////////////////////////////////////////////////////
// gather/scatter
// offsets * offsetScale is in bytes (for all of these)
template<typename RetVec, typename RetScalar>
static FORCEINLINE RetVec
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
__vec4_i32 offsets, uint32_t scale, __vec4_i1 mask) {
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
RetScalar r[4];
#if 1
// "Fast gather" trick...
offsets = __select(mask, offsets, __smear_i32(0));
int offset = scale * _mm_extract_epi32(offsets.v, 0);
constOffset = __select(mask, constOffset, __smear_i32(0));
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
offset = scale * _mm_extract_epi32(offsets.v, 1);
offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
offset = scale * _mm_extract_epi32(offsets.v, 2);
offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
offset = scale * _mm_extract_epi32(offsets.v, 3);
offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
#else
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 0);
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 1);
int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
RetScalar *ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 2);
int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
RetScalar *ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 3);
int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
RetScalar *ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
}
@@ -2554,54 +2667,57 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
return RetVec(r[0], r[1], r[2], r[3]);
}
template<typename RetVec, typename RetScalar>
static FORCEINLINE RetVec
lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
RetScalar r[4];
#if 1
// "Fast gather" trick...
offsets = __select(mask, offsets, __smear_i64(0));
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
constOffset = __select(mask, constOffset, __smear_i64(0));
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
offset = scale * _mm_extract_epi64(offsets.v[0], 1);
offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
offset = scale * _mm_extract_epi64(offsets.v[1], 0);
offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
offset = scale * _mm_extract_epi64(offsets.v[1], 1);
offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
#else
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
RetScalar *ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
RetScalar *ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
}
@@ -2612,80 +2728,89 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
static FORCEINLINE __vec4_i8
__gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale,
mask);
constOffset, mask);
}
static FORCEINLINE __vec4_i8
__gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale,
mask);
constOffset, mask);
}
static FORCEINLINE __vec4_i16
__gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale,
mask);
constOffset, mask);
}
static FORCEINLINE __vec4_i16
__gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale,
mask);
constOffset, mask);
}
static FORCEINLINE __vec4_i32
__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i1 mask) {
__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
__vec4_i32 constOffset, __vec4_i1 mask) {
__m128i r = _mm_set_epi32(0, 0, 0, 0);
#if 1
// "Fast gather"...
offsets = __select(mask, offsets, __smear_i32(0));
constOffset = __select(mask, constOffset, __smear_i32(0));
int offset = scale * _mm_extract_epi32(offsets.v, 0);
int offset = scale * _mm_extract_epi32(offsets.v, 0) +
_mm_extract_epi32(constOffset.v, 0);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 0);
offset = scale * _mm_extract_epi32(offsets.v, 1);
offset = scale * _mm_extract_epi32(offsets.v, 1) +
_mm_extract_epi32(constOffset.v, 1);
ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 1);
offset = scale * _mm_extract_epi32(offsets.v, 2);
offset = scale * _mm_extract_epi32(offsets.v, 2) +
_mm_extract_epi32(constOffset.v, 2);
ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 2);
offset = scale * _mm_extract_epi32(offsets.v, 3);
offset = scale * _mm_extract_epi32(offsets.v, 3) +
_mm_extract_epi32(constOffset.v, 3);
ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 3);
#else
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 0);
int offset = scale * _mm_extract_epi32(offsets.v, 0) +
_mm_extract_epi32(constOffset.v, 0);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 1);
int offset = scale * _mm_extract_epi32(offsets.v, 1) +
_mm_extract_epi32(constOffset.v, 1);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 2);
int offset = scale * _mm_extract_epi32(offsets.v, 2) +
_mm_extract_epi32(constOffset.v, 2);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 3);
int offset = scale * _mm_extract_epi32(offsets.v, 3) +
_mm_extract_epi32(constOffset.v, 3);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 3);
}
@@ -2695,23 +2820,23 @@ __gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
static FORCEINLINE __vec4_i32
__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale,
mask);
delta, mask);
}
static FORCEINLINE __vec4_i64
__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale,
mask);
delta, mask);
}
static FORCEINLINE __vec4_i64
__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale,
mask);
delta, mask);
}
template<typename RetVec, typename RetScalar>
@@ -2858,217 +2983,108 @@ static FORCEINLINE __vec4_i64 __gather64_i64(__vec4_i64 ptrs, __vec4_i1 mask) {
// scatter
static FORCEINLINE void
__scatter_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
*ptr = _mm_extract_epi8(val.v, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
*ptr = _mm_extract_epi8(val.v, 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
*ptr = _mm_extract_epi8(val.v, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
*ptr = _mm_extract_epi8(val.v, 3);
}
#define SCATTER32_64(SUFFIX, TYPE, EXTRACT) \
static FORCEINLINE void \
__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
uint32_t scale, __vec4_i32 constOffset, \
__vec4_##SUFFIX val, __vec4_i1 mask) { \
uint32_t m = _mm_extract_ps(mask.v, 0); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
_mm_extract_epi32(constOffset.v, 0)); \
*ptr = EXTRACT(val.v, 0); \
} \
m = _mm_extract_ps(mask.v, 1); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \
_mm_extract_epi32(constOffset.v, 1)); \
*ptr = EXTRACT(val.v, 1); \
} \
m = _mm_extract_ps(mask.v, 2); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \
_mm_extract_epi32(constOffset.v, 2)); \
*ptr = EXTRACT(val.v, 2); \
} \
m = _mm_extract_ps(mask.v, 3); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \
_mm_extract_epi32(constOffset.v, 3)); \
*ptr = EXTRACT(val.v, 3); \
} \
} \
static FORCEINLINE void \
__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
uint32_t scale, __vec4_i64 constOffset, \
__vec4_##SUFFIX val, __vec4_i1 mask) { \
uint32_t m = _mm_extract_ps(mask.v, 0); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + \
_mm_extract_epi64(constOffset.v[0], 0); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 0); \
} \
m = _mm_extract_ps(mask.v, 1); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + \
_mm_extract_epi64(constOffset.v[0], 1); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 1); \
} \
m = _mm_extract_ps(mask.v, 2); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + \
_mm_extract_epi64(constOffset.v[1], 0); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 2); \
} \
m = _mm_extract_ps(mask.v, 3); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + \
_mm_extract_epi64(constOffset.v[1], 1); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 3); \
} \
}
static FORCEINLINE void
__scatter_base_offsets64_i8(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
uint8_t *ptr = (uint8_t *)(p + offset);
*ptr = _mm_extract_epi8(val.v, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
uint8_t *ptr = (uint8_t *)(p + offset);
*ptr = _mm_extract_epi8(val.v, 1);
}
SCATTER32_64(i8, int8_t, _mm_extract_epi8)
SCATTER32_64(i16, int16_t, _mm_extract_epi16)
SCATTER32_64(i32, int32_t, _mm_extract_epi32)
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
uint8_t *ptr = (uint8_t *)(p + offset);
*ptr = _mm_extract_epi8(val.v, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
uint8_t *ptr = (uint8_t *)(p + offset);
*ptr = _mm_extract_epi8(val.v, 3);
}
}
static FORCEINLINE void
__scatter_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
*ptr = _mm_extract_epi16(val.v, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
*ptr = _mm_extract_epi16(val.v, 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
*ptr = _mm_extract_epi16(val.v, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
*ptr = _mm_extract_epi16(val.v, 3);
}
}
static FORCEINLINE void
__scatter_base_offsets64_i16(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
uint16_t *ptr = (uint16_t *)(p + offset);
*ptr = _mm_extract_epi16(val.v, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
uint16_t *ptr = (uint16_t *)(p + offset);
*ptr = _mm_extract_epi16(val.v, 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
uint16_t *ptr = (uint16_t *)(p + offset);
*ptr = _mm_extract_epi16(val.v, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
uint16_t *ptr = (uint16_t *)(p + offset);
*ptr = _mm_extract_epi16(val.v, 3);
}
}
static FORCEINLINE void
__scatter_base_offsets32_i32(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int32_t *ptr = (int32_t *)(b + scale *
_mm_extract_epi32(offsets.v, 0));
*ptr = _mm_extract_epi32(val.v, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int32_t *ptr = (int32_t *)(b + scale *
_mm_extract_epi32(offsets.v, 1));
*ptr = _mm_extract_epi32(val.v, 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int32_t *ptr = (int32_t *)(b + scale *
_mm_extract_epi32(offsets.v, 2));
*ptr = _mm_extract_epi32(val.v, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int32_t *ptr = (int32_t *)(b + scale *
_mm_extract_epi32(offsets.v, 3));
*ptr = _mm_extract_epi32(val.v, 3);
}
}
static FORCEINLINE void
__scatter_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
uint32_t *ptr = (uint32_t *)(p + offset);
*ptr = _mm_extract_epi32(val.v, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
uint32_t *ptr = (uint32_t *)(p + offset);
*ptr = _mm_extract_epi32(val.v, 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
uint32_t *ptr = (uint32_t *)(p + offset);
*ptr = _mm_extract_epi32(val.v, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
uint32_t *ptr = (uint32_t *)(p + offset);
*ptr = _mm_extract_epi32(val.v, 3);
}
}
static FORCEINLINE void
__scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val,
__vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) +
_mm_extract_epi32(constOffset.v, 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) +
_mm_extract_epi32(constOffset.v, 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) +
_mm_extract_epi32(constOffset.v, 2);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 0);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) +
_mm_extract_epi32(constOffset.v, 3);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 1);
}
@@ -3076,31 +3092,36 @@ __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
static FORCEINLINE void
__scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 constOffset,
__vec4_i64 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +
_mm_extract_epi64(constOffset.v[0], 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +
_mm_extract_epi64(constOffset.v[0], 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +
_mm_extract_epi64(constOffset.v[1], 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 0);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +
_mm_extract_epi64(constOffset.v[1], 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 1);
}

View File

@@ -104,7 +104,7 @@ static void generateRay(uniform const float raster2camera[4][4],
}
static inline bool BBoxIntersect(const uniform float bounds[2][3],
static bool BBoxIntersect(const uniform float bounds[2][3],
const Ray &ray) {
uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
@@ -143,7 +143,7 @@ static inline bool BBoxIntersect(const uniform float bounds[2][3],
static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
static bool TriIntersect(const Triangle &tri, Ray &ray) {
uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };

View File

@@ -129,7 +129,7 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
}
static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
static float Density(float3 Pobj, float3 pMin, float3 pMax,
uniform float density[], uniform int nVoxels[3]) {
if (!Inside(Pobj, pMin, pMax))
return 0;

View File

@@ -36,12 +36,22 @@
*/
#include "expr.h"
#include "ast.h"
#include "type.h"
#include "sym.h"
#include "ctx.h"
#include "module.h"
#include "util.h"
#include "llvmutil.h"
#ifndef _MSC_VER
#include <inttypes.h>
#endif
#ifndef PRId64
#define PRId64 "lld"
#endif
#ifndef PRIu64
#define PRIu64 "llu"
#endif
#include <list>
#include <set>
@@ -224,7 +234,7 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
eltType = eltType->GetAsConstType();
if (Type::Equal(toPointerType,
new PointerType(eltType,
toPointerType->IsUniformType(),
toPointerType->GetVariability(),
toPointerType->IsConstType())))
goto typecast_ok;
else {
@@ -466,7 +476,7 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
typecast_ok:
if (expr != NULL)
*expr = new TypeCastExpr(toType, *expr, false, pos);
*expr = new TypeCastExpr(toType, *expr, pos);
return true;
}
@@ -638,6 +648,9 @@ lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {
static llvm::Value *
lMaskForSymbol(Symbol *baseSym, FunctionEmitContext *ctx) {
if (baseSym == NULL)
return ctx->GetFullMask();
if (dynamic_cast<const PointerType *>(baseSym->type) != NULL ||
dynamic_cast<const ReferenceType *>(baseSym->type) != NULL)
// FIXME: for pointers, we really only want to do this for
@@ -658,10 +671,11 @@ lMaskForSymbol(Symbol *baseSym, FunctionEmitContext *ctx) {
static void
lStoreAssignResult(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
FunctionEmitContext *ctx, Symbol *baseSym) {
Assert(baseSym != NULL &&
Assert(baseSym == NULL ||
baseSym->varyingCFDepth <= ctx->VaryingCFDepth());
if (!g->opt.disableMaskedStoreToStore &&
!g->opt.disableMaskAllOnOptimizations &&
baseSym != NULL &&
baseSym->varyingCFDepth == ctx->VaryingCFDepth() &&
baseSym->storageClass != SC_STATIC &&
dynamic_cast<const ReferenceType *>(baseSym->type) == NULL &&
@@ -2016,14 +2030,13 @@ AssignExpr::GetValue(FunctionEmitContext *ctx) const {
ctx->SetDebugPos(pos);
Symbol *baseSym = lvalue->GetBaseSymbol();
// Should be caught during type-checking...
assert(baseSym != NULL);
switch (op) {
case Assign: {
llvm::Value *lv = lvalue->GetLValue(ctx);
if (lv == NULL) {
Assert(m->errorCount > 0);
Error(lvalue->pos, "Left hand side of assignment expression can't "
"be assigned to.");
return NULL;
}
const Type *lvalueType = lvalue->GetLValueType();
@@ -2146,13 +2159,13 @@ AssignExpr::TypeCheck() {
}
}
if (lvalue->GetBaseSymbol() == NULL) {
Error(lvalue->pos, "Left hand side of assignment statement can't be "
"assigned to.");
const Type *lhsType = lvalue->GetType();
if (lhsType->IsConstType()) {
Error(lvalue->pos, "Can't assign to type \"%s\" on left-hand side of "
"expression.", lhsType->GetString().c_str());
return NULL;
}
const Type *lhsType = lvalue->GetType();
if (dynamic_cast<const PointerType *>(lhsType) != NULL) {
if (op == AddAssign || op == SubAssign) {
if (PointerType::IsVoidPointer(lhsType)) {
@@ -2186,12 +2199,6 @@ AssignExpr::TypeCheck() {
if (rvalue == NULL)
return NULL;
if (lhsType->IsConstType()) {
Error(pos, "Can't assign to type \"%s\" on left-hand side of "
"expression.", lhsType->GetString().c_str());
return NULL;
}
// Make sure we're not assigning to a struct that has a constant member
const StructType *st = dynamic_cast<const StructType *>(lhsType);
if (st != NULL && lCheckForConstStructMember(pos, st, st))
@@ -2709,7 +2716,7 @@ FunctionCallExpr::TypeCheck() {
!(argCouldBeNULL[i] == true &&
dynamic_cast<const PointerType *>(paramType) != NULL)) {
Error(args->exprs[i]->pos, "Can't convert argument of "
"type \"%s\" to type \"%s\" for funcion call "
"type \"%s\" to type \"%s\" for function call "
"argument.", argTypes[i]->GetString().c_str(),
paramType->GetString().c_str());
return NULL;
@@ -3525,6 +3532,12 @@ VectorMemberExpr::getElementType() const {
MemberExpr *
MemberExpr::create(Expr *e, const char *id, SourcePos p, SourcePos idpos,
bool derefLValue) {
// FIXME: we need to call TypeCheck() here so that we can call
// e->GetType() in the following. But really we just shouldn't try to
// resolve this now but just have a generic MemberExpr type that
// handles all cases so that this is unnecessary.
e = ::TypeCheck(e);
const Type *exprType;
if (e == NULL || (exprType = e->GetType()) == NULL)
return NULL;
@@ -4536,18 +4549,10 @@ ConstExpr::Print() const {
printf("%f", floatVal[i]);
break;
case AtomicType::TYPE_INT64:
#ifdef ISPC_IS_LINUX
printf("%ld", int64Val[i]);
#else
printf("%lld", int64Val[i]);
#endif
printf("%"PRId64, int64Val[i]);
break;
case AtomicType::TYPE_UINT64:
#ifdef ISPC_IS_LINUX
printf("%lu", uint64Val[i]);
#else
printf("%llu", uint64Val[i]);
#endif
printf("%"PRIu64, uint64Val[i]);
break;
case AtomicType::TYPE_DOUBLE:
printf("%f", doubleVal[i]);
@@ -4566,11 +4571,10 @@ ConstExpr::Print() const {
///////////////////////////////////////////////////////////////////////////
// TypeCastExpr
TypeCastExpr::TypeCastExpr(const Type *t, Expr *e, bool pu, SourcePos p)
TypeCastExpr::TypeCastExpr(const Type *t, Expr *e, SourcePos p)
: Expr(p) {
type = t;
expr = e;
preserveUniformity = pu;
}
@@ -5213,7 +5217,7 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
if (Type::EqualIgnoringConst(arrayAsPtr->GetType(), toPointerType) == false) {
Assert(Type::EqualIgnoringConst(arrayAsPtr->GetType()->GetAsVaryingType(),
toPointerType) == true);
arrayAsPtr = new TypeCastExpr(toPointerType, arrayAsPtr, false, pos);
arrayAsPtr = new TypeCastExpr(toPointerType, arrayAsPtr, pos);
arrayAsPtr = ::TypeCheck(arrayAsPtr);
Assert(arrayAsPtr != NULL);
arrayAsPtr = ::Optimize(arrayAsPtr);
@@ -5364,6 +5368,7 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
const Type *
TypeCastExpr::GetType() const {
Assert(type->HasUnboundVariability() == false);
return type;
}
@@ -5373,7 +5378,7 @@ lDeconstifyType(const Type *t) {
const PointerType *pt = dynamic_cast<const PointerType *>(t);
if (pt != NULL)
return new PointerType(lDeconstifyType(pt->GetBaseType()),
pt->IsUniformType(), false);
pt->GetVariability(), false);
else
return t->GetAsNonConstType();
}
@@ -5384,16 +5389,16 @@ TypeCastExpr::TypeCheck() {
if (expr == NULL)
return NULL;
const Type *toType = GetType(), *fromType = expr->GetType();
const Type *toType = type, *fromType = expr->GetType();
if (toType == NULL || fromType == NULL)
return NULL;
if (preserveUniformity == true && fromType->IsUniformType() &&
toType->IsVaryingType()) {
if (toType->HasUnboundVariability() && fromType->IsUniformType()) {
TypeCastExpr *tce = new TypeCastExpr(toType->GetAsUniformType(),
expr, false, pos);
expr, pos);
return ::TypeCheck(tce);
}
type = toType = type->ResolveUnboundVariability(Type::Varying);
fromType = lDeconstifyType(fromType);
toType = lDeconstifyType(toType);
@@ -5862,6 +5867,8 @@ SizeOfExpr::SizeOfExpr(Expr *e, SourcePos p)
SizeOfExpr::SizeOfExpr(const Type *t, SourcePos p)
: Expr(p), expr(NULL), type(t) {
if (type->HasUnboundVariability())
type = type->ResolveUnboundVariability(Type::Varying);
}
@@ -6026,7 +6033,8 @@ FunctionSymbolExpr::GetType() const {
return NULL;
}
return matchingFunc ? new PointerType(matchingFunc->type, true, true) : NULL;
return matchingFunc ?
new PointerType(matchingFunc->type, Type::Uniform, true) : NULL;
}

5
expr.h
View File

@@ -314,7 +314,6 @@ public:
std::string identifier;
const SourcePos identifierPos;
protected:
MemberExpr(Expr *expr, const char *identifier, SourcePos pos,
SourcePos identifierPos, bool derefLValue);
@@ -493,8 +492,7 @@ private:
probably-different type. */
class TypeCastExpr : public Expr {
public:
TypeCastExpr(const Type *t, Expr *e, bool preserveUniformity,
SourcePos p);
TypeCastExpr(const Type *t, Expr *e, SourcePos p);
llvm::Value *GetValue(FunctionEmitContext *ctx) const;
const Type *GetType() const;
@@ -507,7 +505,6 @@ public:
const Type *type;
Expr *expr;
bool preserveUniformity;
};

View File

@@ -290,8 +290,10 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
llvm::BasicBlock *bbAllOn = ctx->CreateBasicBlock("all_on");
llvm::BasicBlock *bbNotAll = ctx->CreateBasicBlock("not_all_on");
ctx->BranchInst(bbAllOn, bbNotAll, allOn);
// Set up basic blocks for goto targets
ctx->InitializeLabelMap(code);
ctx->BranchInst(bbAllOn, bbNotAll, allOn);
// all on: we've determined dynamically that the mask is all
// on. Set the current mask to "all on" explicitly so that
// codegen for this path can be improved with this knowledge in
@@ -322,15 +324,23 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
// above
ctx->SetCurrentBasicBlock(bbSomeOn);
ctx->SetFunctionMask(mask);
// Set up basic blocks for goto targets again; we want to have
// one set of them for gotos in the 'all on' case, and a
// distinct set for the 'mixed mask' case.
ctx->InitializeLabelMap(code);
code->EmitCode(ctx);
if (ctx->GetCurrentBasicBlock())
ctx->ReturnInst();
}
else
else {
// Set up basic blocks for goto targets
ctx->InitializeLabelMap(code);
// No check, just emit the code
code->EmitCode(ctx);
}
}
if (ctx->GetCurrentBasicBlock()) {
// FIXME: We'd like to issue a warning if we've reached the end of

View File

@@ -210,7 +210,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->isa = Target::AVX2;
t->nativeVectorWidth = 8;
t->vectorWidth = 8;
t->attributes = "+avx2,+popcnt,+cmov";
t->attributes = "+avx2,+popcnt,+cmov,+f16c";
t->maskingIsFree = false;
t->allOffMaskIsSafe = false;
t->maskBitCount = 32;
@@ -219,7 +219,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->isa = Target::AVX2;
t->nativeVectorWidth = 16;
t->vectorWidth = 16;
t->attributes = "+avx2,+popcnt,+cmov";
t->attributes = "+avx2,+popcnt,+cmov,+f16c";
t->maskingIsFree = false;
t->allOffMaskIsSafe = false;
t->maskBitCount = 32;
@@ -358,10 +358,45 @@ Target::GetISAString() const {
}
static bool
lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
if (type->isPrimitiveType() || type->isIntegerTy())
return false;
if (type == LLVMTypes::BoolVectorType ||
type == LLVMTypes::MaskType ||
type == LLVMTypes::Int1VectorType)
return true;
LLVM_TYPE_CONST llvm::ArrayType *at =
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
if (at != NULL)
return lGenericTypeLayoutIndeterminate(at->getElementType());
LLVM_TYPE_CONST llvm::PointerType *pt =
llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(type);
if (pt != NULL)
return false;
LLVM_TYPE_CONST llvm::StructType *st =
llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
if (st != NULL) {
for (int i = 0; i < (int)st->getNumElements(); ++i)
if (lGenericTypeLayoutIndeterminate(st->getElementType(i)))
return true;
return false;
}
Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type));
return true;
}
llvm::Value *
Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
llvm::BasicBlock *insertAtEnd) {
if (isa == Target::GENERIC && type->isPrimitiveType() == false) {
if (isa == Target::GENERIC &&
lGenericTypeLayoutIndeterminate(type)) {
llvm::Value *index[1] = { LLVMInt32(1) };
LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
@@ -396,7 +431,8 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
llvm::Value *
Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
llvm::BasicBlock *insertAtEnd) {
if (isa == Target::GENERIC && type->isPrimitiveType() == false) {
if (isa == Target::GENERIC &&
lGenericTypeLayoutIndeterminate(type) == true) {
llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(element) };
LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);

5
ispc.h
View File

@@ -98,6 +98,8 @@ namespace llvm {
#endif
class ArrayType;
class AST;
class ASTNode;
class AtomicType;
class FunctionEmitContext;
class Expr;
@@ -421,6 +423,7 @@ enum {
COST_FUNPTR_UNIFORM = 12,
COST_FUNPTR_VARYING = 24,
COST_GATHER = 8,
COST_GOTO = 4,
COST_LOAD = 2,
COST_REGULAR_BREAK_CONTINUE = 2,
COST_RETURN = 4,
@@ -434,6 +437,8 @@ enum {
COST_VARYING_IF = 3,
COST_UNIFORM_LOOP = 4,
COST_VARYING_LOOP = 6,
COST_UNIFORM_SWITCH = 4,
COST_VARYING_SWITCH = 12,
COST_ASSERT = 8,
CHECK_MASK_AT_FUNCTION_START_COST = 16,

View File

@@ -18,8 +18,10 @@
<ClCompile Include="decl.cpp" />
<ClCompile Include="expr.cpp" />
<ClCompile Include="func.cpp" />
<ClCompile Include="gen-bitcode-avx.cpp" />
<ClCompile Include="gen-bitcode-avx-x2.cpp" />
<ClCompile Include="gen-bitcode-avx1.cpp" />
<ClCompile Include="gen-bitcode-avx1-x2.cpp" />
<ClCompile Include="gen-bitcode-avx2.cpp" />
<ClCompile Include="gen-bitcode-avx2-x2.cpp" />
<ClCompile Include="gen-bitcode-c-32.cpp" />
<ClCompile Include="gen-bitcode-c-64.cpp" />
<ClCompile Include="gen-bitcode-dispatch.cpp" />
@@ -158,29 +160,55 @@
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-avx.ll">
<CustomBuild Include="builtins\target-avx1.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll &gt; gen-bitcode-avx.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll &gt; gen-bitcode-avx.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll &gt; gen-bitcode-avx1.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll &gt; gen-bitcode-avx1.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-avx-x2.ll">
<CustomBuild Include="builtins\target-avx1-x2.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll &gt; gen-bitcode-avx1-x2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll &gt; gen-bitcode-avx1-x2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-avx2.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll &gt; gen-bitcode-avx2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll &gt; gen-bitcode-avx2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-avx2-x2.ll">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll &gt; gen-bitcode-avx2-x2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll &gt; gen-bitcode-avx2-x2.cpp</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
<AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
<Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
<Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>

40
lex.ll
View File

@@ -42,7 +42,7 @@
#include <stdlib.h>
#include <stdint.h>
static uint64_t lParseBinary(const char *ptr, SourcePos pos);
static uint64_t lParseBinary(const char *ptr, SourcePos pos, char **endPtr);
static void lCComment(SourcePos *);
static void lCppComment(SourcePos *);
static void lHandleCppHash(SourcePos *);
@@ -67,7 +67,7 @@ inline int isatty(int) { return 0; }
%option nounistd
WHITESPACE [ \t\r]+
INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))
INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[kMG]?
FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
@@ -151,29 +151,43 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
{INT_NUMBER}+(u|U|l|L)*? {
int ls = 0, us = 0;
if (yytext[0] == '0' && yytext[1] == 'b')
yylval->intVal = lParseBinary(yytext+2, *yylloc);
else {
char *endPtr = NULL;
#ifdef ISPC_IS_WINDOWS
if (yytext[0] == '0' && yytext[1] == 'b')
yylval->intVal = lParseBinary(yytext+2, *yylloc, &endPtr);
else {
#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
yylval->intVal = _strtoi64(yytext, &endPtr, 0);
#else
// FIXME: should use strtouq and then issue an error if we can't
// fit into 64 bits...
yylval->intVal = strtoull(yytext, &endPtr, 0);
#endif
}
bool kilo = false, mega = false, giga = false;
for (; *endPtr; endPtr++) {
if (*endPtr == 'l' || *endPtr == 'L')
if (*endPtr == 'k')
kilo = true;
else if (*endPtr == 'M')
mega = true;
else if (*endPtr == 'G')
giga = true;
else if (*endPtr == 'l' || *endPtr == 'L')
ls++;
else if (*endPtr == 'u' || *endPtr == 'U')
us++;
}
if (kilo)
yylval->intVal *= 1024;
if (mega)
yylval->intVal *= 1024*1024;
if (giga)
yylval->intVal *= 1024*1024*1024;
if (ls >= 2)
return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
else if (ls == 1)
return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
}
// See if we can fit this into a 32-bit integer...
if ((yylval->intVal & 0xffffffff) == yylval->intVal)
@@ -268,14 +282,11 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
/** Return the integer version of a binary constant from a string.
*/
static uint64_t
lParseBinary(const char *ptr, SourcePos pos) {
lParseBinary(const char *ptr, SourcePos pos, char **endPtr) {
uint64_t val = 0;
bool warned = false;
while (*ptr != '\0') {
/* if this hits, the regexp for 0b... constants is broken */
Assert(*ptr == '0' || *ptr == '1');
while (*ptr == '0' || *ptr == '1') {
if ((val & (((int64_t)1)<<63)) && warned == false) {
// We're about to shift out a set bit
Warning(pos, "Can't represent binary constant with a 64-bit integer type");
@@ -285,6 +296,7 @@ lParseBinary(const char *ptr, SourcePos pos) {
val = (val << 1) | (*ptr == '0' ? 0 : 1);
++ptr;
}
*endPtr = (char *)ptr;
return val;
}

View File

@@ -36,7 +36,9 @@
*/
#include "llvmutil.h"
#include "ispc.h"
#include "type.h"
#include <llvm/Instructions.h>
LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
@@ -109,7 +111,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
else {
assert(target.maskBitCount == 32);
Assert(target.maskBitCount == 32);
LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth);
}
@@ -465,3 +467,239 @@ LLVMBoolVector(const bool *bvec) {
}
return llvm::ConstantVector::get(vals);
}
/** Conservative test to see if two llvm::Values are equal. There are
(potentially many) cases where the two values actually are equal but
this will return false. However, if it does return true, the two
vectors definitely are equal.
@todo This seems to catch all of the cases we currently need it for in
practice, but it's be nice to make it a little more robust/general. In
general, though, a little something called the halting problem means we
won't get all of them.
*/
static bool
lValuesAreEqual(llvm::Value *v0, llvm::Value *v1,
std::vector<llvm::PHINode *> &seenPhi0,
std::vector<llvm::PHINode *> &seenPhi1) {
// Thanks to the fact that LLVM hashes and returns the same pointer for
// constants (of all sorts, even constant expressions), this first test
// actually catches a lot of cases. LLVM's SSA form also helps a lot
// with this..
if (v0 == v1)
return true;
Assert(seenPhi0.size() == seenPhi1.size());
for (unsigned int i = 0; i < seenPhi0.size(); ++i)
if (v0 == seenPhi0[i] && v1 == seenPhi1[i])
return true;
llvm::BinaryOperator *bo0 = llvm::dyn_cast<llvm::BinaryOperator>(v0);
llvm::BinaryOperator *bo1 = llvm::dyn_cast<llvm::BinaryOperator>(v1);
if (bo0 != NULL && bo1 != NULL) {
if (bo0->getOpcode() != bo1->getOpcode())
return false;
return (lValuesAreEqual(bo0->getOperand(0), bo1->getOperand(0),
seenPhi0, seenPhi1) &&
lValuesAreEqual(bo0->getOperand(1), bo1->getOperand(1),
seenPhi0, seenPhi1));
}
llvm::PHINode *phi0 = llvm::dyn_cast<llvm::PHINode>(v0);
llvm::PHINode *phi1 = llvm::dyn_cast<llvm::PHINode>(v1);
if (phi0 != NULL && phi1 != NULL) {
if (phi0->getNumIncomingValues() != phi1->getNumIncomingValues())
return false;
seenPhi0.push_back(phi0);
seenPhi1.push_back(phi1);
unsigned int numIncoming = phi0->getNumIncomingValues();
// Check all of the incoming values: if all of them are all equal,
// then we're good.
bool anyFailure = false;
for (unsigned int i = 0; i < numIncoming; ++i) {
Assert(phi0->getIncomingBlock(i) == phi1->getIncomingBlock(i));
if (!lValuesAreEqual(phi0->getIncomingValue(i),
phi1->getIncomingValue(i), seenPhi0, seenPhi1)) {
anyFailure = true;
break;
}
}
seenPhi0.pop_back();
seenPhi1.pop_back();
return !anyFailure;
}
return false;
}
/** Given an llvm::Value known to be an integer, return its value as
an int64_t.
*/
static int64_t
lGetIntValue(llvm::Value *offset) {
llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
Assert(intOffset && (intOffset->getBitWidth() == 32 ||
intOffset->getBitWidth() == 64));
return intOffset->getSExtValue();
}
/** This function takes chains of InsertElement instructions along the
lines of:
%v0 = insertelement undef, value_0, i32 index_0
%v1 = insertelement %v1, value_1, i32 index_1
...
%vn = insertelement %vn-1, value_n-1, i32 index_n-1
and initializes the provided elements array such that the i'th
llvm::Value * in the array is the element that was inserted into the
i'th element of the vector.
*/
void
LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
llvm::Value **elements) {
for (int i = 0; i < vectorWidth; ++i)
elements[i] = NULL;
while (ie != NULL) {
int64_t iOffset = lGetIntValue(ie->getOperand(2));
Assert(iOffset >= 0 && iOffset < vectorWidth);
Assert(elements[iOffset] == NULL);
elements[iOffset] = ie->getOperand(1);
llvm::Value *insertBase = ie->getOperand(0);
ie = llvm::dyn_cast<llvm::InsertElementInst>(insertBase);
if (ie == NULL) {
if (llvm::isa<llvm::UndefValue>(insertBase))
return;
llvm::ConstantVector *cv =
llvm::dyn_cast<llvm::ConstantVector>(insertBase);
Assert(cv != NULL);
Assert(iOffset < (int)cv->getNumOperands());
elements[iOffset] = cv->getOperand(iOffset);
}
}
}
/** Tests to see if all of the elements of the vector in the 'v' parameter
are equal. Like lValuesAreEqual(), this is a conservative test and may
return false for arrays where the values are actually all equal. */
bool
LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
std::vector<llvm::PHINode *> &seenPhis) {
if (llvm::isa<llvm::ConstantAggregateZero>(v))
return true;
llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v);
if (cv != NULL)
return (cv->getSplatValue() != NULL);
llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v);
if (bop != NULL)
return (LLVMVectorValuesAllEqual(bop->getOperand(0), vectorLength,
seenPhis) &&
LLVMVectorValuesAllEqual(bop->getOperand(1), vectorLength,
seenPhis));
llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(v);
if (cast != NULL)
return LLVMVectorValuesAllEqual(cast->getOperand(0), vectorLength,
seenPhis);
llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(v);
if (ie != NULL) {
llvm::Value *elements[ISPC_MAX_NVEC];
LLVMFlattenInsertChain(ie, vectorLength, elements);
// We will ignore any values of elements[] that are NULL; as they
// correspond to undefined values--we just want to see if all of
// the defined values have the same value.
int lastNonNull = 0;
while (lastNonNull < vectorLength && elements[lastNonNull] == NULL)
++lastNonNull;
if (lastNonNull == vectorLength)
// all of them are undef!
return true;
for (int i = lastNonNull; i < vectorLength; ++i) {
if (elements[i] == NULL)
continue;
std::vector<llvm::PHINode *> seenPhi0;
std::vector<llvm::PHINode *> seenPhi1;
if (lValuesAreEqual(elements[lastNonNull], elements[i], seenPhi0,
seenPhi1) == false)
return false;
lastNonNull = i;
}
return true;
}
llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v);
if (phi) {
for (unsigned int i = 0; i < seenPhis.size(); ++i)
if (seenPhis[i] == phi)
return true;
seenPhis.push_back(phi);
unsigned int numIncoming = phi->getNumIncomingValues();
// Check all of the incoming values: if all of them are all equal,
// then we're good.
for (unsigned int i = 0; i < numIncoming; ++i) {
if (!LLVMVectorValuesAllEqual(phi->getIncomingValue(i), vectorLength,
seenPhis)) {
seenPhis.pop_back();
return false;
}
}
seenPhis.pop_back();
return true;
}
Assert(!llvm::isa<llvm::Constant>(v));
if (llvm::isa<llvm::CallInst>(v) || llvm::isa<llvm::LoadInst>(v) ||
!llvm::isa<llvm::Instruction>(v))
return false;
llvm::ShuffleVectorInst *shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(v);
if (shuffle != NULL) {
llvm::Value *indices = shuffle->getOperand(2);
if (LLVMVectorValuesAllEqual(indices, vectorLength, seenPhis))
// The easy case--just a smear of the same element across the
// whole vector.
return true;
// TODO: handle more general cases?
return false;
}
#if 0
fprintf(stderr, "all equal: ");
v->dump();
fprintf(stderr, "\n");
llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(v);
if (inst) {
inst->getParent()->dump();
fprintf(stderr, "\n");
fprintf(stderr, "\n");
}
#endif
return false;
}

View File

@@ -38,12 +38,23 @@
#ifndef ISPC_LLVMUTIL_H
#define ISPC_LLVMUTIL_H 1
#include "ispc.h"
#include <llvm/LLVMContext.h>
#include <llvm/Type.h>
#include <llvm/DerivedTypes.h>
#include <llvm/Constants.h>
namespace llvm {
class PHINode;
class InsertElementInst;
}
// llvm::Type *s are no longer const in llvm 3.0
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
#define LLVM_TYPE_CONST
#else
#define LLVM_TYPE_CONST const
#endif
/** This structure holds pointers to a variety of LLVM types; code
elsewhere can use them from here, ratherthan needing to make more
@@ -99,6 +110,7 @@ extern llvm::Constant *LLVMTrue, *LLVMFalse;
of LLVMTypes and the LLVMTrue/LLVMFalse constants. However, it can't
be called until the compilation target is known.
*/
struct Target;
extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target);
/** Returns an LLVM i8 constant of the given value */
@@ -205,4 +217,13 @@ extern llvm::Constant *LLVMMaskAllOn;
/** LLVM constant value representing an 'all off' SIMD lane mask */
extern llvm::Constant *LLVMMaskAllOff;
/** Tests to see if all of the elements of the vector in the 'v' parameter
are equal. Like lValuesAreEqual(), this is a conservative test and may
return false for arrays where the values are actually all equal. */
extern bool LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
std::vector<llvm::PHINode *> &seenPhis);
void LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
llvm::Value **elements);
#endif // ISPC_LLVMUTIL_H

View File

@@ -38,6 +38,7 @@
#include "ispc.h"
#include "module.h"
#include "util.h"
#include "type.h"
#include <stdio.h>
#include <stdlib.h>
#include <llvm/Support/PrettyStackTrace.h>
@@ -53,14 +54,33 @@
#ifdef ISPC_IS_WINDOWS
#define strcasecmp stricmp
#ifndef BUILD_DATE
#define BUILD_DATE __DATE__
#endif
#define BUILD_VERSION ""
#endif // ISPC_IS_WINDOWS
static void usage(int ret) {
printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n",
BUILD_DATE, BUILD_VERSION);
printf("usage: ispc\n");
static void
lPrintVersion() {
printf("Intel(r) SPMD Program Compiler (ispc), build %s (%s, LLVM %s)\n",
BUILD_DATE, BUILD_VERSION,
#ifdef LLVM_2_9
"2.9"
#elif defined(LLVM_3_0) || defined(LLVM_3_0svn)
"3.0"
#elif defined(LLVM_3_1) || defined(LLVM_3_1svn)
"3.1"
#else
#error "Unhandled LLVM version"
#endif
);
}
static void
usage(int ret) {
lPrintVersion();
printf("\nusage: ispc\n");
printf(" [--addressing={32,64}]\t\tSelect 32- or 64-bit addressing. (Note that 32-bit\n");
printf(" \t\taddressing calculations are done by default, even\n");
printf(" \t\ton 64-bit target architectures.)\n");
@@ -188,6 +208,8 @@ int main(int Argc, char *Argv[]) {
LLVMInitializeX86TargetMC();
#endif
AtomicType::Init();
char *file = NULL;
const char *headerFileName = NULL;
const char *outFileName = NULL;
@@ -362,8 +384,7 @@ int main(int Argc, char *Argv[]) {
generatePIC = true;
#endif // !ISPC_IS_WINDOWS
else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n",
BUILD_DATE, BUILD_VERSION);
lPrintVersion();
return 0;
}
else if (argv[i][0] == '-') {

1004
opt.cpp

File diff suppressed because it is too large Load Diff

View File

@@ -224,7 +224,7 @@ struct ForeachDimension {
%type <declSpecs> declaration_specifiers
%type <stringVal> string_constant
%type <constCharPtr> struct_or_union_name enum_identifier
%type <constCharPtr> struct_or_union_name enum_identifier goto_identifier
%type <intVal> int_constant soa_width_specifier
%type <foreachDimension> foreach_dimension_specifier
@@ -362,13 +362,7 @@ cast_expression
: unary_expression
| '(' type_name ')' cast_expression
{
// Pass true here to try to preserve uniformity
// so that things like:
// uniform int y = ...;
// uniform float x = 1. / (float)y;
// don't issue an error due to (float)y being inadvertently
// and undesirably-to-the-user "varying"...
$$ = new TypeCastExpr($2, $4, true, Union(@1,@4));
$$ = new TypeCastExpr($2, $4, Union(@1,@4));
}
;
@@ -500,6 +494,7 @@ declaration_statement
$$ = NULL;
}
else {
$1->DeclareFunctions();
std::vector<VariableDeclaration> vars = $1->GetVariableDeclarations();
$$ = new DeclStmt(vars, @1);
}
@@ -638,13 +633,13 @@ type_specifier
atomic_var_type_specifier
: TOKEN_VOID { $$ = AtomicType::Void; }
| TOKEN_BOOL { $$ = AtomicType::VaryingBool; }
| TOKEN_INT8 { $$ = AtomicType::VaryingInt8; }
| TOKEN_INT16 { $$ = AtomicType::VaryingInt16; }
| TOKEN_INT { $$ = AtomicType::VaryingInt32; }
| TOKEN_FLOAT { $$ = AtomicType::VaryingFloat; }
| TOKEN_DOUBLE { $$ = AtomicType::VaryingDouble; }
| TOKEN_INT64 { $$ = AtomicType::VaryingInt64; }
| TOKEN_BOOL { $$ = AtomicType::UnboundBool; }
| TOKEN_INT8 { $$ = AtomicType::UnboundInt8; }
| TOKEN_INT16 { $$ = AtomicType::UnboundInt16; }
| TOKEN_INT { $$ = AtomicType::UnboundInt32; }
| TOKEN_FLOAT { $$ = AtomicType::UnboundFloat; }
| TOKEN_DOUBLE { $$ = AtomicType::UnboundDouble; }
| TOKEN_INT64 { $$ = AtomicType::UnboundInt64; }
;
short_vec_specifier
@@ -670,7 +665,7 @@ struct_or_union_specifier
GetStructTypesNamesPositions(*$4, &elementTypes, &elementNames,
&elementPositions);
StructType *st = new StructType($2, elementTypes, elementNames,
elementPositions, false, true, @2);
elementPositions, false, Type::Unbound, @2);
m->symbolTable->AddType($2, st, @2);
$$ = st;
}
@@ -681,8 +676,9 @@ struct_or_union_specifier
std::vector<SourcePos> elementPositions;
GetStructTypesNamesPositions(*$3, &elementTypes, &elementNames,
&elementPositions);
// FIXME: should be unbound
$$ = new StructType("", elementTypes, elementNames, elementPositions,
false, true, @1);
false, Type::Unbound, @1);
}
| struct_or_union '{' '}'
{
@@ -748,7 +744,7 @@ specifier_qualifier_list
else if ($1 == TYPEQUAL_SIGNED) {
if ($2->IsIntType() == false) {
Error(@1, "Can't apply \"signed\" qualifier to \"%s\" type.",
$2->GetString().c_str());
$2->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
$$ = $2;
}
}
@@ -758,7 +754,7 @@ specifier_qualifier_list
$$ = t;
else {
Error(@1, "Can't apply \"unsigned\" qualifier to \"%s\" type. Ignoring.",
$2->GetString().c_str());
$2->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
$$ = $2;
}
}
@@ -775,9 +771,12 @@ specifier_qualifier_list
else
FATAL("Unhandled type qualifier in parser.");
}
else
else {
if (m->errorCount == 0)
Error(@1, "Lost type qualifier in parser.");
$$ = NULL;
}
}
;
@@ -1112,8 +1111,7 @@ type_name
abstract_declarator
: pointer
{
Declarator *d = new Declarator(DK_POINTER, @1);
$$ = d;
$$ = $1;
}
| direct_abstract_declarator
| pointer direct_abstract_declarator
@@ -1262,10 +1260,22 @@ statement
;
labeled_statement
: TOKEN_CASE constant_expression ':' statement
{ UNIMPLEMENTED; }
: goto_identifier ':' statement
{
$$ = new LabeledStmt($1, $3, @1);
}
| TOKEN_CASE constant_expression ':' statement
{
int value;
if ($2 != NULL &&
lGetConstantInt($2, &value, @2, "Case statement value")) {
$$ = new CaseStmt(value, $4, Union(@1, @2));
}
else
$$ = NULL;
}
| TOKEN_DEFAULT ':' statement
{ UNIMPLEMENTED; }
{ $$ = new DefaultStmt($3, @1); }
;
start_scope
@@ -1311,7 +1321,7 @@ selection_statement
| TOKEN_CIF '(' expression ')' statement TOKEN_ELSE statement
{ $$ = new IfStmt($3, $5, $7, true, @1); }
| TOKEN_SWITCH '(' expression ')' statement
{ UNIMPLEMENTED; }
{ $$ = new SwitchStmt($3, $5, @1); }
;
for_test
@@ -1433,9 +1443,13 @@ iteration_statement
}
;
goto_identifier
: TOKEN_IDENTIFIER { $$ = yylval.stringVal->c_str(); }
;
jump_statement
: TOKEN_GOTO TOKEN_IDENTIFIER ';'
{ UNIMPLEMENTED; }
: TOKEN_GOTO goto_identifier ';'
{ $$ = new GotoStmt($2, @1, @2); }
| TOKEN_CONTINUE ';'
{ $$ = new ContinueStmt(false, @1); }
| TOKEN_BREAK ';'
@@ -1551,19 +1565,21 @@ lAddDeclaration(DeclSpecs *ds, Declarator *decl) {
const Type *t = decl->GetType(ds);
if (t == NULL)
return;
Symbol *sym = decl->GetSymbol();
Assert(sym != NULL);
const FunctionType *ft = dynamic_cast<const FunctionType *>(t);
if (ft != NULL) {
Symbol *funSym = decl->GetSymbol();
Assert(funSym != NULL);
funSym->type = ft;
funSym->storageClass = ds->storageClass;
sym->type = ft;
sym->storageClass = ds->storageClass;
bool isInline = (ds->typeQualifiers & TYPEQUAL_INLINE);
m->AddFunctionDeclaration(funSym, isInline);
m->AddFunctionDeclaration(sym, isInline);
}
else {
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
bool isConst = (ds->typeQualifiers & TYPEQUAL_CONST) != 0;
m->AddGlobalVariable(sym, decl->initExpr, isConst);
}
else
m->AddGlobalVariable(decl->GetSymbol(), decl->initExpr,
(ds->typeQualifiers & TYPEQUAL_CONST) != 0);
}
}
@@ -1589,6 +1605,7 @@ lAddFunctionParams(Declarator *decl) {
continue;
Assert(pdecl->declarators.size() == 1);
Symbol *sym = pdecl->declarators[0]->GetSymbol();
sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
#ifndef NDEBUG
bool ok = m->symbolTable->AddVariable(sym);
if (ok == false)
@@ -1754,7 +1771,7 @@ lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
the actual enum type here and optimize it, which will have
us end up with a ConstExpr with the desired EnumType... */
Expr *castExpr = new TypeCastExpr(enumType, enums[i]->constValue,
false, enums[i]->pos);
enums[i]->pos);
castExpr = Optimize(castExpr);
enums[i]->constValue = dynamic_cast<ConstExpr *>(castExpr);
Assert(enums[i]->constValue != NULL);

View File

@@ -15,6 +15,7 @@ import string
import subprocess
import shlex
import platform
import tempfile
# This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
# git history has a workaround for that issue.
@@ -79,7 +80,12 @@ if len(args) == 0:
files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc") + \
glob.glob("tests_errors/*ispc")
else:
files = args
files = [ ]
for f in args:
if os.path.splitext(string.lower(f))[1] != ".ispc":
print "Ignoring file %s, which doesn't have an .ispc extension." % f
else:
files += [ f ]
# randomly shuffle the tests if asked to do so
if (options.random):
@@ -146,16 +152,22 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
def run_test(filename):
global is_windows
if is_windows:
input_prefix = "../"
else:
input_prefix = ""
# is this a test to make sure an error is issued?
want_error = (filename.find("tests_errors") != -1)
if want_error == True:
ispc_cmd = ispc_exe + " --werror --nowrap %s --arch=%s --target=%s" % \
(filename, options.arch, options.target)
(input_prefix + filename, options.arch, options.target)
(return_code, output) = run_command(ispc_cmd)
got_error = (return_code != 0)
# figure out the error message we're expecting
file = open(filename, 'r')
file = open(input_prefix + filename, 'r')
firstline = file.readline()
firstline = firstline.replace("//", "")
firstline = firstline.lstrip()
@@ -179,7 +191,7 @@ def run_test(filename):
# function that this test has.
sig2def = { "f_v(" : 0, "f_f(" : 1, "f_fu(" : 2, "f_fi(" : 3,
"f_du(" : 4, "f_duf(" : 5, "f_di(" : 6 }
file = open(filename, 'r')
file = open(input_prefix + filename, 'r')
match = -1
for line in file:
# look for lines with 'export'...
@@ -201,14 +213,13 @@ def run_test(filename):
if is_generic_target:
obj_name = "%s.cpp" % filename
global is_windows
if is_windows:
if not is_generic_target:
obj_name = "%s.obj" % filename
exe_name = "%s.exe" % filename
obj_name = "%s%s.obj" % (input_prefix, filename)
exe_name = "%s%s.exe" % (input_prefix, filename)
cc_cmd = "%s /I. /Iwinstuff /Zi /nologo /DTEST_SIG=%d test_static.cpp %s /Fe%s" % \
(options.compiler_exe, match, obj_name, exe_name)
cc_cmd = "%s /I. /Iwinstuff /Zi /nologo /DTEST_SIG=%d %stest_static.cpp %s /Fe%s" % \
(options.compiler_exe, match, input_prefix, obj_name, exe_name)
if should_fail:
cc_cmd += " /DEXPECT_FAILURE"
else:
@@ -220,7 +231,7 @@ def run_test(filename):
gcc_arch = '-m32'
else:
gcc_arch = '-m64'
cc_cmd = "%s -msse4.2 -I. %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \
cc_cmd = "%s -O2 -msse4.2 -I. %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \
(options.compiler_exe, gcc_arch, match, obj_name, exe_name)
if platform.system() == 'Darwin':
cc_cmd += ' -Wl,-no_pie'
@@ -228,7 +239,7 @@ def run_test(filename):
cc_cmd += " -DEXPECT_FAILURE"
ispc_cmd = ispc_exe + " --woff %s -o %s --arch=%s --target=%s" % \
(filename, obj_name, options.arch, options.target)
(input_prefix+filename, obj_name, options.arch, options.target)
if options.no_opt:
ispc_cmd += " -O0"
if is_generic_target:
@@ -257,12 +268,28 @@ def run_test(filename):
# this function will be running in parallel across all of the CPU cores of
# the system.
def run_tasks_from_queue(queue, queue_ret):
if is_windows:
tmpdir = "tmp%d" % os.getpid()
os.mkdir(tmpdir)
os.chdir(tmpdir)
else:
olddir = ""
compile_error_files = [ ]
run_error_files = [ ]
while True:
filename = queue.get()
if (filename == 'STOP'):
queue_ret.put((compile_error_files, run_error_files))
if is_windows:
try:
os.remove("test_static.obj")
os.remove("/vc100.pdb")
os.chdir("..")
os.rmdir(tmpdir)
except:
None
sys.exit(0)
(compile_error, run_error) = run_test(filename)
@@ -286,32 +313,9 @@ if __name__ == '__main__':
compile_error_files = [ ]
run_error_files = [ ]
if is_windows:
# cl.exe gets itself all confused if we have multiple instances of
# it running concurrently and operating on the same .cpp file
# (test_static.cpp), even if we are generating a differently-named
# exe in the end. So run serially. :-(
nthreads = 1
num_done = 0
sys.stdout.write("Running %d tests.\n" % (total_tests))
for fn in files:
fn = fn.replace("\\",'/')
(compile_error, run_error) = run_test(fn)
if compile_error != 0:
compile_error_files += [ fn ]
if run_error != 0:
run_error_files += [ fn ]
num_done += 1
progress_str = " Done %d / %d [%s]\n" % (num_done, total_tests, fn)
# spaces to clear out detrius from previous printing...
for x in range(30):
progress_str += ' '
progress_str += '\r'
sys.stdout.write(progress_str)
sys.stdout.flush()
else:
nthreads = multiprocessing.cpu_count()
sys.stdout.write("Found %d CPUs. Running %d tests.\n" % (nthreads, total_tests))
print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)
# put each of the test filenames into a queue
q = multiprocessing.Queue()
@@ -335,7 +339,7 @@ if __name__ == '__main__':
# (i.e. return 0 if all is ok)
for t in task_threads:
t.join()
sys.stdout.write("\n")
print
while not qret.empty():
(c, r) = qret.get()

View File

@@ -787,10 +787,14 @@ packed_store_active(uniform int * uniform a, int vals) {
///////////////////////////////////////////////////////////////////////////
// System information
static inline int num_cores() {
static inline uniform int num_cores() {
return __num_cores();
}
static inline uniform int64 clock() {
return __clock();
}
///////////////////////////////////////////////////////////////////////////
// Atomics and memory barriers
@@ -808,8 +812,7 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
uniform TA value) { \
memory_barrier(); \
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
(MASKTYPE)__mask); \
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
memory_barrier(); \
return ret; \
} \
@@ -824,22 +827,80 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
continue; \
uniform TA * uniform p = ptrArray[i]; \
uniform TA v = extract(value, i); \
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
(MASKTYPE)__mask); \
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
ret = insert(ret, i, r); \
} \
memory_barrier(); \
return ret; \
} \
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE) \
#define DEFINE_ATOMIC_SWAP(TA,TB) \
static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
memory_barrier(); \
uniform int i = 0; \
TA ret[programCount]; \
TA memVal; \
uniform int lastSwap; \
uniform int mask = lanemask(); \
/* First, have the first running program instance (if any) perform \
the swap with memory with its value of "value"; record the \
value returned. */ \
for (; i < programCount; ++i) { \
if ((mask & (1 << i)) == 0) \
continue; \
memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
lastSwap = i; \
break; \
} \
/* Now, for all of the remaining running program instances, set the \
return value of the last instance that did a swap with this \
instance's value of "value"; this gives the same effect as if the \
current instance had executed a hardware atomic swap right before \
the last one that did a swap. */ \
for (; i < programCount; ++i) { \
if ((mask & (1 << i)) == 0) \
continue; \
ret[lastSwap] = extract(value, i); \
lastSwap = i; \
} \
/* And the last instance that wanted to swap gets the value we \
originally got back from memory... */ \
ret[lastSwap] = memVal; \
memory_barrier(); \
return ret[programIndex]; \
} \
static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \
uniform TA value) { \
memory_barrier(); \
uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value); \
memory_barrier(); \
return ret; \
} \
static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
memory_barrier(); \
TA ret; \
uniform int mask = lanemask(); \
for (uniform int i = 0; i < programCount; ++i) { \
if ((mask & (1 << i)) == 0) \
continue; \
uniform TA * uniform p = ptrArray[i]; \
uniform TA v = extract(value, i); \
uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \
ret = insert(ret, i, r); \
} \
memory_barrier(); \
return ret; \
} \
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
uniform TA oneval = reduce_##OPA(value); \
TA ret; \
if (lanemask() != 0) { \
memory_barrier(); \
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, \
(MASKTYPE)__mask); \
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval); \
memory_barrier(); \
} \
return ret; \
@@ -847,8 +908,7 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
uniform TA value) { \
memory_barrier(); \
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
(MASKTYPE)__mask); \
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
memory_barrier(); \
return ret; \
} \
@@ -864,8 +924,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
continue; \
uniform TA * uniform p = ptrArray[i]; \
uniform TA v = extract(value, i); \
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
(MASKTYPE)__mask); \
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
ret = insert(ret, i, r); \
} \
memory_barrier(); \
@@ -874,49 +933,51 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType)
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType)
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
DEFINE_ATOMIC_OP(int32,int32,swap,swap,IntMaskType)
DEFINE_ATOMIC_SWAP(int32,int32)
// For everything but atomic min and max, we can use the same
// implementations for unsigned as for signed.
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType)
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType)
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,UIntMaskType)
DEFINE_ATOMIC_SWAP(unsigned int32,int32)
DEFINE_ATOMIC_OP(float,float,swap,swap,IntMaskType)
DEFINE_ATOMIC_SWAP(float,float)
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType)
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType)
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
DEFINE_ATOMIC_OP(int64,int64,swap,swap,IntMaskType)
DEFINE_ATOMIC_SWAP(int64,int64)
// For everything but atomic min and max, we can use the same
// implementations for unsigned as for signed.
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType)
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType)
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,UIntMaskType)
DEFINE_ATOMIC_SWAP(unsigned int64,int64)
DEFINE_ATOMIC_OP(double,double,swap,swap,IntMaskType)
DEFINE_ATOMIC_SWAP(double,double)
#undef DEFINE_ATOMIC_OP
#undef DEFINE_ATOMIC_MINMAX_OP
#undef DEFINE_ATOMIC_SWAP
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \
static inline TA atomic_compare_exchange_global( \
@@ -931,8 +992,7 @@ static inline uniform TA atomic_compare_exchange_global( \
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
memory_barrier(); \
uniform TA ret = \
__atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, \
(MASKTYPE)__mask); \
__atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
memory_barrier(); \
return ret; \
}
@@ -2764,6 +2824,10 @@ static inline uniform double pow(uniform double a, uniform double b) {
// half-precision floats
static inline uniform float half_to_float(uniform unsigned int16 h) {
if (__have_native_half) {
return __half_to_float_uniform(h);
}
else {
if ((h & 0x7FFFu) == 0)
// Signed zero
return floatbits(((unsigned int32) h) << 16);
@@ -2818,9 +2882,14 @@ static inline uniform float half_to_float(uniform unsigned int16 h) {
}
}
}
}
}
static inline float half_to_float(unsigned int16 h) {
if (__have_native_half) {
return __half_to_float_varying(h);
}
else {
if ((h & 0x7FFFu) == 0)
// Signed zero
return floatbits(((unsigned int32) h) << 16);
@@ -2875,10 +2944,15 @@ static inline float half_to_float(unsigned int16 h) {
}
}
}
}
}
static inline uniform int16 float_to_half(uniform float f) {
if (__have_native_half) {
return __float_to_half_uniform(f);
}
else {
uniform int32 x = intbits(f);
// Store the return value in an int32 until the very end; this ends up
// generating better code...
@@ -2941,10 +3015,15 @@ static inline uniform int16 float_to_half(uniform float f) {
}
}
return (int16)ret;
}
}
static inline int16 float_to_half(float f) {
if (__have_native_half) {
return __float_to_half_varying(f);
}
else {
int32 x = intbits(f);
// Store the return value in an int32 until the very end; this ends up
// generating better code...
@@ -3007,10 +3086,15 @@ static inline int16 float_to_half(float f) {
}
}
return (int16)ret;
}
}
static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
if (__have_native_half) {
return __half_to_float_uniform(h);
}
else {
uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
@@ -3024,10 +3108,14 @@ static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
// Mantissa
uniform unsigned int32 xm = ((unsigned int32) hm) << 13;
return floatbits(xs | xe | xm);
}
}
static inline float half_to_float_fast(unsigned int16 h) {
if (__have_native_half) {
return __half_to_float_varying(h);
}
else {
unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
@@ -3041,10 +3129,14 @@ static inline float half_to_float_fast(unsigned int16 h) {
// Mantissa
unsigned int32 xm = ((unsigned int32) hm) << 13;
return floatbits(xs | xe | xm);
}
}
static inline uniform int16 float_to_half_fast(uniform float f) {
if (__have_native_half) {
return __float_to_half_uniform(f);
}
else {
uniform int32 x = intbits(f);
uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
@@ -3062,9 +3154,14 @@ static inline uniform int16 float_to_half_fast(uniform float f) {
ret += 1u;
return (int16)ret;
}
}
static inline int16 float_to_half_fast(float f) {
if (__have_native_half) {
return __float_to_half_varying(f);
}
else {
int32 x = intbits(f);
unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
@@ -3082,6 +3179,7 @@ static inline int16 float_to_half_fast(float f) {
ret += 1u;
return (int16)ret;
}
}
///////////////////////////////////////////////////////////////////////////
@@ -3095,16 +3193,15 @@ static inline unsigned int random(RNGState * uniform state)
{
unsigned int b;
// FIXME: state->z1, etc..
b = (((*state).z1 << 6) ^ (*state).z1) >> 13;
(*state).z1 = (((*state).z1 & 4294967294U) << 18) ^ b;
b = (((*state).z2 << 2) ^ (*state).z2) >> 27;
(*state).z2 = (((*state).z2 & 4294967288U) << 2) ^ b;
b = (((*state).z3 << 13) ^ (*state).z3) >> 21;
(*state).z3 = (((*state).z3 & 4294967280U) << 7) ^ b;
b = (((*state).z4 << 3) ^ (*state).z4) >> 12;
(*state).z4 = (((*state).z4 & 4294967168U) << 13) ^ b;
return ((*state).z1 ^ (*state).z2 ^ (*state).z3 ^ (*state).z4);
b = ((state->z1 << 6) ^ state->z1) >> 13;
state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
b = ((state->z2 << 2) ^ state->z2) >> 27;
state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
b = ((state->z3 << 13) ^ state->z3) >> 21;
state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
b = ((state->z4 << 3) ^ state->z4) >> 12;
state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
}
static inline float frandom(RNGState * uniform state)
@@ -3120,30 +3217,30 @@ static inline uniform unsigned int __seed4(RNGState * uniform state,
uniform unsigned int c1 = 0xf0f0f0f0;
uniform unsigned int c2 = 0x0f0f0f0f;
(*state).z1 = insert((*state).z1, start + 0, seed);
(*state).z1 = insert((*state).z1, start + 1, seed ^ c1);
(*state).z1 = insert((*state).z1, start + 2, (seed << 3) ^ c1);
(*state).z1 = insert((*state).z1, start + 3, (seed << 2) ^ c2);
state->z1 = insert(state->z1, start + 0, seed);
state->z1 = insert(state->z1, start + 1, seed ^ c1);
state->z1 = insert(state->z1, start + 2, (seed << 3) ^ c1);
state->z1 = insert(state->z1, start + 3, (seed << 2) ^ c2);
seed += 131;
(*state).z2 = insert((*state).z2, start + 0, seed);
(*state).z2 = insert((*state).z2, start + 1, seed ^ c1);
(*state).z2 = insert((*state).z2, start + 2, (seed << 3) ^ c1);
(*state).z2 = insert((*state).z2, start + 3, (seed << 2) ^ c2);
state->z2 = insert(state->z2, start + 0, seed);
state->z2 = insert(state->z2, start + 1, seed ^ c1);
state->z2 = insert(state->z2, start + 2, (seed << 3) ^ c1);
state->z2 = insert(state->z2, start + 3, (seed << 2) ^ c2);
seed ^= extract((*state).z2, 2);
(*state).z3 = insert((*state).z3, start + 0, seed);
(*state).z3 = insert((*state).z3, start + 1, seed ^ c1);
(*state).z3 = insert((*state).z3, start + 2, (seed << 3) ^ c1);
(*state).z3 = insert((*state).z3, start + 3, (seed << 2) ^ c2);
seed ^= extract(state->z2, 2);
state->z3 = insert(state->z3, start + 0, seed);
state->z3 = insert(state->z3, start + 1, seed ^ c1);
state->z3 = insert(state->z3, start + 2, (seed << 3) ^ c1);
state->z3 = insert(state->z3, start + 3, (seed << 2) ^ c2);
seed <<= 4;
seed += 3;
seed ^= extract((*state).z1, 3);
(*state).z4 = insert((*state).z4, start + 0, seed);
(*state).z4 = insert((*state).z4, start + 1, seed ^ c1);
(*state).z4 = insert((*state).z4, start + 2, (seed << 3) ^ c1);
(*state).z4 = insert((*state).z4, start + 3, (seed << 2) ^ c2);
seed ^= extract(state->z1, 3);
state->z4 = insert(state->z4, start + 0, seed);
state->z4 = insert(state->z4, start + 1, seed ^ c1);
state->z4 = insert(state->z4, start + 2, (seed << 3) ^ c1);
state->z4 = insert(state->z4, start + 3, (seed << 2) ^ c2);
return seed;
}

773
stmt.cpp
View File

@@ -494,6 +494,7 @@ lEmitIfStatements(FunctionEmitContext *ctx, Stmt *stmts, const char *trueOrFalse
ctx->EndScope();
}
void
IfStmt::EmitCode(FunctionEmitContext *ctx) const {
// First check all of the things that might happen due to errors
@@ -694,6 +695,23 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
}
// All indices are in-bounds
return true;
}
MemberExpr *me;
if ((me = dynamic_cast<MemberExpr *>(node)) != NULL &&
me->dereferenceExpr) {
*okPtr = false;
return false;
}
DereferenceExpr *de;
if ((de = dynamic_cast<DereferenceExpr *>(node)) != NULL) {
const Type *exprType = de->expr->GetType();
if (dynamic_cast<const PointerType *>(exprType) != NULL) {
*okPtr = false;
return false;
}
}
return true;
@@ -1132,7 +1150,7 @@ DoStmt::TypeCheck() {
!lHasVaryingBreakOrContinue(bodyStmts));
testExpr = new TypeCastExpr(uniformTest ? AtomicType::UniformBool :
AtomicType::VaryingBool,
testExpr, false, testExpr->pos);
testExpr, testExpr->pos);
}
return this;
@@ -1317,8 +1335,7 @@ ForStmt::TypeCheck() {
!g->opt.disableUniformControlFlow &&
!lHasVaryingBreakOrContinue(stmts));
test = new TypeCastExpr(uniformTest ? AtomicType::UniformBool :
AtomicType::VaryingBool,
test, false, test->pos);
AtomicType::VaryingBool, test, test->pos);
test = ::TypeCheck(test);
if (test == NULL)
return NULL;
@@ -1558,9 +1575,8 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
if (ctx->GetCurrentBasicBlock() == NULL || stmts == NULL)
return;
llvm::BasicBlock *bbCheckExtras = ctx->CreateBasicBlock("foreach_check_extras");
llvm::BasicBlock *bbDoExtras = ctx->CreateBasicBlock("foreach_do_extras");
llvm::BasicBlock *bbBody = ctx->CreateBasicBlock("foreach_body");
llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body");
llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body");
llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit");
llvm::Value *oldMask = ctx->GetInternalMask();
@@ -1578,8 +1594,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
// dimension and a number of derived values.
std::vector<llvm::BasicBlock *> bbReset, bbStep, bbTest;
std::vector<llvm::Value *> startVals, endVals, uniformCounterPtrs;
std::vector<llvm::Value *> nItems, nExtras, alignedEnd;
std::vector<llvm::Value *> extrasMaskPtrs;
std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
std::vector<int> span(nDims, 0);
lGetSpans(nDims-1, nDims, g->target.vectorWidth, isTiled, &span[0]);
@@ -1588,6 +1603,8 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
// Basic blocks that we'll fill in later with the looping logic for
// this dimension.
bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
if (i < nDims-1)
// stepping for the innermost dimension is handled specially
bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
@@ -1600,14 +1617,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
endVals.push_back(ev);
// nItems = endVal - startVal
nItems.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv,
"nitems"));
llvm::Value *nItems =
ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
// nExtras = nItems % (span for this dimension)
// This gives us the number of extra elements we need to deal with
// at the end of the loop for this dimension that don't fit cleanly
// into a vector width.
nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems[i],
nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
LLVMInt32(span[i]), "nextras"));
// alignedEnd = endVal - nExtras
@@ -1626,7 +1643,8 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
// There is also a varying variable that holds the set of index
// values for each dimension in the current loop iteration; this is
// the value that is program-visible.
dimVariables[i]->storagePtr = ctx->AllocaInst(LLVMTypes::Int32VectorType,
dimVariables[i]->storagePtr =
ctx->AllocaInst(LLVMTypes::Int32VectorType,
dimVariables[i]->name.c_str());
dimVariables[i]->parentFunction = ctx->GetFunction();
ctx->EmitVariableDebugInfo(dimVariables[i]);
@@ -1639,7 +1657,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
}
ctx->StartForeach(bbStep[nDims-1]);
ctx->StartForeach();
// On to the outermost loop's test
ctx->BranchInst(bbTest[0]);
@@ -1660,9 +1678,25 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
}
///////////////////////////////////////////////////////////////////////////
// foreach_test
// foreach_step: increment the uniform counter by the vector width.
// Note that we don't increment the varying counter here as well but
// just generate its value when we need it in the loop body. Don't do
// this for the innermost dimension, which has a more complex stepping
// structure..
for (int i = 0; i < nDims-1; ++i) {
ctx->SetCurrentBasicBlock(bbStep[i]);
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
llvm::Value *newCounter =
ctx->BinaryOperator(llvm::Instruction::Add, counter,
LLVMInt32(span[i]), "new_counter");
ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
ctx->BranchInst(bbTest[i]);
}
///////////////////////////////////////////////////////////////////////////
// foreach_test (for all dimensions other than the innermost...)
std::vector<llvm::Value *> inExtras;
for (int i = 0; i < nDims; ++i) {
for (int i = 0; i < nDims-1; ++i) {
ctx->SetCurrentBasicBlock(bbTest[i]);
llvm::Value *haveExtras =
@@ -1700,8 +1734,6 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
if (i == 0)
ctx->StoreInst(emask, extrasMaskPtrs[i]);
else {
// FIXME: at least specialize the innermost loop to not do all
// this mask stuff each time through the test...
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
llvm::Value *newMask =
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
@@ -1712,59 +1744,267 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
llvm::Value *notAtEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, endVals[i]);
if (i != nDims-1)
ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
}
///////////////////////////////////////////////////////////////////////////
// foreach_test (for innermost dimension)
//
// All of the outer dimensions are handled generically--basically as a
// for() loop from the start value to the end value, where at each loop
// test, we compute the mask of active elements for the current
// dimension and then update an overall mask that is the AND
// combination of all of the outer ones.
//
// The innermost loop is handled specially, for performance purposes.
// When starting the innermost dimension, we start by checking once
// whether any of the outer dimensions has set the mask to be
// partially-active or not. We follow different code paths for these
// two cases, taking advantage of the knowledge that the mask is all
// on, when this is the case.
//
// In each of these code paths, we start with a loop from the starting
// value to the aligned end value for the innermost dimension; we can
// guarantee that the innermost loop will have an "all on" mask (as far
// as its dimension is concerned) for the duration of this loop. Doing
// so allows us to emit code that assumes the mask is all on (for the
// case where none of the outer dimensions has set the mask to be
// partially on), or allows us to emit code that just uses the mask
// from the outer dimensions directly (for the case where they have).
//
// After this loop, we just need to deal with one vector's worth of
// "ragged extra bits", where the mask used includes the effect of the
// mask for the innermost dimension.
//
// We start out this process by emitting the check that determines
// whether any of the enclosing dimensions is partially active
// (i.e. processing extra elements that don't exactly fit into a
// vector).
llvm::BasicBlock *bbOuterInExtras =
ctx->CreateBasicBlock("outer_in_extras");
llvm::BasicBlock *bbOuterNotInExtras =
ctx->CreateBasicBlock("outer_not_in_extras");
ctx->SetCurrentBasicBlock(bbTest[nDims-1]);
if (inExtras.size())
ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
inExtras.back());
else
ctx->BranchInst(bbCheckExtras, bbReset[i], notAtEnd);
// for a 1D iteration domain, we certainly don't have any enclosing
// dimensions that are processing extra elements.
ctx->BranchInst(bbOuterNotInExtras);
///////////////////////////////////////////////////////////////////////////
// One or more outer dimensions in extras, so we need to mask for the loop
// body regardless. We break this into two cases, roughly:
// for (counter = start; counter < alignedEnd; counter += step) {
// // mask is all on for inner, so set mask to outer mask
// // run loop body with mask
// }
// // counter == alignedEnd
// if (counter < end) {
// // set mask to outermask & (counter+programCounter < end)
// // run loop body with mask
// }
llvm::BasicBlock *bbAllInnerPartialOuter =
ctx->CreateBasicBlock("all_inner_partial_outer");
llvm::BasicBlock *bbPartial =
ctx->CreateBasicBlock("both_partial");
ctx->SetCurrentBasicBlock(bbOuterInExtras); {
// Update the varying counter value here, since all subsequent
// blocks along this path need it.
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
dimVariables[nDims-1]->storagePtr, span);
// here we just check to see if counter < alignedEnd
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
llvm::Value *beforeAlignedEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, alignedEnd[nDims-1], "before_aligned_end");
ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
}
// Below we have a basic block that runs the loop body code for the
// case where the mask is partially but not fully on. This same block
// runs in multiple cases: both for handling any ragged extra data for
// the innermost dimension but also when outer dimensions have set the
// mask to be partially on.
//
// The value stored in stepIndexAfterMaskedBodyPtr is used after each
// execution of the body code to determine whether the innermost index
// value should be incremented by the step (we're running the "for"
// loop of full vectors at the innermost dimension, with outer
// dimensions having set the mask to be partially on), or whether we're
// running once for the ragged extra bits at the end of the innermost
// dimension, in which case we're done with the innermost dimension and
// should step the loop counter for the next enclosing dimension
// instead.
llvm::Value *stepIndexAfterMaskedBodyPtr =
ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
///////////////////////////////////////////////////////////////////////////
// We're in the inner loop part where the only masking is due to outer
// dimensions but the innermost dimension fits fully into a vector's
// width. Set the mask and jump to the masked loop body.
ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); {
llvm::Value *mask;
if (extrasMaskPtrs.size() == 0)
// 1D loop; we shouldn't ever get here anyway
mask = LLVMMaskAllOff;
else
mask = ctx->LoadInst(extrasMaskPtrs.back());
ctx->SetInternalMask(mask);
ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbMaskedBody);
}
///////////////////////////////////////////////////////////////////////////
// foreach_step: increment the uniform counter by the vector width.
// Note that we don't increment the varying counter here as well but
// just generate its value when we need it in the loop body.
for (int i = 0; i < nDims; ++i) {
ctx->SetCurrentBasicBlock(bbStep[i]);
if (i == nDims-1)
ctx->RestoreContinuedLanes();
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
llvm::Value *newCounter =
ctx->BinaryOperator(llvm::Instruction::Add, counter,
LLVMInt32(span[i]), "new_counter");
ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
ctx->BranchInst(bbTest[i]);
// We need to include the effect of the innermost dimension in the mask
// for the final bits here
ctx->SetCurrentBasicBlock(bbPartial); {
llvm::Value *varyingCounter =
ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
llvm::Value *smearEnd = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
for (int j = 0; j < g->target.vectorWidth; ++j)
smearEnd = ctx->InsertInst(smearEnd, endVals[nDims-1], j, "smear_end");
llvm::Value *emask =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
varyingCounter, smearEnd);
emask = ctx->I1VecToBoolVec(emask);
if (nDims == 1)
ctx->SetInternalMask(emask);
else {
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
llvm::Value *newMask =
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
"extras_mask");
ctx->SetInternalMask(newMask);
}
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbMaskedBody);
}
///////////////////////////////////////////////////////////////////////////
// foreach_check_extras: see if we need to deal with any partial
// vector's worth of work that's left.
ctx->SetCurrentBasicBlock(bbCheckExtras);
ctx->AddInstrumentationPoint("foreach loop check extras");
ctx->BranchInst(bbDoExtras, bbBody, inExtras[nDims-1]);
// None of the outer dimensions is processing extras; along the lines
// of above, we can express this as:
// for (counter = start; counter < alignedEnd; counter += step) {
// // mask is all on
// // run loop body with mask all on
// }
// // counter == alignedEnd
// if (counter < end) {
// // set mask to (counter+programCounter < end)
// // run loop body with mask
// }
llvm::BasicBlock *bbPartialInnerAllOuter =
ctx->CreateBasicBlock("partial_inner_all_outer");
ctx->SetCurrentBasicBlock(bbOuterNotInExtras); {
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
llvm::Value *beforeAlignedEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, alignedEnd[nDims-1], "before_aligned_end");
ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
beforeAlignedEnd);
}
///////////////////////////////////////////////////////////////////////////
// foreach_body: do a full vector's worth of work. We know that all
// full_body: do a full vector's worth of work. We know that all
// lanes will be running here, so we explicitly set the mask to be 'all
// on'. This ends up being relatively straightforward: just update the
// value of the varying loop counter and have the statements in the
// loop body emit their code.
ctx->SetCurrentBasicBlock(bbBody);
llvm::BasicBlock *bbFullBodyContinue =
ctx->CreateBasicBlock("foreach_full_continue");
ctx->SetCurrentBasicBlock(bbFullBody); {
ctx->SetInternalMask(LLVMMaskAllOn);
ctx->AddInstrumentationPoint("foreach loop body");
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
dimVariables[nDims-1]->storagePtr, span);
ctx->SetContinueTarget(bbFullBodyContinue);
ctx->AddInstrumentationPoint("foreach loop body (all on)");
stmts->EmitCode(ctx);
Assert(ctx->GetCurrentBasicBlock() != NULL);
ctx->BranchInst(bbStep[nDims-1]);
ctx->BranchInst(bbFullBodyContinue);
}
ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
ctx->RestoreContinuedLanes();
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
llvm::Value *newCounter =
ctx->BinaryOperator(llvm::Instruction::Add, counter,
LLVMInt32(span[nDims-1]), "new_counter");
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
ctx->BranchInst(bbOuterNotInExtras);
}
///////////////////////////////////////////////////////////////////////////
// foreach_doextras: set the mask and have the statements emit their
// We're done running blocks with the mask all on; see if the counter is
// less than the end value, in which case we need to run the body one
// more time to get the extra bits.
llvm::BasicBlock *bbSetInnerMask =
ctx->CreateBasicBlock("partial_inner_only");
ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); {
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
llvm::Value *beforeFullEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, endVals[nDims-1], "before_full_end");
ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
}
///////////////////////////////////////////////////////////////////////////
// The outer dimensions are all on, so the mask is just given by the
// mask for the innermost dimension
ctx->SetCurrentBasicBlock(bbSetInnerMask); {
llvm::Value *varyingCounter =
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
dimVariables[nDims-1]->storagePtr, span);
llvm::Value *smearEnd = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
for (int j = 0; j < g->target.vectorWidth; ++j)
smearEnd = ctx->InsertInst(smearEnd, endVals[nDims-1], j, "smear_end");
llvm::Value *emask =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
varyingCounter, smearEnd);
emask = ctx->I1VecToBoolVec(emask);
ctx->SetInternalMask(emask);
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbMaskedBody);
}
///////////////////////////////////////////////////////////////////////////
// masked_body: set the mask and have the statements emit their
// code again. Note that it's generally worthwhile having two copies
// of the statements' code, since the code above is emitted with the
// mask known to be all-on, which in turn leads to more efficient code
// for that case.
ctx->SetCurrentBasicBlock(bbDoExtras);
llvm::Value *mask = ctx->LoadInst(extrasMaskPtrs[nDims-1]);
ctx->SetInternalMask(mask);
llvm::BasicBlock *bbStepInnerIndex =
ctx->CreateBasicBlock("step_inner_index");
llvm::BasicBlock *bbMaskedBodyContinue =
ctx->CreateBasicBlock("foreach_masked_continue");
ctx->SetCurrentBasicBlock(bbMaskedBody); {
ctx->AddInstrumentationPoint("foreach loop body (masked)");
ctx->SetContinueTarget(bbMaskedBodyContinue);
stmts->EmitCode(ctx);
ctx->BranchInst(bbStep[nDims-1]);
ctx->BranchInst(bbMaskedBodyContinue);
}
ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); {
ctx->RestoreContinuedLanes();
llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
}
///////////////////////////////////////////////////////////////////////////
// step the innermost index, for the case where we're doing the
// innermost for loop over full vectors.
ctx->SetCurrentBasicBlock(bbStepInnerIndex); {
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
llvm::Value *newCounter =
ctx->BinaryOperator(llvm::Instruction::Add, counter,
LLVMInt32(span[nDims-1]), "new_counter");
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
ctx->BranchInst(bbOuterInExtras);
}
///////////////////////////////////////////////////////////////////////////
// foreach_exit: All done. Restore the old mask and clean up
@@ -1869,6 +2109,301 @@ ForeachStmt::Print(int indent) const {
}
///////////////////////////////////////////////////////////////////////////
// CaseStmt
/** Given the statements following a 'case' or 'default' label, this
function determines whether the mask should be checked to see if it is
"all off" immediately after the label, before executing the code for
the statements.
*/
static bool
lCheckMask(Stmt *stmts) {
if (stmts == NULL)
return false;
int cost = EstimateCost(stmts);
bool safeToRunWithAllLanesOff = true;
WalkAST(stmts, lCheckAllOffSafety, NULL, &safeToRunWithAllLanesOff);
// The mask should be checked if the code following the
// 'case'/'default' is relatively complex, or if it would be unsafe to
// run that code with the execution mask all off.
return (cost > PREDICATE_SAFE_IF_STATEMENT_COST ||
safeToRunWithAllLanesOff == false);
}
CaseStmt::CaseStmt(int v, Stmt *s, SourcePos pos)
: Stmt(pos), value(v) {
stmts = s;
}
void
CaseStmt::EmitCode(FunctionEmitContext *ctx) const {
ctx->EmitCaseLabel(value, lCheckMask(stmts), pos);
if (stmts)
stmts->EmitCode(ctx);
}
void
CaseStmt::Print(int indent) const {
printf("%*cCase [%d] label", indent, ' ', value);
pos.Print();
printf("\n");
stmts->Print(indent+4);
}
Stmt *
CaseStmt::TypeCheck() {
return this;
}
int
CaseStmt::EstimateCost() const {
return 0;
}
///////////////////////////////////////////////////////////////////////////
// DefaultStmt
DefaultStmt::DefaultStmt(Stmt *s, SourcePos pos)
: Stmt(pos) {
stmts = s;
}
void
DefaultStmt::EmitCode(FunctionEmitContext *ctx) const {
ctx->EmitDefaultLabel(lCheckMask(stmts), pos);
if (stmts)
stmts->EmitCode(ctx);
}
void
DefaultStmt::Print(int indent) const {
printf("%*cDefault Stmt", indent, ' ');
pos.Print();
printf("\n");
stmts->Print(indent+4);
}
Stmt *
DefaultStmt::TypeCheck() {
return this;
}
int
DefaultStmt::EstimateCost() const {
return 0;
}
///////////////////////////////////////////////////////////////////////////
// SwitchStmt
SwitchStmt::SwitchStmt(Expr *e, Stmt *s, SourcePos pos)
: Stmt(pos) {
expr = e;
stmts = s;
}
/* An instance of this structure is carried along as we traverse the AST
nodes for the statements after a "switch" statement. We use this
structure to record all of the 'case' and 'default' statements after the
"switch". */
struct SwitchVisitInfo {
SwitchVisitInfo(FunctionEmitContext *c) {
ctx = c;
defaultBlock = NULL;
lastBlock = NULL;
}
FunctionEmitContext *ctx;
/* Basic block for the code following the "default" label (if any). */
llvm::BasicBlock *defaultBlock;
/* Map from integer values after "case" labels to the basic blocks that
follow the corresponding "case" label. */
std::vector<std::pair<int, llvm::BasicBlock *> > caseBlocks;
/* For each basic block for a "case" label or a "default" label,
nextBlock[block] stores the basic block pointer for the next
subsequent "case" or "default" label in the program. */
std::map<llvm::BasicBlock *, llvm::BasicBlock *> nextBlock;
/* The last basic block created for a "case" or "default" label; when
we create the basic block for the next one, we'll use this to update
the nextBlock map<> above. */
llvm::BasicBlock *lastBlock;
};
static bool
lSwitchASTPreVisit(ASTNode *node, void *d) {
if (dynamic_cast<SwitchStmt *>(node) != NULL)
// don't continue recursively into a nested switch--we only want
// our own case and default statements!
return false;
CaseStmt *cs = dynamic_cast<CaseStmt *>(node);
DefaultStmt *ds = dynamic_cast<DefaultStmt *>(node);
SwitchVisitInfo *svi = (SwitchVisitInfo *)d;
llvm::BasicBlock *bb = NULL;
if (cs != NULL) {
// Complain if we've seen a case statement with the same value
// already
for (int i = 0; i < (int)svi->caseBlocks.size(); ++i) {
if (svi->caseBlocks[i].first == cs->value) {
Error(cs->pos, "Duplicate case value \"%d\".", cs->value);
return true;
}
}
// Otherwise create a new basic block for the code following this
// 'case' statement and record the mappign between the case label
// value and the basic block
char buf[32];
sprintf(buf, "case_%d", cs->value);
bb = svi->ctx->CreateBasicBlock(buf);
svi->caseBlocks.push_back(std::make_pair(cs->value, bb));
}
else if (ds != NULL) {
// And complain if we've seen another 'default' label..
if (svi->defaultBlock != NULL) {
Error(ds->pos, "Multiple \"default\" lables in switch statement.");
return true;
}
else {
// Otherwise create a basic block for the code following the
// "default".
bb = svi->ctx->CreateBasicBlock("default");
svi->defaultBlock = bb;
}
}
// If we saw a "case" or "default" label, then update the map to record
// that the block we just created follows the block created for the
// previous label in the "switch".
if (bb != NULL) {
svi->nextBlock[svi->lastBlock] = bb;
svi->lastBlock = bb;
}
return true;
}
void
SwitchStmt::EmitCode(FunctionEmitContext *ctx) const {
if (ctx->GetCurrentBasicBlock() == NULL)
return;
const Type *type;
if (expr == NULL || ((type = expr->GetType()) == NULL)) {
Assert(m->errorCount > 0);
return;
}
// Basic block we'll end up after the switch statement
llvm::BasicBlock *bbDone = ctx->CreateBasicBlock("switch_done");
// Walk the AST of the statements after the 'switch' to collect a bunch
// of information about the structure of the 'case' and 'default'
// statements.
SwitchVisitInfo svi(ctx);
WalkAST(stmts, lSwitchASTPreVisit, NULL, &svi);
// Record that the basic block following the last one created for a
// case/default is the block after the end of the switch statement.
svi.nextBlock[svi.lastBlock] = bbDone;
llvm::Value *exprValue = expr->GetValue(ctx);
if (exprValue == NULL) {
Assert(m->errorCount > 0);
return;
}
bool isUniformCF = (type->IsUniformType() &&
lHasVaryingBreakOrContinue(stmts) == false);
ctx->StartSwitch(isUniformCF, bbDone);
ctx->SwitchInst(exprValue, svi.defaultBlock ? svi.defaultBlock : bbDone,
svi.caseBlocks, svi.nextBlock);
if (stmts != NULL)
stmts->EmitCode(ctx);
if (ctx->GetCurrentBasicBlock() != NULL)
ctx->BranchInst(bbDone);
ctx->SetCurrentBasicBlock(bbDone);
ctx->EndSwitch();
}
void
SwitchStmt::Print(int indent) const {
printf("%*cSwitch Stmt", indent, ' ');
pos.Print();
printf("\n");
printf("%*cexpr = ", indent, ' ');
expr->Print();
printf("\n");
stmts->Print(indent+4);
}
Stmt *
SwitchStmt::TypeCheck() {
const Type *exprType = expr->GetType();
if (exprType == NULL)
return NULL;
const Type *toType = NULL;
exprType = exprType->GetAsConstType();
bool is64bit = (exprType->GetAsUniformType() ==
AtomicType::UniformConstUInt64 ||
exprType->GetAsUniformType() ==
AtomicType::UniformConstInt64);
if (exprType->IsUniformType()) {
if (is64bit) toType = AtomicType::UniformInt64;
else toType = AtomicType::UniformInt32;
}
else {
if (is64bit) toType = AtomicType::VaryingInt64;
else toType = AtomicType::VaryingInt32;
}
expr = TypeConvertExpr(expr, toType, "switch expression");
if (expr == NULL)
return NULL;
return this;
}
int
SwitchStmt::EstimateCost() const {
const Type *type = expr->GetType();
if (type && type->IsVaryingType())
return COST_VARYING_SWITCH;
else
return COST_UNIFORM_SWITCH;
}
///////////////////////////////////////////////////////////////////////////
// ReturnStmt
@@ -1915,14 +2450,137 @@ ReturnStmt::Print(int indent) const {
}
///////////////////////////////////////////////////////////////////////////
// GotoStmt
GotoStmt::GotoStmt(const char *l, SourcePos gotoPos, SourcePos ip)
: Stmt(gotoPos) {
label = l;
identifierPos = ip;
}
void
GotoStmt::EmitCode(FunctionEmitContext *ctx) const {
if (ctx->VaryingCFDepth() > 0) {
Error(pos, "\"goto\" statements are only legal under \"uniform\" "
"control flow.");
return;
}
if (ctx->InForeachLoop()) {
Error(pos, "\"goto\" statements are currently illegal inside "
"\"foreach\" loops.");
return;
}
llvm::BasicBlock *bb = ctx->GetLabeledBasicBlock(label);
if (bb == NULL) {
// TODO: use the string distance stuff to suggest alternatives if
// there are some with names close to the label name we have here..
Error(identifierPos, "No label named \"%s\" found in current function.",
label.c_str());
return;
}
ctx->BranchInst(bb);
ctx->SetCurrentBasicBlock(NULL);
}
void
GotoStmt::Print(int indent) const {
printf("%*cGoto label \"%s\"\n", indent, ' ', label.c_str());
}
Stmt *
GotoStmt::Optimize() {
return this;
}
Stmt *
GotoStmt::TypeCheck() {
return this;
}
int
GotoStmt::EstimateCost() const {
return COST_GOTO;
}
///////////////////////////////////////////////////////////////////////////
// LabeledStmt
LabeledStmt::LabeledStmt(const char *n, Stmt *s, SourcePos p)
: Stmt(p) {
name = n;
stmt = s;
}
void
LabeledStmt::EmitCode(FunctionEmitContext *ctx) const {
llvm::BasicBlock *bblock = ctx->GetLabeledBasicBlock(name);
Assert(bblock != NULL);
// End the current basic block with a jump to our basic block and then
// set things up for emission to continue there. Note that the current
// basic block may validly be NULL going into this statement due to an
// earlier goto that NULLed it out; that doesn't stop us from
// re-establishing a current basic block starting at the label..
if (ctx->GetCurrentBasicBlock() != NULL)
ctx->BranchInst(bblock);
ctx->SetCurrentBasicBlock(bblock);
if (stmt != NULL)
stmt->EmitCode(ctx);
}
void
LabeledStmt::Print(int indent) const {
printf("%*cLabel \"%s\"\n", indent, ' ', name.c_str());
if (stmt != NULL)
stmt->Print(indent);
}
Stmt *
LabeledStmt::Optimize() {
return this;
}
Stmt *
LabeledStmt::TypeCheck() {
if (!isalpha(name[0]) || name[0] == '_') {
Error(pos, "Label must start with either alphabetic character or '_'.");
return NULL;
}
for (unsigned int i = 1; i < name.size(); ++i) {
if (!isalnum(name[i]) && name[i] != '_') {
Error(pos, "Character \"%c\" is illegal in labels.", name[i]);
return NULL;
}
}
return this;
}
int
LabeledStmt::EstimateCost() const {
return 0;
}
///////////////////////////////////////////////////////////////////////////
// StmtList
void
StmtList::EmitCode(FunctionEmitContext *ctx) const {
if (!ctx->GetCurrentBasicBlock())
return;
ctx->StartScope();
ctx->SetDebugPos(pos);
for (unsigned int i = 0; i < stmts.size(); ++i)
@@ -2020,7 +2678,7 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
baseType == AtomicType::UniformUInt16) {
expr = new TypeCastExpr(type->IsUniformType() ? AtomicType::UniformInt32 :
AtomicType::VaryingInt32,
expr, false, expr->pos);
expr, expr->pos);
type = expr->GetType();
}
@@ -2173,16 +2831,6 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
m->module->getFunction("__do_assert_varying");
Assert(assertFunc != NULL);
#ifdef ISPC_IS_WINDOWS
char errorString[2048];
if (sprintf_s(errorString, sizeof(errorString),
"%s(%d): Assertion failed: %s\n", pos.name,
pos.first_line, message.c_str()) == -1) {
Error(pos, "Fatal error in sprintf_s() call when generating assert "
"string.");
return;
}
#else
char *errorString;
if (asprintf(&errorString, "%s:%d:%d: Assertion failed: %s\n",
pos.name, pos.first_line, pos.first_column,
@@ -2191,7 +2839,6 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
"unable to allocate memory!");
return;
}
#endif
std::vector<llvm::Value *> args;
args.push_back(ctx->GetStringPtr(errorString));
@@ -2199,9 +2846,7 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
args.push_back(ctx->GetFullMask());
ctx->CallInst(assertFunc, NULL, args, "");
#ifndef ISPC_IS_WINDOWS
free(errorString);
#endif // !ISPC_IS_WINDOWS
}
@@ -2223,7 +2868,7 @@ AssertStmt::TypeCheck() {
}
expr = new TypeCastExpr(isUniform ? AtomicType::UniformBool :
AtomicType::VaryingBool,
expr, false, expr->pos);
expr, expr->pos);
expr = ::TypeCheck(expr);
}
return this;

91
stmt.h
View File

@@ -282,6 +282,97 @@ public:
};
/** Statement corresponding to a "case" label in the program. In addition
to the value associated with the "case", this statement also stores the
statements following it. */
class CaseStmt : public Stmt {
public:
CaseStmt(int value, Stmt *stmt, SourcePos pos);
void EmitCode(FunctionEmitContext *ctx) const;
void Print(int indent) const;
Stmt *TypeCheck();
int EstimateCost() const;
/** Integer value after the "case" statement */
const int value;
Stmt *stmts;
};
/** Statement for a "default" label (as would be found inside a "switch"
statement). */
class DefaultStmt : public Stmt {
public:
DefaultStmt(Stmt *stmt, SourcePos pos);
void EmitCode(FunctionEmitContext *ctx) const;
void Print(int indent) const;
Stmt *TypeCheck();
int EstimateCost() const;
Stmt *stmts;
};
/** A "switch" statement in the program. */
class SwitchStmt : public Stmt {
public:
SwitchStmt(Expr *expr, Stmt *stmts, SourcePos pos);
void EmitCode(FunctionEmitContext *ctx) const;
void Print(int indent) const;
Stmt *TypeCheck();
int EstimateCost() const;
/** Expression that is used to determine which label to jump to. */
Expr *expr;
/** Statement block after the "switch" expression. */
Stmt *stmts;
};
/** A "goto" in an ispc program. */
class GotoStmt : public Stmt {
public:
GotoStmt(const char *label, SourcePos gotoPos, SourcePos idPos);
void EmitCode(FunctionEmitContext *ctx) const;
void Print(int indent) const;
Stmt *Optimize();
Stmt *TypeCheck();
int EstimateCost() const;
/** Name of the label to jump to when the goto is executed. */
std::string label;
SourcePos identifierPos;
};
/** Statement corresponding to a label (as would be used as a goto target)
in the program. */
class LabeledStmt : public Stmt {
public:
LabeledStmt(const char *label, Stmt *stmt, SourcePos p);
void EmitCode(FunctionEmitContext *ctx) const;
void Print(int indent) const;
Stmt *Optimize();
Stmt *TypeCheck();
int EstimateCost() const;
/** Name of the label. */
std::string name;
/** Statements following the label. */
Stmt *stmt;
};
/** @brief Representation of a list of statements in the program.
*/
class StmtList : public Stmt {

35
sym.cpp
View File

@@ -72,8 +72,7 @@ SymbolTable::SymbolTable() {
SymbolTable::~SymbolTable() {
// Otherwise we have mismatched push/pop scopes
Assert(variables.size() == 1 && functions.size() == 1 &&
types.size() == 1);
Assert(variables.size() == 1 && types.size() == 1);
PopScope();
}
@@ -81,7 +80,6 @@ SymbolTable::~SymbolTable() {
void
SymbolTable::PushScope() {
variables.push_back(new SymbolMapType);
functions.push_back(new FunctionMapType);
types.push_back(new TypeMapType);
}
@@ -92,10 +90,6 @@ SymbolTable::PopScope() {
delete variables.back();
variables.pop_back();
Assert(functions.size() > 1);
delete functions.back();
functions.pop_back();
Assert(types.size() > 1);
delete types.back();
types.pop_back();
@@ -160,7 +154,7 @@ SymbolTable::AddFunction(Symbol *symbol) {
// the symbol table
return false;
std::vector<Symbol *> &funOverloads = (*functions.back())[symbol->name];
std::vector<Symbol *> &funOverloads = functions[symbol->name];
funOverloads.push_back(symbol);
return true;
}
@@ -168,10 +162,8 @@ SymbolTable::AddFunction(Symbol *symbol) {
bool
SymbolTable::LookupFunction(const char *name, std::vector<Symbol *> *matches) {
for (int i = (int)functions.size() - 1; i >= 0; --i) {
FunctionMapType &fm = *(functions[i]);
FunctionMapType::iterator iter = fm.find(name);
if (iter != fm.end()) {
FunctionMapType::iterator iter = functions.find(name);
if (iter != functions.end()) {
if (matches == NULL)
return true;
else {
@@ -180,24 +172,20 @@ SymbolTable::LookupFunction(const char *name, std::vector<Symbol *> *matches) {
matches->push_back(funcs[j]);
}
}
}
return matches ? (matches->size() > 0) : false;
}
Symbol *
SymbolTable::LookupFunction(const char *name, const FunctionType *type) {
for (int i = (int)functions.size() - 1; i >= 0; --i) {
FunctionMapType &fm = *(functions[i]);
FunctionMapType::iterator iter = fm.find(name);
if (iter != fm.end()) {
FunctionMapType::iterator iter = functions.find(name);
if (iter != functions.end()) {
std::vector<Symbol *> funcs = iter->second;
for (int j = 0; j < (int)funcs.size(); ++j) {
if (Type::Equal(funcs[j]->type, type))
return funcs[j];
}
}
}
return NULL;
}
@@ -261,15 +249,12 @@ SymbolTable::ClosestVariableOrFunctionMatch(const char *str) const {
}
}
for (int i = 0; i < (int)functions.size(); ++i) {
const FunctionMapType &fm = *(functions[i]);
FunctionMapType::const_iterator iter;
for (iter = fm.begin(); iter != fm.end(); ++iter) {
for (iter = functions.begin(); iter != functions.end(); ++iter) {
int dist = StringEditDistance(str, iter->first, maxDelta+1);
if (dist <= maxDelta)
matches[dist].push_back(iter->first);
}
}
// Now, return the first entry of matches[] that is non-empty, if any.
for (int i = 0; i <= maxDelta; ++i) {
@@ -346,16 +331,14 @@ SymbolTable::Print() {
}
fprintf(stderr, "Functions:\n----------------\n");
for (int i = 0; i < (int)functions.size(); ++i) {
FunctionMapType::iterator fiter = functions[i]->begin();
while (fiter != functions[i]->end()) {
FunctionMapType::iterator fiter = functions.begin();
while (fiter != functions.end()) {
fprintf(stderr, "%s\n", fiter->first.c_str());
std::vector<Symbol *> &syms = fiter->second;
for (unsigned int j = 0; j < syms.size(); ++j)
fprintf(stderr, " %s\n", syms[j]->type->GetString().c_str());
++fiter;
}
}
depth = 0;
fprintf(stderr, "Named types:\n---------------\n");

16
sym.h
View File

@@ -257,12 +257,13 @@ private:
typedef std::map<std::string, Symbol *> SymbolMapType;
std::vector<SymbolMapType *> variables;
/** Function declarations are also scoped., A STL \c vector is used to
store the function symbols for a given name since, due to function
overloading, a name can have multiple function symbols associated
with it. */
/** Function declarations are *not* scoped. (C99, for example, allows
an implementation to maintain function declarations in a single
namespace.) A STL \c vector is used to store the function symbols
for a given name since, due to function overloading, a name can
have multiple function symbols associated with it. */
typedef std::map<std::string, std::vector<Symbol *> > FunctionMapType;
std::vector<FunctionMapType *> functions;
FunctionMapType functions;
/** Type definitions can also be scoped. A new \c TypeMapType
is added to the back of the \c types \c vector each time a new scope
@@ -278,17 +279,14 @@ SymbolTable::GetMatchingFunctions(Predicate pred,
std::vector<Symbol *> *matches) const {
// Iterate through all function symbols and apply the given predicate.
// If it returns true, add the Symbol * to the provided vector.
for (unsigned int i = 0; i < functions.size(); ++i) {
FunctionMapType &fm = *(functions[i]);
FunctionMapType::const_iterator iter;
for (iter = fm.begin(); iter != fm.end(); ++iter) {
for (iter = functions.begin(); iter != functions.end(); ++iter) {
const std::vector<Symbol *> &syms = iter->second;
for (unsigned int j = 0; j < syms.size(); ++j) {
if (pred(syms[j]))
matches->push_back(syms[j]);
}
}
}
}

View File

@@ -46,7 +46,6 @@
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <stdint.h>
#ifdef ISPC_IS_LINUX
#include <malloc.h>

17
tests/atomics-swap.ispc Normal file
View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
uniform int32 s = 1234;
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
float b = 0;
if (programIndex & 1) {
b = atomic_swap_global(&s, programIndex);
}
RET[programIndex] = reduce_add(b) + s;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1234 + reduce_add(programIndex & 1 ? programIndex : 0);
}

17
tests/goto-1.ispc Normal file
View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
float b = 0.; b = a;
RET[programIndex] = a+b;
goto skip;
RET[programIndex] = 0;
skip:
;
}
export void result(uniform float RET[]) {
RET[programIndex] = 2 + 2*programIndex;
}

18
tests/goto-2.ispc Normal file
View File

@@ -0,0 +1,18 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
float b = 0.; b = a;
RET[programIndex] = a+b;
if (all(a != 0))
goto skip;
RET[programIndex] = 0;
skip:
;
}
export void result(uniform float RET[]) {
RET[programIndex] = 2 + 2*programIndex;
}

18
tests/goto-3.ispc Normal file
View File

@@ -0,0 +1,18 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
float b = 0.; b = a;
RET[programIndex] = a+b;
if (all(a == 0))
goto skip;
RET[programIndex] = 0;
skip:
;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

19
tests/goto-4.ispc Normal file
View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
float b = 0.; b = a;
RET[programIndex] = 0;
encore:
++RET[programIndex];
if (any(a != 0)) {
a = max(a-1, 0);
goto encore;
}
}
export void result(uniform float RET[]) {
RET[programIndex] = programCount+1;
}

21
tests/half-3.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_v(uniform float RET[]) {
int errors = 0;
foreach (i = 0 ... 65535) {
unsigned int16 h = i;
float f = half_to_float(i);
h = float_to_half(f);
int mismatches = (f == f && i != h);
errors += reduce_add(mismatches);
}
RET[programIndex] = errors;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

View File

@@ -0,0 +1,13 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
a *= 1k;
RET[programIndex] = a;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1024*(programIndex+1);
}

View File

@@ -0,0 +1,12 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = b + 2M;
RET[programIndex] = a;
}
export void result(uniform float RET[]) {
RET[programIndex] = 2*1024*1024 + 5;
}

View File

@@ -0,0 +1,14 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
unsigned int32 a = 3G;
a -= 2G;
a -= 1024M;
RET[programIndex] = a;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

View File

@@ -0,0 +1,16 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform float a[programCount];
a[programIndex] = aFOO[programIndex];
uniform float * uniform ptr = a;
*(ptr+1) = 0;
RET[programIndex] = a[programIndex];
}
export void result(uniform float RET[]) {
RET[programIndex] = 1+programIndex;
RET[1] = 0;
}

View File

@@ -0,0 +1,15 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform float a[programCount];
a[programIndex] = aFOO[programIndex];
uniform float * varying ptr = a;
*(ptr+programIndex) = 0;
RET[programIndex] = a[programIndex];
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

18
tests/switch-1.ispc Normal file
View File

@@ -0,0 +1,18 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
switch (b) {
default:
RET[programIndex] = -1;
break;
case 5:
RET[programIndex] = 0;
}
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

44
tests/switch-10.ispc Normal file
View File

@@ -0,0 +1,44 @@
export uniform int width() { return programCount; }
int switchit(int a, uniform int b) {
switch (a) {
case 3:
return 1;
case 7:
case 6:
case 4:
case 5:
if (a & 1)
break;
return 2;
case 1: {
switch (a+b) {
case 6:
return 42;
default:
break;
}
return -1234;
}
case 32:
*((int *)NULL) = 0;
default:
return 0;
}
return 3;
}
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
int x = switchit(a, b);
RET[programIndex] = x;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 42;
RET[2] = 1;
RET[6] = RET[4] = 3;
RET[5] = RET[3] = 2;
}

50
tests/switch-11.ispc Normal file
View File

@@ -0,0 +1,50 @@
export uniform int width() { return programCount; }
int switchit(int a, uniform int b) {
switch (a) {
case 3:
return 1;
case 7:
case 6:
case 4:
case 5:
if (a & 1)
break;
return 2;
case 1: {
switch (a+b) {
case 60:
return -1234;
default:
break;
case 6:
if (b == 5)
break;
return -42;
case 12:
return -1;
}
return 42;
}
case 32:
*((int *)NULL) = 0;
default:
return 0;
}
return 3;
}
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
int x = switchit(a, b);
RET[programIndex] = x;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 42;
RET[2] = 1;
RET[6] = RET[4] = 3;
RET[5] = RET[3] = 2;
}

54
tests/switch-12.ispc Normal file
View File

@@ -0,0 +1,54 @@
export uniform int width() { return programCount; }
int switchit(int a, uniform int b) {
switch (a) {
case 3:
return 1;
case 7:
case 6:
case 4:
case 5:
if (a & 1)
break;
return 2;
case 1: {
switch (a+b) {
case 60:
return -1234;
default:
break;
case 6:
int count = 0;
for (count = 0; count < 10; ++count) {
a += b;
if (a == 11)
break;
}
return a;
case 12:
return -1;
}
return 42;
}
case 32:
*((int *)NULL) = 0;
default:
return 0;
}
return 3;
}
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
int x = switchit(a, b);
RET[programIndex] = x;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 11;
RET[2] = 1;
RET[6] = RET[4] = 3;
RET[5] = RET[3] = 2;
}

28
tests/switch-13.ispc Normal file
View File

@@ -0,0 +1,28 @@
export uniform int width() { return programCount; }
int switchit(int a, uniform int b) {
int r = -1;
switch (b) {
case 5:
if (a & 1) {
r=3;
break;
}
r= 2;
break;
default:
r= 3;
}
return r;
}
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
int x = switchit(a, b);
RET[programIndex] = x;
}
export void result(uniform float RET[]) {
RET[programIndex] = (programIndex & 1) ? 2 : 3;
}

24
tests/switch-14.ispc Normal file
View File

@@ -0,0 +1,24 @@
export uniform int width() { return programCount; }
int switchit(int a, uniform int b) {
switch (b) {
case 5:
if (a & 1)
break;
return 2;
default:
return 42;
}
return 3;
}
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
int x = switchit(a, b);
RET[programIndex] = x;
}
export void result(uniform float RET[]) {
RET[programIndex] = (programIndex & 1) ? 2 : 3;
}

17
tests/switch-2.ispc Normal file
View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
switch (b) {
default:
RET[programIndex] = -1;
case 5:
RET[programIndex] = 0;
}
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

18
tests/switch-3.ispc Normal file
View File

@@ -0,0 +1,18 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
switch (b) {
case 5:
RET[programIndex] = 0;
break;
default:
RET[programIndex] = -1;
}
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

24
tests/switch-4.ispc Normal file
View File

@@ -0,0 +1,24 @@
export uniform int width() { return programCount; }
int switchit(int a, uniform int b) {
int r = 0;
switch (a) {
case 3:
r = 1;
break;
default:
r = 0;
}
return r;
}
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
int x = switchit(a, b);
RET[programIndex] = x;
}
export void result(uniform float RET[]) {
RET[programIndex] = (programIndex == 2) ? 1 : 0;
}

22
tests/switch-5.ispc Normal file
View File

@@ -0,0 +1,22 @@
export uniform int width() { return programCount; }
int switchit(int a, uniform int b) {
int r = 0;
switch (a) {
case 3:
return 1;
default:
return 0;
}
}
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
int x = switchit(a, b);
RET[programIndex] = x;
}
export void result(uniform float RET[]) {
RET[programIndex] = (programIndex == 2) ? 1 : 0;
}

27
tests/switch-6.ispc Normal file
View File

@@ -0,0 +1,27 @@
export uniform int width() { return programCount; }
int switchit(int a, uniform int b) {
switch (a) {
case 3:
return 1;
case 7:
if (b == 5)
break;
default:
return 0;
}
return -1;
}
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
int x = switchit(a, b);
RET[programIndex] = x;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[2] = 1;
RET[6] = -1;
}

32
tests/switch-7.ispc Normal file
View File

@@ -0,0 +1,32 @@
export uniform int width() { return programCount; }
int switchit(int a, uniform int b) {
switch (a) {
case 3:
return 1;
case 7:
case 6:
case 4:
case 5:
if (a & 1)
break;
return 2;
default:
return 0;
}
return 3;
}
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
int x = switchit(a, b);
RET[programIndex] = x;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[2] = 1;
RET[6] = RET[4] = 3;
RET[5] = RET[3] = 2;
}

36
tests/switch-8.ispc Normal file
View File

@@ -0,0 +1,36 @@
export uniform int width() { return programCount; }
int switchit(int a, uniform int b) {
switch (a) {
case 3:
return 1;
case 7:
case 6:
case 4:
case 5:
if (a & 1)
break;
return 2;
case 32:
*((int *)NULL) = 0;
default:
case 1:
case 2:
return 0;
}
return 3;
}
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
int x = switchit(a, b);
RET[programIndex] = x;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[2] = 1;
RET[6] = RET[4] = 3;
RET[5] = RET[3] = 2;
}

34
tests/switch-9.ispc Normal file
View File

@@ -0,0 +1,34 @@
export uniform int width() { return programCount; }
int switchit(int a, uniform int b) {
switch (a) {
case 3:
return 1;
case 7:
case 6:
case 4:
case 5:
if (a & 1)
break;
return 2;
case 32:
*((int *)NULL) = 0;
default:
return 0;
}
return 3;
}
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int a = aFOO[programIndex];
int x = switchit(a, b);
RET[programIndex] = x;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[2] = 1;
RET[6] = RET[4] = 3;
RET[5] = RET[3] = 2;
}

View File

@@ -1,4 +1,4 @@
// Can't convert argument of type "void * const uniform" to type "float" for funcion call argument.
// Can't convert argument of type "void * uniform" to type "float" for function call argument.
float bar(float a, float b);

10
tests_errors/goto-1.ispc Normal file
View File

@@ -0,0 +1,10 @@
// Multiple labels named "label" in function
void func(int x) {
label:
;
label:
;
}

11
tests_errors/goto-2.ispc Normal file
View File

@@ -0,0 +1,11 @@
// "goto" statements are only legal under "uniform" control flow
void func(int x) {
if (x < 0)
goto label;
label:
;
}

11
tests_errors/goto-3.ispc Normal file
View File

@@ -0,0 +1,11 @@
// "goto" statements are only legal under "uniform" control flow
void func(int x) {
cif (x < 0)
goto label;
label:
;
}

10
tests_errors/goto-4.ispc Normal file
View File

@@ -0,0 +1,10 @@
// "goto" statements are only legal under "uniform" control flow
void func(int x) {
label:
for(int i =0 ;i<x;)
goto label;
}

View File

@@ -1,4 +1,4 @@
// Left hand side of assignment statement can't be assigned to
// Left hand side of assignment expression can't be assigned to
int foo() {return 2;}

View File

@@ -1,4 +1,4 @@
// Left hand side of assignment statement can't be assigned to
// Can't assign to type "const uniform int32" on left-hand side of expression
int bar(){
4 = 0;

View File

@@ -1,4 +1,4 @@
// Left hand side of assignment statement can't be assigned to
// Can't assign to type "const uniform int32" on left-hand side of expression
int bar(){
int x;

View File

@@ -0,0 +1,9 @@
// Case statement value must be a compile-time integer constant
void foo(float f) {
switch (f) {
case 1.5:
++f;
}
}

View File

@@ -0,0 +1,12 @@
// Duplicate case value "1"
void foo(float f) {
switch (f) {
case 1:
++f;
case 2:
case 1:
f = 0;
}
}

View File

@@ -0,0 +1,13 @@
// "case" label illegal outside of "switch" statement
void foo(float f) {
switch (f) {
case 1:
++f;
case 2:
f = 0;
}
case 3:
--f;
}

View File

@@ -0,0 +1,13 @@
// "default" label illegal outside of "switch" statement
void foo(float f) {
default:
++f;
switch (f) {
case 1:
++f;
case 2:
f = 0;
}
}

View File

@@ -0,0 +1,14 @@
// "default" label illegal outside of "switch" statement
void foo(float f) {
default:
++f;
switch (f) {
case 1:
++f;
continue;
case 2:
f = 0;
}
}

View File

@@ -0,0 +1,12 @@
// "continue" statement illegal outside of for/while/do/foreach loops
void foo(float f) {
switch (f) {
case 1:
++f;
continue;
case 2:
f = 0;
}
}

735
type.cpp

File diff suppressed because it is too large Load Diff

151
type.h
View File

@@ -78,20 +78,44 @@ public:
/** Returns true if the underlying type is a float or integer type. */
bool IsNumericType() const { return IsFloatType() || IsIntType(); }
/** Types may have uniform, varying, or not-yet-determined variability;
this enumerant is used by Type implementations to record their
variability. */
enum Variability {
Uniform,
Varying,
Unbound
};
/** Returns the variability of the type. */
virtual Variability GetVariability() const = 0;
/** Returns true if the underlying type is uniform */
virtual bool IsUniformType() const = 0;
bool IsUniformType() const { return GetVariability() == Uniform; }
/** Returns true if the underlying type is varying */
bool IsVaryingType() const { return !IsUniformType(); }
bool IsVaryingType() const { return GetVariability() == Varying; }
/** Returns true if the underlying type's uniform/varying-ness is
unbound. */
bool HasUnboundVariability() const { return GetVariability() == Unbound; }
/* Returns a type wherein any elements of the original type and
contained types that have unbound variability have their variability
set to the given variability. */
virtual const Type *ResolveUnboundVariability(Variability v) const = 0;
/** Return a "uniform" instance of this type. If the type is already
uniform, its "this" pointer will be returned. */
virtual const Type *GetAsUniformType() const = 0;
/** Return a "varying" instance of this type. If the type is already
uniform, its "this" pointer will be returned. */
varying, its "this" pointer will be returned. */
virtual const Type *GetAsVaryingType() const = 0;
/** Get an instance of the type with unbound variability. */
virtual const Type *GetAsUnboundVariabilityType() const = 0;
/** If this is a signed integer type, return the unsigned version of
the type. Otherwise, return the original type. */
virtual const Type *GetAsUnsignedType() const;
@@ -185,7 +209,8 @@ public:
*/
class AtomicType : public Type {
public:
bool IsUniformType() const;
Variability GetVariability() const;
bool IsBoolType() const;
bool IsFloatType() const;
bool IsIntType() const;
@@ -195,8 +220,10 @@ public:
/** For AtomicTypes, the base type is just the same as the AtomicType
itself. */
const AtomicType *GetBaseType() const;
const AtomicType *GetAsVaryingType() const;
const AtomicType *GetAsUniformType() const;
const AtomicType *GetAsVaryingType() const;
const AtomicType *GetAsUnboundVariabilityType() const;
const AtomicType *ResolveUnboundVariability(Variability v) const;
const AtomicType *GetAsUnsignedType() const;
const Type *GetSOAType(int width) const;
const AtomicType *GetAsConstType() const;
@@ -224,38 +251,45 @@ public:
TYPE_INT64,
TYPE_UINT64,
TYPE_DOUBLE,
NUM_BASIC_TYPES
};
const BasicType basicType;
static const AtomicType *UniformBool, *VaryingBool;
static const AtomicType *UniformInt8, *VaryingInt8;
static const AtomicType *UniformInt16, *VaryingInt16;
static const AtomicType *UniformInt32, *VaryingInt32;
static const AtomicType *UniformUInt8, *VaryingUInt8;
static const AtomicType *UniformUInt16, *VaryingUInt16;
static const AtomicType *UniformUInt32, *VaryingUInt32;
static const AtomicType *UniformFloat, *VaryingFloat;
static const AtomicType *UniformInt64, *VaryingInt64;
static const AtomicType *UniformUInt64, *VaryingUInt64;
static const AtomicType *UniformDouble, *VaryingDouble;
static const AtomicType *UniformConstBool, *VaryingConstBool;
static const AtomicType *UniformConstInt8, *VaryingConstInt8;
static const AtomicType *UniformConstInt16, *VaryingConstInt16;
static const AtomicType *UniformConstInt32, *VaryingConstInt32;
static const AtomicType *UniformConstUInt8, *VaryingConstUInt8;
static const AtomicType *UniformConstUInt16, *VaryingConstUInt16;
static const AtomicType *UniformConstUInt32, *VaryingConstUInt32;
static const AtomicType *UniformConstFloat, *VaryingConstFloat;
static const AtomicType *UniformConstInt64, *VaryingConstInt64;
static const AtomicType *UniformConstUInt64, *VaryingConstUInt64;
static const AtomicType *UniformConstDouble, *VaryingConstDouble;
static const AtomicType *UniformBool, *VaryingBool, *UnboundBool;
static const AtomicType *UniformInt8, *VaryingInt8, *UnboundInt8;
static const AtomicType *UniformInt16, *VaryingInt16, *UnboundInt16;
static const AtomicType *UniformInt32, *VaryingInt32, *UnboundInt32;
static const AtomicType *UniformUInt8, *VaryingUInt8, *UnboundUInt8;
static const AtomicType *UniformUInt16, *VaryingUInt16, *UnboundUInt16;
static const AtomicType *UniformUInt32, *VaryingUInt32, *UnboundUInt32;
static const AtomicType *UniformFloat, *VaryingFloat, *UnboundFloat;
static const AtomicType *UniformInt64, *VaryingInt64, *UnboundInt64;
static const AtomicType *UniformUInt64, *VaryingUInt64, *UnboundUInt64;
static const AtomicType *UniformDouble, *VaryingDouble, *UnboundDouble;
static const AtomicType *UniformConstBool, *VaryingConstBool, *UnboundConstBool;
static const AtomicType *UniformConstInt8, *VaryingConstInt8, *UnboundConstInt8;
static const AtomicType *UniformConstInt16, *VaryingConstInt16, *UnboundConstInt16;
static const AtomicType *UniformConstInt32, *VaryingConstInt32, *UnboundConstInt32;
static const AtomicType *UniformConstUInt8, *VaryingConstUInt8, *UnboundConstUInt8;
static const AtomicType *UniformConstUInt16, *VaryingConstUInt16, *UnboundConstUInt16;
static const AtomicType *UniformConstUInt32, *VaryingConstUInt32, *UnboundConstUInt32;
static const AtomicType *UniformConstFloat, *VaryingConstFloat, *UnboundConstFloat;
static const AtomicType *UniformConstInt64, *VaryingConstInt64, *UnboundConstInt64;
static const AtomicType *UniformConstUInt64, *VaryingConstUInt64, *UnboundConstUInt64;
static const AtomicType *UniformConstDouble, *VaryingConstDouble, *UnboundConstDouble;
static const AtomicType *Void;
/** This function must be called before any of the above static const
AtomicType values is used; in practice, we do it early in
main(). */
static void Init();
private:
const bool isUniform;
static const AtomicType *typeTable[NUM_BASIC_TYPES][3][2];
const Variability variability;
const bool isConst;
AtomicType(BasicType basicType, bool isUniform, bool isConst);
AtomicType(BasicType basicType, Variability v, bool isConst);
};
@@ -268,7 +302,8 @@ public:
/** Constructor for named enumerated types */
EnumType(const char *name, SourcePos pos);
bool IsUniformType() const;
Variability GetVariability() const;
bool IsBoolType() const;
bool IsFloatType() const;
bool IsIntType() const;
@@ -278,6 +313,8 @@ public:
const EnumType *GetBaseType() const;
const EnumType *GetAsVaryingType() const;
const EnumType *GetAsUniformType() const;
const EnumType *GetAsUnboundVariabilityType() const;
const EnumType *ResolveUnboundVariability(Variability v) const;
const Type *GetSOAType(int width) const;
const EnumType *GetAsConstType() const;
const EnumType *GetAsNonConstType() const;
@@ -300,15 +337,17 @@ public:
private:
const std::string name;
bool isUniform, isConst;
Variability variability;
bool isConst;
std::vector<Symbol *> enumerators;
};
/** @brief Type implementation for pointers to other types
*/
class PointerType : public Type {
public:
PointerType(const Type *t, bool isUniform, bool isConst);
PointerType(const Type *t, Variability v, bool isConst);
/** Helper method to return a uniform pointer to the given type. */
static PointerType *GetUniform(const Type *t);
@@ -318,7 +357,8 @@ public:
/** Returns true if the given type is a void * type. */
static bool IsVoidPointer(const Type *t);
bool IsUniformType() const;
Variability GetVariability() const;
bool IsBoolType() const;
bool IsFloatType() const;
bool IsIntType() const;
@@ -328,6 +368,8 @@ public:
const Type *GetBaseType() const;
const PointerType *GetAsVaryingType() const;
const PointerType *GetAsUniformType() const;
const PointerType *GetAsUnboundVariabilityType() const;
const PointerType *ResolveUnboundVariability(Variability v) const;
const Type *GetSOAType(int width) const;
const PointerType *GetAsConstType() const;
const PointerType *GetAsNonConstType() const;
@@ -342,7 +384,8 @@ public:
static PointerType *Void;
private:
const bool isUniform, isConst;
const Variability variability;
const bool isConst;
const Type *baseType;
};
@@ -408,7 +451,8 @@ public:
*/
ArrayType(const Type *elementType, int numElements);
bool IsUniformType() const;
Variability GetVariability() const;
bool IsBoolType() const;
bool IsFloatType() const;
bool IsIntType() const;
@@ -418,6 +462,9 @@ public:
const Type *GetBaseType() const;
const ArrayType *GetAsVaryingType() const;
const ArrayType *GetAsUniformType() const;
const ArrayType *GetAsUnboundVariabilityType() const;
const ArrayType *ResolveUnboundVariability(Variability v) const;
const ArrayType *GetAsUnsignedType() const;
const Type *GetSOAType(int width) const;
const ArrayType *GetAsConstType() const;
@@ -495,6 +542,9 @@ public:
const SOAArrayType *GetAsVaryingType() const;
const SOAArrayType *GetAsUniformType() const;
const SOAArrayType *GetAsUnboundVariabilityType() const;
const SOAArrayType *ResolveUnboundVariability(Variability v) const;
const Type *GetSOAType(int width) const;
const SOAArrayType *GetAsConstType() const;
const SOAArrayType *GetAsNonConstType() const;
@@ -536,7 +586,8 @@ class VectorType : public SequentialType {
public:
VectorType(const AtomicType *base, int size);
bool IsUniformType() const;
Variability GetVariability() const;
bool IsBoolType() const;
bool IsFloatType() const;
bool IsIntType() const;
@@ -546,6 +597,9 @@ public:
const Type *GetBaseType() const;
const VectorType *GetAsVaryingType() const;
const VectorType *GetAsUniformType() const;
const VectorType *GetAsUnboundVariabilityType() const;
const VectorType *ResolveUnboundVariability(Variability v) const;
const Type *GetSOAType(int width) const;
const VectorType *GetAsConstType() const;
const VectorType *GetAsNonConstType() const;
@@ -580,9 +634,10 @@ public:
StructType(const std::string &name, const std::vector<const Type *> &elts,
const std::vector<std::string> &eltNames,
const std::vector<SourcePos> &eltPositions, bool isConst,
bool isUniform, SourcePos pos);
Variability variability, SourcePos pos);
Variability GetVariability() const;
bool IsUniformType() const;
bool IsBoolType() const;
bool IsFloatType() const;
bool IsIntType() const;
@@ -592,6 +647,9 @@ public:
const Type *GetBaseType() const;
const StructType *GetAsVaryingType() const;
const StructType *GetAsUniformType() const;
const StructType *GetAsUnboundVariabilityType() const;
const StructType *ResolveUnboundVariability(Variability v) const;
const Type *GetSOAType(int width) const;
const StructType *GetAsConstType() const;
const StructType *GetAsNonConstType() const;
@@ -641,7 +699,7 @@ private:
/** Source file position at which each structure element declaration
appeared. */
const std::vector<SourcePos> elementPositions;
const bool isUniform;
const Variability variability;
const bool isConst;
const SourcePos pos;
};
@@ -653,7 +711,8 @@ class ReferenceType : public Type {
public:
ReferenceType(const Type *targetType);
bool IsUniformType() const;
Variability GetVariability() const;
bool IsBoolType() const;
bool IsFloatType() const;
bool IsIntType() const;
@@ -664,6 +723,9 @@ public:
const Type *GetReferenceTarget() const;
const ReferenceType *GetAsVaryingType() const;
const ReferenceType *GetAsUniformType() const;
const ReferenceType *GetAsUnboundVariabilityType() const;
const ReferenceType *ResolveUnboundVariability(Variability v) const;
const Type *GetSOAType(int width) const;
const ReferenceType *GetAsConstType() const;
const ReferenceType *GetAsNonConstType() const;
@@ -696,13 +758,14 @@ public:
FunctionType(const Type *returnType,
const std::vector<const Type *> &argTypes, SourcePos pos);
FunctionType(const Type *returnType,
const std::vector<const Type *> &argTypes, SourcePos pos,
const std::vector<const Type *> &argTypes,
const std::vector<std::string> &argNames,
const std::vector<ConstExpr *> &argDefaults,
const std::vector<SourcePos> &argPos,
bool isTask, bool isExported, bool isExternC);
bool IsUniformType() const;
Variability GetVariability() const;
bool IsBoolType() const;
bool IsFloatType() const;
bool IsIntType() const;
@@ -712,6 +775,9 @@ public:
const Type *GetBaseType() const;
const Type *GetAsVaryingType() const;
const Type *GetAsUniformType() const;
const Type *GetAsUnboundVariabilityType() const;
const FunctionType *ResolveUnboundVariability(Variability v) const;
const Type *GetSOAType(int width) const;
const Type *GetAsConstType() const;
const Type *GetAsNonConstType() const;
@@ -752,6 +818,7 @@ public:
private:
const Type * const returnType;
// The following four vectors should all have the same length (which is
// in turn the length returned by GetNumParameters()).
const std::vector<const Type *> paramTypes;

View File

@@ -39,6 +39,9 @@
#include "module.h"
#ifdef ISPC_IS_WINDOWS
#include <shlwapi.h>
#ifdef __MINGW32__
#include <malloc.h> // for alloca()
#endif
#else
#include <alloca.h>
#endif
@@ -75,7 +78,7 @@ lTerminalWidth() {
HANDLE h = GetStdHandle(STD_OUTPUT_HANDLE);
if (h == INVALID_HANDLE_VALUE || h == NULL)
return 80;
CONSOLE_SCREEN_BUFFER_INFO bufferInfo = { 0 };
CONSOLE_SCREEN_BUFFER_INFO bufferInfo = { {0} };
GetConsoleScreenBufferInfo(h, &bufferInfo);
return bufferInfo.dwSize.X;
#else
@@ -187,6 +190,32 @@ lPrintWithWordBreaks(const char *buf, int columnWidth, FILE *out) {
}
#ifdef ISPC_IS_WINDOWS
// we cover for the lack vasprintf and asprintf on windows (also covers mingw)
int
vasprintf(char **sptr, const char *fmt, va_list argv)
{
int wanted = vsnprintf(*sptr = NULL, 0, fmt, argv);
if((wanted < 0) || ((*sptr = (char*)malloc( 1 + wanted )) == NULL))
return -1;
return vsprintf(*sptr, fmt, argv);
}
int
asprintf(char **sptr, const char *fmt, ...)
{
int retval;
va_list argv;
va_start(argv, fmt);
retval = vasprintf(sptr, fmt, argv);
va_end(argv);
return retval;
}
#endif
/** Helper function for Error(), Warning(), etc.
@param type The type of message being printed (e.g. "Warning")
@@ -197,30 +226,6 @@ lPrintWithWordBreaks(const char *buf, int columnWidth, FILE *out) {
*/
static void
lPrint(const char *type, SourcePos p, const char *fmt, va_list args) {
#ifdef ISPC_IS_WINDOWS
char errorBuf[2048], formattedBuf[2048];
if (vsnprintf_s(errorBuf, sizeof(errorBuf), _TRUNCATE, fmt, args) == -1) {
fprintf(stderr, "vsnprintf_s() error!\n");
return;
}
if (p.first_line == 0) {
// We don't have a valid SourcePos, so create a message without it
if (sprintf_s(formattedBuf, sizeof(formattedBuf), "%s: %s\n",
type, errorBuf) == -1) {
fprintf(stderr, "vsnprintf_s() error!\n");
exit(1);
}
}
else {
// Create an error message that includes the file and line number
if (sprintf_s(formattedBuf, sizeof(formattedBuf), "%s(%d): %s: %s\n",
p.name, p.first_line, type, errorBuf) == -1) {
fprintf(stderr, "vsnprintf_s() error!\n");
exit(1);
}
}
#else
char *errorBuf, *formattedBuf;
if (vasprintf(&errorBuf, fmt, args) == -1) {
fprintf(stderr, "vasprintf() unable to allocate memory!\n");
@@ -241,7 +246,6 @@ lPrint(const char *type, SourcePos p, const char *fmt, va_list args) {
exit(1);
}
}
#endif
// Now that we've done all that work, see if we've already printed the
// exact same error message. If so, return, so we don't redundantly
@@ -254,10 +258,8 @@ lPrint(const char *type, SourcePos p, const char *fmt, va_list args) {
lPrintWithWordBreaks(formattedBuf, lTerminalWidth(), stderr);
lPrintFileLineContext(p);
#ifndef ISPC_IS_WINDOWS
free(errorBuf);
free(formattedBuf);
#endif // !ISPC_IS_WINDOWS
}

9
util.h
View File

@@ -40,6 +40,9 @@
#define ISPC_UTIL_H
#include "ispc.h"
#ifdef ISPC_IS_WINDOWS
#include <stdarg.h>
#endif
struct SourcePos;
@@ -62,6 +65,12 @@ inline uint32_t RoundUpPow2(uint32_t v) {
#define PRINTF_FUNC
#endif // __GNUG__
// for cross-platform compatibility
#ifdef ISPC_IS_WINDOWS
int vasprintf(char **sptr, const char *fmt, va_list argv);
int asprintf(char **sptr, const char *fmt, ...);
#endif
/** Prints a debugging message. These messages are only printed if
g->debugPrint is \c true. In addition to a program source code
position to associate with the message, a printf()-style format string