diff --git a/Makefile b/Makefile index 5a2246c3..b83714c9 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,11 @@ # ARCH_OS = $(shell uname) +ifeq ($(ARCH_OS), Darwin) + ARCH_OS2 = "OSX" +else + ARCH_OS2 = $(shell uname -o) +endif ARCH_TYPE = $(shell arch) ifeq ($(shell llvm-config --version), 3.1svn) @@ -26,7 +31,15 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \ -lclangAnalysis -lclangAST -lclangLex -lclangBasic ISPC_LIBS=$(shell llvm-config --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \ - -lpthread -ldl + -lpthread + +ifeq ($(ARCH_OS),Linux) + ISPC_LIBS += -ldl +endif + +ifeq ($(ARCH_OS2),Msys) + ISPC_LIBS += -lshlwapi -limagehlp -lpsapi +endif LLVM_CXXFLAGS=$(shell llvm-config --cppflags) LLVM_VERSION=LLVM_$(shell llvm-config --version | sed s/\\./_/) @@ -58,7 +71,8 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=avx avx-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 generic-16 +TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \ + generic-16 BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \ builtins/dispatch.ll BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \ @@ -129,22 +143,22 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< - @m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | ./bitcode2cpp.py $< > $@ + @m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | python bitcode2cpp.py $< > $@ objs/builtins-c-32.cpp: builtins/builtins.c @echo Creating C++ source from builtins definition file $< - @$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-32 > $@ + @$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-32 > $@ objs/builtins-c-64.cpp: builtins/builtins.c @echo Creating C++ source from builtins definition file $< - @$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-64 > $@ + @$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-64 > $@ objs/stdlib_generic_ispc.cpp: stdlib.ispc @echo Creating C++ source from $< for generic @$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ - ./stdlib2cpp.py generic > $@ + python stdlib2cpp.py generic > $@ objs/stdlib_x86_ispc.cpp: stdlib.ispc @echo Creating C++ source from $< for x86 @$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \ - ./stdlib2cpp.py x86 > $@ + python stdlib2cpp.py x86 > $@ diff --git a/README.rst b/README.rst index c9ae1512..f9daad40 100644 --- a/README.rst +++ b/README.rst @@ -47,7 +47,7 @@ remarkable `LLVM Compiler Infrastructure `_ for back-end code generation and optimization and is `hosted on github `_. It supports Windows, Mac, and Linux, with both x86 and x86-64 targets. It currently supports the SSE2, -SSE4, and AVX instruction sets. +SSE4, AVX1, and AVX2 instruction sets. Features -------- diff --git a/ast.cpp b/ast.cpp index 023e4ba9..746bc0ec 100644 --- a/ast.cpp +++ b/ast.cpp @@ -90,7 +90,11 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc, DoStmt *dos; ForStmt *fs; ForeachStmt *fes; + CaseStmt *cs; + DefaultStmt *defs; + SwitchStmt *ss; ReturnStmt *rs; + LabeledStmt *ls; StmtList *sl; PrintStmt *ps; AssertStmt *as; @@ -130,10 +134,21 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc, postFunc, data); fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data); } - else if (dynamic_cast(node) != NULL || - dynamic_cast(node) != NULL) { - // nothing + else if ((cs = dynamic_cast(node)) != NULL) + cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data); + else if ((defs = dynamic_cast(node)) != NULL) + defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data); + else if ((ss = dynamic_cast(node)) != NULL) { + ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data); + ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data); } + else if (dynamic_cast(node) != NULL || + dynamic_cast(node) != NULL || + dynamic_cast(node) != NULL) { + // nothing + } + else if ((ls = dynamic_cast(node)) != NULL) + ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data); else if ((rs = dynamic_cast(node)) != NULL) rs->val = (Expr *)WalkAST(rs->val, preFunc, postFunc, data); else if ((sl = dynamic_cast(node)) != NULL) { @@ -151,7 +166,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc, else { /////////////////////////////////////////////////////////////////////////// // Handle expressions - assert(dynamic_cast(node) != NULL); + Assert(dynamic_cast(node) != NULL); UnaryExpr *ue; BinaryExpr *be; AssignExpr *ae; @@ -289,3 +304,4 @@ EstimateCost(ASTNode *root) { WalkAST(root, lCostCallback, NULL, &cost); return cost; } + diff --git a/builtins.cpp b/builtins.cpp index dce7c9fa..76ebdfa7 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -386,6 +386,7 @@ lSetInternalFunctions(llvm::Module *module) { "__ceil_uniform_float", "__ceil_varying_double", "__ceil_varying_float", + "__clock", "__count_trailing_zeros_i32", "__count_trailing_zeros_i64", "__count_leading_zeros_i32", @@ -717,11 +718,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod extern int builtins_bitcode_sse4_x2_length; switch (g->target.vectorWidth) { case 4: - AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length, + AddBitcodeToModule(builtins_bitcode_sse4, + builtins_bitcode_sse4_length, module, symbolTable); break; case 8: - AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length, + AddBitcodeToModule(builtins_bitcode_sse4_x2, + builtins_bitcode_sse4_x2_length, module, symbolTable); break; default: @@ -729,18 +732,39 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } break; case Target::AVX: - case Target::AVX2: switch (g->target.vectorWidth) { case 8: - extern unsigned char builtins_bitcode_avx[]; - extern int builtins_bitcode_avx_length; - AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length, + extern unsigned char builtins_bitcode_avx1[]; + extern int builtins_bitcode_avx1_length; + AddBitcodeToModule(builtins_bitcode_avx1, + builtins_bitcode_avx1_length, module, symbolTable); break; case 16: - extern unsigned char builtins_bitcode_avx_x2[]; - extern int builtins_bitcode_avx_x2_length; - AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length, + extern unsigned char builtins_bitcode_avx1_x2[]; + extern int builtins_bitcode_avx1_x2_length; + AddBitcodeToModule(builtins_bitcode_avx1_x2, + builtins_bitcode_avx1_x2_length, + module, symbolTable); + break; + default: + FATAL("logic error in DefineStdlib"); + } + break; + case Target::AVX2: + switch (g->target.vectorWidth) { + case 8: + extern unsigned char builtins_bitcode_avx2[]; + extern int builtins_bitcode_avx2_length; + AddBitcodeToModule(builtins_bitcode_avx2, + builtins_bitcode_avx2_length, + module, symbolTable); + break; + case 16: + extern unsigned char builtins_bitcode_avx2_x2[]; + extern int builtins_bitcode_avx2_x2_length; + AddBitcodeToModule(builtins_bitcode_avx2_x2, + builtins_bitcode_avx2_x2_length, module, symbolTable); break; default: @@ -798,6 +822,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module, symbolTable); + lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2), + module, symbolTable); + if (includeStdlibISPC) { // If the user wants the standard library to be included, parse the // serialized version of the stdlib.ispc file to get its diff --git a/builtins/builtins.c b/builtins/builtins.c index f1cb35dd..36498e1a 100644 --- a/builtins/builtins.c +++ b/builtins/builtins.c @@ -149,7 +149,7 @@ void __do_print(const char *format, const char *types, int width, int mask, int __num_cores() { -#ifdef _MSC_VER +#if defined(_MSC_VER) || defined(__MINGW32__) // This is quite a hack. Including all of windows.h to get this definition // pulls in a bunch of stuff that leads to undefined symbols at link time. // So we don't #include but instead have the equivalent declarations diff --git a/builtins/dispatch.ll b/builtins/dispatch.ll index 10ba05a8..e61292aa 100644 --- a/builtins/dispatch.ll +++ b/builtins/dispatch.ll @@ -48,23 +48,42 @@ declare void @abort() noreturn ;; corresponding to one of the Target::ISA enumerant values that gives the ;; most capable ISA that the curremt system can run. ;; -;; #ifdef _MSC_VER -;; extern void __stdcall __cpuid(int info[4], int infoType); -;; #else +;; Note: clang from LLVM 2.9 should be used if this is updated, for maximum +;; backwards compatibility for anyone building ispc with LLVM 2.9. +;; +;; #include +;; #include +;; ;; static void __cpuid(int info[4], int infoType) { ;; __asm__ __volatile__ ("cpuid" ;; : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3]) ;; : "0" (infoType)); ;; } -;; #endif +;; +;; /* Save %ebx in case it's the PIC register */ +;; static void __cpuid_count(int info[4], int level, int count) { +;; __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t" +;; "cpuid\n\t" +;; "xchg{l}\t{%%}ebx, %1\n\t" +;; : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3]) +;; : "0" (level), "2" (count)); +;; } ;; ;; int32_t __get_system_isa() { ;; int info[4]; ;; __cpuid(info, 1); +;; ;; /* NOTE: the values returned below must be the same as the ;; corresponding enumerant values in Target::ISA. */ -;; if ((info[2] & (1 << 28)) != 0) -;; return 2; // AVX +;; if ((info[2] & (1 << 28)) != 0) { +;; // AVX1 for sure. Do we have AVX2? +;; // Call cpuid with eax=7, ecx=0 +;; __cpuid_count(info, 7, 0); +;; if ((info[1] & (1 << 5)) != 0) +;; return 3; // AVX2 +;; else +;; return 2; // AVX1 +;; } ;; else if ((info[2] & (1 << 19)) != 0) ;; return 1; // SSE4 ;; else if ((info[3] & (1 << 26)) != 0) @@ -76,33 +95,42 @@ declare void @abort() noreturn %0 = type { i32, i32, i32, i32 } define i32 @__get_system_isa() nounwind ssp { - %1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind - %2 = extractvalue %0 %1, 2 - %3 = extractvalue %0 %1, 3 - %4 = and i32 %2, 268435456 - %5 = icmp eq i32 %4, 0 - br i1 %5, label %6, label %13 +entry: + %0 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind + %asmresult9.i = extractvalue %0 %0, 2 + %asmresult10.i = extractvalue %0 %0, 3 + %and = and i32 %asmresult9.i, 268435456 + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %if.else7, label %if.then -;