From a552927a6a1f5765959a9c390f1113a4d3ea97eb Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Mon, 1 Aug 2011 05:58:43 +0100 Subject: [PATCH] Cleanup implementation of target builtins code. - Renamed stdlib-sse.ll to builtins-sse.ll (etc.) in an attempt to better indicate the fact that the stuff in those files has a role beyond implementing stuff for the standard library. - Moved declarations of the various __pseudo_* functions from being done with LLVM API calls in builtins.cpp to just straight up declarations in LLVM assembly language in builtins.m4. (Much less code to do it this way, and more clear what's going on.) --- Makefile | 20 +- bitcode2cpp.py | 6 +- stdlib-avx.ll => builtins-avx.ll | 0 stdlib-c.c => builtins-c.c | 2 +- stdlib-sse.ll => builtins-sse.ll | 2 +- stdlib-sse2.ll => builtins-sse2.ll | 2 +- stdlib-sse4.ll => builtins-sse4.ll | 2 +- stdlib-sse4x2.ll => builtins-sse4x2.ll | 0 builtins.cpp | 323 ++----------------------- stdlib.m4 => builtins.m4 | 99 ++++++++ doxygen.cfg | 2 +- ispc.vcxproj | 54 ++--- stmt.cpp | 2 +- 13 files changed, 163 insertions(+), 351 deletions(-) rename stdlib-avx.ll => builtins-avx.ll (100%) rename stdlib-c.c => builtins-c.c (99%) rename stdlib-sse.ll => builtins-sse.ll (99%) rename stdlib-sse2.ll => builtins-sse2.ll (99%) rename stdlib-sse4.ll => builtins-sse4.ll (99%) rename stdlib-sse4x2.ll => builtins-sse4x2.ll (100%) rename stdlib.m4 => builtins.m4 (90%) diff --git a/Makefile b/Makefile index 2017163a..46e60dad 100644 --- a/Makefile +++ b/Makefile @@ -43,17 +43,17 @@ CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \ util.cpp HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -STDLIB_SRC=stdlib-avx.ll stdlib-sse2.ll stdlib-sse4.ll stdlib-sse4x2.ll +BUILTINS_SRC=builtins-avx.ll builtins-sse2.ll builtins-sse4.ll builtins-sse4x2.ll BISON_SRC=parse.yy FLEX_SRC=lex.ll -OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(STDLIB_SRC:.ll=.o) stdlib-c.o stdlib_ispc.o \ +OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) builtins-c.o stdlib_ispc.o \ $(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o)) default: ispc ispc_test .PHONY: dirs clean depend doxygen print_llvm_src -.PRECIOUS: objs/stdlib-%.cpp +.PRECIOUS: objs/builtins-%.cpp depend: $(CXX_SRC) $(HEADERS) @echo Updating dependencies @@ -103,19 +103,19 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc @echo Compiling $< @$(CXX) $(CXXFLAGS) -o $@ -c $< -objs/stdlib-%.cpp: stdlib-%.ll stdlib.m4 stdlib-sse.ll - @echo Creating C++ source from stdlib file $< - @m4 stdlib.m4 $< | ./bitcode2cpp.py $< > $@ +objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll + @echo Creating C++ source from builtin definitions file $< + @m4 builtins.m4 $< | ./bitcode2cpp.py $< > $@ -objs/stdlib-%.o: objs/stdlib-%.cpp +objs/builtins-%.o: objs/builtins-%.cpp @echo Compiling $< @$(CXX) $(CXXFLAGS) -o $@ -c $< -objs/stdlib-c.cpp: stdlib-c.c - @echo Creating C++ source from stdlib file $< +objs/builtins-c.cpp: builtins-c.c + @echo Creating C++ source from builtins definition file $< @$(CLANG) -I /opt/l1om/usr/include/ -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py $< > $@ -objs/stdlib-c.o: objs/stdlib-c.cpp +objs/builtins-c.o: objs/builtins-c.cpp @echo Compiling $< @$(CXX) $(CXXFLAGS) -o $@ -c $< diff --git a/bitcode2cpp.py b/bitcode2cpp.py index b61f6f8e..13272b82 100755 --- a/bitcode2cpp.py +++ b/bitcode2cpp.py @@ -9,7 +9,7 @@ length=0 src=str(sys.argv[1]) -target = re.sub(".*stdlib-", "", src) +target = re.sub(".*builtins-", "", src) target = re.sub("\.ll$", "", target) target = re.sub("\.c$", "", target) target = re.sub("-", "_", target) @@ -20,14 +20,14 @@ except IOError: print >> sys.stderr, "Couldn't open " + src sys.exit(1) -print "unsigned char stdlib_bitcode_" + target + "[] = {" +print "unsigned char builtins_bitcode_" + target + "[] = {" for line in as_out.stdout.readlines(): length = length + len(line) for c in line: print ord(c) print ", " print " 0 };\n\n" -print "int stdlib_bitcode_" + target + "_length = " + str(length) + ";\n" +print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n" as_out.wait() diff --git a/stdlib-avx.ll b/builtins-avx.ll similarity index 100% rename from stdlib-avx.ll rename to builtins-avx.ll diff --git a/stdlib-c.c b/builtins-c.c similarity index 99% rename from stdlib-c.c rename to builtins-c.c index a5b69129..676f5da3 100644 --- a/stdlib-c.c +++ b/builtins-c.c @@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/** @file stdlib-c.c +/** @file builtins-c.c @brief Standard library function implementations written in C. This file provides C implementations of various functions that can be diff --git a/stdlib-sse.ll b/builtins-sse.ll similarity index 99% rename from stdlib-sse.ll rename to builtins-sse.ll index cb4bc712..c76ff907 100644 --- a/stdlib-sse.ll +++ b/builtins-sse.ll @@ -31,7 +31,7 @@ ;; This file declares implementations of various stdlib builtins that ;; only require SSE version 1 and 2 functionality; this file, in turn -;; is then included by stdlib-sse2.ll and stdlib-sse4.ll to provide +;; is then included by builtins-sse2.ll and builtins-sse4.ll to provide ;; those definitions for them. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/stdlib-sse2.ll b/builtins-sse2.ll similarity index 99% rename from stdlib-sse2.ll rename to builtins-sse2.ll index 99711181..de03e422 100644 --- a/stdlib-sse2.ll +++ b/builtins-sse2.ll @@ -37,7 +37,7 @@ stdlib_core(4) packed_load_and_store(4) ; Include the various definitions of things that only require SSE1 and SSE2 -include(`stdlib-sse.ll') +include(`builtins-sse.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding diff --git a/stdlib-sse4.ll b/builtins-sse4.ll similarity index 99% rename from stdlib-sse4.ll rename to builtins-sse4.ll index e0fcec4a..15a8c7ff 100644 --- a/stdlib-sse4.ll +++ b/builtins-sse4.ll @@ -37,7 +37,7 @@ stdlib_core(4) packed_load_and_store(4) ; Define the stuff that can be done with base SSE1/SSE2 instructions -include(`stdlib-sse.ll') +include(`builtins-sse.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/stdlib-sse4x2.ll b/builtins-sse4x2.ll similarity index 100% rename from stdlib-sse4x2.ll rename to builtins-sse4x2.ll diff --git a/builtins.cpp b/builtins.cpp index bd6b778f..2a02ceab 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -68,7 +68,7 @@ extern yy_buffer_state *yy_scan_string(const char *); distinguish between signed and unsigned integers in its types.) Because this function is only used for generating ispc declarations of - functions defined in LLVM bitcode in the stdlib-*.ll files, in practice + functions defined in LLVM bitcode in the builtins-*.ll files, in practice we can get enough of what we need for the relevant cases to make things work, partially with the help of the intAsUnsigned parameter, which indicates whether LLVM integer types should be treated as being signed @@ -271,217 +271,7 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) { } -static void -lDeclarePG(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *vecType, - const char *name) { - SourcePos noPos; - noPos.name = "__stdlib"; - - std::vector argTypes; - argTypes.push_back(LLVMTypes::VoidPointerVectorType); - argTypes.push_back(LLVMTypes::MaskType); - - llvm::FunctionType *fType = llvm::FunctionType::get(vecType, argTypes, false); - llvm::Function *func = - llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, - name, module); - func->setOnlyReadsMemory(true); - func->setDoesNotThrow(true); -} - - -static void -lDeclarePGBO(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *vecType, - const char *name) { - std::vector argTypes; - argTypes.push_back(LLVMTypes::VoidPointerType); - argTypes.push_back(LLVMTypes::Int32VectorType); - argTypes.push_back(LLVMTypes::MaskType); - - llvm::FunctionType *fType = llvm::FunctionType::get(vecType, argTypes, false); - llvm::Function *func = - llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, - name, module); - func->setOnlyReadsMemory(true); - func->setDoesNotThrow(true); -} - - -/** Declare the 'pseudo-gather' functions. When the ispc front-end needs - to perform a gather, it generates a call to one of these functions, - which have signatures: - - varying int8 __pseudo_gather(varying int8 *, mask) - varying int16 __pseudo_gather(varying int16 *, mask) - varying int32 __pseudo_gather(varying int32 *, mask) - varying int64 __pseudo_gather(varying int64 *, mask) - - These functions are never actually implemented; the - GatherScatterFlattenOpt optimization pass finds them and then converts - them to make calls to the following functions, which represent gathers - from a common base pointer with offsets. This approach allows the - front-end to be relatively simple in how it emits address calculation - for gathers. - - varying int8 __pseudo_gather_base_offsets_8(uniform int8 *base, - int32 offsets, mask) - varying int16 __pseudo_gather_base_offsets_16(uniform int16 *base, - int32 offsets, mask) - varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base, - int32 offsets, mask) - varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base, - int64 offsets, mask) - - Then, the GSImprovementsPass optimizations finds these and either - converts them to native gather functions or converts them to vector - loads, if equivalent. - */ -static void -lDeclarePseudoGathers(llvm::Module *module) { - lDeclarePG(module, LLVMTypes::Int8VectorType, "__pseudo_gather_8"); - lDeclarePG(module, LLVMTypes::Int16VectorType, "__pseudo_gather_16"); - lDeclarePG(module, LLVMTypes::Int32VectorType, "__pseudo_gather_32"); - lDeclarePG(module, LLVMTypes::Int64VectorType, "__pseudo_gather_64"); - - lDeclarePGBO(module, LLVMTypes::Int8VectorType, - "__pseudo_gather_base_offsets_8"); - lDeclarePGBO(module, LLVMTypes::Int16VectorType, - "__pseudo_gather_base_offsets_16"); - lDeclarePGBO(module, LLVMTypes::Int32VectorType, - "__pseudo_gather_base_offsets_32"); - lDeclarePGBO(module, LLVMTypes::Int64VectorType, - "__pseudo_gather_base_offsets_64"); -} - - -static void -lDeclarePS(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *vecType, - const char *name) { - std::vector argTypes; - argTypes.push_back(LLVMTypes::VoidPointerVectorType); - argTypes.push_back(vecType); - argTypes.push_back(LLVMTypes::MaskType); - - llvm::FunctionType *fType = - llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false); - llvm::Function *func = - llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, - name, module); - func->setDoesNotThrow(true); -} - - -static void -lDeclarePSBO(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *vecType, - const char *name) { - std::vector argTypes; - argTypes.push_back(LLVMTypes::VoidPointerType); - argTypes.push_back(LLVMTypes::Int32VectorType); - argTypes.push_back(vecType); - argTypes.push_back(LLVMTypes::MaskType); - - llvm::FunctionType *fType = - llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false); - llvm::Function *func = - llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, - name, module); - func->setDoesNotThrow(true); -} - - -/** Similarly to the 'pseudo-gathers' defined by lDeclarePseudoGathers(), - we also declare (but never define) pseudo-scatter instructions with - signatures: - - void __pseudo_scatter_8 (varying int8 *, varying int8 values, mask) - void __pseudo_scatter_16(varying int16 *, varying int16 values, mask) - void __pseudo_scatter_32(varying int32 *, varying int32 values, mask) - void __pseudo_scatter_64(varying int64 *, varying int64 values, mask) - - The GatherScatterFlattenOpt optimization pass also finds these and - transforms them to scatters like: - - void __pseudo_scatter_base_offsets_8(uniform int8 *base, - varying int32 offsets, varying int8 values, mask) - void __pseudo_scatter_base_offsets_16(uniform int16 *base, - varying int32 offsets, varying int16 values, mask) - void __pseudo_scatter_base_offsets_32(uniform int32 *base, - varying int32 offsets, varying int32 values, mask) - void __pseudo_scatter_base_offsets_64(uniform int64 *base, - varying int32 offsets, varying int64 values, mask) - - And the GSImprovementsPass in turn converts these to actual native - scatters or masked stores. -*/ -static void -lDeclarePseudoScatters(llvm::Module *module) { - SourcePos noPos; - noPos.name = "__stdlib"; - - lDeclarePS(module, LLVMTypes::Int8VectorType, "__pseudo_scatter_8"); - lDeclarePS(module, LLVMTypes::Int16VectorType, "__pseudo_scatter_16"); - lDeclarePS(module, LLVMTypes::Int32VectorType, "__pseudo_scatter_32"); - lDeclarePS(module, LLVMTypes::Int64VectorType, "__pseudo_scatter_64"); - - lDeclarePSBO(module, LLVMTypes::Int8VectorType, - "__pseudo_scatter_base_offsets_8"); - lDeclarePSBO(module, LLVMTypes::Int16VectorType, - "__pseudo_scatter_base_offsets_16"); - lDeclarePSBO(module, LLVMTypes::Int32VectorType, - "__pseudo_scatter_base_offsets_32"); - lDeclarePSBO(module, LLVMTypes::Int64VectorType, - "__pseudo_scatter_base_offsets_64"); -} - - -static void -lDeclarePMS(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *lvalueType, - LLVM_TYPE_CONST llvm::Type *rvalueType, const char *name) { - SourcePos noPos; - noPos.name = "__stdlib"; - - std::vector argTypes; - argTypes.push_back(lvalueType); - argTypes.push_back(rvalueType); - argTypes.push_back(LLVMTypes::MaskType); - - llvm::FunctionType *fType = - llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false); - llvm::Function *func = - llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, - name, module); - func->setDoesNotThrow(true); - func->addFnAttr(llvm::Attribute::AlwaysInline); - func->setDoesNotCapture(1, true); -} - - -/** This function declares placeholder masked store functions for the - front-end to use. - - void __pseudo_masked_store_8 (uniform int8 *ptr, varying int8 values, mask) - void __pseudo_masked_store_16(uniform int16 *ptr, varying int16 values, mask) - void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask) - void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask) - - These in turn are converted to native masked stores or to regular - stores (if the mask is all on) by the MaskedStoreOptPass optimization - pass. - */ -static void -lDeclarePseudoMaskedStore(llvm::Module *module) { - lDeclarePMS(module, LLVMTypes::Int8VectorPointerType, - LLVMTypes::Int8VectorType, "__pseudo_masked_store_8"); - lDeclarePMS(module, LLVMTypes::Int16VectorPointerType, - LLVMTypes::Int16VectorType, "__pseudo_masked_store_16"); - lDeclarePMS(module, LLVMTypes::Int32VectorPointerType, - LLVMTypes::Int32VectorType, "__pseudo_masked_store_32"); - lDeclarePMS(module, LLVMTypes::Int64VectorPointerType, - LLVMTypes::Int64VectorType, "__pseudo_masked_store_64"); -} - - -/** In many of the stdlib-*.ll files, we have declarations of various LLVM +/** In many of the builtins-*.ll files, we have declarations of various LLVM intrinsics that are then used in the implementation of various target- specific functions. This function loops over all of the intrinsic declarations and makes sure that the signature we have in our .ll file @@ -579,32 +369,32 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) { void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module, bool includeStdlibISPC) { - // Add the definitions from the compiled stdlib-c.c file - extern unsigned char stdlib_bitcode_c[]; - extern int stdlib_bitcode_c_length; - lAddBitcode(stdlib_bitcode_c, stdlib_bitcode_c_length, module, symbolTable); + // Add the definitions from the compiled builtins-c.c file + extern unsigned char builtins_bitcode_c[]; + extern int builtins_bitcode_c_length; + lAddBitcode(builtins_bitcode_c, builtins_bitcode_c_length, module, symbolTable); // Next, add the target's custom implementations of the various needed // builtin functions (e.g. __masked_store_32(), etc). switch (g->target.isa) { case Target::SSE2: - extern unsigned char stdlib_bitcode_sse2[]; - extern int stdlib_bitcode_sse2_length; - lAddBitcode(stdlib_bitcode_sse2, stdlib_bitcode_sse2_length, module, + extern unsigned char builtins_bitcode_sse2[]; + extern int builtins_bitcode_sse2_length; + lAddBitcode(builtins_bitcode_sse2, builtins_bitcode_sse2_length, module, symbolTable); break; case Target::SSE4: - extern unsigned char stdlib_bitcode_sse4[]; - extern int stdlib_bitcode_sse4_length; - extern unsigned char stdlib_bitcode_sse4x2[]; - extern int stdlib_bitcode_sse4x2_length; + extern unsigned char builtins_bitcode_sse4[]; + extern int builtins_bitcode_sse4_length; + extern unsigned char builtins_bitcode_sse4x2[]; + extern int builtins_bitcode_sse4x2_length; switch (g->target.vectorWidth) { case 4: - lAddBitcode(stdlib_bitcode_sse4, stdlib_bitcode_sse4_length, + lAddBitcode(builtins_bitcode_sse4, builtins_bitcode_sse4_length, module, symbolTable); break; case 8: - lAddBitcode(stdlib_bitcode_sse4x2, stdlib_bitcode_sse4x2_length, + lAddBitcode(builtins_bitcode_sse4x2, builtins_bitcode_sse4x2_length, module, symbolTable); break; default: @@ -612,92 +402,15 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } break; case Target::AVX: - extern unsigned char stdlib_bitcode_avx[]; - extern int stdlib_bitcode_avx_length; - lAddBitcode(stdlib_bitcode_avx, stdlib_bitcode_avx_length, module, + extern unsigned char builtins_bitcode_avx[]; + extern int builtins_bitcode_avx_length; + lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module, symbolTable); break; default: FATAL("logic error"); } - // Add a declaration of void *ISPCMalloc(int64_t size, int alignment). - // The user is responsible for linking in a definition of this if it's - // needed by the compiled program. - { std::vector argTypes; - argTypes.push_back(llvm::Type::getInt64Ty(*ctx)); - argTypes.push_back(llvm::Type::getInt32Ty(*ctx)); - llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType, - argTypes, false); - llvm::Function *func = - llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage, - "ISPCMalloc", module); - func->setDoesNotThrow(true); - } - - // Add a declaration of void ISPCFree(void *). The user is - // responsible for linking in a definition of this if it's needed by - // the compiled program. - { std::vector argTypes; - argTypes.push_back(LLVMTypes::VoidPointerType); - llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType, - argTypes, false); - llvm::Function *func = - llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage, - "ISPCFree", module); - func->setDoesNotThrow(true); - } - - // Add a declaration of void ISPCLaunch(void *funcPtr, void *data). - // The user is responsible for linking in a definition of this if it's - // needed by the compiled program. - { std::vector argTypes; - argTypes.push_back(LLVMTypes::VoidPointerType); - argTypes.push_back(LLVMTypes::VoidPointerType); - llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, - argTypes, false); - llvm::Function *func = - llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage, - "ISPCLaunch", module); - func->setDoesNotThrow(true); - } - - // Add a declaration of void ISPCSync(). The user is responsible for - // linking in a definition of this if it's needed by the compiled - // program. - { - std::vector argTypes; - llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, - argTypes, false); - llvm::Function *func = - llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage, - "ISPCSync", module); - func->setDoesNotThrow(true); - } - - // Add a declaration of void ISPCInstrument(void *, void *, int, int). - // The user is responsible for linking in a definition of this if it's - // needed by the compiled program. - { - std::vector argTypes; - argTypes.push_back(LLVMTypes::VoidPointerType); - argTypes.push_back(LLVMTypes::VoidPointerType); - argTypes.push_back(LLVMTypes::Int32Type); - argTypes.push_back(LLVMTypes::Int32Type); - llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, - argTypes, false); - llvm::Function *func = - llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage, - "ISPCInstrument", module); - func->setDoesNotThrow(true); - } - - // Declare various placeholder functions that the optimizer will later - // find and replace with something more useful. - lDeclarePseudoGathers(module); - lDeclarePseudoScatters(module); - lDeclarePseudoMaskedStore(module); - // define the 'programCount' builtin variable lDefineConstantInt("programCount", g->target.vectorWidth, module, symbolTable); diff --git a/stdlib.m4 b/builtins.m4 similarity index 90% rename from stdlib.m4 rename to builtins.m4 index 4540b796..28daa97c 100644 --- a/stdlib.m4 +++ b/builtins.m4 @@ -560,9 +560,108 @@ define internal <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $ define(`stdlib_core', ` +declare i8* @ISPCMalloc(i64, i32) nounwind +declare i8* @ISPCFree(i8*) nounwind +declare void @ISPCLaunch(i8*, i8*) nounwind +declare void @ISPCSync() nounwind +declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind + declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask) declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>) +; This function declares placeholder masked store functions for the +; front-end to use. +; +; void __pseudo_masked_store_8 (uniform int8 *ptr, varying int8 values, mask) +; void __pseudo_masked_store_16(uniform int16 *ptr, varying int16 values, mask) +; void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask) +; void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask) +; +; These in turn are converted to native masked stores or to regular +; stores (if the mask is all on) by the MaskedStoreOptPass optimization +; pass. + +declare void @__pseudo_masked_store_8(<$1 x i8> * nocapture, <$1 x i8>, <$1 x i32>) +declare void @__pseudo_masked_store_16(<$1 x i16> * nocapture, <$1 x i16>, <$1 x i32>) +declare void @__pseudo_masked_store_32(<$1 x i32> * nocapture, <$1 x i32>, <$1 x i32>) +declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x i32>) + +; Declare the pseudo-gather functions. When the ispc front-end needs +; to perform a gather, it generates a call to one of these functions, +; which have signatures: +; +; varying int8 __pseudo_gather(varying int8 *, mask) +; varying int16 __pseudo_gather(varying int16 *, mask) +; varying int32 __pseudo_gather(varying int32 *, mask) +; varying int64 __pseudo_gather(varying int64 *, mask) +; +; These functions are never actually implemented; the +; GatherScatterFlattenOpt optimization pass finds them and then converts +; them to make calls to the following functions, which represent gathers +; from a common base pointer with offsets. This approach allows the +; front-end to be relatively simple in how it emits address calculation +; for gathers. +; +; varying int8 __pseudo_gather_base_offsets_8(uniform int8 *base, +; int32 offsets, mask) +; varying int16 __pseudo_gather_base_offsets_16(uniform int16 *base, +; int32 offsets, mask) +; varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base, +; int32 offsets, mask) +; varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base, +; int64 offsets, mask) +; +; Then, the GSImprovementsPass optimizations finds these and either +; converts them to native gather functions or converts them to vector +; loads, if equivalent. + +declare <$1 x i8> @__pseudo_gather_8([$1 x i8 *], <$1 x i32>) nounwind readonly +declare <$1 x i16> @__pseudo_gather_16([$1 x i8 *], <$1 x i32>) nounwind readonly +declare <$1 x i32> @__pseudo_gather_32([$1 x i8 *], <$1 x i32>) nounwind readonly +declare <$1 x i64> @__pseudo_gather_64([$1 x i8 *], <$1 x i32>) nounwind readonly + +declare <$1 x i8> @__pseudo_gather_base_offsets_8(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly +declare <$1 x i16> @__pseudo_gather_base_offsets_16(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly +declare <$1 x i32> @__pseudo_gather_base_offsets_32(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly +declare <$1 x i64> @__pseudo_gather_base_offsets_64(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly + +; Similarly to the pseudo-gathers defined above, we also declare undefined +; pseudo-scatter instructions with signatures: +; +; void __pseudo_scatter_8 (varying int8 *, varying int8 values, mask) +; void __pseudo_scatter_16(varying int16 *, varying int16 values, mask) +; void __pseudo_scatter_32(varying int32 *, varying int32 values, mask) +; void __pseudo_scatter_64(varying int64 *, varying int64 values, mask) +; +; The GatherScatterFlattenOpt optimization pass also finds these and +; transforms them to scatters like: +; +; void __pseudo_scatter_base_offsets_8(uniform int8 *base, +; varying int32 offsets, varying int8 values, mask) +; void __pseudo_scatter_base_offsets_16(uniform int16 *base, +; varying int32 offsets, varying int16 values, mask) +; void __pseudo_scatter_base_offsets_32(uniform int32 *base, +; varying int32 offsets, varying int32 values, mask) +; void __pseudo_scatter_base_offsets_64(uniform int64 *base, +; varying int32 offsets, varying int64 values, mask) +; +; And the GSImprovementsPass in turn converts these to actual native +; scatters or masked stores. + +declare void @__pseudo_scatter_8([$1 x i8 *], <$1 x i8>, <$1 x i32>) nounwind +declare void @__pseudo_scatter_16([$1 x i8 *], <$1 x i16>, <$1 x i32>) nounwind +declare void @__pseudo_scatter_32([$1 x i8 *], <$1 x i32>, <$1 x i32>) nounwind +declare void @__pseudo_scatter_64([$1 x i8 *], <$1 x i64>, <$1 x i32>) nounwind + +declare void @__pseudo_scatter_base_offsets_8(i8 * nocapture, <$1 x i32>, + <$1 x i8>, <$1 x i32>) nounwind +declare void @__pseudo_scatter_base_offsets_16(i8 * nocapture, <$1 x i32>, + <$1 x i16>, <$1 x i32>) nounwind +declare void @__pseudo_scatter_base_offsets_32(i8 * nocapture, <$1 x i32>, + <$1 x i32>, <$1 x i32>) nounwind +declare void @__pseudo_scatter_base_offsets_64(i8 * nocapture, <$1 x i32>, + <$1 x i64>, <$1 x i32>) nounwind + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector ops diff --git a/doxygen.cfg b/doxygen.cfg index db7d35b7..31d97beb 100644 --- a/doxygen.cfg +++ b/doxygen.cfg @@ -610,7 +610,7 @@ INPUT = builtins.h \ util.cpp \ parse.yy \ lex.ll \ - stdlib-c.c + builtins-c.c # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is diff --git a/ispc.vcxproj b/ispc.vcxproj index 99a62b2c..fea47cb5 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -1,4 +1,4 @@ - + @@ -28,11 +28,11 @@ - - %LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c > gen-bitcode-c.cpp - clang stdlib-c.c - %LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c > gen-bitcode-c.cpp - clang stdlib-c.c + + %LLVM_INSTALL_DIR%\bin\clang -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c.c > gen-bitcode-c.cpp + clang builtins-c.c + %LLVM_INSTALL_DIR%\bin\clang -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c.c > gen-bitcode-c.cpp + clang builtins-c.c gen-bitcode-c.cpp gen-bitcode-c.cpp @@ -68,53 +68,53 @@ - + Document - m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll > gen-bitcode-sse4.cpp + m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp gen-bitcode-sse4.cpp - stdlib.m4;stdlib-sse.ll - m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll > gen-bitcode-sse4.cpp + builtins.m4;builtins-sse.ll + m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll > gen-bitcode-sse4.cpp gen-bitcode-sse4.cpp - stdlib.m4;stdlib-sse.ll + builtins.m4;builtins-sse.ll Building gen-bitcode-sse4.cpp Building gen-bitcode-sse4.cpp - + Document - m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll > gen-bitcode-sse4x2.cpp + m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll > gen-bitcode-sse4x2.cpp gen-bitcode-sse4x2.cpp - stdlib.m4;stdlib-sse.ll - m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll > gen-bitcode-sse4x2.cpp + builtins.m4;builtins-sse.ll + m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll > gen-bitcode-sse4x2.cpp gen-bitcode-sse4x2.cpp - stdlib.m4;stdlib-sse.ll + builtins.m4;builtins-sse.ll Building gen-bitcode-sse4x2.cpp Building gen-bitcode-sse4x2.cpp - + Document - m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll > gen-bitcode-sse2.cpp + m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp gen-bitcode-sse2.cpp - stdlib.m4;stdlib-sse.ll - m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll > gen-bitcode-sse2.cpp + builtins.m4;builtins-sse.ll + m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll > gen-bitcode-sse2.cpp gen-bitcode-sse2.cpp - stdlib.m4;stdlib-sse.ll + builtins.m4;builtins-sse.ll Building gen-bitcode-sse2.cpp Building gen-bitcode-sse2.cpp - + Document - m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll > gen-bitcode-avx.cpp + m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp gen-bitcode-avx.cpp - stdlib.m4;stdlib-sse.ll - m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll > gen-bitcode-avx.cpp + builtins.m4;builtins-sse.ll + m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll > gen-bitcode-avx.cpp gen-bitcode-avx.cpp - stdlib.m4;stdlib-sse.ll + builtins.m4;builtins-sse.ll Building gen-bitcode-avx.cpp Building gen-bitcode-avx.cpp @@ -213,4 +213,4 @@ - \ No newline at end of file + diff --git a/stmt.cpp b/stmt.cpp index d83a056e..434a4b85 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -1442,7 +1442,7 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) { /* PrintStmt works closely with the __do_print() function implemented in - the stdlib-c.c file. In particular, the EmitCode() method here needs to + the builtins-c.c file. In particular, the EmitCode() method here needs to take the arguments passed to it from ispc and generate a valid call to __do_print() with the information that __do_print() then needs to do the actual printing work at runtime.