diff --git a/alloy.py b/alloy.py index 21e428de..83296b46 100755 --- a/alloy.py +++ b/alloy.py @@ -88,6 +88,9 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, FOLDER_NAME=version_LLVM if version_LLVM == "trunk": SVN_PATH="trunk" + if version_LLVM == "3.4": + SVN_PATH="tags/RELEASE_34/rc2" + version_LLVM = "3_4" if version_LLVM == "3.3": SVN_PATH="tags/RELEASE_33/final" version_LLVM = "3_3" @@ -273,8 +276,10 @@ def build_ispc(version_LLVM, make): os.environ["LLVM_INSTALL_DIR"] = os.environ["LLVM_HOME"] + "\\bin-" + version_LLVM if version_LLVM == "3.3": temp = "3_3" - if version_LLVM == "trunk": + if version_LLVM == "3.4": temp = "3_4" + if version_LLVM == "trunk": + temp = "3_5" os.environ["LLVM_VERSION"] = "LLVM_" + temp try_do_LLVM("clean ISPC for building", "msbuild ispc.vcxproj /t:clean", True) try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", "msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild", True) @@ -376,7 +381,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update, archs.append("x86-64") if "native" in only: sde_targets_t = [] - for i in ["3.1", "3.2", "3.3", "trunk"]: + for i in ["3.1", "3.2", "3.3", "3.4", "trunk"]: if i in only: LLVM.append(i) if "current" in only: @@ -635,6 +640,7 @@ import platform import smtplib import datetime import copy +import multiprocessing from email.MIMEMultipart import MIMEMultipart from email.MIMEBase import MIMEBase from email.mime.text import MIMEText @@ -663,18 +669,19 @@ if __name__ == '__main__': "Try to build compiler with all LLVM\n\talloy.py -r --only=build\n" + "Performance validation run with 10 runs of each test and comparing to branch 'old'\n\talloy.py -r --only=performance --compare-with=old --number=10\n" + "Validation run. Update fail_db.txt with new fails, send results to my@my.com\n\talloy.py -r --update-errors=F --notify='my@my.com'\n") + num_threads="%s" % multiprocessing.cpu_count() parser = MyParser(usage="Usage: alloy.py -r/-b [options]", epilog=examples) parser.add_option('-b', '--build-llvm', dest='build_llvm', help='ask to build LLVM', default=False, action="store_true") parser.add_option('-r', '--run', dest='validation_run', help='ask for validation run', default=False, action="store_true") parser.add_option('-j', dest='speed', - help='set -j for make', default="8") + help='set -j for make', default=num_threads) # options for activity "build LLVM" llvm_group = OptionGroup(parser, "Options for building LLVM", "These options must be used with -b option.") llvm_group.add_option('--version', dest='version', - help='version of llvm to build: 3.1 3.2 3.3 trunk. Default: trunk', default="trunk") + help='version of llvm to build: 3.1 3.2 3.3 3.4 trunk. Default: trunk', default="trunk") llvm_group.add_option('--revision', dest='revision', help='revision of llvm to build in format r172870', default="") llvm_group.add_option('--debug', dest='debug', @@ -709,7 +716,7 @@ if __name__ == '__main__': run_group.add_option('--only', dest='only', help='set types of tests. Possible values:\n' + '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' + - 'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).', + 'build (only build with different LLVM), 3.1, 3.2, 3.3, 3.4, trunk, native (do not use SDE), current (do not rebuild ISPC).', default="") run_group.add_option('--perf_LLVM', dest='perf_llvm', help='compare LLVM 3.3 with "--compare-with", default trunk', default=False, action='store_true') diff --git a/cbackend.cpp b/cbackend.cpp index 8535653f..3db2d504 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -241,7 +241,11 @@ namespace { class CBEMCAsmInfo : public llvm::MCAsmInfo { public: CBEMCAsmInfo() { +#if defined(LLVM_3_5) + GlobalPrefix = '\0'; +#else GlobalPrefix = ""; +#endif PrivateGlobalPrefix = ""; } }; @@ -656,7 +660,7 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out, llvm::raw_ostream & CWriter::printSimpleType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned, const std::string &NameSoFar) { - assert((Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) && + assert((Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() || Ty->isVectorTy() || Ty->isVoidTy()) && "Invalid type for printSimpleType"); switch (Ty->getTypeID()) { case llvm::Type::VoidTyID: return Out << "void " << NameSoFar; @@ -752,7 +756,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, #endif ) { - if (Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) { + if (Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() || Ty->isVectorTy() || Ty->isVoidTy()) { printSimpleType(Out, Ty, isSigned, NameSoFar); return Out; } @@ -2733,7 +2737,7 @@ void CWriter::printModuleTypes() { void CWriter::printContainedStructs(llvm::Type *Ty, llvm::SmallPtrSet &Printed) { // Don't walk through pointers. - if (Ty->isPointerTy() || Ty->isPrimitiveType() || Ty->isIntegerTy()) + if (!(Ty->isStructTy() || Ty->isArrayTy())) return; // Print all contained types first. diff --git a/ctx.cpp b/ctx.cpp index dcc943a0..74a760ae 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -318,7 +318,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym, llvm::BasicBlock *offBB = llvm::BasicBlock::Create(*g->ctx, "entry", (llvm::Function *)offFunc, 0); - new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB); + llvm::StoreInst *inst = + new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB); + if (g->opt.forceAlignedMemory) { + inst->setAlignment(g->target->getNativeVectorAlignment()); + } llvm::ReturnInst::Create(*g->ctx, offBB); } @@ -2453,7 +2457,13 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) { if (name == NULL) name = LLVMGetName(ptr, "_load"); - llvm::Instruction *inst = new llvm::LoadInst(ptr, name, bblock); + llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock); + + if (g->opt.forceAlignedMemory && + llvm::dyn_cast(pt->getElementType())) { + inst->setAlignment(g->target->getNativeVectorAlignment()); + } + AddDebugPos(inst); return inst; } @@ -2735,7 +2745,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType, inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock); // If no alignment was specified but we have an array of a uniform - // type, then align it to 4 * the native vector width; it's not + // type, then align it to the native vector alignment; it's not // unlikely that this array will be loaded into varying variables with // what will be aligned accesses if the uniform -> varying load is done // in regular chunks. @@ -2743,7 +2753,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType, llvm::dyn_cast(llvmType); if (align == 0 && arrayType != NULL && !llvm::isa(arrayType->getElementType())) - align = 4 * g->target->getNativeVectorWidth(); + align = g->target->getNativeVectorAlignment(); if (align != 0) inst->setAlignment(align); @@ -3002,7 +3012,17 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) { return; } - llvm::Instruction *inst = new llvm::StoreInst(value, ptr, bblock); + llvm::PointerType *pt = + llvm::dyn_cast(ptr->getType()); + AssertPos(currentPos, pt != NULL); + + llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock); + + if (g->opt.forceAlignedMemory && + llvm::dyn_cast(pt->getElementType())) { + inst->setAlignment(g->target->getNativeVectorAlignment()); + } + AddDebugPos(inst); } diff --git a/examples/aobench/ao.cpp b/examples/aobench/ao.cpp index 2286316d..b4e2833d 100644 --- a/examples/aobench/ao.cpp +++ b/examples/aobench/ao.cpp @@ -60,7 +60,7 @@ using namespace ispc; extern void ao_serial(int w, int h, int nsubsamples, float image[]); -static unsigned int test_iterations; +static unsigned int test_iterations[] = {3, 7, 1}; static unsigned int width, height; static unsigned char *img; static float *fimg; @@ -106,16 +106,20 @@ savePPM(const char *fname, int w, int h) int main(int argc, char **argv) { - if (argc != 4) { + if (argc < 3) { printf ("%s\n", argv[0]); - printf ("Usage: ao [num test iterations] [width] [height]\n"); + printf ("Usage: ao [width] [height] [ispc iterations] [tasks iterations] [serial iterations]\n"); getchar(); exit(-1); } else { - test_iterations = atoi(argv[1]); - width = atoi (argv[2]); - height = atoi (argv[3]); + if (argc == 6) { + for (int i = 0; i < 3; i++) { + test_iterations[i] = atoi(argv[3 + i]); + } + } + width = atoi (argv[1]); + height = atoi (argv[2]); } // Allocate space for output images @@ -127,13 +131,14 @@ int main(int argc, char **argv) // time for any of them. // double minTimeISPC = 1e30; - for (unsigned int i = 0; i < test_iterations; i++) { + for (unsigned int i = 0; i < test_iterations[0]; i++) { memset((void *)fimg, 0, sizeof(float) * width * height * 3); assert(NSUBSAMPLES == 2); reset_and_start_timer(); ao_ispc(width, height, NSUBSAMPLES, fimg); double t = get_elapsed_mcycles(); + printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", t); minTimeISPC = std::min(minTimeISPC, t); } @@ -147,13 +152,14 @@ int main(int argc, char **argv) // minimum time for any of them. // double minTimeISPCTasks = 1e30; - for (unsigned int i = 0; i < test_iterations; i++) { + for (unsigned int i = 0; i < test_iterations[1]; i++) { memset((void *)fimg, 0, sizeof(float) * width * height * 3); assert(NSUBSAMPLES == 2); reset_and_start_timer(); ao_ispc_tasks(width, height, NSUBSAMPLES, fimg); double t = get_elapsed_mcycles(); + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", t); minTimeISPCTasks = std::min(minTimeISPCTasks, t); } @@ -167,11 +173,12 @@ int main(int argc, char **argv) // minimum time. // double minTimeSerial = 1e30; - for (unsigned int i = 0; i < test_iterations; i++) { + for (unsigned int i = 0; i < test_iterations[2]; i++) { memset((void *)fimg, 0, sizeof(float) * width * height * 3); reset_and_start_timer(); ao_serial(width, height, NSUBSAMPLES, fimg); double t = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t\t[%.3f] million cycles\n", t); minTimeSerial = std::min(minTimeSerial, t); } diff --git a/examples/aobench/aobench.vcxproj b/examples/aobench/aobench.vcxproj index a5b354ce..c46ee41a 100644 --- a/examples/aobench/aobench.vcxproj +++ b/examples/aobench/aobench.vcxproj @@ -1,181 +1,16 @@  - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - + + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB} + Win32Proj + aobench + ao + sse2,sse4,avx1-i32x8 + + - - - Document - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - - - - {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB} - Win32Proj - aobench - ispc - - - - Application - true - Unicode - - - Application - true - Unicode - - - Application - false - true - Unicode - - - Application - false - true - Unicode - - - - - - - - - - - - - - - - - - - true - $(ProjectDir)..\..;$(ExecutablePath) - ao - - - true - $(ExecutablePath);$(ProjectDir)..\.. - ao - - - false - $(ProjectDir)..\..;$(ExecutablePath) - ao - - - false - $(ProjectDir)..\..;$(ExecutablePath) - ao - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - diff --git a/examples/common.props b/examples/common.props new file mode 100644 index 00000000..7bf37005 --- /dev/null +++ b/examples/common.props @@ -0,0 +1,172 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + + Application + true + Unicode + + + Application + true + Unicode + + + Application + false + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + + + + + + + true + $(ProjectDir)..\..;$(ExecutablePath) + + + true + $(ProjectDir)..\..;$(ExecutablePath) + + + false + $(ProjectDir)..\..;$(ExecutablePath) + + + false + $(ProjectDir)..\..;$(ExecutablePath) + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(TargetDir) + true + Fast + + + Console + true + + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(TargetDir) + true + Fast + + + Console + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(TargetDir) + Fast + + + Console + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(TargetDir) + Fast + + + Console + true + true + true + + + + ispc + $(default_targets) + $(TargetDir)$(ISPC_file).obj + $(Target_out);$(TargetDir)$(ISPC_file)_sse2.obj + $(Target_out);$(TargetDir)$(ISPC_file)_sse4.obj + $(Target_out);$(TargetDir)$(ISPC_file)_avx.obj + $(Target_out);$(TargetDir)$(ISPC_file)_avx11.obj + $(Target_out);$(TargetDir)$(ISPC_file)_avx2.obj + + + + Document + $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str) + $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str) + $(Target_out);$(TargetDir)%(Filename)_ispc.h + $(Target_out);$(TargetDir)%(Filename)_ispc.h + $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str) + $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str) + $(Target_out);$(TargetDir)%(Filename)_ispc.h + $(Target_out);$(TargetDir)%(Filename)_ispc.h + + + + + + diff --git a/examples/deferred/deferred_shading.vcxproj b/examples/deferred/deferred_shading.vcxproj index 94e38540..cd361b26 100755 --- a/examples/deferred/deferred_shading.vcxproj +++ b/examples/deferred/deferred_shading.vcxproj @@ -1,154 +1,13 @@ - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - {87f53c53-957e-4e91-878a-bc27828fb9eb} Win32Proj - mandelbrot - ispc + deferred + kernels + sse2,sse4-x2,avx1-x2 - - - Application - true - Unicode - - - Application - true - Unicode - - - Application - false - true - Unicode - - - Application - false - true - Unicode - - - - - - - - - - - - - - - - - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - + @@ -156,24 +15,4 @@ - - - Document - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - - - - - diff --git a/examples/deferred/main.cpp b/examples/deferred/main.cpp index 4f2be879..d7f62f50 100644 --- a/examples/deferred/main.cpp +++ b/examples/deferred/main.cpp @@ -62,10 +62,16 @@ /////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { - if (argc != 2) { - printf("usage: deferred_shading \n"); + if (argc < 2) { + printf("usage: deferred_shading [tasks iterations] [serial iterations]\n"); return 1; } + static unsigned int test_iterations[] = {5, 3, 500}; //last value is for nframes, it is scale. + if (argc == 5) { + for (int i = 0; i < 3; i++) { + test_iterations[i] = atoi(argv[2 + i]); + } + } InputData *input = CreateInputDataFromFile(argv[1]); if (!input) { @@ -81,9 +87,9 @@ int main(int argc, char** argv) { InitDynamicCilk(input); #endif // __cilk - int nframes = 5; + int nframes = test_iterations[2]; double ispcCycles = 1e30; - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { framebuffer.clear(); reset_and_start_timer(); for (int j = 0; j < nframes; ++j) @@ -91,6 +97,7 @@ int main(int argc, char** argv) { VISUALIZE_LIGHT_COUNT, framebuffer.r, framebuffer.g, framebuffer.b); double mcycles = get_elapsed_mcycles() / nframes; + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", mcycles); ispcCycles = std::min(ispcCycles, mcycles); } printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render " @@ -98,14 +105,16 @@ int main(int argc, char** argv) { input->header.framebufferWidth, input->header.framebufferHeight); WriteFrame("deferred-ispc-static.ppm", input, framebuffer); + nframes = 3; #ifdef __cilk double dynamicCilkCycles = 1e30; - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { framebuffer.clear(); reset_and_start_timer(); for (int j = 0; j < nframes; ++j) DispatchDynamicCilk(input, &framebuffer); double mcycles = get_elapsed_mcycles() / nframes; + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", mcycles); dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles); } printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", @@ -114,12 +123,13 @@ int main(int argc, char** argv) { #endif // __cilk double serialCycles = 1e30; - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { framebuffer.clear(); reset_and_start_timer(); for (int j = 0; j < nframes; ++j) DispatchDynamicC(input, &framebuffer); double mcycles = get_elapsed_mcycles() / nframes; + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", mcycles); serialCycles = std::min(serialCycles, mcycles); } printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", diff --git a/examples/mandelbrot/mandelbrot.cpp b/examples/mandelbrot/mandelbrot.cpp index d2bebb96..fafc00d0 100644 --- a/examples/mandelbrot/mandelbrot.cpp +++ b/examples/mandelbrot/mandelbrot.cpp @@ -42,6 +42,7 @@ #include #include "../timing.h" #include "mandelbrot_ispc.h" +#include using namespace ispc; extern void mandelbrot_serial(float x0, float y0, float x1, float y1, @@ -67,7 +68,8 @@ writePPM(int *buf, int width, int height, const char *fn) { } -int main() { +int main(int argc, char *argv[]) { + static unsigned int test_iterations[] = {3, 3}; unsigned int width = 768; unsigned int height = 512; float x0 = -2; @@ -75,6 +77,19 @@ int main() { float y0 = -1; float y1 = 1; + if (argc > 1) { + if (strncmp(argv[1], "--scale=", 8) == 0) { + float scale = atof(argv[1] + 8); + width *= scale; + height *= scale; + } + } + if ((argc == 3) || (argc == 4)) { + for (int i = 0; i < 2; i++) { + test_iterations[i] = atoi(argv[argc - 2 + i]); + } + } + int maxIterations = 256; int *buf = new int[width*height]; @@ -83,10 +98,11 @@ int main() { // time of three runs. // double minISPC = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { reset_and_start_timer(); mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); double dt = get_elapsed_mcycles(); + printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt); minISPC = std::min(minISPC, dt); } @@ -102,10 +118,11 @@ int main() { // minimum time. // double minSerial = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { reset_and_start_timer(); mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); double dt = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); minSerial = std::min(minSerial, dt); } diff --git a/examples/mandelbrot/mandelbrot.vcxproj b/examples/mandelbrot/mandelbrot.vcxproj index 1b6f1281..e7703ad0 100644 --- a/examples/mandelbrot/mandelbrot.vcxproj +++ b/examples/mandelbrot/mandelbrot.vcxproj @@ -1,176 +1,15 @@  - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1} Win32Proj mandelbrot - ispc + mandelbrot + sse2,sse4-x2,avx1-x2 - - - Application - true - Unicode - - - Application - true - Unicode - - - Application - false - true - Unicode - - - Application - false - true - Unicode - - - - - - - - - - - - - - - - - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - + - - - Document - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - - - - - diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp index 698daf0f..32db45bc 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -69,11 +69,12 @@ writePPM(int *buf, int width, int height, const char *fn) { static void usage() { - fprintf(stderr, "usage: mandelbrot [--scale=]\n"); + fprintf(stderr, "usage: mandelbrot [--scale=] [tasks iterations] [serial iterations]\n"); exit(1); } int main(int argc, char *argv[]) { + static unsigned int test_iterations[] = {7, 1}; unsigned int width = 1536; unsigned int height = 1024; float x0 = -2; @@ -81,9 +82,7 @@ int main(int argc, char *argv[]) { float y0 = -1; float y1 = 1; - if (argc == 1) - ; - else if (argc == 2) { + if (argc > 1) { if (strncmp(argv[1], "--scale=", 8) == 0) { float scale = atof(argv[1] + 8); if (scale == 0.f) @@ -94,11 +93,13 @@ int main(int argc, char *argv[]) { width = (width + 0xf) & ~0xf; height = (height + 0xf) & ~0xf; } - else - usage(); } - else - usage(); + if ((argc == 3) || (argc == 4)) { + for (int i = 0; i < 2; i++) { + test_iterations[i] = atoi(argv[argc - 2 + i]); + } + } + int maxIterations = 512; int *buf = new int[width*height]; @@ -108,13 +109,14 @@ int main(int argc, char *argv[]) { // time of three runs. // double minISPC = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { // Clear out the buffer for (unsigned int i = 0; i < width * height; ++i) buf[i] = 0; reset_and_start_timer(); mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); double dt = get_elapsed_mcycles(); + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt); minISPC = std::min(minISPC, dt); } @@ -127,13 +129,14 @@ int main(int argc, char *argv[]) { // minimum time. // double minSerial = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { // Clear out the buffer for (unsigned int i = 0; i < width * height; ++i) buf[i] = 0; reset_and_start_timer(); mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); double dt = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); minSerial = std::min(minSerial, dt); } diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj index fbebdc32..f8b8cfcb 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj @@ -1,181 +1,16 @@ - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - {E80DA7D4-AB22-4648-A068-327307156BE6} Win32Proj mandelbrot_tasks - ispc + mandelbrot_tasks + sse2,sse4-x2,avx1-x2 - - - Application - true - Unicode - - - Application - true - Unicode - - - Application - false - true - Unicode - - - Application - false - true - Unicode - - - - - - - - - - - - - - - - - - - true - $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot_tasks - - - true - $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot_tasks - - - false - $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot_tasks - - - false - $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot_tasks - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - + - - - Document - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - - - - - diff --git a/examples/noise/noise.cpp b/examples/noise/noise.cpp index 123f98c7..0664bbd9 100644 --- a/examples/noise/noise.cpp +++ b/examples/noise/noise.cpp @@ -42,6 +42,7 @@ #include #include "../timing.h" #include "noise_ispc.h" +#include using namespace ispc; extern void noise_serial(float x0, float y0, float x1, float y1, @@ -65,7 +66,8 @@ writePPM(float *buf, int width, int height, const char *fn) { } -int main() { +int main(int argc, char *argv[]) { + static unsigned int test_iterations[] = {3, 1}; unsigned int width = 768; unsigned int height = 768; float x0 = -10; @@ -73,6 +75,18 @@ int main() { float y0 = -10; float y1 = 10; + if (argc > 1) { + if (strncmp(argv[1], "--scale=", 8) == 0) { + float scale = atof(argv[1] + 8); + width *= scale; + height *= scale; + } + } + if ((argc == 3) || (argc == 4)) { + for (int i = 0; i < 2; i++) { + test_iterations[i] = atoi(argv[argc - 2 + i]); + } + } float *buf = new float[width*height]; // @@ -80,10 +94,11 @@ int main() { // time of three runs. // double minISPC = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { reset_and_start_timer(); noise_ispc(x0, y0, x1, y1, width, height, buf); double dt = get_elapsed_mcycles(); + printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt); minISPC = std::min(minISPC, dt); } @@ -99,10 +114,11 @@ int main() { // minimum time. // double minSerial = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { reset_and_start_timer(); noise_serial(x0, y0, x1, y1, width, height, buf); double dt = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); minSerial = std::min(minSerial, dt); } diff --git a/examples/noise/noise.vcxproj b/examples/noise/noise.vcxproj index 01456625..7adc57f3 100644 --- a/examples/noise/noise.vcxproj +++ b/examples/noise/noise.vcxproj @@ -1,176 +1,15 @@ - - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - - - {0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD} - Win32Proj - noise - ispc - - - - Application - true - Unicode - - - Application - true - Unicode - - - Application - false - true - Unicode - - - Application - false - true - Unicode - - - - - - - - - - - - - - - - - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - - - - - Document - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2 - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2 - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - - - - - + + + {0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD} + Win32Proj + noise + noise + sse2,sse4,avx1-x2 + + + + + + diff --git a/examples/options/options.vcxproj b/examples/options/options.vcxproj index 77fb9353..af336aa1 100644 --- a/examples/options/options.vcxproj +++ b/examples/options/options.vcxproj @@ -1,184 +1,17 @@ - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - {8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE} Win32Proj options - ispc + options + sse2,sse4-x2,avx1-x2 - - - Application - true - Unicode - - - Application - true - Unicode - - - Application - false - true - Unicode - - - Application - false - true - Unicode - - - - - - - - - - - - - - - - - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - 4305 - true - Fast - - - Console - true - - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - 4305 - true - Fast - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - 4305 - Fast - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - 4305 - Fast - - - Console - true - true - true - - + - - - - Document - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - - - - - - diff --git a/examples/rt/rt.cpp b/examples/rt/rt.cpp index 48bcc423..8f61656a 100644 --- a/examples/rt/rt.cpp +++ b/examples/rt/rt.cpp @@ -96,27 +96,27 @@ static void writeImage(int *idImage, float *depthImage, int width, int height, static void usage() { - fprintf(stderr, "rt [--scale=] \n"); + fprintf(stderr, "rt [--scale=] [ispc iterations] [tasks iterations] [serial iterations]\n"); exit(1); } int main(int argc, char *argv[]) { + static unsigned int test_iterations[] = {3, 7, 1}; float scale = 1.f; const char *filename = NULL; - for (int i = 1; i < argc; ++i) { - if (strncmp(argv[i], "--scale=", 8) == 0) { - scale = atof(argv[i] + 8); - if (scale == 0.f) - usage(); + if (argc < 2) usage(); + filename = argv[1]; + if (argc > 2) { + if (strncmp(argv[2], "--scale=", 8) == 0) { + scale = atof(argv[2] + 8); + } + } + if ((argc == 6) || (argc == 5)) { + for (int i = 0; i < 3; i++) { + test_iterations[i] = atoi(argv[argc - 3 + i]); } - else if (filename != NULL) - usage(); - else - filename = argv[i]; } - if (filename == NULL) - usage(); #define READ(var, n) \ if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \ @@ -211,11 +211,12 @@ int main(int argc, char *argv[]) { // Run 3 iterations with ispc + 1 core, record the minimum time // double minTimeISPC = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { reset_and_start_timer(); raytrace_ispc(width, height, baseWidth, baseHeight, raster2camera, camera2world, image, id, nodes, triangles); double dt = get_elapsed_mcycles(); + printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt); minTimeISPC = std::min(dt, minTimeISPC); } printf("[rt ispc, 1 core]:\t\t[%.3f] million cycles for %d x %d image\n", @@ -230,11 +231,12 @@ int main(int argc, char *argv[]) { // Run 3 iterations with ispc + 1 core, record the minimum time // double minTimeISPCtasks = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { reset_and_start_timer(); raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera, camera2world, image, id, nodes, triangles); double dt = get_elapsed_mcycles(); + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt); minTimeISPCtasks = std::min(dt, minTimeISPCtasks); } printf("[rt ispc + tasks]:\t\t[%.3f] million cycles for %d x %d image\n", @@ -250,11 +252,12 @@ int main(int argc, char *argv[]) { // minimum time. // double minTimeSerial = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[2]; ++i) { reset_and_start_timer(); raytrace_serial(width, height, baseWidth, baseHeight, raster2camera, camera2world, image, id, nodes, triangles); double dt = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); minTimeSerial = std::min(dt, minTimeSerial); } printf("[rt serial]:\t\t\t[%.3f] million cycles for %d x %d image\n", diff --git a/examples/rt/rt.vcxproj b/examples/rt/rt.vcxproj index 19d40192..ea34de56 100644 --- a/examples/rt/rt.vcxproj +++ b/examples/rt/rt.vcxproj @@ -1,181 +1,16 @@ - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - {E787BC3F-2D2E-425E-A64D-4721E2FF3DC9} Win32Proj rt - ispc + rt + sse2,sse4-x2,avx1-i32x8 - - - Application - true - Unicode - - - Application - true - Unicode - - - Application - false - true - Unicode - - - Application - false - true - Unicode - - - - - - - - - - - - - - - - - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - Document - -$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx - - -$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - -$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx - - -$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - - + - - - diff --git a/examples/sort/sort.vcxproj b/examples/sort/sort.vcxproj index b37eab1c..43f2b439 100644 --- a/examples/sort/sort.vcxproj +++ b/examples/sort/sort.vcxproj @@ -1,177 +1,16 @@  - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2} Win32Proj sort - ispc + sort + sse2,sse4-x2,avx1-x2 - - - Application - true - Unicode - - - Application - true - Unicode - - - Application - false - true - Unicode - - - Application - false - true - Unicode - - - - - - - - - - - - - - - - - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - + - - - Document - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - - - - - diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp index 93d11b7e..33abc85c 100644 --- a/examples/stencil/stencil.cpp +++ b/examples/stencil/stencil.cpp @@ -40,6 +40,7 @@ #include #include +#include #include #include "../timing.h" #include "stencil_ispc.h" @@ -66,9 +67,25 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) { } -int main() { +int main(int argc, char *argv[]) { + static unsigned int test_iterations[] = {3, 3, 3};//the last two numbers must be equal here int Nx = 256, Ny = 256, Nz = 256; int width = 4; + + if (argc > 1) { + if (strncmp(argv[1], "--scale=", 8) == 0) { + float scale = atof(argv[1] + 8); + Nx *= scale; + Ny *= scale; + Nz *= scale; + } + } + if ((argc == 4) || (argc == 5)) { + for (int i = 0; i < 3; i++) { + test_iterations[i] = atoi(argv[argc - 3 + i]); + } + } + float *Aserial[2], *Aispc[2]; Aserial[0] = new float [Nx * Ny * Nz]; Aserial[1] = new float [Nx * Ny * Nz]; @@ -79,24 +96,22 @@ int main() { float coeff[4] = { 0.5, -.25, .125, -.0625 }; InitData(Nx, Ny, Nz, Aispc, vsq); - // // Compute the image using the ispc implementation on one core; report // the minimum time of three runs. // double minTimeISPC = 1e30; -#if 0 - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { reset_and_start_timer(); loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, width, Nz - width, Nx, Ny, Nz, coeff, vsq, Aispc[0], Aispc[1]); double dt = get_elapsed_mcycles(); + printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt); minTimeISPC = std::min(minTimeISPC, dt); } printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); -#endif InitData(Nx, Ny, Nz, Aispc, vsq); @@ -105,12 +120,13 @@ int main() { // the minimum time of three runs. // double minTimeISPCTasks = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { reset_and_start_timer(); loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, width, Nz - width, Nx, Ny, Nz, coeff, vsq, Aispc[0], Aispc[1]); double dt = get_elapsed_mcycles(); + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt); minTimeISPCTasks = std::min(minTimeISPCTasks, dt); } @@ -123,12 +139,13 @@ int main() { // minimum time. // double minTimeSerial = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[2]; ++i) { reset_and_start_timer(); loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, width, Nz - width, Nx, Ny, Nz, coeff, vsq, Aserial[0], Aserial[1]); double dt = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); minTimeSerial = std::min(minTimeSerial, dt); } diff --git a/examples/stencil/stencil.vcxproj b/examples/stencil/stencil.vcxproj index a96a187d..b5f5bb22 100644 --- a/examples/stencil/stencil.vcxproj +++ b/examples/stencil/stencil.vcxproj @@ -1,181 +1,16 @@ - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - {2ef070a1-f62f-4e6a-944b-88d140945c3c} Win32Proj rt - ispc + stencil + sse2,sse4-x2,avx1-i32x8 - - - Application - true - Unicode - - - Application - true - Unicode - - - Application - false - true - Unicode - - - Application - false - true - Unicode - - - - - - - - - - - - - - - - - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - Document - -$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx - - -$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - -$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx - - -$(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - - + - - - diff --git a/examples/volume_rendering/volume.cpp b/examples/volume_rendering/volume.cpp index 458cd407..b6eda986 100644 --- a/examples/volume_rendering/volume.cpp +++ b/examples/volume_rendering/volume.cpp @@ -135,10 +135,16 @@ loadVolume(const char *fn, int n[3]) { int main(int argc, char *argv[]) { - if (argc != 3) { - fprintf(stderr, "usage: volume \n"); + static unsigned int test_iterations[] = {3, 7, 1}; + if (argc < 3) { + fprintf(stderr, "usage: volume [ispc iterations] [tasks iterations] [serial iterations]\n"); return 1; } + if (argc == 6) { + for (int i = 0; i < 3; i++) { + test_iterations[i] = atoi(argv[3 + i]); + } + } // // Load viewing data and the volume density data @@ -156,11 +162,12 @@ int main(int argc, char *argv[]) { // time of three runs. // double minISPC = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { reset_and_start_timer(); volume_ispc(density, n, raster2camera, camera2world, width, height, image); double dt = get_elapsed_mcycles(); + printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt); minISPC = std::min(minISPC, dt); } @@ -176,11 +183,12 @@ int main(int argc, char *argv[]) { // tasks; report the minimum time of three runs. // double minISPCtasks = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { reset_and_start_timer(); volume_ispc_tasks(density, n, raster2camera, camera2world, width, height, image); double dt = get_elapsed_mcycles(); + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt); minISPCtasks = std::min(minISPCtasks, dt); } @@ -196,11 +204,12 @@ int main(int argc, char *argv[]) { // minimum time. // double minSerial = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[2]; ++i) { reset_and_start_timer(); volume_serial(density, n, raster2camera, camera2world, width, height, image); double dt = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); minSerial = std::min(minSerial, dt); } diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj index d3594b98..cc738a7e 100644 --- a/examples/volume_rendering/volume.vcxproj +++ b/examples/volume_rendering/volume.vcxproj @@ -1,177 +1,16 @@ - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - {dee5733a-e93e-449d-9114-9bffcaeb4df9} Win32Proj volume - ispc + volume + sse2,sse4-x2,avx1-i32x8 - - - Application - true - Unicode - - - Application - true - Unicode - - - Application - false - true - Unicode - - - Application - false - true - Unicode - - - - - - - - - - - - - - - - - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - true - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - false - $(ProjectDir)..\..;$(ExecutablePath) - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - + - - - Document - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx - - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - - - - - diff --git a/fail_db.txt b/fail_db.txt index 32917815..ff119d5a 100644 --- a/fail_db.txt +++ b/fail_db.txt @@ -277,13 +277,7 @@ .\tests\reduce-min-uint64.ispc runfail x86 avx1-i64x4 Windows LLVM 3.4 cl -O2 * ./tests/atomics-13.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 clang++3.3 -O2 * ./tests/atomics-13.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 clang++3.3 -O2 * -./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 clang++3.3 -O2 * -./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 clang++3.3 -O2 * -./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 clang++3.3 -O2 * ./tests/atomics-13.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 clang++3.3 -O2 * -./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 clang++3.3 -O2 * -./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 clang++3.3 -O2 * -./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 clang++3.3 -O2 * ./tests/atomics-13.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 clang++3.3 -O2 * ./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O2 * ./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.3 clang++3.3 -O2 * @@ -572,3 +566,68 @@ ./tests/reduce-equal-5.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 * ./tests/reduce-equal-6.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 * ./tests/reduce-equal-8.ispc compfail x86-64 generic-16 Linux LLVM 3.4 clang++3.3 -O0 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O2 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O2 * +./tests/half-1.ispc runfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/ptr-15.ispc runfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-1.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-10.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-11.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-12.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-13.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-14.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-2.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-3.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-4.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-9.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-add-1.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-add-10.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-add-8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-add-9.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-and-1.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-and-2.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-or-1.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-1.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-10.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-12.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-13.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-2.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-3.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-4.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-5.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-6.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O2 * +./tests/half-1.ispc runfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/ptr-15.ispc runfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/ptr-19.ispc runfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-1.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-10.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-11.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-12.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-13.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-14.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-2.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-3.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-4.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/atomics-9.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-add-1.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-add-10.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-add-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-add-9.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-and-1.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-and-2.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/exclusive-scan-or-1.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-1.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-10.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-12.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-13.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-2.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-3.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-4.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-5.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-6.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * +./tests/reduce-equal-8.ispc compfail x86-64 generic-16 Linux LLVM 3.5 clang++3.3 -O0 * diff --git a/ispc.cpp b/ispc.cpp index a0b96f27..fabe6614 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -193,6 +193,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo m_tf_attributes(NULL), #endif m_nativeVectorWidth(-1), + m_nativeVectorAlignment(-1), m_dataTypeWidth(-1), m_vectorWidth(-1), m_generatePIC(pic), @@ -318,6 +319,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "sse2-i32x4")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" @@ -334,6 +336,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "sse2-i32x8")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" @@ -350,6 +353,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "sse4-i32x4")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; // TODO: why not sse42 and popcnt? @@ -368,6 +372,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "sse4-i32x8")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" @@ -383,6 +388,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo else if (!strcasecmp(isa, "sse4-i8x16")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 8; this->m_vectorWidth = 16; this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" @@ -398,6 +404,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo else if (!strcasecmp(isa, "sse4-i16x8")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 16; this->m_vectorWidth = 8; this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" @@ -414,6 +421,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "generic-x4")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_vectorWidth = 4; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -425,6 +433,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "generic-x8")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_vectorWidth = 8; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -436,6 +445,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "generic-x16")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 64; this->m_vectorWidth = 16; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -447,6 +457,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "generic-x32")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 32; + this->m_nativeVectorAlignment = 64; this->m_vectorWidth = 32; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -458,6 +469,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "generic-x64")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 64; + this->m_nativeVectorAlignment = 64; this->m_vectorWidth = 64; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -469,6 +481,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "generic-x1")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 1; + this->m_nativeVectorAlignment = 16; this->m_vectorWidth = 1; this->m_maskingIsFree = false; this->m_maskBitCount = 32; @@ -476,6 +489,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo else if (!strcasecmp(isa, "avx1-i32x4")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -487,6 +501,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "avx1-i32x8")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -497,6 +512,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "avx1-i64x4")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 64; this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -508,6 +524,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "avx1-i32x16")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 16; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -518,6 +535,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "avx1.1-i32x8")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+avx,+popcnt,+cmov,+f16c" @@ -539,6 +557,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "avx1.1-i32x16")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 16; this->m_attributes = "+avx,+popcnt,+cmov,+f16c" @@ -559,6 +578,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo else if (!strcasecmp(isa, "avx1.1-i64x4")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 64; this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov,+f16c" @@ -580,6 +600,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "avx2-i32x8")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" @@ -605,6 +626,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "avx2-i32x16")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 16; this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" @@ -629,6 +651,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo else if (!strcasecmp(isa, "avx2-i64x4")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 64; this->m_vectorWidth = 4; this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" @@ -654,6 +677,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo else if (!strcasecmp(isa, "neon-i8x16")) { this->m_isa = Target::NEON8; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 8; this->m_vectorWidth = 16; this->m_attributes = "+neon,+fp16"; @@ -664,6 +688,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo else if (!strcasecmp(isa, "neon-i16x8")) { this->m_isa = Target::NEON16; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 16; this->m_vectorWidth = 8; this->m_attributes = "+neon,+fp16"; @@ -675,6 +700,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo !strcasecmp(isa, "neon-i32x4")) { this->m_isa = Target::NEON32; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; this->m_attributes = "+neon,+fp16"; @@ -950,7 +976,8 @@ Target::GetISATargetString() const { static bool lGenericTypeLayoutIndeterminate(llvm::Type *type) { - if (type->isPrimitiveType() || type->isIntegerTy()) + if (type->isFloatingPointTy() || type->isX86_MMXTy() || type->isVoidTy() || + type->isIntegerTy() || type->isLabelTy() || type->isMetadataTy()) return false; if (type == LLVMTypes::BoolVectorType || diff --git a/ispc.h b/ispc.h index 8a03023b..652390d2 100644 --- a/ispc.h +++ b/ispc.h @@ -261,6 +261,8 @@ public: int getNativeVectorWidth() const {return m_nativeVectorWidth;} + int getNativeVectorAlignment() const {return m_nativeVectorAlignment;} + int getDataTypeWidth() const {return m_dataTypeWidth;} int getVectorWidth() const {return m_vectorWidth;} @@ -334,6 +336,13 @@ private: SSE, 8 for AVX, etc.) */ int m_nativeVectorWidth; + /** Native vector alignment in bytes. Theoretically this may be derived + from the vector size, but it's better to manage directly the alignement. + It allows easier experimenting and better fine tuning for particular + platform. This information is primatily used when + --opt=force-aligned-memory is used. */ + int m_nativeVectorAlignment; + /** Data type with in bits. Typically it's 32, but could be 8, 16 or 64. For generic it's -1, which means undefined. */ int m_dataTypeWidth; diff --git a/ispc.vcxproj b/ispc.vcxproj index b9a3b6c5..8aee2988 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -57,17 +57,17 @@ - - - - + + + + - - - - - + + + + + 4146;4800;4996;4355;4624;4005;4003;4018 @@ -132,383 +132,215 @@ Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit > $(Configuration)/gen-bitcode-sse4-32bit.cpp - $(Configuration)/gen-bitcode-sse4-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit > $(Configuration)/gen-bitcode-sse4-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 64bit > $(Configuration)/gen-bitcode-sse4-64bit.cpp + $(Configuration)/gen-bitcode-sse4-32bit.cpp; $(Configuration)/gen-bitcode-sse4-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll - Building gen-bitcode-sse4-32bit.cpp + Building gen-bitcode-sse4-32bit.cpp and gen-bitcode-sse4-64bit.cpp - + Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 64bit > $(Configuration)/gen-bitcode-sse4-64bit.cpp - $(Configuration)/gen-bitcode-sse4-64bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit > $(Configuration)/gen-bitcode-sse4-8-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit > $(Configuration)/gen-bitcode-sse4-8-64bit.cpp + $(Configuration)/gen-bitcode-sse4-8-32bit.cpp; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll - Building gen-bitcode-sse4-64bit.cpp + Building gen-bitcode-sse4-8-32bit.cpp and gen-bitcode-sse4-8-64bit.cpp - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit > $(Configuration)/gen-bitcode-sse4-8-32bit.cpp - $(Configuration)/gen-bitcode-sse4-8-32bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll - Building gen-bitcode-sse4-8-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit > $(Configuration)/gen-bitcode-sse4-8-64bit.cpp - $(Configuration)/gen-bitcode-sse4-8-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll - Building gen-bitcode-sse4-8-64bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit > $(Configuration)/gen-bitcode-sse4-16-32bit.cpp - $(Configuration)/gen-bitcode-sse4-16-32bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll - Building gen-bitcode-sse4-16-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit > $(Configuration)/gen-bitcode-sse4-16-64bit.cpp - $(Configuration)/gen-bitcode-sse4-16-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll - Building gen-bitcode-sse4-16-64bit.cpp - - - + Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit > $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp - $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit > $(Configuration)/gen-bitcode-sse4-16-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit > $(Configuration)/gen-bitcode-sse4-16-64bit.cpp + $(Configuration)/gen-bitcode-sse4-16-32bit.cpp; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll - Building gen-bitcode-sse4-x2-32bit.cpp + Building gen-bitcode-sse4-16-32bit.cpp and gen-bitcode-sse4-16-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit > $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp - $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit > $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit > $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp + $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll - Building gen-bitcode-sse4-x2-64bit.cpp + Building gen-bitcode-sse4-x2-32bit.cpp and gen-bitcode-sse4-x2-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit > $(Configuration)/gen-bitcode-sse2-32bit.cpp - $(Configuration)/gen-bitcode-sse2-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit > $(Configuration)/gen-bitcode-sse2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit > $(Configuration)/gen-bitcode-sse2-64bit.cpp + $(Configuration)/gen-bitcode-sse2-32bit.cpp; $(Configuration)/gen-bitcode-sse2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll - Building gen-bitcode-sse2-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit > $(Configuration)/gen-bitcode-sse2-64bit.cpp - $(Configuration)/gen-bitcode-sse2-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll - Building gen-bitcode-sse2-64bit.cpp + Building gen-bitcode-sse2-32bit.cpp and gen-bitcode-sse2-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit > $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp - $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit > $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit > $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp + $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll - Building gen-bitcode-sse2-x2-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit > $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp - $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll - Building gen-bitcode-sse2-x2-64bit.cpp + Building gen-bitcode-sse2-x2-32bit.cpp and gen-bitcode-sse2-x2-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit > $(Configuration)/gen-bitcode-avx1-32bit.cpp - $(Configuration)/gen-bitcode-avx1-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit > $(Configuration)/gen-bitcode-avx1-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit > $(Configuration)/gen-bitcode-avx1-64bit.cpp + $(Configuration)/gen-bitcode-avx1-32bit.cpp; $(Configuration)/gen-bitcode-avx1-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx1-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit > $(Configuration)/gen-bitcode-avx1-64bit.cpp - $(Configuration)/gen-bitcode-avx1-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx1-64bit.cpp + Building gen-bitcode-avx1-32bit.cpp and gen-bitcode-avx1-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit > $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp - $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit > $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit > $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp + $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll - Building gen-bitcode-avx1-x2-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit > $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp - $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll - Building gen-bitcode-avx1-x2-64bit.cpp + Building gen-bitcode-avx1-x2-32bit.cpp and gen-bitcode-avx1-x2-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp - $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 64bit > $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp + $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll - Building gen-bitcode-avx1-i64x4-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 64bit > $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp - $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll - Building gen-bitcode-avx1-i64x4-64bit.cpp + Building gen-bitcode-avx1-i64x4-32bit.cpp and gen-bitcode-avx1-i64x4-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit > $(Configuration)/gen-bitcode-avx11-32bit.cpp - $(Configuration)/gen-bitcode-avx11-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit > $(Configuration)/gen-bitcode-avx11-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit > $(Configuration)/gen-bitcode-avx11-64bit.cpp + $(Configuration)/gen-bitcode-avx11-32bit.cpp; $(Configuration)/gen-bitcode-avx11-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx11-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit > $(Configuration)/gen-bitcode-avx11-64bit.cpp - $(Configuration)/gen-bitcode-avx11-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx11-64bit.cpp + Building gen-bitcode-avx11-32bit.cpp and gen-bitcode-avx11-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit > $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp - $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit > $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit > $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp + $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll - Building gen-bitcode-avx11-x2-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit > $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp - $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll - Building gen-bitcode-avx11-x2-64bit.cpp + Building gen-bitcode-avx11-x2-32bit.cpp and gen-bitcode-avx11-x2-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp - $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 64bit > $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp + $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll - Building gen-bitcode-avx11-i64x4-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 64bit > $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp - $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll - Building gen-bitcode-avx11-i64x4-64bit.cpp + Building gen-bitcode-avx11-i64x4-32bit.cpp and gen-bitcode-avx11-i64x4-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit > $(Configuration)/gen-bitcode-avx2-32bit.cpp - $(Configuration)/gen-bitcode-avx2-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit > $(Configuration)/gen-bitcode-avx2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit > $(Configuration)/gen-bitcode-avx2-64bit.cpp + $(Configuration)/gen-bitcode-avx2-32bit.cpp; $(Configuration)/gen-bitcode-avx2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx2-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit > $(Configuration)/gen-bitcode-avx2-64bit.cpp - $(Configuration)/gen-bitcode-avx2-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx2-64bit.cpp + Building gen-bitcode-avx2-32bit.cpp and gen-bitcode-avx2-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit > $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp - $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit > $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit > $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp + $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll - Building gen-bitcode-avx2-x2-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit > $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp - $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll - Building gen-bitcode-avx2-x2-64bit.cpp + Building gen-bitcode-avx2-x2-32bit.cpp and gen-bitcode-avx2-x2-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp - $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 64bit > $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp + $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp; $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll - Building gen-bitcode-avx2-i64x4-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 64bit > $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp - $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll - Building gen-bitcode-avx2-i64x4-64bit.cpp + Building gen-bitcode-avx2-i64x4-32bit.cpp and gen-bitcode-avx2-i64x4-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit > $(Configuration)/gen-bitcode-generic-1-32bit.cpp - $(Configuration)/gen-bitcode-generic-1-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit > $(Configuration)/gen-bitcode-generic-1-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit > $(Configuration)/gen-bitcode-generic-1-64bit.cpp + $(Configuration)/gen-bitcode-generic-1-32bit.cpp; $(Configuration)/gen-bitcode-generic-1-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll - Building gen-bitcode-generic-1-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit > $(Configuration)/gen-bitcode-generic-1-64bit.cpp - $(Configuration)/gen-bitcode-generic-1-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll - Building gen-bitcode-generic-1-64bit.cpp + Building gen-bitcode-generic-1-32bit.cpp and gen-bitcode-generic-1-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit > $(Configuration)/gen-bitcode-generic-4-32bit.cpp - $(Configuration)/gen-bitcode-generic-4-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit > $(Configuration)/gen-bitcode-generic-4-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit > $(Configuration)/gen-bitcode-generic-4-64bit.cpp + $(Configuration)/gen-bitcode-generic-4-32bit.cpp; $(Configuration)/gen-bitcode-generic-4-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll - Building gen-bitcode-generic-4-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit > $(Configuration)/gen-bitcode-generic-4-64bit.cpp - $(Configuration)/gen-bitcode-generic-4-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll - Building gen-bitcode-generic-4-64bit.cpp + Building gen-bitcode-generic-4-32bit.cpp and gen-bitcode-generic-4-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit > $(Configuration)/gen-bitcode-generic-8-32bit.cpp - $(Configuration)/gen-bitcode-generic-8-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit > $(Configuration)/gen-bitcode-generic-8-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit > $(Configuration)/gen-bitcode-generic-8-64bit.cpp + $(Configuration)/gen-bitcode-generic-8-32bit.cpp; $(Configuration)/gen-bitcode-generic-8-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll - Building gen-bitcode-generic-8-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit > $(Configuration)/gen-bitcode-generic-8-64bit.cpp - $(Configuration)/gen-bitcode-generic-8-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll - Building gen-bitcode-generic-8-64bit.cpp + Building gen-bitcode-generic-8-32bit.cpp and gen-bitcode-generic-8-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit > $(Configuration)/gen-bitcode-generic-16-32bit.cpp - $(Configuration)/gen-bitcode-generic-16-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit > $(Configuration)/gen-bitcode-generic-16-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit > $(Configuration)/gen-bitcode-generic-16-64bit.cpp + $(Configuration)/gen-bitcode-generic-16-32bit.cpp; $(Configuration)/gen-bitcode-generic-16-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll - Building gen-bitcode-generic-16-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit > $(Configuration)/gen-bitcode-generic-16-64bit.cpp - $(Configuration)/gen-bitcode-generic-16-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll - Building gen-bitcode-generic-16-64bit.cpp + Building gen-bitcode-generic-16-32bit.cpp and gen-bitcode-generic-16-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit > $(Configuration)/gen-bitcode-generic-32-32bit.cpp - $(Configuration)/gen-bitcode-generic-32-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit > $(Configuration)/gen-bitcode-generic-32-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit > $(Configuration)/gen-bitcode-generic-32-64bit.cpp + $(Configuration)/gen-bitcode-generic-32-32bit.cpp; $(Configuration)/gen-bitcode-generic-32-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll - Building gen-bitcode-generic-32-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit > $(Configuration)/gen-bitcode-generic-32-64bit.cpp - $(Configuration)/gen-bitcode-generic-32-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll - Building gen-bitcode-generic-32-64bit.cpp + Building gen-bitcode-generic-32-32bit.cpp and gen-bitcode-generic-32-64bit.cpp Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit > $(Configuration)/gen-bitcode-generic-64-32bit.cpp - $(Configuration)/gen-bitcode-generic-64-32bit.cpp + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit > $(Configuration)/gen-bitcode-generic-64-32bit.cpp; + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit > $(Configuration)/gen-bitcode-generic-64-64bit.cpp + $(Configuration)/gen-bitcode-generic-64-32bit.cpp; $(Configuration)/gen-bitcode-generic-64-64bit.cpp builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll - Building gen-bitcode-generic-64-32bit.cpp + Building gen-bitcode-generic-64-32bit.cpp and gen-bitcode-generic-64-64bit.cpp - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit > $(Configuration)/gen-bitcode-generic-64-64bit.cpp - $(Configuration)/gen-bitcode-generic-64-64bit.cpp - builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll - Building gen-bitcode-generic-64-64bit.cpp - - - - + Document flex -t lex.ll > $(Configuration)\lex.cc $(Configuration)\lex.cc @@ -597,4 +429,4 @@ - + diff --git a/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch new file mode 100644 index 00000000..8f0a790b --- /dev/null +++ b/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch @@ -0,0 +1,115 @@ +From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001 +From: Michael Liao +Date: Mon, 21 Oct 2013 17:47:58 -0700 +Subject: [PATCH] Fix PR17631 + +- Skip instructions added in prolog. For specific targets, prolog may + insert helper function calls (e.g. _chkstk will be called when + there're more than 4K bytes allocated on stack). However, these + helpers don't use/def YMM/XMM registers. + It also include second fix for the problem: r196261+r196391. + +diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp +index 477f75a..0d37a7d 100644 +--- lib/Target/X86/X86VZeroUpper.cpp ++++ lib/Target/X86/X86VZeroUpper.cpp +@@ -121,7 +121,7 @@ + } + + static bool clobbersAllYmmRegs(const MachineOperand &MO) { +- for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) { ++ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { + if (!MO.clobbersPhysReg(reg)) + return false; + } +@@ -143,6 +143,21 @@ + return false; + } + ++/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this ++/// instruction. ++static bool clobbersAnyYmmReg(MachineInstr *MI) { ++ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { ++ const MachineOperand &MO = MI->getOperand(i); ++ if (!MO.isRegMask()) ++ continue; ++ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { ++ if (MO.clobbersPhysReg(reg)) ++ return true; ++ } ++ } ++ return false; ++} ++ + /// runOnMachineFunction - Loop over all of the basic blocks, inserting + /// vzero upper instructions before function calls. + bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { +@@ -226,8 +241,9 @@ + bool BBHasCall = false; + + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { ++ DebugLoc dl = I->getDebugLoc(); + MachineInstr *MI = I; +- DebugLoc dl = I->getDebugLoc(); ++ + bool isControlFlow = MI->isCall() || MI->isReturn(); + + // Shortcut: don't need to check regular instructions in dirty state. +@@ -246,6 +262,14 @@ + if (!isControlFlow) + continue; + ++ // If the call won't clobber any YMM register, skip it as well. It usually ++ // happens on helper function calls (such as '_chkstk', '_ftol2') where ++ // standard calling convention is not used (RegMask is not used to mark ++ // register clobbered and register usage (def/imp-def/use) is well-dfined ++ // and explicitly specified. ++ if (MI->isCall() && !clobbersAnyYmmReg(MI)) ++ continue; ++ + BBHasCall = true; + + // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX +diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll +new file mode 100644 +index 0000000..a572ff2 +--- /dev/null ++++ test/CodeGen/X86/pr17631.ll +@@ -0,0 +1,34 @@ ++; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s ++ ++%struct_type = type { [64 x <8 x float>], <8 x float> } ++ ++; Function Attrs: nounwind readnone ++declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) ++ ++; Function Attrs: nounwind ++define i32 @equal(<8 x i32> %A) { ++allocas: ++ %first_alloc = alloca [64 x <8 x i32>] ++ %second_alloc = alloca %struct_type ++ ++ %A1 = bitcast <8 x i32> %A to <8 x float> ++ %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1) ++ ret i32 %A2 ++} ++ ++; CHECK: equal ++; CHECK-NOT: vzeroupper ++; CHECK: _chkstk ++; CHECK: ret ++ ++define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) { ++ %i = fptoui double %x to i64 ++ store i64 %i, i64* %p ++ %ret = fadd <8 x float> %y, %y ++ ret <8 x float> %ret ++} ++ ++; CHECK: foo ++; CHECK-NOT: vzeroupper ++; CHECK: _ftol2 ++; CHECK: ret +-- +1.8.1.2 + diff --git a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch deleted file mode 100644 index b6abb1d3..00000000 --- a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch +++ /dev/null @@ -1,69 +0,0 @@ -From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001 -From: Michael Liao -Date: Mon, 21 Oct 2013 17:47:58 -0700 -Subject: [PATCH] Fix PR17631 - -- Skip instructions added in prolog. For specific targets, prolog may - insert helper function calls (e.g. _chkstk will be called when - there're more than 4K bytes allocated on stack). However, these - helpers don't use/def YMM/XMM registers. ---- - lib/Target/X86/X86VZeroUpper.cpp | 11 ++++++++++- - test/CodeGen/X86/pr17631.ll | 22 ++++++++++++++++++++++ - 2 files changed, 32 insertions(+), 1 deletion(-) - create mode 100644 test/CodeGen/X86/pr17631.ll - -diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp -index 477f75a..0d37a7d 100644 ---- lib/Target/X86/X86VZeroUpper.cpp -+++ lib/Target/X86/X86VZeroUpper.cpp -@@ -231,8 +231,17 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, - bool BBHasCall = false; - - for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { -- MachineInstr *MI = I; - DebugLoc dl = I->getDebugLoc(); -+ MachineInstr *MI = I; -+ -+ // Don't need to check instructions added in prolog. -+ // In prolog, special function calls may be added for specific targets -+ // (e.g. on Windows, a prolog helper '_chkstk' is called when the local -+ // variables exceed 4K bytes on stack.) These helpers won't use/def YMM/XMM -+ // registers. -+ if (MI->getFlag(MachineInstr::FrameSetup)) -+ continue; -+ - bool isControlFlow = MI->isCall() || MI->isReturn(); - - // Shortcut: don't need to check regular instructions in dirty state. -diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll -new file mode 100644 -index 0000000..a572ff2 ---- /dev/null -+++ test/CodeGen/X86/pr17631.ll -@@ -0,0 +1,22 @@ -+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s -+ -+%struct_type = type { [64 x <8 x float>], <8 x float> } -+ -+; Function Attrs: nounwind readnone -+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) -+ -+; Function Attrs: nounwind -+define i32 @equal(<8 x i32> %A) { -+allocas: -+ %first_alloc = alloca [64 x <8 x i32>] -+ %second_alloc = alloca %struct_type -+ -+ %A1 = bitcast <8 x i32> %A to <8 x float> -+ %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1) -+ ret i32 %A2 -+} -+ -+; CHECK: equal -+; CHECK-NOT: vzeroupper -+; CHECK: _chkstk -+; CHECK: ret --- -1.8.1.2 - diff --git a/llvm_patches/3_3_r195476_r195779_i16_sext.patch b/llvm_patches/3_3_r195476_r195779_i16_sext.patch new file mode 100644 index 00000000..a49325c9 --- /dev/null +++ b/llvm_patches/3_3_r195476_r195779_i16_sext.patch @@ -0,0 +1,57 @@ +Two stability patches affecting sse4-i16x8 and sse4-i8x16 targets. See PR18014 and PR18054 for more details. + +Index: lib/Target/X86/X86ISelLowering.cpp +=================================================================== +--- lib/Target/X86/X86ISelLowering.cpp (revision 195862) ++++ lib/Target/X86/X86ISelLowering.cpp (working copy) +@@ -12099,19 +12099,27 @@ + // fall through + case MVT::v4i32: + case MVT::v8i16: { +- // (sext (vzext x)) -> (vsext x) + SDValue Op0 = Op.getOperand(0); + SDValue Op00 = Op0.getOperand(0); + SDValue Tmp1; + // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. + if (Op0.getOpcode() == ISD::BITCAST && +- Op00.getOpcode() == ISD::VECTOR_SHUFFLE) ++ Op00.getOpcode() == ISD::VECTOR_SHUFFLE) { ++ // (sext (vzext x)) -> (vsext x) + Tmp1 = LowerVectorIntExtend(Op00, DAG); +- if (Tmp1.getNode()) { +- SDValue Tmp1Op0 = Tmp1.getOperand(0); +- assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && +- "This optimization is invalid without a VZEXT."); +- return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); ++ if (Tmp1.getNode()) { ++ EVT ExtraEltVT = ExtraVT.getVectorElementType(); ++ // This folding is only valid when the in-reg type is a vector of i8, ++ // i16, or i32. ++ if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 || ++ ExtraEltVT == MVT::i32) { ++ SDValue Tmp1Op0 = Tmp1.getOperand(0); ++ assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && ++ "This optimization is invalid without a VZEXT."); ++ return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); ++ } ++ Op0 = Tmp1; ++ } + } + + // If the above didn't work, then just use Shift-Left + Shift-Right. +@@ -15826,6 +15834,15 @@ + if (BitWidth == 1) + return SDValue(); + ++ // Check all uses of that condition operand to check whether it will be ++ // consumed by non-BLEND instructions, which may depend on all bits are set ++ // properly. ++ for (SDNode::use_iterator I = Cond->use_begin(), ++ E = Cond->use_end(); I != E; ++I) ++ if (I->getOpcode() != ISD::VSELECT) ++ // TODO: Add other opcodes eventually lowered into BLEND. ++ return SDValue(); ++ + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); + APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); + diff --git a/module.cpp b/module.cpp index fa9b98a4..4ca1b351 100644 --- a/module.cpp +++ b/module.cpp @@ -2134,7 +2134,7 @@ lAddExtractedGlobals(llvm::Module *module, // example, this happens with varying globals if we compile // to different vector widths. if (gv2->getType() != gv->getType()) - Error(rgi.pos, "Mismatch in size/layout of global " + Warning(rgi.pos, "Mismatch in size/layout of global " "variable \"%s\" with different targets. " "Globals must not include \"varying\" types or arrays " "with size based on programCount when compiling to " diff --git a/opt.cpp b/opt.cpp index 8df0f4fe..9059c746 100644 --- a/opt.cpp +++ b/opt.cpp @@ -127,6 +127,8 @@ static llvm::Pass *CreateDebugPass(char * output); static llvm::Pass *CreateReplaceStdlibShiftPass(); +static llvm::Pass *CreateFixBooleanSelectPass(); + #define DEBUG_START_PASS(NAME) \ if (g->debugPrint && \ (getenv("FUNC") == NULL || \ @@ -659,6 +661,9 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(CreateMakeInternalFuncsStaticPass()); optPM.add(llvm::createGlobalDCEPass()); optPM.add(llvm::createConstantMergePass()); + + // Should be the last + optPM.add(CreateFixBooleanSelectPass(), 400); } // Finish up by making sure we didn't mess anything up in the IR along @@ -670,6 +675,7 @@ Optimize(llvm::Module *module, int optLevel) { printf("\n*****\nFINAL OUTPUT\n*****\n"); module->dump(); } + } @@ -898,7 +904,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { lCopyMetadata(castPtr, callInst); int align; if (g->opt.forceAlignedMemory) - align = 0; + align = g->target->getNativeVectorAlignment(); else align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8; name = LLVMGetName(callInst->getArgOperand(0), "_load"); @@ -940,7 +946,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL); int align; if (g->opt.forceAlignedMemory) - align = 0; + align = g->target->getNativeVectorAlignment(); else align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8; storeInst->setAlignment(align); @@ -1022,12 +1028,12 @@ InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) { if (trunc != NULL) { // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector) llvm::SExtInst *sext = llvm::dyn_cast(value); - if (sext && + if (sext && sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType) return sext->getOperand(0); llvm::ZExtInst *zext = llvm::dyn_cast(value); - if (zext && + if (zext && zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType) return zext->getOperand(0); } @@ -1853,7 +1859,7 @@ lIs32BitSafeHelper(llvm::Value *v) { // handle Adds, SExts, Constant Vectors if (llvm::BinaryOperator *bop = llvm::dyn_cast(v)) { if (bop->getOpcode() == llvm::Instruction::Add) { - return lIs32BitSafeHelper(bop->getOperand(0)) + return lIs32BitSafeHelper(bop->getOperand(0)) && lIs32BitSafeHelper(bop->getOperand(1)); } return false; @@ -2752,7 +2758,8 @@ lImproveMaskedStore(llvm::CallInst *callInst) { lCopyMetadata(lvalue, callInst); llvm::Instruction *store = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */, - g->opt.forceAlignedMemory ? 0 : info->align); + g->opt.forceAlignedMemory ? + g->target->getNativeVectorAlignment() : info->align); lCopyMetadata(store, callInst); llvm::ReplaceInstWithInst(callInst, store); return true; @@ -2815,7 +2822,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst, callInst); llvm::Instruction *load = new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */, - g->opt.forceAlignedMemory ? 0 : info->align, + g->opt.forceAlignedMemory ? + g->target->getNativeVectorAlignment() : info->align, (llvm::Instruction *)NULL); lCopyMetadata(load, callInst); llvm::ReplaceInstWithInst(callInst, load); @@ -3220,6 +3228,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector &loadOps, } case 4: { // 4-wide vector load + if (g->opt.forceAlignedMemory) { + align = g->target->getNativeVectorAlignment(); + } llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 4); loadOps[i].load = lGEPAndLoad(basePtr, start, align, @@ -3228,6 +3239,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector &loadOps, } case 8: { // 8-wide vector load + if (g->opt.forceAlignedMemory) { + align = g->target->getNativeVectorAlignment(); + } llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 8); loadOps[i].load = lGEPAndLoad(basePtr, start, align, @@ -4961,7 +4975,7 @@ bool ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) { DEBUG_START_PASS("ReplaceStdlibShiftPass"); bool modifiedAny = false; - + llvm::Function *shifts[6]; shifts[0] = m->module->getFunction("__shift_i8"); shifts[1] = m->module->getFunction("__shift_i16"); @@ -4992,19 +5006,19 @@ ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) { } llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleVals); llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shiftedVec->getType()); - llvm::Value *shuffle = new llvm::ShuffleVectorInst(shiftedVec, zeroVec, + llvm::Value *shuffle = new llvm::ShuffleVectorInst(shiftedVec, zeroVec, shuffleIdxs, "vecShift", ci); ci->replaceAllUsesWith(shuffle); modifiedAny = true; delete [] shuffleVals; } else { - PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount."); + PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount."); } } } } } - + DEBUG_END_PASS("ReplaceStdlibShiftPass"); return modifiedAny; @@ -5015,3 +5029,185 @@ static llvm::Pass * CreateReplaceStdlibShiftPass() { return new ReplaceStdlibShiftPass(); } + + + +/////////////////////////////////////////////////////////////////////////////// +// FixBooleanSelect +// +// The problem is that in LLVM 3.3, optimizer doesn't like +// the following instruction sequence: +// %cmp = fcmp olt <8 x float> %a, %b +// %sext_cmp = sext <8 x i1> %cmp to <8 x i32> +// %new_mask = and <8 x i32> %sext_cmp, %mask +// and optimizes it to the following: +// %cmp = fcmp olt <8 x float> %a, %b +// %cond = select <8 x i1> %cmp, <8 x i32> %mask, <8 x i32> zeroinitializer +// +// It wouldn't be a problem if codegen produced good code for it. But it +// doesn't, especially for vectors larger than native vectors. +// +// This optimization reverts this pattern and should be the last one before +// code gen. +// +// Note that this problem was introduced in LLVM 3.3. But in LLVM 3.4 it was +// fixed. See commit r194542. +// +// After LLVM 3.3 this optimization should probably stay for experimental +// purposes and code should be compared with and without this optimization from +// time to time to make sure that LLVM does right thing. +/////////////////////////////////////////////////////////////////////////////// + +class FixBooleanSelectPass : public llvm::FunctionPass { +public: + static char ID; + FixBooleanSelectPass() :FunctionPass(ID) {} + + const char *getPassName() const { return "Resolve \"replace extract insert chains\""; } + bool runOnFunction(llvm::Function &F); + +private: + llvm::Instruction* fixSelect(llvm::SelectInst* sel, llvm::SExtInst* sext); +}; + +char FixBooleanSelectPass::ID = 0; + +llvm::Instruction* FixBooleanSelectPass::fixSelect(llvm::SelectInst* sel, llvm::SExtInst* sext) { + // Select instruction result type and its integer equivalent + llvm::VectorType *orig_type = llvm::dyn_cast(sel->getType()); + llvm::VectorType *int_type = llvm::VectorType::getInteger(orig_type); + + // Result value and optional pointer to instruction to delete + llvm::Instruction *result = 0, *optional_to_delete = 0; + + // It can be vector of integers or vector of floating point values. + if (orig_type->getElementType()->isIntegerTy()) { + // Generate sext+and, remove select. + result = llvm::BinaryOperator::CreateAnd(sext, sel->getTrueValue(), "and_mask", sel); + } else { + llvm::BitCastInst* bc = llvm::dyn_cast(sel->getTrueValue()); + + if (bc && bc->hasOneUse() && bc->getSrcTy()->isIntOrIntVectorTy() && bc->getSrcTy()->isVectorTy() && + llvm::isa(bc->getOperand(0)) && + llvm::dyn_cast(bc->getOperand(0))->getParent() == sel->getParent()) { + // Bitcast is casting form integer type, it's operand is instruction, which is located in the same basic block (otherwise it's unsafe to use it). + // bitcast+select => sext+and+bicast + // Create and + llvm::BinaryOperator* and_inst = llvm::BinaryOperator::CreateAnd(sext, bc->getOperand(0), "and_mask", sel); + // Bitcast back to original type + result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel); + // Original bitcast will be removed + optional_to_delete = bc; + } else { + // General case: select => bitcast+sext+and+bitcast + // Bitcast + llvm::BitCastInst* bc_in = new llvm::BitCastInst(sel->getTrueValue(), int_type, "bitcast_mask_in", sel); + // And + llvm::BinaryOperator* and_inst = llvm::BinaryOperator::CreateAnd(sext, bc_in, "and_mask", sel); + // Bitcast back to original type + result = new llvm::BitCastInst(and_inst, sel->getType(), "bitcast_mask_out", sel); + } + } + + // Done, finalize. + sel->replaceAllUsesWith(result); + sel->eraseFromParent(); + if (optional_to_delete) { + optional_to_delete->eraseFromParent(); + } + + return result; +} + +bool +FixBooleanSelectPass::runOnFunction(llvm::Function &F) { + bool modifiedAny = false; + + // LLVM 3.3 only +#if defined(LLVM_3_3) + + for (llvm::Function::iterator I = F.begin(), E = F.end(); + I != E; ++I) { + llvm::BasicBlock* bb = &*I; + for (llvm::BasicBlock::iterator iter = bb->begin(), e = bb->end(); iter != e; ++iter) { + llvm::Instruction *inst = &*iter; + + llvm::CmpInst *cmp = llvm::dyn_cast(inst); + + if (cmp && + cmp->getType()->isVectorTy() && + cmp->getType()->getVectorElementType()->isIntegerTy(1)) { + + // Search for select instruction uses. + int selects = 0; + llvm::VectorType* sext_type = 0; + for (llvm::Instruction::use_iterator it=cmp->use_begin(); it!=cmp->use_end(); ++it ) { + llvm::SelectInst* sel = llvm::dyn_cast(*it); + if (sel && + sel->getType()->isVectorTy() && + sel->getType()->getScalarSizeInBits() > 1) { + selects++; + // We pick the first one, but typical case when all select types are the same. + sext_type = llvm::dyn_cast(sel->getType()); + break; + } + } + if (selects == 0) { + continue; + } + // Get an integer equivalent, if it's not yet an integer. + sext_type = llvm::VectorType::getInteger(sext_type); + + // Do transformation + llvm::BasicBlock::iterator iter_copy=iter; + llvm::Instruction* next_inst = &*(++iter_copy); + // Create or reuse sext + llvm::SExtInst* sext = llvm::dyn_cast(next_inst); + if (sext && + sext->getOperand(0) == cmp && + sext->getDestTy() == sext_type) { + // This sext can be reused + } else { + if (next_inst) { + sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", next_inst); + } else { + sext = new llvm::SExtInst(cmp, sext_type, "sext_cmp", bb); + } + } + + // Walk and fix selects + std::vector sel_uses; + for (llvm::Instruction::use_iterator it=cmp->use_begin(); it!=cmp->use_end(); ++it) { + llvm::SelectInst* sel = llvm::dyn_cast(*it); + if (sel && + sel->getType()->getScalarSizeInBits() == sext_type->getScalarSizeInBits()) { + + // Check that second operand is zero. + llvm::Constant* false_cond = llvm::dyn_cast(sel->getFalseValue()); + if (false_cond && + false_cond->isZeroValue()) { + sel_uses.push_back(sel); + modifiedAny = true; + } + } + } + + for (int i=0; i> " + perf_temp + "_ref" ex_command = "./test " + command + " >> " + perf_temp + "_test" @@ -487,8 +494,8 @@ def perf(options1, args): else: ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref" ex_command = "x64\\Release1\\test.exe " + command + " >> " + perf_temp + "_test" - bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /p:ISPC_compiler=ispc_ref /t:rebuild >> " + build_log - bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /p:ISPC_compiler=ispc /t:rebuild >> " + build_log + bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /p:ISPC_compiler=ispc_ref " + Target_out + " /t:rebuild >> " + build_log + bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /p:ISPC_compiler=ispc " + Target_out + " /t:rebuild >> " + build_log re_command = "msbuild /t:clean >> " + build_log commands = [ex_command, bu_command, ex_command_ref, bu_command_ref, re_command] # parsing config parameters diff --git a/run_tests.py b/run_tests.py index 506d37a5..e6429861 100755 --- a/run_tests.py +++ b/run_tests.py @@ -452,9 +452,9 @@ def verify(): f_lines = f.readlines() f.close() check = [["g++", "clang++", "cl"],["-O0", "-O2"],["x86","x86-64"], - ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"], + ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM 3.4","LLVM trunk"], ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8", - "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8", + "sse4-i8x16", "avx1-i32x4" "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8", "avx1.1-i32x16", "avx1.1-i64x4", "avx2-i32x8", "avx2-i32x16", "avx2-i64x4", "generic-1", "generic-4", "generic-8", "generic-16", "generic-32", "generic-64"]]