python regex-based preprocessor proof of concept

Ignore llvm build directory
Merge pull request #1264 from dbabokin/attributelist
2017-04-18 22:28:48 -04:00 · 2017-04-11 14:08:00 -04:00 · 2017-03-28 17:02:28 -07:00 · 2017-03-28 16:58:49 -07:00 · 2017-03-22 13:29:06 -07:00 · 2017-03-22 13:27:26 -07:00
30 changed files with 959 additions and 318 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -4,11 +4,15 @@ depend
 ispc
 ispc_test
 ispc_ref
+llvm/
 objs
 docs/doxygen
 docs/*.html
 tests*/*cpp
 tests*/*run
+tests*/*.o
+tests_ispcpp/*.h
+tests_ispcpp/*pre*
 logs/
 notify_log.log
 alloy_results_*
--- a/alloy.py
+++ b/alloy.py
@@ -33,6 +33,8 @@

 # // Author: Filippov Ilia

+import re
+
 def tail_and_save(file_in, file_out, tail = 100):    
    with open(file_in, 'r') as f_in:
        lines = f_in.readlines()[-tail:]
@@ -91,6 +93,7 @@ def check_LLVM(which_LLVM):
    return answer

 def try_do_LLVM(text, command, from_validation):
+    print_debug("Command line: "+command+"\n", True, alloy_build)
    if from_validation == True:
        text = text + "\n"
    print_debug("Trying to " + text, from_validation, alloy_build)
@@ -108,7 +111,77 @@ def try_do_LLVM(text, command, from_validation):
        error("can't " + text, 1)
    print_debug("DONE.\n", from_validation, alloy_build)

-def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, from_validation, force, make, gcc_toolchain_path):
+def checkout_LLVM(component, use_git, version_LLVM, revision, target_dir, from_validation):
+    # Identify the component
+    GIT_REPO_BASE="http://llvm.org/git/"
+    #GIT_REPO_BASE="https://github.com/llvm-mirror/"
+    if component == "llvm":
+        SVN_REPO="http://llvm.org/svn/llvm-project/llvm/"
+        GIT_REPO=GIT_REPO_BASE+"llvm.git"
+    elif component == "clang":
+        SVN_REPO="http://llvm.org/svn/llvm-project/cfe/"
+        GIT_REPO=GIT_REPO_BASE+"clang.git"
+    elif component == "libcxx":
+        SVN_REPO="http://llvm.org/svn/llvm-project/libcxx/"
+        GIT_REPO=GIT_REPO_BASE+"libcxx.git"
+    elif component == "clang-tools-extra":
+        SVN_REPO="http://llvm.org/svn/llvm-project/clang-tools-extra/"
+        GIT_REPO=GIT_REPO_BASE+"clang-tools-extra.git"
+    elif component == "compiler-rt":
+        SVN_REPO="http://llvm.org/svn/llvm-project/compiler-rt/"
+        GIT_REPO=GIT_REPO_BASE+"compiler-rt.git"
+    else:
+        error("Trying to checkout unidentified component: " + component, 1)
+
+    # Identify the version
+    if  version_LLVM == "trunk":
+        SVN_PATH="trunk"
+        GIT_BRANCH="master"
+    elif  version_LLVM == "4_0":
+        SVN_PATH="branches/release_40"
+        GIT_BRANCH="release_40"
+    elif  version_LLVM == "3_9":
+        SVN_PATH="tags/RELEASE_390/final"
+        GIT_BRANCH="release_39"
+    elif  version_LLVM == "3_8":
+        SVN_PATH="tags/RELEASE_381/final"
+        GIT_BRANCH="release_38"
+    elif  version_LLVM == "3_7":
+        SVN_PATH="tags/RELEASE_370/final"
+        GIT_BRANCH="release_37"
+    elif  version_LLVM == "3_6":
+        SVN_PATH="tags/RELEASE_362/final"
+        GIT_BRANCH="release_36"
+    elif  version_LLVM == "3_5":
+        SVN_PATH="tags/RELEASE_351/final"
+        GIT_BRANCH="release_35"
+    elif  version_LLVM == "3_4":
+        SVN_PATH="tags/RELEASE_34/dot2-final"
+        GIT_BRANCH="release_34"
+    elif  version_LLVM == "3_3":
+        SVN_PATH="tags/RELEASE_33/final"
+        GIT_BRANCH="release_33"
+    elif  version_LLVM == "3_2":
+        SVN_PATH="tags/RELEASE_32/final"
+        GIT_BRANCH="release_32"
+    else:
+        error("Unsupported llvm version: " + version_LLVM, 1)
+
+    if use_git:
+        try_do_LLVM("clone "+component+" from "+GIT_REPO+" to "+target_dir+" ",
+                    "git clone "+GIT_REPO+" "+target_dir,
+                    from_validation)
+        if GIT_BRANCH != "master":
+            os.chdir(target_dir)
+            try_do_LLVM("switch to "+GIT_BRANCH+" branch ",
+                        "git checkout -b "+GIT_BRANCH+" remotes/origin/"+GIT_BRANCH, from_validation)
+            os.chdir("..")
+    else:
+        try_do_LLVM("load "+component+" from "+SVN_REPO+SVN_PATH+" ",
+                    "svn co "+revision+" "+SVN_REPO+SVN_PATH+" "+target_dir,
+                    from_validation)
+
+def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, from_validation, force, make, gcc_toolchain_path, use_git):
    print_debug("Building LLVM. Version: " + version_LLVM + ". ", from_validation, alloy_build)
    if revision != "":
        print_debug("Revision: " + revision + ".\n", from_validation, alloy_build)
@@ -119,32 +192,11 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
    llvm_home = os.environ["LLVM_HOME"]
    
    make_sure_dir_exists(llvm_home)
+
+    FOLDER_NAME=version_LLVM
+    version_LLVM = re.sub('\.', '_', version_LLVM)
    
    os.chdir(llvm_home)
-    FOLDER_NAME=version_LLVM
-    if  version_LLVM == "trunk":
-        SVN_PATH="trunk"
-    if  version_LLVM == "3.8":
-        SVN_PATH="tags/RELEASE_380/final"
-        version_LLVM = "3_8"
-    if  version_LLVM == "3.7":
-        SVN_PATH="tags/RELEASE_370/final"
-        version_LLVM = "3_7"
-    if  version_LLVM == "3.6":
-        SVN_PATH="tags/RELEASE_362/final"
-        version_LLVM = "3_6"
-    if  version_LLVM == "3.5":
-        SVN_PATH="tags/RELEASE_351/final"
-        version_LLVM = "3_5"
-    if  version_LLVM == "3.4":
-        SVN_PATH="tags/RELEASE_34/dot2-final"
-        version_LLVM = "3_4"
-    if  version_LLVM == "3.3":
-        SVN_PATH="tags/RELEASE_33/final"
-        version_LLVM = "3_3"
-    if  version_LLVM == "3.2":
-        SVN_PATH="tags/RELEASE_32/final"
-        version_LLVM = "3_2"
    if revision != "":
        FOLDER_NAME = FOLDER_NAME + "_" + revision
        revision = "-" + revision
@@ -173,7 +225,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
            if os.path.exists(os.path.join(path, "xcrun")):
                found_xcrun = True
        if found_xcrun:
-            mac_system_root = " --with-default-sysroot=`xcrun --show-sdk-path`"
+            mac_system_root = "`xcrun --show-sdk-path`"
        else:
            error("Can't find XCode (xcrun tool) - it's required on MacOS 10.9 and newer", 1)

@@ -184,13 +236,9 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
        llvm_home + "\n", from_validation, alloy_build)
    # load llvm
    if tarball == "":
-        try_do_LLVM("load LLVM from http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " ",
-                    "svn co " + revision + " http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " " + LLVM_SRC,
-                    from_validation)
+        checkout_LLVM("llvm", options.use_git, version_LLVM, revision, LLVM_SRC, from_validation)
        os.chdir(LLVM_SRC + "/tools")
-        try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ",
-                    "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang",
-                    from_validation)
+        checkout_LLVM("clang", options.use_git, version_LLVM, revision, "clang", from_validation)
        os.chdir("..")
        if current_OS == "MacOS" and int(current_OS_version.split(".")[0]) >= 13:
            # Starting with MacOS 10.9 Maverics, the system doesn't contain headers for standard C++ library and
@@ -202,19 +250,13 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
            # to the linker explicitly (either through command line or environment variables). So we are not doing it
            # currently to make the build process easier.
            os.chdir("projects")
-            try_do_LLVM("load libcxx http://llvm.org/svn/llvm-project/libcxx/" + SVN_PATH + " ",
-                    "svn co " + revision + " http://llvm.org/svn/llvm-project/libcxx/" + SVN_PATH + " libcxx",
-                    from_validation)
+            checkout_LLVM("libcxx", options.use_git, version_LLVM, revision, "libcxx", from_validation)
            os.chdir("..")
        if extra == True:
            os.chdir("tools/clang/tools")
-            try_do_LLVM("load extra clang extra tools ",
-                    "svn co " + revision + " http://llvm.org/svn/llvm-project/clang-tools-extra/" + SVN_PATH + " extra",
-                    from_validation)
+            checkout_LLVM("clang-tools-extra", options.use_git, version_LLVM, revision, "extra", from_validation)
            os.chdir("../../../projects")
-            try_do_LLVM("load extra clang compiler-rt ",
-                    "svn co " + revision + " http://llvm.org/svn/llvm-project/compiler-rt/" + SVN_PATH + " compiler-rt",
-                    from_validation)
+            checkout_LLVM("compiler-rt", options.use_git, version_LLVM, revision, "compiler-rt", from_validation)
            os.chdir("..")
    else:
        tar = tarball.split(" ")
@@ -249,7 +291,6 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
        os.makedirs(LLVM_BIN_selfbuild)
        os.chdir(LLVM_BUILD_selfbuild)
        if  version_LLVM not in LLVM_configure_capable:
-            # TODO: mac_root
            try_do_LLVM("configure release version for selfbuild ",
                    "cmake -G Unix\ Makefiles" + " -DCMAKE_EXPORT_COMPILE_COMMANDS=ON" +
                    "  -DCMAKE_INSTALL_PREFIX=" + llvm_home + "/" + LLVM_BIN_selfbuild +
@@ -258,6 +299,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
                    (("  -DGCC_INSTALL_PREFIX=" + gcc_toolchain_path) if gcc_toolchain_path != "" else "") +
                    (("  -DCMAKE_C_COMPILER=" + gcc_toolchain_path+"/bin/gcc") if gcc_toolchain_path != "" else "") +
                    (("  -DCMAKE_CXX_COMPILER=" + gcc_toolchain_path+"/bin/g++") if gcc_toolchain_path != "" else "") +
+                    (("  -DDEFAULT_SYSROOT=" + mac_system_root) if mac_system_root != "" else "") +
                    "  -DLLVM_TARGETS_TO_BUILD=NVPTX\;X86" +
                    " ../" + LLVM_SRC,
                    from_validation)
@@ -269,7 +311,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
                        LLVM_BIN_selfbuild + " --enable-optimized" +
                        " --enable-targets=x86,x86_64,nvptx" +
                        ((" --with-gcc-toolchain=" + gcc_toolchain_path) if gcc_toolchain_path != "" else "") +
-                        mac_system_root,
+                        ((" --with-default-sysroot=" + mac_system_root) if mac_system_root != "" else ""),
                        from_validation)
            selfbuild_compiler = ("CC=" +llvm_home+ "/" + LLVM_BIN_selfbuild + "/bin/clang " +
                                  "CXX="+llvm_home+ "/" + LLVM_BIN_selfbuild + "/bin/clang++ ")
@@ -285,7 +327,6 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
    if debug == False:
        if current_OS != "Windows":
            if  version_LLVM not in LLVM_configure_capable:
-                # TODO: mac_root
                try_do_LLVM("configure release version ",
                        "cmake -G Unix\ Makefiles" + " -DCMAKE_EXPORT_COMPILE_COMMANDS=ON" +
                        selfbuild_compiler +
@@ -295,6 +336,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
                        (("  -DGCC_INSTALL_PREFIX=" + gcc_toolchain_path) if gcc_toolchain_path != "" else "") +
                        (("  -DCMAKE_C_COMPILER=" + gcc_toolchain_path+"/bin/gcc") if gcc_toolchain_path != "" and selfbuild_compiler == "" else "") +
                        (("  -DCMAKE_CXX_COMPILER=" + gcc_toolchain_path+"/bin/g++") if gcc_toolchain_path != "" and selfbuild_compiler == "" else "") +
+                        (("  -DDEFAULT_SYSROOT=" + mac_system_root) if mac_system_root != "" else "") +
                        "  -DLLVM_TARGETS_TO_BUILD=NVPTX\;X86" +
                        " ../" + LLVM_SRC,
                        from_validation)
@@ -304,7 +346,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
                        LLVM_BIN + " --enable-optimized" +
                        " --enable-targets=x86,x86_64,nvptx" +
                        ((" --with-gcc-toolchain=" + gcc_toolchain_path) if gcc_toolchain_path != "" else "") +
-                        mac_system_root,
+                        ((" --with-default-sysroot=" + mac_system_root) if mac_system_root != "" else ""),
                        from_validation)
        else:
            try_do_LLVM("configure release version ",
@@ -313,7 +355,6 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
                    from_validation)
    else:
        if  version_LLVM not in LLVM_configure_capable:
-            # TODO: mac_root
            try_do_LLVM("configure debug version ",
                    "cmake -G Unix\ Makefiles" + " -DCMAKE_EXPORT_COMPILE_COMMANDS=ON" +
                    selfbuild_compiler +
@@ -323,6 +364,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
                    (("  -DGCC_INSTALL_PREFIX=" + gcc_toolchain_path) if gcc_toolchain_path != "" else "") +
                    (("  -DCMAKE_C_COMPILER=" + gcc_toolchain_path+"/bin/gcc") if gcc_toolchain_path != "" and selfbuild_compiler == "" else "") +
                    (("  -DCMAKE_CXX_COMPILER=" + gcc_toolchain_path+"/bin/g++") if gcc_toolchain_path != "" and selfbuild_compiler == "" else "") +
+                    (("  -DDEFAULT_SYSROOT=" + mac_system_root) if mac_system_root != "" else "") +
                    "  -DLLVM_TARGETS_TO_BUILD=NVPTX\;X86" +
                    " ../" + LLVM_SRC,
                    from_validation)
@@ -332,7 +374,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
                        " --enable-debug-runtime --enable-debug-symbols --enable-keep-symbols" +
                        " --enable-targets=x86,x86_64,nvptx" +
                        ((" --with-gcc-toolchain=" + gcc_toolchain_path) if gcc_toolchain_path != "" else "") +
-                        mac_system_root,
+                        ((" --with-default-sysroot=" + mac_system_root) if mac_system_root != "" else ""),
                        from_validation)
    # building llvm
    if current_OS != "Windows":
@@ -352,6 +394,8 @@ def unsupported_llvm_targets(LLVM_VERSION):
                       "3.7":["avx512skx-i32x16"],
                       "3.8":[],
                       "3.9":[],
+                       "4.0":[],
+                       "5.0":[],
                       "trunk":[]}   
    return prohibited_list[LLVM_VERSION]

@@ -476,8 +520,12 @@ def build_ispc(version_LLVM, make):
            temp = "3_7"
        if version_LLVM == "3.8":
            temp = "3_8"
-        if version_LLVM == "trunk":
+        if version_LLVM == "3.9":
            temp = "3_9"
+        if version_LLVM == "4.0":
+            temp = "4_0"
+        if version_LLVM == "trunk":
+            temp = "5_0"
        os.environ["LLVM_VERSION"] = "LLVM_" + temp
        try_do_LLVM("clean ISPC for building", "msbuild ispc.vcxproj /t:clean", True)
        try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", "msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild", True)
@@ -617,7 +665,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
            archs.append("x86-64")
        if "native" in only:
            sde_targets_t = []
-        for i in ["3.2", "3.3", "3.4", "3.5", "3.6", "3.7", "3.8", "trunk"]:
+        for i in ["3.2", "3.3", "3.4", "3.5", "3.6", "3.7", "3.8", "3.9", "4.0", "trunk"]:
            if i in only:
                LLVM.append(i)
        if "current" in only:
@@ -675,7 +723,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
        gen_archs = ["x86-64"]
        need_LLVM = check_LLVM(LLVM)
        for i in range(0,len(need_LLVM)):
-            build_LLVM(need_LLVM[i], "", "", "", False, False, False, True, False, make, options.gcc_toolchain_path)
+            build_LLVM(need_LLVM[i], "", "", "", False, False, False, True, False, make, options.gcc_toolchain_path, False)
 # begin validation run for stabitily
        common.remove_if_exists(stability.in_file)
        R = [[[],[]],[[],[]],[[],[]],[[],[]]]
@@ -789,7 +837,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
 # prepare newest LLVM
        need_LLVM = check_LLVM([newest_LLVM])
        if len(need_LLVM) != 0:
-            build_LLVM(need_LLVM[0], "", "", "", False, False, False, True, False, make, options.gcc_toolchain_path)
+            build_LLVM(need_LLVM[0], "", "", "", False, False, False, True, False, make, options.gcc_toolchain_path, options.use_git)
        if perf_llvm == False:
            # prepare reference point. build both test and reference compilers
            try_do_LLVM("apply git", "git branch", True)
@@ -903,7 +951,7 @@ def Main():
        if os.environ.get("SMTP_ISPC") == None:
            error("you have no SMTP_ISPC in your environment for option notify", 1)
    if options.only != "":
-        test_only_r = " 3.2 3.3 3.4 3.5 3.6 3.7 3.8 trunk current build stability performance x86 x86-64 x86_64 -O0 -O2 native debug nodebug "
+        test_only_r = " 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4.0 trunk current build stability performance x86 x86-64 x86_64 -O0 -O2 native debug nodebug "
        test_only = options.only.split(" ")
        for iterator in test_only:
            if not (" " + iterator + " " in test_only_r):
@@ -929,11 +977,14 @@ def Main():
    if options.perf_llvm == True:
        if options.branch == "master":
            options.branch = "trunk"
+    if options.use_git and options.revision != "":
+        error("--revision is not supported with --git", 1)
+
    try:
        start_time = time.time()
        if options.build_llvm:
            build_LLVM(options.version, options.revision, options.folder, options.tarball,
-                    options.debug, options.selfbuild, options.extra, False, options.force, make, options.gcc_toolchain_path)
+                    options.debug, options.selfbuild, options.extra, False, options.force, make, options.gcc_toolchain_path, options.use_git)
        if options.validation_run:
            validation_run(options.only, options.only_targets, options.branch,
                    options.number_for_performance, options.notify, options.update, int(options.speed),
@@ -1013,13 +1064,13 @@ if __name__ == '__main__':
    llvm_group = OptionGroup(parser, "Options for building LLVM",
                    "These options must be used with -b option.")
    llvm_group.add_option('--version', dest='version',
-        help='version of llvm to build: 3.2 3.3 3.4 3.5 3.6 3.7 3.8 trunk. Default: trunk', default="trunk")
+        help='version of llvm to build: 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4.0 trunk. Default: trunk', default="trunk")
    llvm_group.add_option('--with-gcc-toolchain', dest='gcc_toolchain_path',
         help='GCC install dir to use when building clang. It is important to set when ' +
         'you have alternative gcc installation. Note that otherwise gcc from standard ' +
         'location will be used, not from your PATH', default="")
    llvm_group.add_option('--revision', dest='revision',
-        help='revision of llvm to build in format r172870', default="")
+        help='revision of llvm to build in format r172870 (not supported with --git)', default="")
    llvm_group.add_option('--debug', dest='debug',
        help='debug build of LLVM?', default=False, action="store_true")
    llvm_group.add_option('--folder', dest='folder',
@@ -1032,6 +1083,8 @@ if __name__ == '__main__':
        help='rebuild LLVM', default=False, action='store_true')
    llvm_group.add_option('--extra', dest='extra',
        help='load extra clang tools', default=False, action='store_true')
+    llvm_group.add_option('--git', dest='use_git',
+        help='use git llvm repository instead of svn', default=False, action='store_true')
    parser.add_option_group(llvm_group)
    # options for activity "validation run"
    run_group = OptionGroup(parser, "Options for validation run",
@@ -1054,7 +1107,7 @@ if __name__ == '__main__':
    run_group.add_option('--only', dest='only',
        help='set types of tests. Possible values:\n' + 
            '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance),\n' +
-            'build (only build with different LLVM), 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, trunk, native (do not use SDE),\n' +
+            'build (only build with different LLVM), 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, trunk, native (do not use SDE),\n' +
            'current (do not rebuild ISPC), debug (only with debug info), nodebug (only without debug info, default).',
            default="")
    run_group.add_option('--perf_LLVM', dest='perf_llvm',
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -72,7 +72,11 @@
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/ADT/Triple.h>
 #include <llvm/Support/MemoryBuffer.h>
-#include <llvm/Bitcode/ReaderWriter.h>
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
+    #include <llvm/Bitcode/ReaderWriter.h>
+#else
+    #include <llvm/Bitcode/BitcodeReader.h>
+#endif

 extern int yyparse();
 struct yy_buffer_state;
@@ -800,7 +804,13 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
    llvm::MemoryBufferRef bcBuf = llvm::MemoryBuffer::getMemBuffer(sb)->getMemBufferRef();
 #endif

-#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_7 // LLVM 3.7+
+#if ISPC_LLVM_VERSION >= ISPC_LLVM_4_0 // LLVM 4.0+
+    llvm::Expected<std::unique_ptr<llvm::Module>> ModuleOrErr = llvm::parseBitcodeFile(bcBuf, *g->ctx);
+    if (!ModuleOrErr) {
+        Error(SourcePos(), "Error parsing stdlib bitcode: %s", toString(ModuleOrErr.takeError()).c_str());
+    } else {
+        llvm::Module *bcModule = ModuleOrErr.get().release();
+#elif ISPC_LLVM_VERSION >= ISPC_LLVM_3_7 // LLVM 3.7+
    llvm::ErrorOr<std::unique_ptr<llvm::Module>> ModuleOrErr = llvm::parseBitcodeFile(bcBuf, *g->ctx);
    if (std::error_code EC = ModuleOrErr.getError())
        Error(SourcePos(), "Error parsing stdlib bitcode: %s", EC.message().c_str());
@@ -989,7 +999,7 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
                                               diType,
                                               true /* static */,
                                               sym_const_storagePtr);
-#else // LLVM 3.7+
+#elif ISPC_LLVM_VERSION >= ISPC_LLVM_3_7 && ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // LLVM 3.7 - 3.9
    llvm::Constant *sym_const_storagePtr = llvm::dyn_cast<llvm::Constant>(sym->storagePtr);
    Assert(sym_const_storagePtr);
    m->diBuilder->createGlobalVariable(
@@ -1001,6 +1011,17 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
              diType,
              true /* static */,
              sym_const_storagePtr);
+#else // LLVM 4.0+
+        llvm::GlobalVariable *sym_GV_storagePtr = llvm::dyn_cast<llvm::GlobalVariable>(sym->storagePtr);
+        llvm::DIGlobalVariableExpression *var = m->diBuilder->createGlobalVariableExpression(
+                                              file,
+                                              name,
+                                              name,
+                                              file,
+                                              0 /* line */,
+                                              diType,
+                                              true /* static */);
+        sym_GV_storagePtr->addDebugInfo(var);
 #endif
 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
        Assert(var.Verify());
@@ -1087,7 +1108,7 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
                                               diType,
                                               false /* static */,
                                               sym->storagePtr);
-#else // LLVM 3.7+
+#elif ISPC_LLVM_VERSION >= ISPC_LLVM_3_7 && ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // LLVM 3.7 - 3.9
        llvm::Constant *sym_const_storagePtr = llvm::dyn_cast<llvm::Constant>(sym->storagePtr);
        Assert(sym_const_storagePtr);
        m->diBuilder->createGlobalVariable(
@@ -1099,7 +1120,18 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
                                               diType,
                                               false /* static */,
                                               sym_const_storagePtr);
-#endif       
+#else // LLVM 4.0+
+        llvm::GlobalVariable *sym_GV_storagePtr = llvm::dyn_cast<llvm::GlobalVariable>(sym->storagePtr);
+        llvm::DIGlobalVariableExpression *var = m->diBuilder->createGlobalVariableExpression(
+                                              file,
+                                              sym->name.c_str(),
+                                              sym->name.c_str(),
+                                              file,
+                                              0 /* line */,
+                                              diType,
+                                              false /* static */);
+        sym_GV_storagePtr->addDebugInfo(var);
+#endif
 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
        Assert(var.Verify());
 #else // LLVM 3.7+
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -151,6 +151,10 @@ define(`PTR_OP_ARGS',
         LLVM_VERSION, LLVM_3_8,
    ``$1 , $1 *'',
         LLVM_VERSION, LLVM_3_9,
+    ``$1 , $1 *'',
+         LLVM_VERSION, LLVM_4_0,
+    ``$1 , $1 *'',
+         LLVM_VERSION, LLVM_5_0,
    ``$1 , $1 *'',
    ``$1 *''
  )
--- a/builtins/target-avx512-common.ll
+++ b/builtins/target-avx512-common.ll
@@ -617,16 +617,16 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
 }
 ctlztz()

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
+;; TODO: should we use masked versions of SVML functions?
 ;; svml

 include(`svml.m4')
-svml_stubs(float,f,WIDTH)
-svml_stubs(double,d,WIDTH)
+svml_declare(float,f16,16)
+svml_define(float,f16,16,f)
+
+;; double precision
+svml_declare(double,8,8)
+svml_define_x(double,8,8,d,16)



--- a/builtins/target-knl.ll
+++ b/builtins/target-knl.ll
@@ -36,6 +36,10 @@ ifelse(LLVM_VERSION, LLVM_3_7,
         LLVM_VERSION, LLVM_3_8,
    `include(`target-avx512-common.ll')',
         LLVM_VERSION, LLVM_3_9,
+    `include(`target-avx512-common.ll')',
+         LLVM_VERSION, LLVM_4_0,
+    `include(`target-avx512-common.ll')',
+         LLVM_VERSION, LLVM_5_0,
    `include(`target-avx512-common.ll')'
  )

@@ -60,6 +64,10 @@ ifelse(LLVM_VERSION, LLVM_3_7,
         LLVM_VERSION, LLVM_3_8,
    rcp_rsqrt_varying_float_knl(),
         LLVM_VERSION, LLVM_3_9,
+    rcp_rsqrt_varying_float_knl(),
+         LLVM_VERSION, LLVM_4_0,
+    rcp_rsqrt_varying_float_knl(),
+         LLVM_VERSION, LLVM_5_0,
    rcp_rsqrt_varying_float_knl()
  )

--- a/builtins/target-neon-16.ll
+++ b/builtins/target-neon-16.ll
@@ -42,12 +42,12 @@ include(`target-neon-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines

-define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone alwaysinline {
  unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
  ret <8 x float> %r
 }

-define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone alwaysinline {
  unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
  ret <8 x i16> %r
 }
@@ -115,13 +115,13 @@ declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwin
 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone

 define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
-                                            <WIDTH x float>) nounwind readnone {
+                                            <WIDTH x float>) nounwind readnone alwaysinline {
  binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
  ret <WIDTH x float> %r
 }

 define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
-                                            <WIDTH x float>) nounwind readnone {
+                                            <WIDTH x float>) nounwind readnone alwaysinline {
  binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
  ret <WIDTH x float> %r
 }
@@ -131,22 +131,22 @@ declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind read
 declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

-define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
  binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
  ret <WIDTH x i32> %r
 }

-define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
  binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
  ret <WIDTH x i32> %r
 }

-define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
  binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
  ret <WIDTH x i32> %r
 }

-define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
  binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
  ret <WIDTH x i32> %r
 }
@@ -156,7 +156,7 @@ define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwin
 declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone

-define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone alwaysinline {
  unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
  binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
  %x1 = fmul <WIDTH x float> %x0, %x0_nr
@@ -168,7 +168,7 @@ define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnon
 declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone

-define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone alwaysinline {
  unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
  %x0_2 = fmul <WIDTH x float> %x0, %x0
  binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
@@ -179,7 +179,7 @@ define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readn
  ret <WIDTH x float> %x2
 }

-define float @__rsqrt_uniform_float(float) nounwind readnone {
+define float @__rsqrt_uniform_float(float) nounwind readnone alwaysinline {
  %v1 = bitcast float %0 to <1 x float>
  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
@@ -189,7 +189,7 @@ define float @__rsqrt_uniform_float(float) nounwind readnone {
  ret float %r
 }

-define float @__rcp_uniform_float(float) nounwind readnone {
+define float @__rcp_uniform_float(float) nounwind readnone alwaysinline {
  %v1 = bitcast float %0 to <1 x float>
  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
@@ -201,7 +201,7 @@ define float @__rcp_uniform_float(float) nounwind readnone {

 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)

-define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline {
  unary4to8(result, float, @llvm.sqrt.v4f32, %0)
 ;; this returns nan for v=0, which is undesirable..
 ;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
@@ -211,7 +211,7 @@ define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone

 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)

-define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone alwaysinline {
  unary4to8(r, double, @llvm.sqrt.v4f64, %0)
  ret <WIDTH x double> %r
 }
@@ -219,7 +219,7 @@ define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readno
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions

-define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone alwaysinline {
  %and_mask = and <WIDTH x i16> %0,
    <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask)
@@ -288,48 +288,48 @@ define(`neon_reduce', `

 declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone

-define internal float @add_f32(float, float) {
+define internal float @add_f32(float, float) nounwind readnone alwaysinline {
  %r = fadd float %0, %1
  ret float %r
 }

-define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone alwaysinline {
  %r = fadd <WIDTH x float> %0, %1
  ret <WIDTH x float> %r
 }

-define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone alwaysinline {
  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
 }

 declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone

-define internal float @min_f32(float, float) {
+define internal float @min_f32(float, float) nounwind readnone alwaysinline {
  %cmp = fcmp olt float %0, %1
  %r = select i1 %cmp, float %0, float %1
  ret float %r
 }

-define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone alwaysinline {
  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
 }

 declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone

-define internal float @max_f32(float, float) {
+define internal float @max_f32(float, float) nounwind readnone alwaysinline {
  %cmp = fcmp ugt float %0, %1
  %r = select i1 %cmp, float %0, float %1
  ret float %r
 }

-define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone alwaysinline {
  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
 }

 declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
 declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone

-define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone alwaysinline {
  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0)
  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
  %a0 = extractelement <2 x i32> %a32, i32 0
@@ -341,7 +341,7 @@ define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {

 declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16>)

-define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone alwaysinline {
  %a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16> %0)
  %a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1)
  %aa = extractelement <2 x i64> %a2, i32 0
@@ -352,7 +352,7 @@ define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {

 declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone

-define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
  v8tov4(i32, %0, %va, %vb)
  %pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
  %pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
@@ -365,53 +365,53 @@ define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {

 declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

-define internal i32 @min_si32(i32, i32) {
+define internal i32 @min_si32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp slt i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
 }

 declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

-define internal i32 @max_si32(i32, i32) {
+define internal i32 @max_si32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp sgt i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
 }

 declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

-define internal i32 @min_ui32(i32, i32) {
+define internal i32 @min_ui32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp ult i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone alwaysinline {
  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
 }

 declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

-define internal i32 @max_ui32(i32, i32) {
+define internal i32 @max_ui32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp ugt i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone alwaysinline {
  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
 }

-define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone alwaysinline {
  v8tov2(double, %0, %v0, %v1, %v2, %v3)
  %v01 = fadd <2 x double> %v0, %v1
  %v23 = fadd <2 x double> %v2, %v3
@@ -422,15 +422,15 @@ define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
  ret double %m
 }

-define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone alwaysinline {
  reduce8(double, @__min_varying_double, @__min_uniform_double)
 }

-define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone alwaysinline {
  reduce8(double, @__max_varying_double, @__max_uniform_double)
 }

-define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
  v8tov2(i64, %0, %v0, %v1, %v2, %v3)
  %v01 = add <2 x i64> %v0, %v1
  %v23 = add <2 x i64> %v2, %v3
@@ -441,19 +441,19 @@ define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
  ret i64 %m
 }

-define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
 }

-define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
 }

-define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }

-define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }

@@ -462,56 +462,56 @@ define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {

 declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone

-define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
+define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone alwaysinline {
  %r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
  ret <8 x i8> %r
 }

 declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone

-define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone {
+define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone alwaysinline {
  %r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
  ret <8 x i8> %r
 }

 declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone

-define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
+define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone alwaysinline {
  %r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
  ret <8 x i8> %r
 }

 declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone

-define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone {
+define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone alwaysinline {
  %r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
  ret <8 x i8> %r
 }

 declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone

-define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
+define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone alwaysinline {
  %r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
  ret <8 x i16> %r
 }

 declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone

-define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone {
+define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone alwaysinline {
  %r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
  ret <8 x i16> %r
 }

 declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone

-define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
+define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone alwaysinline {
  %r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
  ret <8 x i16> %r
 }

 declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone

-define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
+define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone alwaysinline {
  %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
  ret <8 x i16> %r
 }
--- a/builtins/target-neon-32.ll
+++ b/builtins/target-neon-32.ll
@@ -43,12 +43,12 @@ include(`target-neon-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines

-define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone {
+define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone alwaysinline {
  %r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v)
  ret <4 x float> %r
 }

-define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
+define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone alwaysinline {
  %r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v)
  ret <4 x i16> %r
 }
@@ -106,13 +106,13 @@ declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwin
 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone

 define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
-                                            <WIDTH x float>) nounwind readnone {
+                                            <WIDTH x float>) nounwind readnone alwaysinline {
  %r = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %0, <4 x float> %1)
  ret <WIDTH x float> %r
 }

 define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
-                                            <WIDTH x float>) nounwind readnone {
+                                            <WIDTH x float>) nounwind readnone alwaysinline {
  %r = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %0, <4 x float> %1)
  ret <WIDTH x float> %r
 }
@@ -122,22 +122,22 @@ declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind read
 declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

-define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
  %r = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %r
 }

-define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
  %r = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %r
 }

-define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
  %r = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %r
 }

-define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
  %r = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %r
 }
@@ -147,7 +147,7 @@ define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwin
 declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone

-define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone alwaysinline {
  %x0 = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %d)
  %x0_nr = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %d, <4 x float> %x0)
  %x1 = fmul <4 x float> %x0, %x0_nr
@@ -159,7 +159,7 @@ define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnon
 declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone

-define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone alwaysinline {
  %x0 = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %d)
  %x0_2 = fmul <4 x float> %x0, %x0
  %x0_nr = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %d, <4 x float> %x0_2)
@@ -170,7 +170,7 @@ define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readn
  ret <4 x float> %x2
 }

-define float @__rsqrt_uniform_float(float) nounwind readnone {
+define float @__rsqrt_uniform_float(float) nounwind readnone alwaysinline {
  %v1 = bitcast float %0 to <1 x float>
  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
          <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
@@ -179,7 +179,7 @@ define float @__rsqrt_uniform_float(float) nounwind readnone {
  ret float %r
 }

-define float @__rcp_uniform_float(float) nounwind readnone {
+define float @__rcp_uniform_float(float) nounwind readnone alwaysinline {
  %v1 = bitcast float %0 to <1 x float>
  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
          <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
@@ -190,7 +190,7 @@ define float @__rcp_uniform_float(float) nounwind readnone {

 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)

-define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline {
  %result = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %0)
 ;; this returns nan for v=0, which is undesirable..
 ;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
@@ -200,7 +200,7 @@ define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone

 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)

-define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone alwaysinline {
  %r = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %0)
  ret <4 x double> %r
 }
@@ -208,7 +208,7 @@ define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readno
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions

-define i64 @__movmsk(<4 x MASK>) nounwind readnone {
+define i64 @__movmsk(<4 x MASK>) nounwind readnone alwaysinline {
  %and_mask = and <4 x MASK> %0, <MASK 1, MASK 2, MASK 4, MASK 8>
  %v01 = shufflevector <4 x i32> %and_mask, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
  %v23 = shufflevector <4 x i32> %and_mask, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -264,42 +264,42 @@ define(`neon_reduce', `

 declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone

-define internal float @add_f32(float, float) {
+define internal float @add_f32(float, float) nounwind readnone alwaysinline {
  %r = fadd float %0, %1
  ret float %r
 }

-define float @__reduce_add_float(<4 x float>) nounwind readnone {
+define float @__reduce_add_float(<4 x float>) nounwind readnone alwaysinline {
  neon_reduce(float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
 }

 declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone

-define internal float @min_f32(float, float) {
+define internal float @min_f32(float, float) nounwind readnone alwaysinline {
  %cmp = fcmp olt float %0, %1
  %r = select i1 %cmp, float %0, float %1
  ret float %r
 }

-define float @__reduce_min_float(<4 x float>) nounwind readnone {
+define float @__reduce_min_float(<4 x float>) nounwind readnone alwaysinline {
  neon_reduce(float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
 }

 declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone

-define internal float @max_f32(float, float) {
+define internal float @max_f32(float, float) nounwind readnone alwaysinline {
  %cmp = fcmp ugt float %0, %1
  %r = select i1 %cmp, float %0, float %1
  ret float %r
 }

-define float @__reduce_max_float(<4 x float>) nounwind readnone {
+define float @__reduce_max_float(<4 x float>) nounwind readnone alwaysinline {
  neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
 }

 declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone

-define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone alwaysinline {
  %v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8)
@@ -313,7 +313,7 @@ define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {

 declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone

-define i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+define i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone alwaysinline {
  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0)
  %a0 = extractelement <2 x i32> %a32, i32 0
  %a1 = extractelement <2 x i32> %a32, i32 1
@@ -323,7 +323,7 @@ define i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {

 declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone

-define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0)
  %a0 = extractelement <2 x i64> %a64, i32 0
  %a1 = extractelement <2 x i64> %a64, i32 1
@@ -333,53 +333,53 @@ define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {

 declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

-define internal i32 @min_si32(i32, i32) {
+define internal i32 @min_si32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp slt i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
  neon_reduce(i32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
 }

 declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

-define internal i32 @max_si32(i32, i32) {
+define internal i32 @max_si32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp sgt i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
  neon_reduce(i32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
 }

 declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

-define internal i32 @min_ui32(i32, i32) {
+define internal i32 @min_ui32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp ult i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
  neon_reduce(i32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
 }

 declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

-define internal i32 @max_ui32(i32, i32) {
+define internal i32 @max_ui32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp ugt i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
  neon_reduce(i32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
 }

-define double @__reduce_add_double(<4 x double>) nounwind readnone {
+define double @__reduce_add_double(<4 x double>) nounwind readnone alwaysinline {
  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
                      <2 x i32> <i32 0, i32 1>
  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
@@ -391,15 +391,15 @@ define double @__reduce_add_double(<4 x double>) nounwind readnone {
  ret double %m
 }

-define double @__reduce_min_double(<4 x double>) nounwind readnone {
+define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
  reduce4(double, @__min_varying_double, @__min_uniform_double)
 }

-define double @__reduce_max_double(<4 x double>) nounwind readnone {
+define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
  reduce4(double, @__max_varying_double, @__max_uniform_double)
 }

-define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
                      <2 x i32> <i32 0, i32 1>
  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
@@ -411,19 +411,19 @@ define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
  ret i64 %m
 }

-define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
 }

-define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
 }

-define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }

-define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }

@@ -432,56 +432,56 @@ define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {

 declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone

-define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
+define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone alwaysinline {
  %r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
  ret <4 x i8> %r
 }

 declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone

-define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone {
+define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone alwaysinline {
  %r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
  ret <4 x i8> %r
 }

 declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone

-define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
+define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone alwaysinline {
  %r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
  ret <4 x i8> %r
 }

 declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone

-define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone {
+define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone alwaysinline {
  %r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
  ret <4 x i8> %r
 }

 declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone

-define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
+define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone alwaysinline {
  %r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
  ret <4 x i16> %r
 }

 declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone

-define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone {
+define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone alwaysinline {
  %r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
  ret <4 x i16> %r
 }

 declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone

-define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
+define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone alwaysinline {
  %r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
  ret <4 x i16> %r
 }

 declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone

-define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
+define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone alwaysinline {
  %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
  ret <4 x i16> %r
 }
--- a/builtins/target-neon-8.ll
+++ b/builtins/target-neon-8.ll
@@ -42,12 +42,12 @@ include(`target-neon-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines

-define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone alwaysinline {
  unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
  ret <16 x float> %r
 }

-define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone alwaysinline {
  unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
  ret <16 x i16> %r
 }
@@ -125,13 +125,13 @@ declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwin
 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone

 define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
-                                            <WIDTH x float>) nounwind readnone {
+                                            <WIDTH x float>) nounwind readnone alwaysinline {
  binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
  ret <WIDTH x float> %r
 }

 define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
-                                            <WIDTH x float>) nounwind readnone {
+                                            <WIDTH x float>) nounwind readnone alwaysinline {
  binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
  ret <WIDTH x float> %r
 }
@@ -141,22 +141,22 @@ declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind read
 declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

-define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
  binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
  ret <WIDTH x i32> %r
 }

-define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
  binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
  ret <WIDTH x i32> %r
 }

-define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
  binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
  ret <WIDTH x i32> %r
 }

-define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
  binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
  ret <WIDTH x i32> %r
 }
@@ -166,7 +166,7 @@ define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwin
 declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone

-define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone alwaysinline {
  unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
  binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
  %x1 = fmul <WIDTH x float> %x0, %x0_nr
@@ -178,7 +178,7 @@ define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnon
 declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone

-define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone alwaysinline {
  unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
  %x0_2 = fmul <WIDTH x float> %x0, %x0
  binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
@@ -189,7 +189,7 @@ define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readn
  ret <WIDTH x float> %x2
 }

-define float @__rsqrt_uniform_float(float) nounwind readnone {
+define float @__rsqrt_uniform_float(float) nounwind readnone alwaysinline {
  %v1 = bitcast float %0 to <1 x float>
  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
@@ -201,7 +201,7 @@ define float @__rsqrt_uniform_float(float) nounwind readnone {
  ret float %r
 }

-define float @__rcp_uniform_float(float) nounwind readnone {
+define float @__rcp_uniform_float(float) nounwind readnone alwaysinline {
  %v1 = bitcast float %0 to <1 x float>
  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
@@ -215,7 +215,7 @@ define float @__rcp_uniform_float(float) nounwind readnone {

 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)

-define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline {
  unary4to16(result, float, @llvm.sqrt.v4f32, %0)
 ;; this returns nan for v=0, which is undesirable..
 ;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
@@ -225,7 +225,7 @@ define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone

 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)

-define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone alwaysinline {
  unary4to16(r, double, @llvm.sqrt.v4f64, %0)
  ret <WIDTH x double> %r
 }
@@ -233,7 +233,7 @@ define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readno
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions

-define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone alwaysinline {
  %and_mask = and <WIDTH x i8> %0,
    <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128,
     i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128>
@@ -327,41 +327,41 @@ define(`neon_reduce', `

 declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone

-define internal float @add_f32(float, float) {
+define internal float @add_f32(float, float) nounwind readnone alwaysinline {
  %r = fadd float %0, %1
  ret float %r
 }

-define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone alwaysinline {
  %r = fadd <WIDTH x float> %0, %1
  ret <WIDTH x float> %r
 }

-define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone alwaysinline {
  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
 }

 declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone

-define internal float @min_f32(float, float) {
+define internal float @min_f32(float, float) nounwind readnone alwaysinline {
  %cmp = fcmp olt float %0, %1
  %r = select i1 %cmp, float %0, float %1
  ret float %r
 }

-define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone alwaysinline {
  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
 }

 declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone

-define internal float @max_f32(float, float) {
+define internal float @max_f32(float, float) nounwind readnone alwaysinline {
  %cmp = fcmp ugt float %0, %1
  %r = select i1 %cmp, float %0, float %1
  ret float %r
 }

-define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone alwaysinline {
  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
 }

@@ -369,7 +369,7 @@ declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnon
 declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
 declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone

-define i64 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+define i64 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone alwaysinline {
  %a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0)
  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16)
  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
@@ -379,7 +379,7 @@ define i64 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
  ret i64 %r
 }

-define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone alwaysinline {
  v16tov8(i16, %0, %va, %vb)
  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va)
  %b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb)
@@ -392,7 +392,7 @@ define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
  ret i64 %r
 }

-define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
  v16tov4(i32, %0, %va, %vb, %vc, %vd)
  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
@@ -409,101 +409,101 @@ define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {

 declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

-define internal i32 @min_si32(i32, i32) {
+define internal i32 @min_si32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp slt i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
 }

 declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

-define internal i32 @max_si32(i32, i32) {
+define internal i32 @max_si32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp sgt i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
 }

 declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

-define internal i32 @min_ui32(i32, i32) {
+define internal i32 @min_ui32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp ult i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone alwaysinline {
  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
 }

 declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

-define internal i32 @max_ui32(i32, i32) {
+define internal i32 @max_ui32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp ugt i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone alwaysinline {
  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
 }

-define internal double @__add_uniform_double(double, double) {
+define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
  %r = fadd double %0, %1
  ret double %r
 }

-define internal <WIDTH x double> @__add_varying_double(<WIDTH x double>, <WIDTH x double>) {
+define internal <WIDTH x double> @__add_varying_double(<WIDTH x double>, <WIDTH x double>) nounwind readnone alwaysinline {
  %r = fadd <WIDTH x double> %0, %1
  ret <WIDTH x double> %r
 }

-define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone alwaysinline {
  reduce16(double, @__add_varying_double, @__add_uniform_double)
 }

-define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone alwaysinline {
  reduce16(double, @__min_varying_double, @__min_uniform_double)
 }

-define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone alwaysinline {
  reduce16(double, @__max_varying_double, @__max_uniform_double)
 }

-define internal i64 @__add_uniform_int64(i64, i64) {
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %r = add i64 %0, %1
  ret i64 %r
 }

-define internal <WIDTH x i64> @__add_varying_int64(<WIDTH x i64>, <WIDTH x i64>) {
+define internal <WIDTH x i64> @__add_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone alwaysinline {
  %r = add <WIDTH x i64> %0, %1
  ret <WIDTH x i64> %r
 }

-define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
 }

-define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
 }

-define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
 }

-define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }

-define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }

@@ -512,35 +512,35 @@ define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {

 declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone

-define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone alwaysinline {
  %r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
  ret <16 x i8> %r
 }

 declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone

-define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone {
+define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone alwaysinline {
  %r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
  ret <16 x i8> %r
 }

 declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone

-define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone alwaysinline {
  %r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
  ret <16 x i8> %r
 }

 declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone

-define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone {
+define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone alwaysinline {
  %r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
  ret <16 x i8> %r
 }

 declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone

-define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone alwaysinline {
  v16tov8(i16, %0, %a0, %b0)
  v16tov8(i16, %1, %a1, %b1)
  %r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
@@ -551,7 +551,7 @@ define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {

 declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone

-define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone {
+define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone alwaysinline {
  v16tov8(i16, %0, %a0, %b0)
  v16tov8(i16, %1, %a1, %b1)
  %r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
@@ -562,7 +562,7 @@ define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone {

 declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone

-define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone alwaysinline {
  v16tov8(i16, %0, %a0, %b0)
  v16tov8(i16, %1, %a1, %b1)
  %r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
@@ -573,7 +573,7 @@ define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone {

 declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone

-define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
+define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone alwaysinline {
  v16tov8(i16, %0, %a0, %b0)
  v16tov8(i16, %1, %a1, %b1)
  %r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -49,7 +49,7 @@ ctlztz()
 declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone

-define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+define float @__half_to_float_uniform(i16 %v) nounwind readnone alwaysinline {
  %v1 = bitcast i16 %v to <1 x i16>
  %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, 
           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -58,7 +58,7 @@ define float @__half_to_float_uniform(i16 %v) nounwind readnone {
  ret float %r
 }

-define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+define i16 @__float_to_half_uniform(float %v) nounwind readnone alwaysinline {
  %v1 = bitcast float %v to <1 x float>
  %vec = shufflevector <1 x float> %v1, <1 x float> undef, 
           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -70,7 +70,14 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; math

-define void @__fastmath() nounwind {
+declare i32 @llvm.arm.get.fpscr() nounwind
+declare void @llvm.arm.set.fpscr(i32) nounwind
+
+define void @__fastmath() nounwind alwaysinline {
+  %x = call i32 @llvm.arm.get.fpscr()
+  ; Turn on FTZ (bit 24) and default NaN (bit 25)
+  %y = or i32 %x, 50331648
+  call void @llvm.arm.set.fpscr(i32 %y)
  ret void
 }

@@ -120,111 +127,111 @@ declare double @__ceil_uniform_double(double) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max

-define float @__max_uniform_float(float, float) nounwind readnone {
+define float @__max_uniform_float(float, float) nounwind readnone alwaysinline {
  %cmp = fcmp ugt float %0, %1
  %r = select i1 %cmp, float %0, float %1
  ret float %r
 }

-define float @__min_uniform_float(float, float) nounwind readnone {
+define float @__min_uniform_float(float, float) nounwind readnone alwaysinline {
  %cmp = fcmp ult float %0, %1
  %r = select i1 %cmp, float %0, float %1
  ret float %r
 }

-define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
+define i32 @__min_uniform_int32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp slt i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
+define i32 @__max_uniform_int32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp sgt i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
+define i32 @__min_uniform_uint32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp ult i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
+define i32 @__max_uniform_uint32(i32, i32) nounwind readnone alwaysinline {
  %cmp = icmp ugt i32 %0, %1
  %r = select i1 %cmp, i32 %0, i32 %1
  ret i32 %r
 }

-define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
+define i64 @__min_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %cmp = icmp slt i64 %0, %1
  %r = select i1 %cmp, i64 %0, i64 %1
  ret i64 %r
 }

-define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
+define i64 @__max_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %cmp = icmp sgt i64 %0, %1
  %r = select i1 %cmp, i64 %0, i64 %1
  ret i64 %r
 }

-define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
+define i64 @__min_uniform_uint64(i64, i64) nounwind readnone alwaysinline {
  %cmp = icmp ult i64 %0, %1
  %r = select i1 %cmp, i64 %0, i64 %1
  ret i64 %r
 }

-define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
+define i64 @__max_uniform_uint64(i64, i64) nounwind readnone alwaysinline {
  %cmp = icmp ugt i64 %0, %1
  %r = select i1 %cmp, i64 %0, i64 %1
  ret i64 %r
 }

-define double @__min_uniform_double(double, double) nounwind readnone {
+define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
  %cmp = fcmp olt double %0, %1
  %r = select i1 %cmp, double %0, double %1
  ret double %r
 }

-define double @__max_uniform_double(double, double) nounwind readnone {
+define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
  %cmp = fcmp ogt double %0, %1
  %r = select i1 %cmp, double %0, double %1
  ret double %r
 }

-define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone alwaysinline {
  %m = icmp slt <WIDTH x i64> %0, %1
  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
  ret <WIDTH x i64> %r
 }

-define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone alwaysinline {
  %m = icmp sgt <WIDTH x i64> %0, %1
  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
  ret <WIDTH x i64> %r
 }

-define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone alwaysinline {
  %m = icmp ult <WIDTH x i64> %0, %1
  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
  ret <WIDTH x i64> %r
 }

-define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone alwaysinline {
  %m = icmp ugt <WIDTH x i64> %0, %1
  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
  ret <WIDTH x i64> %r
 }

 define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
-                                              <WIDTH x double>) nounwind readnone {
+                                              <WIDTH x double>) nounwind readnone alwaysinline {
  %m = fcmp olt <WIDTH x double> %0, %1
  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
  ret <WIDTH x double> %r
 }

 define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
-                                              <WIDTH x double>) nounwind readnone {
+                                              <WIDTH x double>) nounwind readnone alwaysinline {
  %m = fcmp ogt <WIDTH x double> %0, %1
  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
  ret <WIDTH x double> %r
@@ -234,14 +241,14 @@ define <WIDTH x double> @__max_varying_double(<WIDTH x double>,

 declare float @llvm.sqrt.f32(float)

-define float @__sqrt_uniform_float(float) nounwind readnone {
+define float @__sqrt_uniform_float(float) nounwind readnone alwaysinline {
  %r = call float @llvm.sqrt.f32(float %0)
  ret float %r
 }

 declare double @llvm.sqrt.f64(double)

-define double @__sqrt_uniform_double(double) nounwind readnone {
+define double @__sqrt_uniform_double(double) nounwind readnone alwaysinline {
  %r = call double @llvm.sqrt.f64(double %0)
  ret double %r
 }
@@ -251,12 +258,12 @@ define double @__sqrt_uniform_double(double) nounwind readnone {
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone

-define i32 @__popcnt_int32(i32) nounwind readnone {
+define i32 @__popcnt_int32(i32) nounwind readnone alwaysinline {
  %v = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %v
 }

-define i64 @__popcnt_int64(i64) nounwind readnone {
+define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
  %v = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %v
 }
--- a/builtins/target-skx.ll
+++ b/builtins/target-skx.ll
@@ -35,6 +35,10 @@ define(`WIDTH',`16')
 ifelse(LLVM_VERSION, LLVM_3_8,
    `include(`target-avx512-common.ll')',
         LLVM_VERSION, LLVM_3_9,
+    `include(`target-avx512-common.ll')',
+         LLVM_VERSION, LLVM_4_0,
+    `include(`target-avx512-common.ll')',
+         LLVM_VERSION, LLVM_5_0,
    `include(`target-avx512-common.ll')'
  )

@@ -80,6 +84,10 @@ define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly al
 ifelse(LLVM_VERSION, LLVM_3_8,
    rcp_rsqrt_varying_float_skx(),
         LLVM_VERSION, LLVM_3_9,
+    rcp_rsqrt_varying_float_skx(),
+         LLVM_VERSION, LLVM_4_0,
+    rcp_rsqrt_varying_float_skx(),
+         LLVM_VERSION, LLVM_5_0,
    rcp_rsqrt_varying_float_skx()
  )

--- a/builtins/util-nvptx.m4
+++ b/builtins/util-nvptx.m4
@@ -54,9 +54,13 @@ define(`MASK_HIGH_BIT_ON',
 define(`PTR_OP_ARGS',
  ifelse(LLVM_VERSION, LLVM_3_7,
    ``$1 , $1 *'',
-  ifelse(LLVM_VERSION, LLVM_3_8,
+         LLVM_VERSION, LLVM_3_8,
    ``$1 , $1 *'',
-  ifelse(LLVM_VERSION, LLVM_3_9,
+         LLVM_VERSION, LLVM_3_9,
+    ``$1 , $1 *'',
+         LLVM_VERSION, LLVM_4_0,
+    ``$1 , $1 *'',
+         LLVM_VERSION, LLVM_5_0,
    ``$1 , $1 *'',
    ``$1 *''
  )
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -57,6 +57,10 @@ define(`PTR_OP_ARGS',
         LLVM_VERSION, LLVM_3_8,
    ``$1 , $1 *'',
         LLVM_VERSION, LLVM_3_9,
+    ``$1 , $1 *'',
+         LLVM_VERSION, LLVM_4_0,
+    ``$1 , $1 *'',
+         LLVM_VERSION, LLVM_5_0,
    ``$1 , $1 *'',
    ``$1 *''
  )
@@ -69,6 +73,10 @@ define(`MdORi64',
    ``i64'',
    LLVM_VERSION, LLVM_3_9,
    ``i64'',
+    LLVM_VERSION, LLVM_4_0,
+    ``i64'',
+    LLVM_VERSION, LLVM_5_0,
+    ``i64'',
    ``double''
  )
 )
@@ -78,6 +86,10 @@ define(`MfORi32',
    ``i32'',
    LLVM_VERSION, LLVM_3_9,
    ``i32'',
+    LLVM_VERSION, LLVM_4_0,
+    ``i32'',
+    LLVM_VERSION, LLVM_5_0,
+    ``i32'',
    ``float''
  )
 )
@@ -1586,6 +1598,12 @@ define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
  ',LLVM_VERSION,LLVM_3_9,`
    %r_LANE_ID_t = cmpxchg $2 * %ptr, $2 %cmp_LANE_ID, $2 %val_LANE_ID seq_cst seq_cst
    %r_LANE_ID = extractvalue { $2, i1 } %r_LANE_ID_t, 0
+  ',LLVM_VERSION,LLVM_4_0,`
+    %r_LANE_ID_t = cmpxchg $2 * %ptr, $2 %cmp_LANE_ID, $2 %val_LANE_ID seq_cst seq_cst
+    %r_LANE_ID = extractvalue { $2, i1 } %r_LANE_ID_t, 0
+  ',LLVM_VERSION,LLVM_5_0,`
+    %r_LANE_ID_t = cmpxchg $2 * %ptr, $2 %cmp_LANE_ID, $2 %val_LANE_ID seq_cst seq_cst
+    %r_LANE_ID = extractvalue { $2, i1 } %r_LANE_ID_t, 0
  ',`
    %r_LANE_ID = cmpxchg $2 * %ptr, $2 %cmp_LANE_ID, $2 %val_LANE_ID seq_cst
  ')
@@ -1614,6 +1632,12 @@ define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
  ',LLVM_VERSION,LLVM_3_9,`
   %r_t = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst seq_cst
   %r = extractvalue { $2, i1 } %r_t, 0
+  ',LLVM_VERSION,LLVM_4_0,`
+   %r_t = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst seq_cst
+   %r = extractvalue { $2, i1 } %r_t, 0
+  ',LLVM_VERSION,LLVM_5_0,`
+   %r_t = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst seq_cst
+   %r = extractvalue { $2, i1 } %r_t, 0
  ',`
   %r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst
  ')
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -133,8 +133,8 @@
 #define snprintf _snprintf
 #endif
 ///////////////////////////////////////////////////////////////////////////////
-// This part of code was in LLVM's ConstantsScanner.h, 
-// but it was removed in revision #232397 
+// This part of code was in LLVM's ConstantsScanner.h,
+// but it was removed in revision #232397

 namespace constant_scanner {
 class constant_iterator : public std::iterator<std::forward_iterator_tag,
@@ -381,8 +381,8 @@ namespace {
  };
 } // end anonymous namespace

-static void findUsedArrayAndLongIntTypes(const llvm::Module *m, std::vector<llvm::ArrayType*> &t, 
-                               std::vector<llvm::IntegerType*> &i, std::vector<bool> &IsVolatile, 
+static void findUsedArrayAndLongIntTypes(const llvm::Module *m, std::vector<llvm::ArrayType*> &t,
+                               std::vector<llvm::IntegerType*> &i, std::vector<bool> &IsVolatile,
                               std::vector<int> &Alignment) {
  TypeFinder(t, i, IsVolatile, Alignment).run(*m);
 }
@@ -390,7 +390,7 @@ static void findUsedArrayAndLongIntTypes(const llvm::Module *m, std::vector<llvm

 static bool is_vec16_i64_ty(llvm::Type *Ty) {
  llvm::VectorType *VTy = llvm::dyn_cast<llvm::VectorType>(Ty);
-  if ((VTy != NULL) && (VTy->getElementType()->isIntegerTy()) && 
+  if ((VTy != NULL) && (VTy->getElementType()->isIntegerTy()) &&
    VTy->getElementType()->getPrimitiveSizeInBits() == 64)
    return true;
  return false;
@@ -462,7 +462,11 @@ namespace {
      VectorConstantIndex = 0;
    }

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // <= 3.9
    virtual const char *getPassName() const { return "C backend"; }
+#else // LLVM 4.0+
+    virtual llvm::StringRef getPassName() const { return "C backend"; }
+#endif

    void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6 // <= 3.6
@@ -525,8 +529,10 @@ namespace {
                           bool IgnoreName = false,
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                           const llvm::AttrListPtr &PAL = llvm::AttrListPtr()
-#else // LLVM 3.3+
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
                           const llvm::AttributeSet &PAL = llvm::AttributeSet()
+#else // LLVM 5.0+
+                           const llvm::AttributeList &PAL = llvm::AttributeList()
 #endif
                                 );
    llvm::raw_ostream &printSimpleType(llvm::raw_ostream &Out, llvm::Type *Ty,
@@ -536,8 +542,10 @@ namespace {
    void printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                                              const llvm::AttrListPtr &PAL,
-#else // LLVM 3.3+
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
                                              const llvm::AttributeSet &PAL,
+#else // LLVM 5.0+
+                                              const llvm::AttributeList &PAL,
 #endif
                                              llvm::PointerType *Ty);

@@ -782,8 +790,10 @@ std::string CWriter::getArrayName(llvm::ArrayType *AT) {
 void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                                                   const llvm::AttrListPtr &PAL,
-#else // LLVM 3.3+
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
                                                   const llvm::AttributeSet &PAL,
+#else // LLVM 5.0+
+                                                   const llvm::AttributeList &PAL,
 #endif
                                                   llvm::PointerType *TheTy) {
  llvm::FunctionType *FTy = llvm::cast<llvm::FunctionType>(TheTy->getElementType());
@@ -801,8 +811,10 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
    llvm::Type *ArgTy = *I;
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
    if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
-#else // LLVM 3.3+
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
        if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
+#else // LLVM 5.0+
+        if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeList::FunctionIndex, llvm::Attribute::ByVal)) {
 #endif
      assert(ArgTy->isPointerTy());
      ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
@@ -810,8 +822,10 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
    printType(FunctionInnards, ArgTy,
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
              PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
-#else // LLVM 3.3+
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
              PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
+#else // LLVM 5.0+
+              PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeList::FunctionIndex, llvm::Attribute::SExt),
 #endif
              "");
    PrintedType = true;
@@ -827,8 +841,10 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
  printType(Out, RetTy,
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
            PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
-#else // LLVM 3.3+
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
            PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
+#else // LLVM 5.0+
+            PAL.getParamAttributes(0).hasAttribute(llvm::AttributeList::ReturnIndex, llvm::Attribute::SExt),
 #endif
            FunctionInnards.str());
 }
@@ -925,8 +941,10 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
                                bool IgnoreName,
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                                const llvm::AttrListPtr &PAL
-#else /* LLVM 3.3+ */
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
                                const llvm::AttributeSet &PAL
+#else // LLVM 5.0+
+                                const llvm::AttributeList &PAL
 #endif
                                      ) {

@@ -947,8 +965,10 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
      llvm::Type *ArgTy = *I;
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
      if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
-#else /* LLVM 3.3+ */
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
          if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
+#else // LLVM 5.0+
+          if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeList::FunctionIndex, llvm::Attribute::ByVal)) {
 #endif
        assert(ArgTy->isPointerTy());
        ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
@@ -958,8 +978,10 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
      printType(FunctionInnards, ArgTy,
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
-#else /* LLVM 3.3+ */
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
                PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
+#else // LLVM 5.0+
+                PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeList::FunctionIndex, llvm::Attribute::SExt),
 #endif
                "");
      ++Idx;
@@ -975,8 +997,10 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
    printType(Out, FTy->getReturnType(),
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
              PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
-#else /* LLVM 3.3+ */
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
              PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
+#else // LLVM 5.0+
+              PAL.getParamAttributes(0).hasAttribute(llvm::AttributeList::ReturnIndex, llvm::Attribute::SExt),
 #endif
              FunctionInnards.str());
    return Out;
@@ -1087,7 +1111,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,

 void CWriter::printConstantArray(llvm::ConstantArray *CPA, bool Static) {
  // vec16_i64 should be handled separately
-  
+
  if (is_vec16_i64_ty(CPA->getOperand(0)->getType())) {
    Out << "/* vec16_i64 should be loaded carefully on knc */";
    Out << "\n#if defined(KNC)\n";
@@ -1180,6 +1204,7 @@ void CWriter::printConstantDataSequential(llvm::ConstantDataSequential *CDS,

 static inline std::string ftostr(const llvm::APFloat& V) {
  std::string Buf;
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
  if (&V.getSemantics() == &llvm::APFloat::IEEEdouble) {
    llvm::raw_string_ostream(Buf) << V.convertToDouble();
    return Buf;
@@ -1187,6 +1212,15 @@ static inline std::string ftostr(const llvm::APFloat& V) {
    llvm::raw_string_ostream(Buf) << (double)V.convertToFloat();
    return Buf;
  }
+#else // LLVM 4.0+
+  if (&V.getSemantics() == &llvm::APFloat::IEEEdouble()) {
+    llvm::raw_string_ostream(Buf) << V.convertToDouble();
+    return Buf;
+  } else if (&V.getSemantics() == &llvm::APFloat::IEEEsingle()) {
+    llvm::raw_string_ostream(Buf) << (double)V.convertToFloat();
+    return Buf;
+  }
+#endif
  return "<unknown format in ftostr>"; // error
 }

@@ -1206,7 +1240,11 @@ static bool isFPCSafeToPrint(const llvm::ConstantFP *CFP) {
    return false;
  llvm::APFloat APF = llvm::APFloat(CFP->getValueAPF());  // copy
  if (CFP->getType() == llvm::Type::getFloatTy(CFP->getContext()))
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // <= 3.9
    APF.convert(llvm::APFloat::IEEEdouble, llvm::APFloat::rmNearestTiesToEven, &ignored);
+#else // LLVM 4.0+
+    APF.convert(llvm::APFloat::IEEEdouble(), llvm::APFloat::rmNearestTiesToEven, &ignored);
+#endif
 #if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
  char Buffer[100];
  sprintf(Buffer, "%a", APF.convertToDouble());
@@ -1637,7 +1675,11 @@ void CWriter::printConstant(llvm::Constant *CPV, bool Static) {
        // useful.
        llvm::APFloat Tmp = FPC->getValueAPF();
        bool LosesInfo;
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // <= 3.9
        Tmp.convert(llvm::APFloat::IEEEdouble, llvm::APFloat::rmTowardZero, &LosesInfo);
+#else // LLVM 4.0+
+        Tmp.convert(llvm::APFloat::IEEEdouble(), llvm::APFloat::rmTowardZero, &LosesInfo);
+#endif
        V = Tmp.convertToDouble();
      }

@@ -1819,11 +1861,11 @@ void CWriter::printConstant(llvm::Constant *CPV, bool Static) {
        // when generating code for knl-generic in multitarget mode.
        // Short vectors are mapped to "native" vectors and cause AVX-512 code
        // generation in static block initialization (__vec16_* in ::init function).
-        bool isGenericKNL = g->target->getISA() == Target::GENERIC && 
+        bool isGenericKNL = g->target->getISA() == Target::GENERIC &&
                                                   !g->target->getTreatGenericAsSmth().empty() &&
                                                   g->mangleFunctionsWithTarget;
-        if (isGenericKNL && CPV->getOperand(0)->getType()->isVectorTy()) 
-          llvm::report_fatal_error("knl-generic-* target doesn's support short vectors");  
+        if (isGenericKNL && CPV->getOperand(0)->getType()->isVectorTy())
+          llvm::report_fatal_error("knl-generic-* target doesn's support short vectors");
        Out << ' ';
        printConstant(llvm::cast<llvm::Constant>(CPV->getOperand(0)), Static);
        for (unsigned i = 1, e = CPV->getNumOperands(); i != e; ++i) {
@@ -2024,7 +2066,7 @@ void CWriter::writeInstComputationInline(llvm::Instruction &I) {

  if (NeedBoolTrunc)
    Out << "((";
- 
+
  visit(I);

  if (NeedBoolTrunc)
@@ -2798,7 +2840,7 @@ void CWriter::printFloatingPointConstants(llvm::Function &F) {
  // the precision of the printed form, unless the printed form preserves
  // precision.
  //
-  for (constant_scanner::constant_iterator I = constant_scanner::constant_begin(&F), 
+  for (constant_scanner::constant_iterator I = constant_scanner::constant_begin(&F),
       E = constant_scanner::constant_end(&F); I != E; ++I)
    printFloatingPointConstants(*I);

@@ -2865,7 +2907,7 @@ void CWriter::printFloatingPointConstants(const llvm::Constant *C) {
 // loads to get their values, rather than tediously inserting the
 // individual values into the vector.
 void CWriter::printVectorConstants(llvm::Function &F) {
-    for (constant_scanner::constant_iterator I = constant_scanner::constant_begin(&F), 
+    for (constant_scanner::constant_iterator I = constant_scanner::constant_begin(&F),
         E = constant_scanner::constant_end(&F); I != E; ++I) {
        const llvm::ConstantDataVector *CDV = llvm::dyn_cast<llvm::ConstantDataVector>(*I);
        if (CDV == NULL)
@@ -3017,7 +3059,7 @@ void CWriter::printModuleTypes() {
      Out << "  struct " << Name << ";\n";
  }
  Out << "};\n";
-  
+
  for (unsigned i = 0, e = IntegerTypes.size(); i != e; ++i) {
     llvm::IntegerType *IT = IntegerTypes[i];
      if (IT->getIntegerBitWidth() <= 64 || Alignment[i] == 0)
@@ -3142,8 +3184,10 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
  llvm::FunctionType *FT = llvm::cast<llvm::FunctionType>(F->getFunctionType());
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
  const llvm::AttrListPtr &PAL = F->getAttributes();
-#else /* LLVM 3.3+ */
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
  const llvm::AttributeSet &PAL = F->getAttributes();
+#else // LLVM 5.0+
+  const llvm::AttributeList &PAL = F->getAttributes();
 #endif

  std::string tstr;
@@ -3180,8 +3224,10 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
        llvm::Type *ArgTy = I->getType();
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
        if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
-#else /* LLVM 3.3+ */
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
            if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
+#else // LLVM 5.0+
+            if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeList::FunctionIndex, llvm::Attribute::ByVal)) {
 #endif
          ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7 /* 3.2, 3.3, 3.4, 3.5, 3.6, 3.7 */
@@ -3193,8 +3239,10 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
        printType(FunctionInnards, ArgTy,
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                  PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
-#else /* LLVM 3.3+ */
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
                  PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
+#else // LLVM 5.0+
+                  PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeList::FunctionIndex, llvm::Attribute::SExt),
 #endif
                  ArgName);
        PrintedArg = true;
@@ -3219,8 +3267,10 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
      llvm::Type *ArgTy = *I;
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
      if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
-#else /* LLVM 3.3+ */
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
          if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
+#else // LLVM 5.0+
+          if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeList::FunctionIndex, llvm::Attribute::ByVal)) {
 #endif
        assert(ArgTy->isPointerTy());
        ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
@@ -3228,8 +3278,10 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
      printType(FunctionInnards, ArgTy,
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt)
-#else /* LLVM 3.3+ */
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
                PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt)
+#else // LLVM 5.0+
+                PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeList::FunctionIndex, llvm::Attribute::SExt)
 #endif
                );
      PrintedArg = true;
@@ -3265,8 +3317,10 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
  printType(Out, RetTy,
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
            PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
-#else /* LLVM 3.3+ */
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
            PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
+#else // LLVM 5.0+
+            PAL.getParamAttributes(0).hasAttribute(llvm::AttributeList::ReturnIndex, llvm::Attribute::SExt),
 #endif
            FunctionInnards.str());
 }
@@ -4061,17 +4115,17 @@ void CWriter::printIntrinsicDefinition(const llvm::Function &F, llvm::raw_ostrea

    printType(Out, retT);
    Out << "r;\n";
-        
+
    unsigned NumBits = llvm::cast<llvm::IntegerType>(elemT)->getBitWidth();
    std::stringstream  str_type;
-    if (NumBits <= 32) 
+    if (NumBits <= 32)
      str_type << "uint" << 2 * NumBits << "_t";
    else {
      assert(NumBits <= 64 && "Bit widths > 128 not implemented yet");
      str_type << "llvmUInt128";
    }

-    Out << "  " << str_type.str() << " result = (" << str_type.str() << ") a * (" << str_type.str() << ") b;\n"; 
+    Out << "  " << str_type.str() << " result = (" << str_type.str() << ") a * (" << str_type.str() << ") b;\n";
    Out << "  r.field0 = result;\n";
    Out << "  r.field1 = result >> " << NumBits << ";\n";
    Out << "  return r;\n}\n";
@@ -4201,8 +4255,10 @@ void CWriter::visitCallInst(llvm::CallInst &I) {
  // parameter instead of passing it to the call.
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
  const llvm::AttrListPtr &PAL = I.getAttributes();
-#else /* LLVM 3.3+ */
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
  const llvm::AttributeSet &PAL = I.getAttributes();
+#else // LLVM 5.0+
+  const llvm::AttributeList &PAL = I.getAttributes();
 #endif

  bool hasByVal = I.hasByValArgument();
@@ -4241,7 +4297,7 @@ void CWriter::visitCallInst(llvm::CallInst &I) {
    if (Callee->getName() == "malloc" ||
        Callee->getName() == "_aligned_malloc")
        Out << "(uint8_t *)";
-  
+
    // This 'if' will fix 'soa-18.ispc' test (fails with optimizations off)
    // Yet the way the case is fixed is quite dirty and leads to many other fails

@@ -4302,7 +4358,7 @@ void CWriter::visitCallInst(llvm::CallInst &I) {

  for (; AI != AE; ++AI, ++ArgNo) {
    if (PrintedArg) Out << ", ";
-    if (ArgNo == 0 && 
+    if (ArgNo == 0 &&
        Callee->getName() == "posix_memalign") {
        // uint8_t** is incompatible with void** without explicit cast.
        // Should be do this any other functions?
@@ -4314,8 +4370,10 @@ void CWriter::visitCallInst(llvm::CallInst &I) {
      printType(Out, FTy->getParamType(ArgNo),
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::Attributes::SExt)
-#else /* LLVM 3.3+ */
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
                PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt)
+#else // LLVM 5.0+
+                PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::AttributeList::FunctionIndex, llvm::Attribute::SExt)
 #endif
                );
      Out << ')';
@@ -4377,7 +4435,7 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID,
    if (I.getParent()->getParent()->arg_empty())
      Out << "vararg_dummy_arg";
    else
-      writeOperand(&*(--I.getParent()->getParent()->arg_end()));
+      writeOperand(&*(std::prev(I.getParent()->getParent()->arg_end())));
    Out << ')';
    return true;
  case llvm::Intrinsic::vaend:
@@ -4552,7 +4610,11 @@ void CWriter::printGEPExpression(llvm::Value *Ptr, llvm::gep_type_iterator I,
  llvm::VectorType *LastIndexIsVector = 0;
  {
    for (llvm::gep_type_iterator TmpI = I; TmpI != E; ++TmpI)
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
      LastIndexIsVector = llvm::dyn_cast<llvm::VectorType>(*TmpI);
+#else // LLVM 4.0+
+    LastIndexIsVector = llvm::dyn_cast<llvm::VectorType>(TmpI.getIndexedType());
+#endif
  }

  Out << "(";
@@ -4581,7 +4643,11 @@ void CWriter::printGEPExpression(llvm::Value *Ptr, llvm::gep_type_iterator I,
    // exposed, like a global, avoid emitting (&foo)[0], just emit foo instead.
    if (isAddressExposed(Ptr)) {
      writeOperandInternal(Ptr, Static);
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    } else if (I != E && (*I)->isStructTy()) {
+#else // LLVM 4.0+
+    } else if (I != E && I.isStruct()) {
+#endif
      // If we didn't already emit the first operand, see if we can print it as
      // P->f instead of "P[0].f"
      writeOperand(Ptr);
@@ -4596,13 +4662,18 @@ void CWriter::printGEPExpression(llvm::Value *Ptr, llvm::gep_type_iterator I,
  }

  for (; I != E; ++I) {
-    if ((*I)->isStructTy()) {
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
+    llvm::Type *type = *I;
+#else // LLVM 4.0+
+    llvm::Type *type = I.getIndexedType();
+#endif
+    if (type->isStructTy()) {
      Out << ".field" << llvm::cast<llvm::ConstantInt>(I.getOperand())->getZExtValue();
-    } else if ((*I)->isArrayTy()) {
+    } else if (type->isArrayTy()) {
      Out << ".array[";
      writeOperandWithCast(I.getOperand(), llvm::Instruction::GetElementPtr);
      Out << ']';
-    } else if (!(*I)->isVectorTy()) {
+    } else if (!type->isVectorTy()) {
      Out << '[';
      writeOperandWithCast(I.getOperand(), llvm::Instruction::GetElementPtr);
      Out << ']';
@@ -4633,7 +4704,7 @@ void CWriter::writeMemoryAccess(llvm::Value *Operand, llvm::Type *OperandType,
    Out << '*';
  if (IsVolatile || IsUnaligned) {
    Out << "((";
-    if (IsUnaligned && ITy && (ITy->getBitWidth() > 64)) 
+    if (IsUnaligned && ITy && (ITy->getBitWidth() > 64))
      Out << "iN_" << ITy->getBitWidth() << "_align_" << Alignment << " *)";
    else {
      if (IsUnaligned)
@@ -4798,7 +4869,7 @@ void CWriter::visitShuffleVectorInst(llvm::ShuffleVectorInst &SVI) {
        printType(Out, llvm::PointerType::getUnqual(EltTy));
        Out << ")(&" << GetValueName(Op)
            << "))[" << SrcVal << "]";
-        Out << " \n#endif \n";        
+        Out << " \n#endif \n";
      }
    }
  }
@@ -4901,7 +4972,11 @@ public:
    SmearCleanupPass(llvm::Module *m, int width)
        : BasicBlockPass(ID) { module = m; vectorWidth = width; }

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // <= 3.9
    const char *getPassName() const { return "Smear Cleanup Pass"; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "Smear Cleanup Pass"; }
+#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);

    static char ID;
@@ -4989,7 +5064,7 @@ SmearCleanupPass::getShuffleSmearValue(llvm::Instruction* inst) const {
        llvm::dyn_cast<llvm::Constant>(shuffleInst->getOperand(2));

    // Check that the shuffle is a broadcast of the element of the first vector,
-    // i.e. mask vector is vector with equal elements of expected size.     
+    // i.e. mask vector is vector with equal elements of expected size.
    if (!(mask &&
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
         (mask->isNullValue() || (shuffleInst->getMask()->getType()->isVectorTy() && llvm::dyn_cast<llvm::ConstantVector>(shuffleInst->getMask())->getSplatValue() != 0 ) ) &&
@@ -5014,34 +5089,34 @@ SmearCleanupPass::getShuffleSmearValue(llvm::Instruction* inst) const {
        if (operandVec && operandVec->getNumElements() == 1)
          return NULL;

-        // Insert ExtractElementInstr to get value for smear        
+        // Insert ExtractElementInstr to get value for smear

        llvm::Function *extractFunc = module->getFunction("__extract_element");
-       
+
         if (extractFunc == NULL) {
-            // Declare the __extract_element function if needed; it takes a vector and 
+            // Declare the __extract_element function if needed; it takes a vector and
            // a scalar parameter and returns a scalar of the vector parameter type.
            llvm::Constant *ef =
-                module->getOrInsertFunction("__extract_element", 
-                                            shuffleInst->getOperand(0)->getType()->getVectorElementType(), 
+                module->getOrInsertFunction("__extract_element",
+                                            shuffleInst->getOperand(0)->getType()->getVectorElementType(),
                                            shuffleInst->getOperand(0)->getType(),
                                            llvm::IntegerType::get(module->getContext(), 32), NULL);
            extractFunc = llvm::dyn_cast<llvm::Function>(ef);
            assert(extractFunc != NULL);
            extractFunc->setDoesNotThrow();
            extractFunc->setOnlyReadsMemory();
-        } 
+        }

        if (extractFunc == NULL) {
            return NULL;
        }
-        llvm::Instruction *extractCall = 
-              llvm::ExtractElementInst::Create(shuffleInst->getOperand(0), 
+        llvm::Instruction *extractCall =
+              llvm::ExtractElementInst::Create(shuffleInst->getOperand(0),
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                         // mask is of VectorType
-                         llvm::dyn_cast<llvm::ConstantVector>(mask)->getSplatValue(),  
+                         llvm::dyn_cast<llvm::ConstantVector>(mask)->getSplatValue(),
 #else
-                         mask->getSplatValue(),  
+                         mask->getSplatValue(),
 #endif
                         "__extract_element", inst);
        return extractCall;
@@ -5109,7 +5184,11 @@ public:
    AndCmpCleanupPass()
        : BasicBlockPass(ID) { }

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // <= 3.9
    const char *getPassName() const { return "AndCmp Cleanup Pass"; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "AndCmp Cleanup Pass"; }
+#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);

    static char ID;
@@ -5251,7 +5330,11 @@ public:
 #endif
    }

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // <= 3.9
    const char *getPassName() const { return "MaskOps Cleanup Pass"; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "MaskOps Cleanup Pass"; }
+#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);

 private:
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -385,10 +385,14 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
        llvm::DISubroutineType *diSubprogramType_n =
            llvm::cast<llvm::DISubroutineType>(getDICompositeType(diSubprogramType));
        int flags = llvm::DINode::FlagPrototyped;
-#else /* LLVM 3.8+ */
+#elif ISPC_LLVM_VERSION == ISPC_LLVM_3_8 || ISPC_LLVM_VERSION == ISPC_LLVM_3_9 /* LLVM 3.8, 3.9 */
        Assert(llvm::isa<llvm::DISubroutineType>(diSubprogramType));
        llvm::DISubroutineType *diSubprogramType_n = llvm::cast<llvm::DISubroutineType>(diSubprogramType);
        int flags = llvm::DINode::FlagPrototyped;
+#else /* LLVM 4.0+ */
+        Assert(llvm::isa<llvm::DISubroutineType>(diSubprogramType));
+        llvm::DISubroutineType *diSubprogramType_n = llvm::cast<llvm::DISubroutineType>(diSubprogramType);
+        llvm::DINode::DIFlags flags = llvm::DINode::FlagPrototyped;

 #endif

@@ -417,7 +421,16 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
                                         isStatic,           true, /* is defn */
                                         firstLine,          flags,
                                         isOptimized,        llvmFunction);
-#else /* LLVM 3.8+ */
+#elif ISPC_LLVM_VERSION == ISPC_LLVM_3_8 || ISPC_LLVM_VERSION == ISPC_LLVM_3_9 /* LLVM 3.8, 3.9 */
+        diSubprogram =
+            m->diBuilder->createFunction(diFile /* scope */, funSym->name,
+                                         mangledName,        diFile,
+                                         firstLine,          diSubprogramType_n,
+                                         isStatic,           true, /* is defn */
+                                         firstLine,          flags,
+                                         isOptimized);
+        llvmFunction->setSubprogram(diSubprogram);
+#else /* LLVM 4.0+ */
        diSubprogram =
            m->diBuilder->createFunction(diFile /* scope */, funSym->name,
                                         mangledName,        diFile,
@@ -1821,7 +1834,11 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym, int argNum) {
    if (m->diBuilder == NULL)
        return;

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    int flags = 0;
+#else // LLVM 4.0+
+    llvm::DINode::DIFlags flags = llvm::DINode::FlagZero;
+#endif
 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6 /* 3.2, 3.3, 3.4, 3.5, 3.6 */
    llvm::DIScope scope = diSubprogram;
    llvm::DIType diType = sym->type->GetDIType(scope);
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -4,7 +4,7 @@ An ISPC update with new native AVX512 target for future Xeon CPUs and
 improvements for debugging, including new switch --dwarf-version to support
 debugging on old systems.

-The release is based on patched version LLVM 3.8.
+The release is based on patched LLVM 3.8.

 === v1.9.0 === (12 Feb 2016)

--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.9.1
+PROJECT_NUMBER         = 1.9.2dev

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/func.cpp
+++ b/func.cpp
@@ -420,8 +420,10 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
            (
 #if ISPC_LLVM_VERSION == ISPC_LLVM_3_2 // 3.2
              (function->getFnAttributes().hasAttribute(llvm::Attributes::AlwaysInline) == false)
-#else // LLVM 3.3+
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
              (function->getAttributes().getFnAttributes().hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::AlwaysInline) == false)
+#else // LLVM 5.0+
+              (function->getAttributes().getFnAttributes().hasAttribute(llvm::AttributeList::FunctionIndex, llvm::Attribute::AlwaysInline) == false)
 #endif
             &&
             costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -1158,11 +1158,19 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
 #endif
            attrBuilder.addAttribute("target-cpu", this->m_cpu);
            attrBuilder.addAttribute("target-features", this->m_attributes);
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
            this->m_tf_attributes = new llvm::AttributeSet(
                llvm::AttributeSet::get(
                    *g->ctx,
                    llvm::AttributeSet::FunctionIndex,
                    attrBuilder));
+#else // LLVM 5.0+
+            this->m_tf_attributes = new llvm::AttributeList(
+                llvm::AttributeList::get(
+                    *g->ctx,
+                    llvm::AttributeList::FunctionIndex,
+                    attrBuilder));
+#endif
        }
 #endif

@@ -1477,7 +1485,11 @@ Target::StructOffset(llvm::Type *type, int element,
 void Target::markFuncWithTargetAttr(llvm::Function* func) {
 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_3
    if (m_tf_attributes) {
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
        func->addAttributes(llvm::AttributeSet::FunctionIndex, *m_tf_attributes);
+#else // LLVM 5.0+
+        func->addAttributes(llvm::AttributeList::FunctionIndex, *m_tf_attributes);
+#endif
    }
 #endif
 }
--- a/ispc.h
+++ b/ispc.h
@@ -41,7 +41,7 @@
 #include "ispc_version.h"

 #if ISPC_LLVM_VERSION < OLDEST_SUPPORTED_LLVM || ISPC_LLVM_VERSION > LATEST_SUPPORTED_LLVM
-#error "Only LLVM 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8 and 3.9 development branch are supported"
+#error "Only LLVM 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0 and 5.0 development branch are supported"
 #endif

 #if defined(_WIN32) || defined(_WIN64)
@@ -72,7 +72,11 @@

 // Forward declarations of a number of widely-used LLVM types
 namespace llvm {
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
    class AttributeSet;
+#else // LLVM 5.0+
+    class AttributeList;
+#endif
    class BasicBlock;
    class Constant;
    class ConstantValue;
@@ -86,13 +90,11 @@ namespace llvm {
    class TargetMachine;
    class Type;
    class Value;
-#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
    class DIFile;
    class DIType;
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
    class DIDescriptor;
 #else // LLVM 3.7+
-    class DIFile;
-    class DIType;
    class DIScope;
 #endif
 }
@@ -348,7 +350,11 @@ private:
    /** Target-specific LLVM attribute, which has to be attached to every
        function to ensure that it is generated for correct target architecture.
        This is requirement was introduced in LLVM 3.3 */
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_4_0
    llvm::AttributeSet* m_tf_attributes;
+#else // LLVM 5.0+
+    llvm::AttributeList* m_tf_attributes;
+#endif
 #endif

    /** Native vector width of the vector instruction set.  Note that this
--- a/ispc_version.h
+++ b/ispc_version.h
@@ -38,7 +38,7 @@
 #ifndef ISPC_VERSION_H
 #define ISPC_VERSION_H

-#define ISPC_VERSION "1.9.1"
+#define ISPC_VERSION "1.9.2dev"
 #include "llvm/Config/llvm-config.h"

 #define ISPC_LLVM_VERSION ( LLVM_VERSION_MAJOR * 10000 + LLVM_VERSION_MINOR * 100 )
@@ -51,9 +51,11 @@
 #define ISPC_LLVM_3_7 30700
 #define ISPC_LLVM_3_8 30800
 #define ISPC_LLVM_3_9 30900
+#define ISPC_LLVM_4_0 40000
+#define ISPC_LLVM_5_0 50000

 #define OLDEST_SUPPORTED_LLVM ISPC_LLVM_3_2
-#define LATEST_SUPPORTED_LLVM ISPC_LLVM_3_9
+#define LATEST_SUPPORTED_LLVM ISPC_LLVM_5_0

 #ifdef __ispc__xstr
 #undef __ispc__xstr
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -51,7 +51,11 @@
  #include <llvm/IR/Constants.h>
 #endif

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
 #define PTYPE(p) (llvm::cast<llvm::SequentialType>((p)->getType()->getScalarType())->getElementType())
+#else // LLVM 4.0+
+#define PTYPE(p) (llvm::cast<llvm::PointerType>((p)->getType()->getScalarType())->getElementType())
+#endif

 namespace llvm {
    class PHINode;
--- a/module.cpp
+++ b/module.cpp
@@ -124,10 +124,15 @@
 #include <clang/Frontend/TextDiagnosticPrinter.h>
 #include <clang/Frontend/Utils.h>
 #include <clang/Basic/TargetInfo.h>
+#include <clang/Lex/PreprocessorOptions.h>
 #include <llvm/Support/ToolOutputFile.h>
 #include <llvm/Support/Host.h>
 #include <llvm/Support/raw_ostream.h>
-#include <llvm/Bitcode/ReaderWriter.h>
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
+    #include <llvm/Bitcode/ReaderWriter.h>
+#else
+    #include <llvm/Bitcode/BitcodeWriter.h>
+#endif

 /*! list of files encountered by the parser. this allows emitting of
    the module file's dependencies via the -MMM option */
@@ -426,9 +431,7 @@ Module::Module(const char *fn) {
            sprintf(producerString, "ispc version %s (built on %s)",
                    ISPC_VERSION, __DATE__);
 #endif
-#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_4 // LLVM 3.4+
-            diCompileUnit =
-#endif // LLVM_3_4+
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_3
            diBuilder->createCompileUnit(llvm::dwarf::DW_LANG_C99,  /* lang */
                                         name,  /* filename */
                                         directory, /* directory */
@@ -436,6 +439,25 @@ Module::Module(const char *fn) {
                                         g->opt.level > 0 /* is optimized */,
                                         "-g", /* command line args */
                                         0 /* run time version */);
+#elif ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // LLVM 3.4-3.9
+            diCompileUnit =
+            diBuilder->createCompileUnit(llvm::dwarf::DW_LANG_C99,  /* lang */
+                                         name,  /* filename */
+                                         directory, /* directory */
+                                         producerString, /* producer */
+                                         g->opt.level > 0 /* is optimized */,
+                                         "-g", /* command line args */
+                                         0 /* run time version */);
+#elif ISPC_LLVM_VERSION >= ISPC_LLVM_4_0 // LLVM 4.0+
+            auto srcFile = diBuilder->createFile(name, directory);
+            diCompileUnit =
+            diBuilder->createCompileUnit(llvm::dwarf::DW_LANG_C99,  /* lang */
+                                         srcFile,  /* filename */
+                                         producerString, /* producer */
+                                         g->opt.level > 0 /* is optimized */,
+                                         "-g", /* command line args */
+                                         0 /* run time version */);
+#endif
        }
    }
    else
@@ -734,7 +756,7 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
                                            sym->type->GetDIType(file),
                                            (sym->storageClass == SC_STATIC),
                                            sym_const_storagePtr);
-#else // LLVM 3.7+
+#elif ISPC_LLVM_VERSION >= ISPC_LLVM_3_7 && ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // LLVM 3.7 - 3.9
        llvm::DIFile *file = pos.GetDIFile();
        //llvm::MDFile *file = pos.GetDIFile();
        llvm::Constant *sym_const_storagePtr = llvm::dyn_cast<llvm::Constant>(sym->storagePtr);
@@ -748,6 +770,20 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
                                            sym->type->GetDIType(file),
                                            (sym->storageClass == SC_STATIC),
                                            sym_const_storagePtr);
+#else // LLVM 4.0+
+        llvm::DIFile *file = pos.GetDIFile();
+        //llvm::MDFile *file = pos.GetDIFile();
+        llvm::GlobalVariable *sym_GV_storagePtr = llvm::dyn_cast<llvm::GlobalVariable>(sym->storagePtr);
+        Assert(sym_GV_storagePtr);
+        llvm::DIGlobalVariableExpression *var = diBuilder->createGlobalVariableExpression(
+                                            file,
+                                            name,
+                                            name,
+                                            file,
+                                            pos.first_line,
+                                            sym->type->GetDIType(file),
+                                            (sym->storageClass == SC_STATIC));
+        sym_GV_storagePtr->addDebugInfo(var);
 #endif
 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
        Assert(var.Verify());
--- a/opt.cpp
+++ b/opt.cpp
@@ -503,8 +503,13 @@ DebugPassManager::add(llvm::Pass * P, int stage = -1) {
        if (g->debug_stages.find(number) != g->debug_stages.end()) {
            // adding dump of LLVM IR after optimization
            char buf[100];
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
            sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n",
                number, P->getPassName());
+#else // LLVM 4.0+
+            sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n",
+                number, P->getPassName().data());
+#endif
            PM.add(CreateDebugPass(buf));
        }

@@ -943,7 +948,11 @@ class IntrinsicsOpt : public llvm::BasicBlockPass {
 public:
    IntrinsicsOpt() : BasicBlockPass(ID) {};

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    const char *getPassName() const { return "Intrinsics Cleanup Optimization"; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "Intrinsics Cleanup Optimization"; }
+#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);

    static char ID;
@@ -1257,7 +1266,11 @@ public:
    InstructionSimplifyPass()
        : BasicBlockPass(ID) { }

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    const char *getPassName() const { return "Vector Select Optimization"; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "Vector Select Optimization"; }
+#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);

    static char ID;
@@ -1426,7 +1439,11 @@ public:
    static char ID;
    ImproveMemoryOpsPass() : BasicBlockPass(ID) { }

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    const char *getPassName() const { return "Improve Memory Ops"; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "Improve Memory Ops"; }
+#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);
 };

@@ -3278,7 +3295,11 @@ public:
    static char ID;
    GatherCoalescePass() : BasicBlockPass(ID) { }

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    const char *getPassName() const { return "Gather Coalescing"; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "Gather Coalescing"; }
+#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);
 };

@@ -4336,7 +4357,11 @@ public:
    static char ID;
    ReplacePseudoMemoryOpsPass() : BasicBlockPass(ID) { }

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    const char *getPassName() const { return "Replace Pseudo Memory Ops"; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "Replace Pseudo Memory Ops"; }
+#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);
 };

@@ -4705,7 +4730,11 @@ public:
        isLastTry = last;
    }

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    const char *getPassName() const { return "Resolve \"is compile time constant\""; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "Resolve \"is compile time constant\""; }
+#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);

    bool isLastTry;
@@ -4800,7 +4829,11 @@ public:
        sprintf(str_output, "%s", output);
    }

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    const char *getPassName() const { return "Dump LLVM IR"; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "Dump LLVM IR"; }
+#endif
    bool runOnModule(llvm::Module &m);

 private:
@@ -4846,7 +4879,11 @@ public:
        AU.setPreservesCFG();
    }

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    const char *getPassName() const { return "Make internal funcs \"static\""; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "Make internal funcs \"static\""; }
+#endif
    bool runOnModule(llvm::Module &m);
 };

@@ -4953,7 +4990,11 @@ class PeepholePass : public llvm::BasicBlockPass {
 public:
    PeepholePass();

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    const char *getPassName() const { return "Peephole Optimizations"; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "Peephole Optimizations"; }
+#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);

    static char ID;
@@ -5354,7 +5395,11 @@ public:
    ReplaceStdlibShiftPass() : BasicBlockPass(ID) {
    }

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "Resolve \"replace extract insert chains\""; }
+#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);

 };
@@ -5453,7 +5498,11 @@ public:
    static char ID;
    FixBooleanSelectPass() :FunctionPass(ID) {}

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
+#else // LLVM 4.0+
+    llvm::StringRef getPassName() const { return "Resolve \"replace extract insert chains\""; }
+#endif
    bool runOnFunction(llvm::Function &F);

 private:
--- a/prepro.py
+++ b/prepro.py
@@ -0,0 +1,189 @@
+import re
+import sys
+
+def floating2float(function, idx):
+    typ = 'floating<' + str(idx) + '>'
+    return function.replace(typ, 'float')
+
+def floating2double(function, idx):
+    typ = 'floating<' + str(idx) + '>'
+    return function.replace(typ, 'double')
+
+def number2float(function, idx):
+    typ = 'number<' + str(idx) + '>'
+    return function.replace(typ, 'float')
+
+def number2double(function, idx):
+    typ = 'number<' + str(idx) + '>'
+    return function.replace(typ, 'double')
+
+def number2int(function, idx):
+    typ = 'number<' + str(idx) + '>'
+    return function.replace(typ, 'int')
+
+def number2long(function, idx):
+    typ = 'number<' + str(idx) + '>'
+    return function.replace(typ, 'long')
+
+def number2short(function, idx):
+    typ = 'number<' + str(idx) + '>'
+    return function.replace(typ, 'short')
+
+def number2char(function, idx):
+    typ = 'number<' + str(idx) + '>'
+    return function.replace(typ, 'char')
+
+def integer2int(function, idx):
+    typ = 'integer<' + str(idx) + '>'
+    return function.replace(typ, 'int')
+
+def integer2long(function, idx):
+    typ = 'integer<' + str(idx) + '>'
+    return function.replace(typ, 'long')
+
+def integer2short(function, idx):
+    typ = 'integer<' + str(idx) + '>'
+    return function.replace(typ, 'short')
+
+def integer2char(function, idx):
+    typ = 'integer<' + str(idx) + '>'
+    return function.replace(typ, 'char')
+
+def strip_types(fun, i):
+    return fun.replace('export', '').replace('void', '').replace('float', '') \
+            .replace('double', '').replace('char','').replace('short','') \
+            .replace('int','').replace('long','').replace('[]','') \
+            .replace('*','').replace('(','_' + str(i) + '(') \
+            .replace('uniform','')
+
+if __name__ == '__main__':
+    if (len(sys.argv) != 2):
+        print 'usage:\n\tpython ',
+        print sys.argv[0],
+        print '[file.ispc]'
+        exit(1)
+
+    f = open(sys.argv[1], 'r')
+    function = f.read()
+    fun_def = re.findall(r'export [^{]*\)', function)[0]
+    print fun_def
+    f.close()
+
+    floating = 0
+    number = 0
+    integer = 0
+
+    while ('floating<' + str(floating) + '>' in function):
+        floating += 1
+
+    while ('number<' + str(number) + '>' in function):
+        number += 1
+
+    while ('integer<' + str(integer) + '>' in function):
+        integer += 1
+
+    for i in range(len(re.findall(r'floating[\s+]', function))):
+        function = function.replace('floating ',
+                                    'floating<' + str(floating) + '> ', 1)
+        fun_def = fun_def.replace('floating ',
+                                  'floating<' + str(floating) + '> ', 1)
+        floating += 1
+
+    for i in range(len(re.findall(r'number[\s+]', function))):
+        function = function.replace('number ',
+                                    'number<' + str(number) + '> ', 1)
+        fun_def = fun_def.replace('number ',
+                                  'number<' + str(number) + '> ', 1)
+        number += 1
+
+    for i in range(len(re.findall(r'integer[\s+]', function))):
+        function = function.replace('integer ',
+                                    'integer<' + str(integer) + '> ', 1)
+        fun_def = fun_def.replace('integer ',
+                                  'integer<' + str(integer) + '> ', 1)
+        integer += 1
+
+    floats = [[(fun_def, function)]]
+
+    for i in range(floating):
+        floats.append([])
+        for (h, f) in floats[i]:
+            floats[i+1].append((floating2float(h, i),
+                                floating2float(f, i)
+                               ))
+            floats[i+1].append((floating2double(h, i),
+                                floating2double(f, i)
+                               ))
+
+    numbers = [[]]
+
+    for f in floats[floating]:
+        numbers[0].append(f)
+
+    for i in range(number):
+        numbers.append([])
+        for (h, f) in numbers[i]:
+            numbers[i+1].append((number2float(h, i),
+                                 number2float(f, i)
+                               ))
+            numbers[i+1].append((number2double(h, i),
+                                 number2double(f, i)
+                               ))
+            numbers[i+1].append((number2int(h, i),
+                                 number2int(f, i)
+                               ))
+            numbers[i+1].append((number2long(h, i),
+                                 number2long(f, i)
+                               ))
+            numbers[i+1].append((number2short(h, i),
+                                 number2short(f, i)
+                               ))
+            numbers[i+1].append((number2char(h, i),
+                                 number2char(f, i)
+                               ))
+
+    integers = [[]]
+
+    for f in numbers[number]:
+        integers[0].append(f)
+
+    for i in range(integer):
+        integers.append([])
+        for (h, f) in integers[i]:
+            integers[i+1].append((integer2int(h, i),
+                                 integer2int(f, i)
+                                ))
+            integers[i+1].append((integer2long(h, i),
+                                 integer2long(f, i)
+                                ))
+            integers[i+1].append((integer2short(h, i),
+                                 integer2short(f, i)
+                                ))
+            integers[i+1].append((integer2char(h, i),
+                                 integer2char(f, i)
+                                ))
+
+    o = open(sys.argv[1] + '.pre.ispc', 'w')
+    hdr = open(sys.argv[1] + '.h', 'w')
+
+    hdr.write('#include "' + '.'.join(sys.argv[1].split('.')[:-1]) + '.h"\n\n')
+    hdr.write('#ifndef _' + sys.argv[1].upper().replace('.', '_') + '_H_\n')
+    hdr.write('#define _' + sys.argv[1].upper().replace('.', '_') + '_H_\n')
+    hdr.write('namespace ispc {\n')
+
+    fun_name = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*\(', function)[0][:-1]
+
+    i = 0
+    for (h, f) in integers[integer]:
+        o.write(f.replace(fun_name, fun_name+'_'+str(i), 1))
+        hdr.write(h.replace('uniform','').replace('export','') \
+                  + '\n{\n\treturn ' + strip_types(h, i) + ';\n}\n\n')
+
+        i += 1
+
+    o.close()
+    hdr.write('}\n')
+    hdr.write('#endif\n')
+    hdr.close()
+
+    exit(0)
--- a/tests/masked-scatter-struct.ispc
+++ b/tests/masked-scatter-struct.ispc
@@ -5,8 +5,8 @@ struct Foo { float x; float y; };

 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    float a = aa[programIndex];
-    uniform Foo foo[programCount];
-    for (uniform int i = 0; i < programCount; ++i) {
+    uniform Foo foo[programCount+1];
+    for (uniform int i = 0; i < programCount+1; ++i) {
        foo[i].x = i;
        foo[i].y = -1234 + i;
    }
--- a/tests/masked-scatter-vector.ispc
+++ b/tests/masked-scatter-vector.ispc
@@ -5,8 +5,8 @@ typedef int<3> int3;

 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    float a = aa[programIndex];
-    uniform int3 array[programCount];
-    for (uniform int i = 0; i < programCount + 5 - b; ++i) {
+    uniform int3 array[programCount+1];
+    for (uniform int i = 0; i < programCount + 6 - b; ++i) {
        for (uniform int j = 0; j < 3; ++j)
            array[i][j] = i+100*j;
    }
--- a/tests_ispcpp/hello.cpp
+++ b/tests_ispcpp/hello.cpp
@@ -0,0 +1,23 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "hello.ispc.h"
+
+int main() {
+    float A[100];
+    float B[100];
+    double result[100];
+
+    for (int i=0; i<100; i++) {
+        A[i] = 100 - i;
+        B[i] = i*i;
+    }
+
+    ispc::saxpy(100, 3.1415926535, (float*)&A, (float*)&B, (double*)&result);
+
+    for (int i=0; i<100; i++) {
+        printf("%.6f\n", result[i]);
+    }
+
+    return 0;
+}
--- a/tests_ispcpp/hello.ispc
+++ b/tests_ispcpp/hello.ispc
@@ -0,0 +1,11 @@
+export void saxpy(uniform int N,
+                  uniform floating<0> scale,
+                  uniform floating<1> X[],
+                  uniform floating<1> Y[],
+                  uniform floating<2> result[])
+{
+    foreach (i = 0 ... N) {
+        floating<2> tmp = scale * X[i] + Y[i];
+        result[i] = tmp;
+    }
+}
--- a/type.cpp
+++ b/type.cpp
@@ -541,6 +541,8 @@ llvm::DIType *AtomicType::GetDIType(llvm::DIScope *scope) const {
 #else //LLVM 3.7++
            return NULL;
 #endif
+
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
        case TYPE_BOOL:
            return m->diBuilder->createBasicType("bool", 32 /* size */, 32 /* align */,
                                                 llvm::dwarf::DW_ATE_unsigned);
@@ -585,6 +587,53 @@ llvm::DIType *AtomicType::GetDIType(llvm::DIScope *scope) const {
            return m->diBuilder->createBasicType("uint64", 64 /* size */, 64 /* align */,
                                                 llvm::dwarf::DW_ATE_unsigned);
            break;
+#else // LLVM 4.0+
+        case TYPE_BOOL:
+            return m->diBuilder->createBasicType("bool", 32 /* size */,
+                                                 llvm::dwarf::DW_ATE_unsigned);
+            break;
+        case TYPE_INT8:
+            return m->diBuilder->createBasicType("int8", 8 /* size */,
+                                                 llvm::dwarf::DW_ATE_signed);
+            break;
+        case TYPE_UINT8:
+            return m->diBuilder->createBasicType("uint8", 8 /* size */,
+                                                 llvm::dwarf::DW_ATE_unsigned);
+            break;
+        case TYPE_INT16:
+            return m->diBuilder->createBasicType("int16", 16 /* size */,
+                                                 llvm::dwarf::DW_ATE_signed);
+            break;
+        case TYPE_UINT16:
+            return m->diBuilder->createBasicType("uint16", 16 /* size */,
+                                                 llvm::dwarf::DW_ATE_unsigned);
+            break;
+        case TYPE_INT32:
+            return m->diBuilder->createBasicType("int32", 32 /* size */,
+                                                 llvm::dwarf::DW_ATE_signed);
+            break;
+        case TYPE_UINT32:
+            return m->diBuilder->createBasicType("uint32", 32 /* size */,
+                                                 llvm::dwarf::DW_ATE_unsigned);
+            break;
+        case TYPE_FLOAT:
+            return m->diBuilder->createBasicType("float", 32 /* size */,
+                                                 llvm::dwarf::DW_ATE_float);
+            break;
+        case TYPE_DOUBLE:
+            return m->diBuilder->createBasicType("double", 64 /* size */,
+                                                 llvm::dwarf::DW_ATE_float);
+            break;
+        case TYPE_INT64:
+            return m->diBuilder->createBasicType("int64", 64 /* size */,
+                                                 llvm::dwarf::DW_ATE_signed);
+            break;
+        case TYPE_UINT64:
+            return m->diBuilder->createBasicType("uint64", 64 /* size */,
+                                                 llvm::dwarf::DW_ATE_unsigned);
+            break;
+#endif
+
        default:
            FATAL("unhandled basic type in AtomicType::GetDIType()");
 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
@@ -2307,9 +2356,15 @@ llvm::DIType *StructType::GetDIType(llvm::DIScope *scope) const {
        llvm::DIFile *diFile = elementPositions[i].GetDIFile();
        llvm::DIDerivedType *fieldType =
 #endif
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
            m->diBuilder->createMemberType(scope, elementNames[i], diFile,
                                           line, eltSize, eltAlign,
                                           currentSize, 0, eltType);
+#else // LLVM 4.0+
+            m->diBuilder->createMemberType(scope, elementNames[i], diFile,
+                                           line, eltSize, eltAlign,
+                                           currentSize, llvm::DINode::FlagZero, eltType);
+#endif
        elementLLVMTypes.push_back(fieldType);

        currentSize += eltSize;
@@ -2334,7 +2389,11 @@ llvm::DIType *StructType::GetDIType(llvm::DIScope *scope) const {
        pos.first_line, // Line number
        currentSize,    // Size in bits
        align,          // Alignment in bits
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
        0,              // Flags
+#else // LLVM 4.0+
+        llvm::DINode::FlagZero, // Flags
+#endif
 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_3 && ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
        llvm::DIType(), // DerivedFrom
 #elif ISPC_LLVM_VERSION >= ISPC_LLVM_3_7 // LLVM 3.7++
@@ -2584,7 +2643,11 @@ llvm::DIType *UndefinedStructType::GetDIType(llvm::DIScope *scope) const {
        pos.first_line, // Line number
        0,              // Size
        0,              // Align
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
        0,              // Flags
+#else // LLVM 4.0+
+        llvm::DINode::FlagZero, // Flags
+#endif
 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_3 && ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
        llvm::DIType(), // DerivedFrom
 #elif ISPC_LLVM_VERSION >= ISPC_LLVM_3_7 // LLVM 3.7+
Author	SHA1	Message	Date
Aaron Gutierrez	4182fa2967	python regex-based preprocessor proof of concept	2017-04-18 22:28:48 -04:00
Aaron Gutierrez	d6cf38a929	Ignore llvm build directory	2017-04-11 14:08:00 -04:00
Dmitry Babokin	8c97883317	Merge pull request #1264 from dbabokin/attributelist Renaming AttributeSet to AttributeList to follow trunk changes.	2017-03-28 17:02:28 -07:00
Dmitry Babokin	455a29c491	Renaming AttributeSet to AttributeList to follow trunk changes.	2017-03-28 16:58:49 -07:00
Dmitry Babokin	a618ad45bf	Merge pull request #1263 from dbabokin/cbackend Better fix for cbackend.	2017-03-22 13:29:06 -07:00
Dmitry Babokin	0ff8ae4596	Better fix cbackend.	2017-03-22 13:27:26 -07:00
Dmitry Babokin	a5b689439b	Merge pull request #1262 from dbabokin/trunk_fix Trunk fix	2017-03-22 12:33:39 -07:00
Dmitry Babokin	f9947541a1	Whitespace fixes	2017-03-22 12:32:26 -07:00
Dmitry Babokin	c2b2b38081	Fix for trunk. Probably it's temporary if they fix -- operator for arg_iterator in trunk.	2017-03-22 12:31:41 -07:00
Dmitry Babokin	7884c7da04	Merge pull request #1260 from dbabokin/alloy For naming folders with llvm use dot instead of underscore.	2017-03-02 13:25:21 -08:00
Dmitry Babokin	b471e97a10	For naming folders with llvm use dot instead of underscore.	2017-03-02 13:24:15 -08:00
Dmitry Babokin	611fe0bc42	Merge pull request #1259 from dbabokin/llvm50 Enabling LLVM 5.0 and making fixes to track changes in LLVM for the past couple months.	2017-03-01 11:39:53 -08:00
Dmitry Babokin	6d649e1dff	Enabling LLVM 5.0 and making fixes to track changes in LLVM for the past couple months. The changes are tested with LLVM 3.9, 4.0 and trunk on MacOS (sse4, avx2, skx).	2017-03-01 11:10:34 -08:00
Dmitry Babokin	d0bfe7738a	Merge pull request #1245 from dbabokin/git Adding support for git repository instead of svn.	2016-12-01 22:11:58 +03:00
Dmitry Babokin	95d33554db	Merge pull request #1244 from dbabokin/trunk_fix Fix for trunk - change in DIBuilder interface	2016-12-01 22:10:45 +03:00
Dmitry Babokin	4298e3d0cd	Fix for trunk - change in DIBuilder interface	2016-12-01 22:00:36 +03:00
Dmitry Babokin	a7fd70fa21	Adding support for using git repository instead of svn. This is experimental for now, but going forward this will become primary way of working with LLVM, as they are going to switch to git in not too distant future.	2016-12-01 18:10:57 +03:00
Dmitry Babokin	60dc47e0a6	Merge pull request #1242 from dbabokin/fixes SVML support for AVX512 and a couple of script fixes	2016-12-01 00:33:34 +03:00
Dmitry Babokin	ff298f21b7	Adding SVML support to AVX512 targets	2016-11-30 05:27:10 +03:00
Dmitry Babokin	f04a04a7e3	Set sysroot for CMake build on MacOS	2016-11-29 21:04:46 +03:00
Dmitry Babokin	39e7f0c2d4	3.9.0 is better choice for us. 3.9.1 has couple regressions	2016-11-29 21:03:53 +03:00
Dmitry Babokin	726b260cd5	Merge pull request #1236 from suluke/llvm_change/BitcodeWriter Support llvm 4.0: Bitcode/ReaderWriter.h -> BitCode/BitcodeWriter.h	2016-11-15 02:02:54 +03:00
Lukas Böhm	6a8ce4b412	Apply Bitcode/ReaderWriter renaming in builtins.cpp This also fixes usage of parseBitcodeFile after [r286752](https://reviews.llvm.org/D26562)	2016-11-14 23:13:08 +01:00
Lukas Böhm	32626ea9e3	Support llvm 4.0: Bitcode/ReaderWriter.h -> BitCode/BitcodeWriter.h	2016-11-14 21:38:25 +01:00
Dmitry Babokin	d4a8afd6e8	Merge pull request #1230 from Shishpan/trunkFix Trunk fix for Rev.283004	2016-10-05 14:29:21 +03:00
Andrey Shishpanov	8acfd92f92	Trunk fix for Rev.283004	2016-10-05 14:17:14 +03:00
Dmitry Babokin	7fb4188f51	Merge pull request #1229 from Shishpan/trunkFix Trunk fix for Rev.281284-281285.	2016-09-26 20:47:23 +03:00
Andrey Shishpanov	8b525bb8bc	Trunk fix for Rev.281284-281285.	2016-09-26 20:24:36 +03:00
Dmitry Babokin	a86a16600b	Merge pull request #1228 from Shishpan/trunkFix Trunk fix for Rev.280686.	2016-09-07 14:11:37 +03:00
Andrey Shishpanov	d0341754d6	Trunk fix for Rev.280686.	2016-09-07 13:08:04 +03:00
Dmitry Babokin	f968bc1b2a	Merge pull request #1227 from ned14/arm-neon-code-quality-fix Fix ARM NEON output not always being inlined. Also improved scope for ARM NEON optimisation by LLVM, gained about 2% on my code here.	2016-09-05 18:07:46 +03:00
Niall Douglas (s [underscore] sourceforge {at} nedprod [dot] com)	7af7659ac2	Fix ARM NEON output not always being inlined. Also improved scope for ARM NEON optimisation by LLVM, gained about 2% on my code here.	2016-09-05 15:56:25 +01:00
Dmitry Babokin	a6952fd651	Merge pull request #1226 from dbabokin/test-fix Fixing off by 1 access to local array.	2016-08-31 19:47:51 +03:00
Dmitry Babokin	4c7fb35f57	Fixing off by 1 access to local array.	2016-08-31 19:38:33 +03:00
Dmitry Babokin	87efb27dc5	Merge pull request #1225 from dbabokin/llvm40 Adding support for LLVM 4.0 (trunk)	2016-07-20 22:19:56 +03:00
Dmitry Babokin	45b306480e	-Adding support for LLVM 4.0 -Switching 3.9 support to branch/release_39 -Switching 3.8 support to tags/release_381	2016-07-20 22:16:50 +03:00
Dmitry Babokin	2a68fc6c48	Merge pull request #1222 from dbabokin/192dev Bumping version to 1.9.2dev	2016-07-08 22:03:49 +03:00
Dmitry Babokin	30d88e1683	Bumping version to 1.9.2dev	2016-07-08 21:44:59 +03:00
Dmitry Babokin	a97a69c96e	Typo in Release Notes	2016-07-08 19:52:22 +03:00