diff --git a/.gitignore b/.gitignore index 0469cf7d..429199bb 100644 --- a/.gitignore +++ b/.gitignore @@ -3,13 +3,20 @@ depend ispc ispc_test +ispc_ref objs docs/doxygen docs/*.html tests*/*cpp tests*/*run +logs/ +notify_log.log +alloy_results_* examples/*/*.png examples/*/*.ppm examples/*/objs/* +examples/*/ref +examples/*/test +*.swp diff --git a/Makefile b/Makefile index fab66b58..aba1cdd4 100644 --- a/Makefile +++ b/Makefile @@ -39,6 +39,10 @@ LLVM_CONFIG=$(shell which /usr/local/llvm-3.3/bin/llvm-config) CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir) +# Enable ARM by request +# To enable: make ARM_ENABLED=1 +ARM_ENABLED=0 + # Add llvm bin to the path so any scripts run will go to the right llvm-config LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir) export PATH:=$(LLVM_BIN):$(PATH) @@ -55,12 +59,15 @@ LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags) LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//) LLVM_VERSION_DEF=-D$(LLVM_VERSION) -LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker arm nvptx +LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker nvptx # Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step. # We check if it's available before adding it (to not break 3.2 and earlier). ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1) LLVM_COMPONENTS+=option endif +ifneq ($(ARM_ENABLED), 0) + LLVM_COMPONENTS+=arm +endif LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS)) CLANG=clang @@ -72,6 +79,10 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \ ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \ -lpthread +ifeq ($(LLVM_VERSION),LLVM_3_4) + ISPC_LIBS += -lcurses +endif + ifeq ($(ARCH_OS),Linux) ISPC_LIBS += -ldl endif @@ -102,8 +113,16 @@ CXX=g++ CPP=cpp OPT=-O2 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE) \ - -Wall $(LLVM_VERSION_DEF) \ - -DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" + $(LLVM_VERSION_DEF) \ + -Wall \ + -DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" \ + -Wno-sign-compare +ifneq ($(LLVM_VERSION),LLVM_3_1) + CXXFLAGS+=-Werror +endif +ifneq ($(ARM_ENABLED), 0) + CXXFLAGS+=-DISPC_ARM_ENABLED +endif LDFLAGS= ifeq ($(ARCH_OS),Linux) @@ -122,8 +141,12 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=nvptx64 neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \ +TARGETS=nvptx64 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ + sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \ generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 +ifneq ($(ARM_ENABLED), 0) + TARGETS+=neon-32 neon-16 neon-8 +endif # These files need to be compiled in two versions - 32 and 64 bits. BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) # These are files to be compiled in single version. @@ -132,12 +155,12 @@ BUILTINS_OBJS_32=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-32bi BUILTINS_OBJS_64=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-64bit.o))) BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_COMMON:.ll=.o))) \ $(BUILTINS_OBJS_32) $(BUILTINS_OBJS_64) \ - builtins-c-32.cpp builtins-c-64.cpp + builtins-c-32.cpp builtins-c-64.cpp BISON_SRC=parse.yy FLEX_SRC=lex.ll OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \ - stdlib_generic_ispc.o stdlib_x86_ispc.o \ + stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o stdlib_mask64_ispc.o \ $(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o)) default: ispc @@ -223,15 +246,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc @echo Compiling $< $(CXX) $(CXXFLAGS) -o $@ -c $< -objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 $(wildcard builtins/*common.ll) +objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@ -objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll) +objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< \(32 bit version\) m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@ -objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll) +objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll) @echo Creating C++ source from builtins definition file $< \(64 bit version\) m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@ @@ -243,12 +266,27 @@ objs/builtins-c-64.cpp: builtins/builtins.c @echo Creating C++ source from builtins definition file $< $(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@ -objs/stdlib_generic_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< for generic - $(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ - python stdlib2cpp.py generic > $@ +objs/stdlib_mask1_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask1 + $(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 $< -o - | \ + python stdlib2cpp.py mask1 > $@ -objs/stdlib_x86_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< for x86 - $(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \ - python stdlib2cpp.py x86 > $@ +objs/stdlib_mask8_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask8 + $(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 $< -o - | \ + python stdlib2cpp.py mask8 > $@ + +objs/stdlib_mask16_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask16 + $(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 $< -o - | \ + python stdlib2cpp.py mask16 > $@ + +objs/stdlib_mask32_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask32 + $(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 $< -o - | \ + python stdlib2cpp.py mask32 > $@ + +objs/stdlib_mask64_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask64 + $(CLANG) -E -x c -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 $< -o - | \ + python stdlib2cpp.py mask64 > $@ diff --git a/alloy.py b/alloy.py new file mode 100755 index 00000000..cda51d70 --- /dev/null +++ b/alloy.py @@ -0,0 +1,660 @@ +#!/usr/bin/python +# +# Copyright (c) 2013, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# // Author: Filippov Ilia + +def attach_mail_file(msg, filename, name): + if os.path.exists(filename): + fp = open(filename, "rb") + to_attach = MIMEBase("application", "octet-stream") + to_attach.set_payload(fp.read()) + encode_base64(to_attach) + to_attach.add_header("Content-Disposition", "attachment", filename=name) + fp.close() + msg.attach(to_attach) + +def setting_paths(llvm, ispc, sde): + if llvm != "": + os.environ["LLVM_HOME"]=llvm + if ispc != "": + os.environ["ISPC_HOME"]=ispc + if sde != "": + os.environ["SDE_HOME"]=sde + +def check_LLVM(which_LLVM): + answer = [] + if which_LLVM[0] == " ": + return answer + p = os.environ["LLVM_HOME"] + for i in range(0,len(which_LLVM)): + if not os.path.exists(p + os.sep + "bin-" + which_LLVM[i] + os.sep + "bin"): + answer.append(which_LLVM[i]) + return answer + +def try_do_LLVM(text, command, from_validation): + if from_validation == True: + text = text + "\n" + print_debug("Trying to " + text, from_validation, alloy_build) + if os.system(command + " >> " + alloy_build + " 2>> " + alloy_build) != 0: + print_debug("ERROR.\n", from_validation, alloy_build) + error("can't " + text, 1) + print_debug("DONE.\n", from_validation, alloy_build) + +def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_validation, force, make): + print_debug("Building LLVM. Version: " + version_LLVM + ". ", from_validation, alloy_build) + if revision != "": + print_debug("Revision: " + revision + ".\n", from_validation, alloy_build) + else: + print_debug("\n", from_validation, alloy_build) + # Here we understand what and where do we want to build + current_path = os.getcwd() + llvm_home = os.environ["LLVM_HOME"] + os.chdir(llvm_home) + FOLDER_NAME=version_LLVM + if version_LLVM == "trunk": + SVN_PATH="trunk" + if version_LLVM == "3.3": + SVN_PATH="tags/RELEASE_33/final" + version_LLVM = "3_3" + if version_LLVM == "3.2": + SVN_PATH="tags/RELEASE_32/final" + version_LLVM = "3_2" + if version_LLVM == "3.1": + SVN_PATH="tags/RELEASE_31/final" + version_LLVM = "3_1" + if revision != "": + FOLDER_NAME = FOLDER_NAME + "_" + revision + revision = "-" + revision + if folder == "": + folder = FOLDER_NAME + LLVM_SRC="llvm-" + folder + LLVM_BUILD="build-" + folder + LLVM_BIN="bin-" + folder + if os.path.exists(LLVM_BIN + os.sep + "bin") and not force: + error("you have folder " + LLVM_BIN + ".\nIf you want to rebuild use --force", 1) + LLVM_BUILD_selfbuild = LLVM_BUILD + "_temp" + LLVM_BIN_selfbuild = LLVM_BIN + "_temp" + common.remove_if_exists(LLVM_SRC) + common.remove_if_exists(LLVM_BUILD) + common.remove_if_exists(LLVM_BIN) + if selfbuild: + common.remove_if_exists(LLVM_BUILD_selfbuild) + common.remove_if_exists(LLVM_BIN_selfbuild) + print_debug("Using folders: " + LLVM_SRC + " " + LLVM_BUILD + " " + LLVM_BIN + " in " + + llvm_home + "\n", from_validation, alloy_build) + # load llvm + if tarball == "": + try_do_LLVM("load LLVM from http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " ", + "svn co " + revision + " http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " " + LLVM_SRC, + from_validation) + os.chdir(LLVM_SRC + "/tools") + try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ", + "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang", + from_validation) + os.chdir("../") + else: + tar = tarball.split(" ") + os.makedirs(LLVM_SRC) + os.chdir(LLVM_SRC) + try_do_LLVM("untar LLVM from " + tar[0] + " ", + "tar -xvzf " + tar[0] + " --strip-components 1", from_validation) + os.chdir("./tools") + os.makedirs("clang") + os.chdir("./clang") + try_do_LLVM("untar clang from " + tar[1] + " ", + "tar -xvzf " + tar[1] + " --strip-components 1", from_validation) + os.chdir("../../") + # paching llvm + patches = glob.glob(os.environ["ISPC_HOME"] + "/llvm_patches/*.*") + for patch in patches: + if version_LLVM in os.path.basename(patch): + try_do_LLVM("patch LLVM with patch" + patch + " ", "patch -p0 < " + patch, from_validation) + os.chdir("../") + # configuring llvm, build first part of selfbuild + os.makedirs(LLVM_BUILD) + os.makedirs(LLVM_BIN) + selfbuild_compiler = "" + if selfbuild: + print_debug("Making selfbuild and use folders " + LLVM_BUILD_selfbuild + " and " + + LLVM_BIN_selfbuild + "\n", from_validation, alloy_build) + os.makedirs(LLVM_BUILD_selfbuild) + os.makedirs(LLVM_BIN_selfbuild) + os.chdir(LLVM_BUILD_selfbuild) + try_do_LLVM("configure release version for selfbuild ", + "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" + + LLVM_BIN_selfbuild + " --enable-optimized", + from_validation) + try_do_LLVM("build release version for selfbuild ", + make, from_validation) + try_do_LLVM("install release version for selfbuild ", + "make install", + from_validation) + os.chdir("../") + selfbuild_compiler = " CC="+llvm_home+ "/" + LLVM_BIN_selfbuild + "/bin/clang" + print_debug("Now we have compiler for selfbuild: " + selfbuild_compiler + "\n", from_validation, alloy_build) + os.chdir(LLVM_BUILD) + if debug == False: + try_do_LLVM("configure release version ", + "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" + + LLVM_BIN + " --enable-optimized" + selfbuild_compiler, + from_validation) + else: + try_do_LLVM("configure debug version ", + "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" + LLVM_BIN + + " --enable-debug-runtime --enable-debug-symbols --enable-keep-symbols" + selfbuild_compiler, + from_validation) + # building llvm + try_do_LLVM("build LLVM ", make, from_validation) + try_do_LLVM("install LLVM ", "make install", from_validation) + os.chdir(current_path) + +def check_targets(): + answer = [] + answer_sde = [] + SSE2 = False; + SSE4 = False; + AVX = False; + AVX11 = False; + AVX2 = False; + if current_OS == "Linux": + cpu = open("/proc/cpuinfo") + f_lines = cpu.readlines() + cpu.close() + # check what native targets do we have + for i in range(0,len(f_lines)): + if SSE2 == False and "sse2" in f_lines[i]: + SSE2 = True; + answer = answer + ["sse2-i32x4", "sse2-i32x8"] + if SSE4 == False and "sse4_1" in f_lines[i]: + SSE4 = True; + answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"] + if AVX == False and "avx" in f_lines[i]: + AVX = True; + answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"] + if AVX11 == False and "rdrand" in f_lines[i]: + AVX11 = True; + answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"] + if AVX2 == False and "avx2" in f_lines[i]: + AVX2 = True; + answer = answer + ["avx2-i32x8", "avx2-i32x16"] + if current_OS == "MacOS": + f_lines = take_lines("sysctl machdep.cpu.features", "first") + if "SSE2" in f_lines: + SSE2 = True; + answer = answer + ["sse2-i32x4", "sse2-i32x8"] + if "SSE4.1" in f_lines: + SSE4 = True; + answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"] + if "AVX1.0" in f_lines: + AVX = True; + answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"] + if "RDRAND" in f_lines: + AVX11 = True; + answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"] + if "AVX2.0" in f_lines: + AVX2 = True; + answer = answer + ["avx2-i32x8", "avx2-i32x16"] + + answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"] + # now check what targets we have with the help of SDE + sde_exists = "" + PATH_dir = string.split(os.getenv("PATH"), os.pathsep) + for counter in PATH_dir: + if os.path.exists(counter + os.sep + "sde") and sde_exists == "": + sde_exists = counter + os.sep + "sde" + if os.environ.get("SDE_HOME") != None: + if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"): + sde_exists = os.environ.get("SDE_HOME") + os.sep + "sde" + if sde_exists == "": + error("you haven't got sde neither in SDE_HOME nor in your PATH.\n" + + "To test all platforms please set SDE_HOME to path containing SDE.\n" + + "Please refer to http://www.intel.com/software/sde for SDE download information.", 2) + return [answer, answer_sde] + # here we have SDE + f_lines = take_lines(sde_exists + " -help", "all") + for i in range(0,len(f_lines)): + if SSE4 == False and "wsm" in f_lines[i]: + answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]] + if AVX == False and "snb" in f_lines[i]: + answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]] + if AVX11 == False and "ivb" in f_lines[i]: + answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"]] + if AVX2 == False and "hsw" in f_lines[i]: + answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]] + return [answer, answer_sde] + +def build_ispc(version_LLVM, make): + current_path = os.getcwd() + os.chdir(os.environ["ISPC_HOME"]) + p_temp = os.getenv("PATH") + os.environ["PATH"] = os.environ["LLVM_HOME"] + "/bin-" + version_LLVM + "/bin:" + os.environ["PATH"] + try_do_LLVM("clean ISPC for building", "make clean", True) + try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", make, True) + os.environ["PATH"] = p_temp + os.chdir(current_path) + +def execute_stability(stability, R, print_version): + stability1 = copy.deepcopy(stability) + temp = run_tests.run_tests(stability1, [], print_version) + for j in range(0,4): + R[j][0] = R[j][0] + temp[j] + for i in range(0,len(temp[j])): + R[j][1].append(temp[4]) + number_of_fails = temp[5] + number_of_new_fails = len(temp[0]) + len(temp[1]) + if number_of_fails == 0: + str_fails = ". No fails" + else: + str_fails = ". Fails: " + str(number_of_fails) + if number_of_new_fails == 0: + str_new_fails = ", No new fails.\n" + else: + str_new_fails = ", New fails: " + str(number_of_new_fails) + ".\n" + print_debug(temp[4][1:-3] + str_fails + str_new_fails, False, stability_log) + +def run_special_tests(): + i = 5 + +def validation_run(only, only_targets, reference_branch, number, notify, update, make): + os.chdir(os.environ["ISPC_HOME"]) + os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"] + if options.notify != "": + common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "notify_log.log") + smtp_server = os.environ["SMTP_ISPC"] + msg = MIMEMultipart() + msg['Subject'] = 'ISPC test system results' + msg['From'] = 'ISPC_test_system' + msg['To'] = options.notify + print_debug("Command: " + ' '.join(sys.argv) + "\n", False, "") + print_debug("Folder: " + os.environ["ISPC_HOME"] + "\n", False, "") + date = datetime.datetime.now() + print_debug("Date: " + date.strftime('%H:%M %d/%m/%Y') + "\n", False, "") + class options_for_drivers: + pass +# *** *** *** +# Stability validation run +# *** *** *** + if ((("stability" in only) == True) or ("performance" in only) == False): + print_debug("\n\nStability validation run\n\n", False, "") + stability = options_for_drivers() +# stability constant options + stability.random = False + stability.ispc_flags = "" + stability.compiler_exe = None + stability.num_jobs = 1024 + stability.verbose = False + stability.time = False + stability.non_interactive = True + stability.update = update + stability.include_file = None + stability.silent = True + stability.in_file = "." + os.sep + f_date + os.sep + "run_tests_log.log" + stability.verify = False +# stability varying options + stability.target = "" + stability.arch = "" + stability.no_opt = False + stability.wrapexe = "" +# prepare parameters of run + [targets_t, sde_targets_t] = check_targets() + rebuild = True + opts = [] + archs = [] + LLVM = [] + targets = [] + sde_targets = [] +# parsing option only, update parameters of run + if "-O2" in only: + opts.append(False) + if "-O0" in only: + opts.append(True) + if "x86" in only and not ("x86-64" in only): + archs.append("x86") + if "x86-64" in only: + archs.append("x86-64") + if "native" in only: + sde_targets_t = [] + for i in ["3.1", "3.2", "3.3", "trunk"]: + if i in only: + LLVM.append(i) + if "current" in only: + LLVM = [" "] + rebuild = False + else: + common.check_tools(1) + if only_targets != "": + only_targets += " " + only_targets = only_targets.replace("generic "," generic-4 generic-16 ") + only_targets_t = only_targets.split(" ") + for i in only_targets_t: + if i == "": + continue + err = True + for j in range(0,len(targets_t)): + if i in targets_t[j]: + targets.append(targets_t[j]) + err = False + for j in range(0,len(sde_targets_t)): + if i in sde_targets_t[j][1]: + sde_targets.append(sde_targets_t[j]) + err = False + if err == True: + error("You haven't sde for target " + i, 1) + else: + targets = targets_t[:-4] + sde_targets = sde_targets_t + if "build" in only: + targets = [] + sde_targets = [] + only = only + " stability " +# finish parameters of run, prepare LLVM + if len(opts) == 0: + opts = [False] + if len(archs) == 0: + archs = ["x86", "x86-64"] + if len(LLVM) == 0: + LLVM = ["3.3", "trunk"] + gen_archs = ["x86-64"] + need_LLVM = check_LLVM(LLVM) + for i in range(0,len(need_LLVM)): + build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make) +# begin validation run for stabitily + common.remove_if_exists(stability.in_file) + R = [[[],[]],[[],[]],[[],[]],[[],[]]] + print_debug("\n_________________________STABILITY REPORT_________________________\n", False, stability_log) + for i in range(0,len(LLVM)): + print_version = 2 + if rebuild: + build_ispc(LLVM[i], make) + for j in range(0,len(targets)): + stability.target = targets[j] + stability.wrapexe = "" + if "generic" in targets[j]: + arch = gen_archs + else: + arch = archs + for i1 in range(0,len(arch)): + for i2 in range(0,len(opts)): + stability.arch = arch[i1] + stability.no_opt = opts[i2] + execute_stability(stability, R, print_version) + print_version = 0 + for j in range(0,len(sde_targets)): + stability.target = sde_targets[j][1] + stability.wrapexe = os.environ["SDE_HOME"] + "/sde " + sde_targets[j][0] + " -- " + for i1 in range(0,len(archs)): + for i2 in range(0,len(opts)): + stability.arch = archs[i1] + stability.no_opt = opts[i2] + execute_stability(stability, R, print_version) + print_version = 0 +# run special tests like embree +# + run_special_tests() + ttt = ["NEW RUNFAILS: ", "NEW COMPFAILS: ", "NEW PASSES RUNFAILS: ", "NEW PASSES COMPFAILS: "] + for j in range(0,4): + if len(R[j][0]) == 0: + print_debug("NO " + ttt[j][:-2] + "\n", False, stability_log) + else: + print_debug(ttt[j] + str(len(R[j][0])) + "\n", False, stability_log) + temp5 = [[],[]] + for i in range(0,len(R[j][0])): + er = True + for k in range(0,len(temp5[0])): + if R[j][0][i] == temp5[0][k]: + temp5[1][k].append(R[j][1][i]) + er = False + if er == True: + temp5[0].append(R[j][0][i]) + temp5[1].append([R[j][1][i]]) + for i in range(0,len(temp5[0])): + print_debug("\t" + temp5[0][i] + "\n", True, stability_log) + for k in range(0,len(temp5[1][i])): + print_debug("\t\t\t" + temp5[1][i][k], True, stability_log) + print_debug("__________________Watch stability.log for details_________________\n", False, stability_log) + if options.notify != "": + attach_mail_file(msg, stability.in_file, "run_tests_log.log") + attach_mail_file(msg, stability_log, "stability.log") + +# *** *** *** +# Performance validation run +# *** *** *** + if ((("performance" in only) == True) or ("stability" in only) == False): + print_debug("\n\nPerformance validation run\n\n", False, "") + common.check_tools(1) + performance = options_for_drivers() +# performance constant options + performance.number = number + performance.config = "./perf.ini" + performance.path = "./" + performance.silent = True + performance.output = "" + performance.compiler = "" + performance.ref = "ispc_ref" + performance.in_file = "." + os.sep + f_date + os.sep + "performance.log" +# prepare LLVM 3.3 as newest LLVM + need_LLVM = check_LLVM(["3.3"]) + if len(need_LLVM) != 0: + build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make) +# prepare reference point. build both test and reference compilers + try_do_LLVM("apply git", "git branch", True) + temp4 = take_lines("git branch", "all") + for line in temp4: + if "*" in line: + current_branch = line[2:-1] + stashing = True + sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n") + if "No local changes" in take_lines("git stash", "first"): + stashing = False + #try_do_LLVM("stash current branch ", "git stash", True) + try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True) + sys.stdout.write(".\n") + build_ispc("3.3", make) + sys.stdout.write(".\n") + os.rename("ispc", "ispc_ref") + try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True) + if stashing: + try_do_LLVM("return current branch ", "git stash pop", True) + sys.stdout.write("You can interrupt script now.\n") + build_ispc("3.3", make) +# begin validation run for performance. output is inserted into perf() + perf.perf(performance, []) + if options.notify != "": + attach_mail_file(msg, performance.in_file, "performance.log") + attach_mail_file(msg, "." + os.sep + "logs" + os.sep + "perf_build.log", "perf_build.log") + +# sending e-mail with results + if options.notify != "": + fp = open(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", 'rb') + f_lines = fp.readlines() + fp.close() + line = "" + for i in range(0,len(f_lines)): + line = line + f_lines[i][:-1] + line = line + ' \n' + text = MIMEText(line, "", "KOI-8") + msg.attach(text) + attach_mail_file(msg, alloy_build, "alloy_build.log") + s = smtplib.SMTP(smtp_server) + s.sendmail('ISPC_test_system', options.notify, msg.as_string()) + s.quit() + +def Main(): + global current_OS + if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True: + current_OS = "Windows" + error("Windows isn't supported now", 1) + else: + if (platform.system() == 'Darwin'): + current_OS = "MacOS" + else: + current_OS = "Linux" + + if (options.build_llvm == False and options.validation_run == False): + parser.print_help() + exit(0) + + setting_paths(options.llvm_home, options.ispc_home, options.sde_home) + if os.environ.get("LLVM_HOME") == None: + error("you have no LLVM_HOME", 1) + if os.environ.get("ISPC_HOME") == None: + error("you have no ISPC_HOME", 1) + if options.notify != "": + if os.environ.get("SMTP_ISPC") == None: + error("you have no SMTP_ISPC in your environment for option notify", 1) + if options.only != "": + test_only_r = " 3.1 3.2 3.3 trunk current build stability performance x86 x86-64 -O0 -O2 native " + test_only = options.only.split(" ") + for iterator in test_only: + if not (" " + iterator + " " in test_only_r): + error("unknow option for only: " + iterator, 1) + + global f_date + f_date = "logs" + common.remove_if_exists(f_date) + os.makedirs(f_date) + global alloy_build + alloy_build = os.getcwd() + os.sep + f_date + os.sep + "alloy_build.log" + global stability_log + stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log" + current_path = os.getcwd() + make = "make -j" + options.speed + try: + if options.build_llvm: + build_LLVM(options.version, options.revision, options.folder, options.tarball, + options.debug, options.selfbuild, False, options.force, make) + if options.validation_run: + validation_run(options.only, options.only_targets, options.branch, + options.number_for_performance, options.notify, options.update, make) + finally: + os.chdir(current_path) + date_name = "alloy_results_" + datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S') + if os.path.exists(date_name): + error("It's forbidden to run alloy two times in a second, logs are in ./logs", 1) + os.rename(f_date, date_name) + print_debug("Logs are in " + date_name + "\n", False, "") + +###Main### +from optparse import OptionParser +from optparse import OptionGroup +import sys +import os +import operator +import time +import glob +import string +import platform +import smtplib +import datetime +import copy +from email.MIMEMultipart import MIMEMultipart +from email.MIMEBase import MIMEBase +from email.mime.text import MIMEText +from email.Encoders import encode_base64 +# our drivers +import run_tests +import perf +import common +error = common.error +take_lines = common.take_lines +print_debug = common.print_debug +# parsing options +class MyParser(OptionParser): + def format_epilog(self, formatter): + return self.epilog +examples = ("Examples:\n" + +"Load and build LLVM from trunk\n\talloy.py -b\n" + +"Load and build LLVM 3.3. Rewrite LLVM folders\n\talloy.py -b --version=3.3 --force\n" + +"Untar files llvm.tgz clang.tgz, build LLVM from them in folder bin-from_tar\n\talloy.py -b --tarball='llvm.tgz clang.tgz' --folder=from_tar\n" + +"Load LLVM from trunk, revision r172870. Build it. Do selfbuild\n\talloy.py -b --revision=r172870 --selfbuild\n" + +"Validation run with LLVM 3.3, trunk; x86, x86-64; -O2;\nall supported targets; performance\n\talloy.py -r\n" + +"Validation run with all avx targets and sse4-i8x16 without performance\n\talloy.py -r --only=stability --only-targets='avx sse4-i8x16'\n" + +"Validation run with avx2-i32x8, all sse4 and sse2 targets\nand all targets with i32x16\n\talloy.py -r --only-targets='avx2-i32x8 sse4 i32x16 sse2'\n" + +"Stability validation run with LLVM 3.2, 3.3; -O0; x86,\nupdate fail_db.txt with passes and fails\n\talloy.py -r --only='3.2 -O0 stability 3.3 x86' --update-errors=FP\n" + +"Try to build compiler with all LLVM\n\talloy.py -r --only=build\n" + +"Performance validation run with 10 runs of each test and comparing to branch 'old'\n\talloy.py -r --only=performance --compare-with=old --number=10\n" + +"Validation run. Update fail_db.txt with new fails, send results to my@my.com\n\talloy.py -r --update-errors=F --notify='my@my.com'\n") +parser = MyParser(usage="Usage: alloy.py -r/-b [options]", epilog=examples) +parser.add_option('-b', '--build-llvm', dest='build_llvm', + help='ask to build LLVM', default=False, action="store_true") +parser.add_option('-r', '--run', dest='validation_run', + help='ask for validation run', default=False, action="store_true") +parser.add_option('-j', dest='speed', + help='set -j for make', default="8") +# options for activity "build LLVM" +llvm_group = OptionGroup(parser, "Options for building LLVM", + "These options must be used with -b option.") +llvm_group.add_option('--version', dest='version', + help='version of llvm to build: 3.1 3.2 3.3 trunk. Default: trunk', default="trunk") +llvm_group.add_option('--revision', dest='revision', + help='revision of llvm to build in format r172870', default="") +llvm_group.add_option('--debug', dest='debug', + help='debug build of LLVM?', default=False, action="store_true") +llvm_group.add_option('--folder', dest='folder', + help='folder to build LLVM in', default="") +llvm_group.add_option('--tarball', dest='tarball', + help='"llvm_tarball clang_tarball"', default="") +llvm_group.add_option('--selfbuild', dest='selfbuild', + help='make selfbuild of LLVM and clang', default=False, action="store_true") +llvm_group.add_option('--force', dest='force', + help='rebuild LLVM', default=False, action='store_true') +parser.add_option_group(llvm_group) +# options for activity "validation run" +run_group = OptionGroup(parser, "Options for validation run", + "These options must be used with -r option.") +run_group.add_option('--compare-with', dest='branch', + help='set performance reference point. Dafault: master', default="master") +run_group.add_option('--number', dest='number_for_performance', + help='number of performance runs for each test. Default: 5', default=5) +run_group.add_option('--notify', dest='notify', + help='email to sent results to', default="") +run_group.add_option('--update-errors', dest='update', + help='rewrite fail_db.txt file according to received results (F or FP)', default="") +run_group.add_option('--only-targets', dest='only_targets', + help='set list of targets to test. Possible values - all subnames of targets.', + default="") +run_group.add_option('--only', dest='only', + help='set types of tests. Possible values:\n' + + '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' + + 'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).', + default="") +parser.add_option_group(run_group) +# options for activity "setup PATHS" +setup_group = OptionGroup(parser, "Options for setup", + "These options must be use with -r or -b to setup environment variables") +setup_group.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="") +setup_group.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="") +setup_group.add_option('--sde_home', dest='sde_home',help='path to SDE',default="") +parser.add_option_group(setup_group) +(options, args) = parser.parse_args() +Main() diff --git a/builtins.cpp b/builtins.cpp index 4b91ba30..48ce9afb 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -112,10 +112,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64; // varying - if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType && - t == LLVMTypes::MaskType) - return AtomicType::VaryingBool; - else if (t == LLVMTypes::Int8VectorType) + if (t == LLVMTypes::Int8VectorType) return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8; else if (t == LLVMTypes::Int16VectorType) return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16; @@ -127,6 +124,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return AtomicType::VaryingDouble; else if (t == LLVMTypes::Int64VectorType) return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64; + else if (t == LLVMTypes::MaskType) + return AtomicType::VaryingBool; // pointers to uniform else if (t == LLVMTypes::Int8PointerType) @@ -303,6 +302,7 @@ lCheckModuleIntrinsics(llvm::Module *module) { // check the llvm.x86.* intrinsics for now... if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) { llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID(); + if (id == 0) fprintf(stderr, "FATAL: intrinsic is not found: %s \n", funcName.c_str()); Assert(id != 0); llvm::Type *intrinsicType = llvm::Intrinsic::getType(*g->ctx, id); @@ -488,7 +488,6 @@ lSetInternalFunctions(llvm::Module *module) { "__num_cores", "__packed_load_active", "__packed_store_active", - "__pause", "__popcnt_int32", "__popcnt_int64", "__prefetch_read_uniform_1", @@ -502,6 +501,8 @@ lSetInternalFunctions(llvm::Module *module) { "__rdrand_i64", "__reduce_add_double", "__reduce_add_float", + "__reduce_add_int8", + "__reduce_add_int16", "__reduce_add_int32", "__reduce_add_int64", "__reduce_equal_double", @@ -576,20 +577,34 @@ lSetInternalFunctions(llvm::Module *module) { "__stdlib_pow", "__stdlib_powf", "__stdlib_sin", + "__stdlib_asin", "__stdlib_sincos", "__stdlib_sincosf", "__stdlib_sinf", "__stdlib_tan", "__stdlib_tanf", - "__svml_sin", - "__svml_cos", - "__svml_sincos", - "__svml_tan", - "__svml_atan", - "__svml_atan2", - "__svml_exp", - "__svml_log", - "__svml_pow", + "__svml_sind", + "__svml_asind", + "__svml_cosd", + "__svml_acosd", + "__svml_sincosd", + "__svml_tand", + "__svml_atand", + "__svml_atan2d", + "__svml_expd", + "__svml_logd", + "__svml_powd", + "__svml_sinf", + "__svml_asinf", + "__svml_cosf", + "__svml_acosf", + "__svml_sincosf", + "__svml_tanf", + "__svml_atanf", + "__svml_atan2f", + "__svml_expf", + "__svml_logf", + "__svml_powf", "__undef_uniform", "__undef_varying", "__vec4_add_float", @@ -640,7 +655,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length, llvm::Triple bcTriple(bcModule->getTargetTriple()); Debug(SourcePos(), "module triple: %s\nbitcode triple: %s\n", mTriple.str().c_str(), bcTriple.str().c_str()); -#ifndef __arm__ +#if defined(ISPC_ARM_ENABLED) && !defined(__arm__) // FIXME: More ugly and dangerous stuff. We really haven't set up // proper build and runtime infrastructure for ispc to do // cross-compilation, yet it's at minimum useful to be able to emit @@ -656,8 +671,12 @@ AddBitcodeToModule(const unsigned char *bitcode, int length, // the values for an ARM target. This maybe won't cause problems // in the generated code, since bulitins.c doesn't do anything too // complex w.r.t. struct layouts, etc. - if (g->target->getISA() != Target::NEON && + if (g->target->getISA() != Target::NEON32 && + g->target->getISA() != Target::NEON16 && + g->target->getISA() != Target::NEON8 && g->target->getISA() != Target::NVPTX64) +#else + if (g->target->getISA() != Target::NVPTX64) #endif // !__arm__ { Assert(bcTriple.getArch() == llvm::Triple::UnknownArch || @@ -831,15 +850,35 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } break; }; - case Target::NEON: { +#ifdef ISPC_ARM_ENABLED + case Target::NEON8: { if (runtime32) { - EXPORT_MODULE(builtins_bitcode_neon_32bit); + EXPORT_MODULE(builtins_bitcode_neon_8_32bit); } else { - EXPORT_MODULE(builtins_bitcode_neon_64bit); + EXPORT_MODULE(builtins_bitcode_neon_8_64bit); } break; } + case Target::NEON16: { + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_neon_16_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_neon_16_64bit); + } + break; + } + case Target::NEON32: { + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_neon_32_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_neon_32_64bit); + } + break; + } +#endif case Target::SSE2: { switch (g->target->getVectorWidth()) { case 4: @@ -875,10 +914,31 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod break; case 8: if (runtime32) { - EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit); + if (g->target->getMaskBitCount() == 16) { + EXPORT_MODULE(builtins_bitcode_sse4_16_32bit); + } + else { + Assert(g->target->getMaskBitCount() == 32); + EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit); + } } else { - EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit); + if (g->target->getMaskBitCount() == 16) { + EXPORT_MODULE(builtins_bitcode_sse4_16_64bit); + } + else { + Assert(g->target->getMaskBitCount() == 32); + EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit); + } + } + break; + case 16: + Assert(g->target->getMaskBitCount() == 8); + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_sse4_8_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_sse4_8_64bit); } break; default: @@ -888,6 +948,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } case Target::AVX: { switch (g->target->getVectorWidth()) { + case 4: + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit); + } + break; case 8: if (runtime32) { EXPORT_MODULE(builtins_bitcode_avx1_32bit); @@ -1050,16 +1118,33 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // If the user wants the standard library to be included, parse the // serialized version of the stdlib.ispc file to get its // definitions added. + extern char stdlib_mask1_code[], stdlib_mask8_code[]; + extern char stdlib_mask16_code[], stdlib_mask32_code[], stdlib_mask64_code[]; if (g->target->getISA() == Target::GENERIC && - g->target->getVectorWidth() != 1) { // 1 wide uses x86 stdlib - extern char stdlib_generic_code[]; - yy_scan_string(stdlib_generic_code); - yyparse(); + g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib + yy_scan_string(stdlib_mask32_code); } else { - extern char stdlib_x86_code[]; - yy_scan_string(stdlib_x86_code); - yyparse(); + switch (g->target->getMaskBitCount()) { + case 1: + yy_scan_string(stdlib_mask1_code); + break; + case 8: + yy_scan_string(stdlib_mask8_code); + break; + case 16: + yy_scan_string(stdlib_mask16_code); + break; + case 32: + yy_scan_string(stdlib_mask32_code); + break; + case 64: + yy_scan_string(stdlib_mask64_code); + break; + default: + FATAL("Unhandled mask bit size for stdlib.ispc"); + } } + yyparse(); } } diff --git a/builtins/dispatch.ll b/builtins/dispatch.ll index f1d5a969..ba216df7 100644 --- a/builtins/dispatch.ll +++ b/builtins/dispatch.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2011, Intel Corporation +;; Copyright (c) 2011-2013, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without @@ -41,15 +41,13 @@ @__system_best_isa = internal global i32 -1 -declare void @abort() noreturn - ;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the ;; following code... Specifically, __get_system_isa should return a value ;; corresponding to one of the Target::ISA enumerant values that gives the ;; most capable ISA that the curremt system can run. ;; -;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum -;; backwards compatibility for anyone building ispc with LLVM 3.0 +;; Note: clang from LLVM 3.1 should be used if this is updated, for maximum +;; backwards compatibility for anyone building ispc with LLVM 3.1 ;; ;; #include ;; #include @@ -60,7 +58,7 @@ declare void @abort() noreturn ;; : "0" (infoType)); ;; } ;; -;; /* Save %ebx in case it's the PIC register */ +;; // Save %ebx in case it's the PIC register. ;; static void __cpuid_count(int info[4], int level, int count) { ;; __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t" ;; "cpuid\n\t" @@ -69,13 +67,23 @@ declare void @abort() noreturn ;; : "0" (level), "2" (count)); ;; } ;; +;; static int __os_has_avx_support() { +;; // Check xgetbv; this uses a .byte sequence instead of the instruction +;; // directly because older assemblers do not include support for xgetbv and +;; // there is no easy way to conditionally compile based on the assembler used. +;; int rEAX, rEDX; +;; __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0)); +;; return (rEAX & 6) == 6; +;; } +;; ;; int32_t __get_system_isa() { ;; int info[4]; ;; __cpuid(info, 1); ;; -;; /* NOTE: the values returned below must be the same as the -;; corresponding enumerant values in Target::ISA. */ -;; if ((info[2] & (1 << 28)) != 0) { +;; // NOTE: the values returned below must be the same as the +;; // corresponding enumerant values in Target::ISA. +;; if ((info[2] & (1 << 28)) != 0 && +;; __os_has_avx_support()) { ;; if ((info[2] & (1 << 29)) != 0 && // F16C ;; (info[2] & (1 << 30)) != 0) { // RDRAND ;; // So far, so good. AVX2? @@ -98,47 +106,56 @@ declare void @abort() noreturn ;; abort(); ;; } -define i32 @__get_system_isa() nounwind uwtable ssp { +define i32 @__get_system_isa() nounwind uwtable { entry: %0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind %asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2 %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3 %and = and i32 %asmresult5.i, 268435456 %cmp = icmp eq i32 %and, 0 - br i1 %cmp, label %if.else13, label %if.then + br i1 %cmp, label %if.else14, label %land.lhs.true -if.then: ; preds = %entry - %1 = and i32 %asmresult5.i, 1610612736 - %2 = icmp eq i32 %1, 1610612736 - br i1 %2, label %if.then7, label %return +land.lhs.true: ; preds = %entry + %1 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind + %asmresult.i25 = extractvalue { i32, i32 } %1, 0 + %and.i = and i32 %asmresult.i25, 6 + %cmp.i = icmp eq i32 %and.i, 6 + br i1 %cmp.i, label %if.then, label %if.else14 -if.then7: ; preds = %if.then - %3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind - %asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1 - %and10 = lshr i32 %asmresult4.i28, 5 - %4 = and i32 %and10, 1 - %5 = add i32 %4, 3 +if.then: ; preds = %land.lhs.true + %2 = and i32 %asmresult5.i, 1610612736 + %3 = icmp eq i32 %2, 1610612736 + br i1 %3, label %if.then8, label %return + +if.then8: ; preds = %if.then + %4 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind + %asmresult4.i30 = extractvalue { i32, i32, i32, i32 } %4, 1 + %and11 = lshr i32 %asmresult4.i30, 5 + %5 = and i32 %and11, 1 + %6 = add i32 %5, 3 br label %return -if.else13: ; preds = %entry - %and15 = and i32 %asmresult5.i, 524288 - %cmp16 = icmp eq i32 %and15, 0 - br i1 %cmp16, label %if.else18, label %return +if.else14: ; preds = %land.lhs.true, %entry + %and16 = and i32 %asmresult5.i, 524288 + %cmp17 = icmp eq i32 %and16, 0 + br i1 %cmp17, label %if.else19, label %return -if.else18: ; preds = %if.else13 - %and20 = and i32 %asmresult6.i, 67108864 - %cmp21 = icmp eq i32 %and20, 0 - br i1 %cmp21, label %if.else23, label %return +if.else19: ; preds = %if.else14 + %and21 = and i32 %asmresult6.i, 67108864 + %cmp22 = icmp eq i32 %and21, 0 + br i1 %cmp22, label %if.else24, label %return -if.else23: ; preds = %if.else18 +if.else24: ; preds = %if.else19 tail call void @abort() noreturn nounwind unreachable -return: ; preds = %if.else18, %if.else13, %if.then7, %if.then - %retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ] +return: ; preds = %if.else19, %if.else14, %if.then8, %if.then + %retval.0 = phi i32 [ %6, %if.then8 ], [ 2, %if.then ], [ 1, %if.else14 ], [ 0, %if.else19 ] ret i32 %retval.0 } +declare void @abort() noreturn nounwind + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; This function is called by each of the dispatch functions we generate; ;; it sets @__system_best_isa if it is unset. diff --git a/builtins/svml.m4 b/builtins/svml.m4 new file mode 100644 index 00000000..0a587577 --- /dev/null +++ b/builtins/svml.m4 @@ -0,0 +1,217 @@ +;; copyright stub :) +;; Copyright (c) 2013, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +;; svml macro + +;; svml_stubs : stubs for svml calls +;; $1 - type ("float" or "double") +;; $2 - svml internal function suffix ("f" for float, "d" for double) +;; $3 - vector width +define(`svml_stubs',` + declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline + declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline + declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline +') + +;; svml_declare : declaration of __svml_* intrinsics +;; $1 - type ("float" or "double") +;; $2 - __svml_* intrinsic function suffix +;; float: "f4"(sse) "f8"(avx) "f16"(avx512) +;; double: "2"(sse) "4"(avx) "8"(avx512) +;; $3 - vector width +define(`svml_declare',` + declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_sincos$2(<$3 x $1> *, <$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone + declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone +'); + +;; defintition of __svml_* internal functions +;; $1 - type ("float" or "double") +;; $2 - __svml_* intrinsic function suffix +;; float: "f4"(sse) "f8"(avx) "f16"(avx512) +;; double: "2"(sse) "4"(avx) "8"(avx512) +;; $3 - vector width +;; $4 - svml internal function suffix ("f" for float, "d" for double) +define(`svml_define',` + define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0) + ret <$3 x $1> %ret + } + define <$3 x $1> @__svml_asin$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_asin$2(<$3 x $1> %0) + ret <$3 x $1> %ret + } + + define <$3 x $1> @__svml_cos$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_cos$2(<$3 x $1> %0) + ret <$3 x $1> %ret + } + + define void @__svml_sincos$4(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline { + %s = call <$3 x $1> @__svml_sincos$2(<$3 x $1> * %2, <$3 x $1> %0) + store <$3 x $1> %s, <$3 x $1> * %1 + ret void + } + + define <$3 x $1> @__svml_tan$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_tan$2(<$3 x $1> %0) + ret <$3 x $1> %ret + } + + define <$3 x $1> @__svml_atan$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_atan$2(<$3 x $1> %0) + ret <$3 x $1> %ret + } + + define <$3 x $1> @__svml_atan2$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_atan2$2(<$3 x $1> %0, <$3 x $1> %1) + ret <$3 x $1> %ret + } + + define <$3 x $1> @__svml_exp$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_exp$2(<$3 x $1> %0) + ret <$3 x $1> %ret + } + + define <$3 x $1> @__svml_log$4(<$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_log$2(<$3 x $1> %0) + ret <$3 x $1> %ret + } + + define <$3 x $1> @__svml_pow$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline { + %ret = call <$3 x $1> @__svml_pow$2(<$3 x $1> %0, <$3 x $1> %1) + ret <$3 x $1> %ret + } +') + + +;; svml_define_x : defintition of __svml_* internal functions operation on extended width +;; $1 - type ("float" or "double") +;; $2 - __svml_* intrinsic function suffix +;; float: "f4"(sse) "f8"(avx) "f16"(avx512) +;; double: "2"(sse) "4"(avx) "8"(avx512) +;; $3 - vector width +;; $4 - svml internal function suffix ("f" for float, "d" for double) +;; $5 - extended width, must be at least twice the native vector width +;; contigent on existing of unary$3to$5 and binary$3to$5 macros + +;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g. +;;define void @__svml_sincosf(<8 x float>, <8 x float> *, +;; <8 x float> *) nounwind readnone alwaysinline { +;; ; call svml_sincosf4 two times with the two 4-wide sub-vectors +;; %a = shufflevector <8 x float> %0, <8 x float> undef, +;; <4 x i32> +;; %b = shufflevector <8 x float> %0, <8 x float> undef, +;; <4 x i32> +;; +;; %cospa = alloca <4 x float> +;; %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) +;; +;; %cospb = alloca <4 x float> +;; %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) +;; +;; %sin = shufflevector <4 x float> %sa, <4 x float> %sb, +;; <8 x i32> +;; store <8 x float> %sin, <8 x float> * %1 +;; +;; %cosa = load <4 x float> * %cospa +;; %cosb = load <4 x float> * %cospb +;; %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, +;; <8 x i32> +;; store <8 x float> %cos, <8 x float> * %2 +;; +;; ret void +;;} +define(`svml_define_x',` + define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_sin$2, %0) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_asin$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_asin$2, %0) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_cos$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_cos$2, %0) + ret <$5 x $1> %ret + } + define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline + { + %s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0) + %c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0) + store <$5 x $1> %s, <$5 x $1> * %1 + store <$5 x $1> %c, <$5 x $1> * %2 + ret void + } + define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_tan$2, %0) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_atan$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_atan$2, %0) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_atan2$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline { + binary$3to$5(ret, $1, @__svml_atan2$2, %0, %1) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_exp$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_exp$2, %0) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_log$4(<$5 x $1>) nounwind readnone alwaysinline { + unary$3to$5(ret, $1, @__svml_log$2, %0) + ret <$5 x $1> %ret + } + define <$5 x $1> @__svml_pow$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline { + binary$3to$5(ret, $1, @__svml_pow$2, %0, %1) + ret <$5 x $1> %ret + } +') + diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index dcbe0a66..1d317713 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1) ret double %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 8c6b7753..f8fd5cd5 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -137,19 +137,14 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; svml -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones 4x with our 16-wide -; vectors... +include(`svml.m4') +;; single precision +svml_declare(float,f8,8) +svml_define_x(float,f8,8,f,16) -declare <16 x float> @__svml_sin(<16 x float>) -declare <16 x float> @__svml_cos(<16 x float>) -declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) -declare <16 x float> @__svml_tan(<16 x float>) -declare <16 x float> @__svml_atan(<16 x float>) -declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) -declare <16 x float> @__svml_exp(<16 x float>) -declare <16 x float> @__svml_log(<16 x float>) -declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) +;; double precision +svml_declare(double,4,4) +svml_define_x(double,4,4,d,16) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max @@ -271,6 +266,33 @@ reduce_equal(16) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int32 ops +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline { + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <16 x i16> @__add_varying_i16(<16 x i16>, + <16 x i16>) nounwind readnone alwaysinline { + %r = add <16 x i16> %0, %1 + ret <16 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline { + reduce16(i16, @__add_varying_i16, @__add_uniform_i16) +} + define <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) nounwind readnone alwaysinline { %s = add <16 x i32> %0, %1 diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index e6ab3a4b..196e5ea4 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -137,19 +137,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; svml -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones twice with our 8-wide -; vectors... +include(`svml.m4') +;; single precision +svml_declare(float,f8,8) +svml_define(float,f8,8,f) -declare <8 x float> @__svml_sin(<8 x float>) -declare <8 x float> @__svml_cos(<8 x float>) -declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) -declare <8 x float> @__svml_tan(<8 x float>) -declare <8 x float> @__svml_atan(<8 x float>) -declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) -declare <8 x float> @__svml_exp(<8 x float>) -declare <8 x float> @__svml_log(<8 x float>) -declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) +;; double precision +svml_declare(double,4,4) +svml_define_x(double,4,4,d,8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max @@ -217,7 +212,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { ret float %sum } - define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { reduce8(float, @__min_varying_float, @__min_uniform_float) } @@ -229,6 +223,42 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline { reduce_equal(8) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int8 ops + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int16 ops + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int32 ops @@ -257,20 +287,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__max_varying_int32, @__max_uniform_int32) } - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; horizontal uint32 ops - define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32) } - define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32) } - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal double ops @@ -329,9 +353,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline { } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; horizontal uint64 ops - define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline { reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) } diff --git a/builtins/target-avx1-i64x4.ll b/builtins/target-avx1-i64x4.ll new file mode 100644 index 00000000..d183f1ce --- /dev/null +++ b/builtins/target-avx1-i64x4.ll @@ -0,0 +1,81 @@ +;; Copyright (c) 2013, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include(`target-avx1-i64x4base.ll') + +rdrand_decls() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int min/max + +define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %call +} +define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1) + + ret <4 x i32> %call +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unsigned int min/max + +define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %call +} + +define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +ifelse(NO_HALF_DECLARES, `1', `', ` +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll new file mode 100644 index 00000000..e1832030 --- /dev/null +++ b/builtins/target-avx1-i64x4base.ll @@ -0,0 +1,513 @@ +;; Copyright (c) 2013, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Basic 4-wide definitions + +define(`WIDTH',`4') +define(`MASK',`i64') +include(`util.m4') + +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-avx-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline { + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + + %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0) + ; do one N-R iteration + %v_iv = fmul <4 x float> %0, %call + %two_minus = fsub <4 x float> , %v_iv + %iv_mul = fmul <4 x float> %call, %two_minus + ret <4 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline { + ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8) + ret <4 x float> %call +} + +define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline { + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9) + ret <4 x float> %call +} + +define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline { + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10) + ret <4 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +;; avx intrinsic +declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone + +define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline { + %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8) + ret <4 x double> %call +} + +define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline { + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9 + %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9) + ret <4 x double> %call +} + + +define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline { + ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10 + %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10) + ret <4 x double> %call +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rsqrt + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <4 x float> %v, %is + %v_is_is = fmul <4 x float> %v_is, %is + %three_sub = fsub <4 x float> , %v_is_is + %is_mul = fmul <4 x float> %is, %three_sub + %half_scale = fmul <4 x float> , %is_mul + ret <4 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; sqrt + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0) + ret <4 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +;; avx§ intrinsic +declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone + +define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline { + %call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0) + ret <4 x double> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +include(`svml.m4') +;; single precision +svml_declare(float,f4,4) +svml_define(float,f4,4,f) + +;; double precision +svml_declare(double,4,4) +svml_define(double,4,4,d) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +;; sse intrinsics +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1) + ret <4 x float> %call +} + +define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline { + %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1) + ret <4 x float> %call +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops + +;; sse intrinsic +declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone + +define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i64> %0 to <4 x double> + %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone + %v64 = zext i32 %v to i64 + ret i64 %v64 +} + +define i1 @__any(<4 x i64>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i64> %0 to <4 x double> + %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all(<4 x i64>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i64> %0 to <4 x double> + %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone + %cmp = icmp eq i32 %v, 15 + ret i1 %cmp +} + +define i1 @__none(<4 x i64>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i64> %0 to <4 x double> + %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone + %cmp = icmp eq i32 %v, 0 + ret i1 %cmp +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal float ops + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone + +define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline { + %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0) + %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1) + %scalar = extractelement <4 x float> %v2, i32 0 + ret float %scalar +} + +define float @__reduce_min_float(<4 x float>) nounwind readnone { + reduce4(float, @__min_varying_float, @__min_uniform_float) +} + +define float @__reduce_max_float(<4 x float>) nounwind readnone { + reduce4(float, @__max_varying_float, @__max_uniform_float) +} + +reduce_equal(4) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int8 ops + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline +{ + %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int16 ops + +define internal <4 x i16> @__add_varying_i16(<4 x i16>, + <4 x i16>) nounwind readnone alwaysinline { + %r = add <4 x i16> %0, %1 + ret <4 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline { + reduce4(i16, @__add_varying_i16, @__add_uniform_i16) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int32 ops + +define <4 x i32> @__add_varying_int32(<4 x i32>, + <4 x i32>) nounwind readnone alwaysinline { + %s = add <4 x i32> %0, %1 + ret <4 x i32> %s +} + +define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline { + %s = add i32 %0, %1 + ret i32 %s +} + +define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__add_varying_int32, @__add_uniform_int32) +} + + +define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__min_varying_int32, @__min_uniform_int32) +} + + +define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline { + reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal double ops + +declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline { + %v0 = shufflevector <4 x double> %0, <4 x double> undef, + <4 x i32> + %v1 = shufflevector <4 x double> , <4 x double> undef, + <4 x i32> +;; %v1 = <4 x double> + %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1) + %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0) + %final0 = extractelement <4 x double> %sum1, i32 0 + %final1 = extractelement <4 x double> %sum1, i32 2 + %sum = fadd double %final0, %final1 + + ret double %sum +} + +define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline { + reduce4(double, @__min_varying_double, @__min_uniform_double) +} + + +define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline { + reduce4(double, @__max_varying_double, @__max_uniform_double) +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int64 ops + +define <4 x i64> @__add_varying_int64(<4 x i64>, + <4 x i64>) nounwind readnone alwaysinline { + %s = add <4 x i64> %0, %1 + ret <4 x i64> %s +} + +define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline { + %s = add i64 %0, %1 + ret i64 %s +} + +define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__add_varying_int64, @__add_uniform_int64) +} + + +define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__min_varying_int64, @__min_uniform_int64) +} + + +define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__max_varying_int64, @__max_uniform_int64) +} + + +define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + + +define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline { + reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + + +; no masked load instruction for i8 and i16 types?? +masked_load(i8, 1) +masked_load(i16, 2) + +;; avx intrinsics +declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask) +declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask) + +define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline { + %mask = trunc <4 x i64> %mask64 to <4 x i32> + %floatmask = bitcast <4 x i32> %mask to <4 x float> + %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask) + %retval = bitcast <4 x float> %floatval to <4 x i32> + ret <4 x i32> %retval +} + + +define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline { + %doublemask = bitcast <4 x i64> %mask to <4 x double> + %doubleval = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask) + %retval = bitcast <4 x double> %doubleval to <4 x i64> + ret <4 x i64> %retval +} + +masked_load_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +gen_masked_store(i8) +gen_masked_store(i16) + +; note that mask is the 2nd parameter, not the 3rd one!! +;; avx intrinsics +declare void @llvm.x86.avx.maskstore.ps (i8 *, <4 x float>, <4 x float>) +declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>) + +define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, + <4 x i64>) nounwind alwaysinline { + %mask32 = trunc <4 x i64> %2 to <4 x i32> + + %ptr = bitcast <4 x i32> * %0 to i8 * + %val = bitcast <4 x i32> %1 to <4 x float> + %mask = bitcast <4 x i32> %mask32 to <4 x float> + call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val) + ret void +} + +define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>, + <4 x i64>) nounwind alwaysinline { + %ptr = bitcast <4 x i64> * %0 to i8 * + %val = bitcast <4 x i64> %1 to <4 x double> + %mask = bitcast <4 x i64> %2 to <4 x double> + call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val) + ret void +} + + +masked_store_blend_8_16_by_4_mask64() + +;; sse intrinsic +declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, + <4 x float>) nounwind readnone + +define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, + <4 x i64>) nounwind alwaysinline { + %mask = trunc <4 x i64> %2 to <4 x i32> + %mask_as_float = bitcast <4 x i32> %mask to <4 x float> + %oldValue = load <4 x i32>* %0, align 4 + %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float> + %newAsFloat = bitcast <4 x i32> %1 to <4 x float> + %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat, + <4 x float> %newAsFloat, + <4 x float> %mask_as_float) + %blendAsInt = bitcast <4 x float> %blend to <4 x i32> + store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4 + ret void +} + +;; avx intrinsic +declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, + <4 x double>) nounwind readnone + +define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>, + <4 x i64>) nounwind alwaysinline { + %mask_as_double = bitcast <4 x i64> %2 to <4 x double> + %oldValue = load <4 x i64>* %0, align 4 + %oldAsDouble = bitcast <4 x i64> %oldValue to <4 x double> + %newAsDouble = bitcast <4 x i64> %1 to <4 x double> + %blend = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble, + <4 x double> %newAsDouble, + <4 x double> %mask_as_double) + %blendAsInt = bitcast <4 x double> %blend to <4 x i64> + store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4 + ret void +} + +masked_store_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; scatter + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone +declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline { + %call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1) + ret <4 x double> %call +} + +define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline { + %call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1) + ret <4 x double> %call +} + diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 9b747e2e..910565dd 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -310,6 +310,7 @@ declare double @round (double) nounwind readnone ;declare float @llvm.sqrt.f32(float %Val) declare double @llvm.sqrt.f64(double %Val) declare float @llvm.sin.f32(float %Val) +declare float @llvm.asin.f32(float %Val) declare float @llvm.cos.f32(float %Val) declare float @llvm.sqrt.f32(float %Val) declare float @llvm.exp.f32(float %Val) @@ -471,6 +472,15 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { ret i64 %call } +define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline { + %r = extractelement <1 x i8> %v, i32 0 + ret i8 %r +} + +define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline { + %r = extractelement <1 x i16> %v, i32 0 + ret i16 %r +} define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline { %r = extractelement <1 x float> %v, i32 0 @@ -642,7 +652,18 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; svml stuff -define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline { +declare <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline +declare void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline +declare <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline + +define <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0) ;ret <1 x float> %ret ;%r = extractelement <1 x float> %0, i32 0 @@ -653,7 +674,18 @@ define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline { } -define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm.asin.f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + unary1to1(float,@llvm.asin.f32) + +} + +define <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0) ;ret <1 x float> %ret ;%r = extractelement <1 x float> %0, i32 0 @@ -664,18 +696,18 @@ define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline { } -define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline { +define void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline { ; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0) ; store <1 x float> %s, <1 x float> * %1 ; ret void - %sin = call <1 x float> @__svml_sin (<1 x float> %0) - %cos = call <1 x float> @__svml_cos (<1 x float> %0) + %sin = call <1 x float> @__svml_sinf(<1 x float> %0) + %cos = call <1 x float> @__svml_cosf(<1 x float> %0) store <1 x float> %sin, <1 x float> * %1 store <1 x float> %cos, <1 x float> * %2 ret void } -define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0) ;ret <1 x float> %ret ;%r = extractelement <1 x float> %0, i32 0 @@ -687,7 +719,7 @@ define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline { ret <1 x float > %0 } -define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline { ; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0) ; ret <1 x float> %ret ;%r = extractelement <1 x float> %0, i32 0 @@ -700,7 +732,7 @@ define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline { } -define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1) ;ret <1 x float> %ret ;%y = extractelement <1 x float> %0, i32 0 @@ -713,19 +745,19 @@ define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone al ret <1 x float > %0 } -define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0) ;ret <1 x float> %ret unary1to1(float, @llvm.exp.f32) } -define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0) ;ret <1 x float> %ret unary1to1(float, @llvm.log.f32) } -define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline { +define <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline { ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1) ;ret <1 x float> %ret %r = extractelement <1 x float> %0, i32 0 @@ -953,3 +985,9 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone declare @__half_to_float_varying( %v) nounwind readnone declare i16 @__float_to_half_uniform(float %v) nounwind readnone declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index bbf1b842..2a5d1b32 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -202,21 +202,15 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone declare i32 @__count_leading_zeros_i32(i32) nounwind readnone declare i64 @__count_leading_zeros_i64(i64) nounwind readnone -;; svml - ; FIXME: need either to wire these up to the 8-wide SVML entrypoints, ; or, use the macro to call the 4-wide ones twice with our 8-wide ; vectors... -declare @__svml_sin() -declare @__svml_cos() -declare void @__svml_sincos(, *, *) -declare @__svml_tan() -declare @__svml_atan() -declare @__svml_atan2(, ) -declare @__svml_exp() -declare @__svml_log() -declare @__svml_pow(, ) +;; svml + +include(`svml.m4') +svml_stubs(float,f,WIDTH) +svml_stubs(double,d,WIDTH) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions @@ -226,14 +220,16 @@ declare i1 @__any() nounwind readnone declare i1 @__all() nounwind readnone declare i1 @__none() nounwind readnone +declare i16 @__reduce_add_int8() nounwind readnone +declare i32 @__reduce_add_int16() nounwind readnone + declare float @__reduce_add_float() nounwind readnone declare float @__reduce_min_float() nounwind readnone declare float @__reduce_max_float() nounwind readnone -declare i32 @__reduce_add_int32() nounwind readnone +declare i64 @__reduce_add_int32() nounwind readnone declare i32 @__reduce_min_int32() nounwind readnone declare i32 @__reduce_max_int32() nounwind readnone - declare i32 @__reduce_min_uint32() nounwind readnone declare i32 @__reduce_max_uint32() nounwind readnone @@ -244,7 +240,6 @@ declare double @__reduce_max_double() nounwind readnone declare i64 @__reduce_add_int64() nounwind readnone declare i64 @__reduce_min_int64() nounwind readnone declare i64 @__reduce_max_int64() nounwind readnone - declare i64 @__reduce_min_uint64() nounwind readnone declare i64 @__reduce_max_uint64() nounwind readnone @@ -379,3 +374,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll new file mode 100644 index 00000000..a0575927 --- /dev/null +++ b/builtins/target-neon-16.ll @@ -0,0 +1,517 @@ +;; +;; target-neon-16.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`8') +define(`MASK',`i16') + +include(`util.m4') +include(`target-neon-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone { + unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v) + ret <8 x float> %r +} + +define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone { + unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v) + ret <8 x i16> %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32> + %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, + + %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, + + %binop21.i = fadd <8 x float> %binop.i, + + %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32> + %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float> + ret <8 x float> %int_to_float_bitcast.i.i.i +} + +define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind + %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32> + %bitop.i = and <8 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <8 x float> %binop.i +} + +define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind + %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32> + %bitop.i = and <8 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <8 x float> %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare @__round_varying_double() nounwind readnone +declare @__floor_varying_double() nounwind readnone +declare @__ceil_varying_double() nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__max_varying_float(, + ) nounwind readnone { + binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1) + ret %r +} + +define @__min_varying_float(, + ) nounwind readnone { + binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1) + ret %r +} + +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define @__min_varying_int32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1) + ret %r +} + +define @__max_varying_int32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1) + ret %r +} + +define @__min_varying_uint32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1) + ret %r +} + +define @__max_varying_uint32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1) + ret %r +} + +;; sqrt/rsqrt/rcp + +declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rcp_varying_float( %d) nounwind readnone { + unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d) + binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0) + %x1 = fmul %x0, %x0_nr + binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %d) nounwind readnone { + unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d) + %x0_2 = fmul %x0, %x0 + binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2) + %x1 = fmul %x0, %x0_nr + %x1_2 = fmul %x1, %x1 + binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +define float @__rsqrt_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + %vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs) + %r = extractelement <8 x float> %vr, i32 0 + ret float %r +} + +define float @__rcp_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + %vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs) + %r = extractelement <8 x float> %vr, i32 0 + ret float %r +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define @__sqrt_varying_float() nounwind readnone { + unary4to8(result, float, @llvm.sqrt.v4f32, %0) +;; this returns nan for v=0, which is undesirable.. +;; %rsqrt = call @__rsqrt_varying_float( %0) +;; %result = fmul <4 x float> %rsqrt, %0 + ret <8 x float> %result +} + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) + +define @__sqrt_varying_double() nounwind readnone { + unary4to8(r, double, @llvm.sqrt.v4f64, %0) + ret %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reductions + +define i64 @__movmsk() nounwind readnone { + %and_mask = and %0, + + %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask) + %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4) + %va = extractelement <2 x i64> %v2, i32 0 + %vb = extractelement <2 x i64> %v2, i32 1 + %v = or i64 %va, %vb + ret i64 %v +} + +define i1 @__any() nounwind readnone alwaysinline { + v8tov4(MASK, %0, %v0123, %v4567) + %vor = or <4 x MASK> %v0123, %v4567 + %v0 = extractelement <4 x MASK> %vor, i32 0 + %v1 = extractelement <4 x MASK> %vor, i32 1 + %v2 = extractelement <4 x MASK> %vor, i32 2 + %v3 = extractelement <4 x MASK> %vor, i32 3 + %v01 = or MASK %v0, %v1 + %v23 = or MASK %v2, %v3 + %v = or MASK %v01, %v23 + %cmp = icmp ne MASK %v, 0 + ret i1 %cmp +} + +define i1 @__all() nounwind readnone alwaysinline { + v8tov4(MASK, %0, %v0123, %v4567) + %vand = and <4 x MASK> %v0123, %v4567 + %v0 = extractelement <4 x MASK> %vand, i32 0 + %v1 = extractelement <4 x MASK> %vand, i32 1 + %v2 = extractelement <4 x MASK> %vand, i32 2 + %v3 = extractelement <4 x MASK> %vand, i32 3 + %v01 = and MASK %v0, %v1 + %v23 = and MASK %v2, %v3 + %v = and MASK %v01, %v23 + %cmp = icmp ne MASK %v, 0 + ret i1 %cmp +} + +define i1 @__none() nounwind readnone alwaysinline { + %any = call i1 @__any( %0) + %none = icmp eq i1 %any, 0 + ret i1 %none +} + +;; $1: scalar type +;; $2: vector/vector reduce function (2 x -> ) +;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>) +;; $4: scalar reduce function + +define(`neon_reduce', ` + v8tov4($1, %0, %v0123, %v4567) + %v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef, + <8 x i32> + %v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef, + <8 x i32> + %vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8) + %vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef, + <4 x i32> + v4tov2($1, %vfirst_4, %v0, %v1) + %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1) + %vh0 = extractelement <2 x $1> %vh, i32 0 + %vh1 = extractelement <2 x $1> %vh, i32 1 + %r = call $1 $4($1 %vh0, $1 %vh1) + ret $1 %r +') + +declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @add_f32(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define internal @__add_varying_float(, ) { + %r = fadd %0, %1 + ret %r +} + +define float @__reduce_add_float() nounwind readnone { + neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @min_f32(float, float) { + %cmp = fcmp olt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_min_float() nounwind readnone { + neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @max_f32(float, float) { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_max_float() nounwind readnone { + neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) +} + +declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone + +define i16 @__reduce_add_int8() nounwind readnone { + %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0) + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 + %r16 = trunc i32 %r to i16 + ret i16 %r16 +} + +declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16() + +define i64 @__reduce_add_int16() nounwind readnone { + %a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16( %0) + %a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1) + %aa = extractelement <2 x i64> %a2, i32 0 + %ab = extractelement <2 x i64> %a2, i32 1 + %r = add i64 %aa, %ab + ret i64 %r +} + +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone + +define i64 @__reduce_add_int32() nounwind readnone { + v8tov4(i32, %0, %va, %vb) + %pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va) + %pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb) + %psum = add <2 x i64> %pa, %pb + %a0 = extractelement <2 x i64> %psum, i32 0 + %a1 = extractelement <2 x i64> %psum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_si32(i32, i32) { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_int32() nounwind readnone { + neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_si32(i32, i32) { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_int32() nounwind readnone { + neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_ui32(i32, i32) { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_uint32() nounwind readnone { + neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_ui32(i32, i32) { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_uint32() nounwind readnone { + neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32) +} + +define double @__reduce_add_double() nounwind readnone { + v8tov2(double, %0, %v0, %v1, %v2, %v3) + %v01 = fadd <2 x double> %v0, %v1 + %v23 = fadd <2 x double> %v2, %v3 + %sum = fadd <2 x double> %v01, %v23 + %e0 = extractelement <2 x double> %sum, i32 0 + %e1 = extractelement <2 x double> %sum, i32 1 + %m = fadd double %e0, %e1 + ret double %m +} + +define double @__reduce_min_double() nounwind readnone { + reduce8(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double() nounwind readnone { + reduce8(double, @__max_varying_double, @__max_uniform_double) +} + +define i64 @__reduce_add_int64() nounwind readnone { + v8tov2(i64, %0, %v0, %v1, %v2, %v3) + %v01 = add <2 x i64> %v0, %v1 + %v23 = add <2 x i64> %v2, %v3 + %sum = add <2 x i64> %v01, %v23 + %e0 = extractelement <2 x i64> %sum, i32 0 + %e1 = extractelement <2 x i64> %sum, i32 1 + %m = add i64 %e0, %e1 + ret i64 %m +} + +define i64 @__reduce_min_int64() nounwind readnone { + reduce8(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64() nounwind readnone { + reduce8(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64() nounwind readnone { + reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64() nounwind readnone { + reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 + +declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} diff --git a/builtins/target-neon.ll b/builtins/target-neon-32.ll similarity index 60% rename from builtins/target-neon.ll rename to builtins/target-neon-32.ll index e70b774b..30b062c9 100644 --- a/builtins/target-neon.ll +++ b/builtins/target-neon-32.ll @@ -1,5 +1,5 @@ ;; -;; target-neon.ll +;; target-neon-32.ll ;; ;; Copyright(c) 2012-2013 Matt Pharr ;; Copyright(c) 2013 Google, Inc. @@ -34,52 +34,20 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32" - define(`WIDTH',`4') - define(`MASK',`i32') include(`util.m4') - -stdlib_core() -scans() -reduce_equal(WIDTH) -rdrand_decls() -define_shuffles() -aossoa() -ctlztz() +include(`target-neon-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines -declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone - -define float @__half_to_float_uniform(i16 %v) nounwind readnone { - %v1 = bitcast i16 %v to <1 x i16> - %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, - <4 x i32> - %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec) - %r = extractelement <4 x float> %h, i32 0 - ret float %r -} - define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone { %r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v) ret <4 x float> %r } -define i16 @__float_to_half_uniform(float %v) nounwind readnone { - %v1 = bitcast float %v to <1 x float> - %vec = shufflevector <1 x float> %v1, <1 x float> undef, - <4 x i32> - %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec) - %r = extractelement <4 x i16> %h, i32 0 - ret i16 %r -} - - define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone { %r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v) ret <4 x i16> %r @@ -88,48 +56,11 @@ define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; math -define void @__fastmath() nounwind { - ret void -} - ;; round/floor/ceil ;; FIXME: grabbed these from the sse2 target, which does not have native ;; instructions for these. Is there a better approach for NEON? -define float @__round_uniform_float(float) nounwind readonly alwaysinline { - %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32 - %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648 - %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i - %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06 - %binop21.i = fadd float %binop.i, -8.388608e+06 - %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32 - %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float - ret float %int_to_float_bitcast.i.i.i -} - -define float @__floor_uniform_float(float) nounwind readonly alwaysinline { - %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind - %bincmp.i = fcmp ogt float %calltmp.i, %0 - %selectexpr.i = sext i1 %bincmp.i to i32 - %bitop.i = and i32 %selectexpr.i, -1082130432 - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i - ret float %binop.i -} - -define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { - %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind - %bincmp.i = fcmp olt float %calltmp.i, %0 - %selectexpr.i = sext i1 %bincmp.i to i32 - %bitop.i = and i32 %selectexpr.i, 1065353216 - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i - ret float %binop.i -} - define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline { %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32> %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, @@ -164,10 +95,6 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin } ;; FIXME: rounding doubles and double vectors needs to be implemented -declare double @__round_uniform_double(double) nounwind readnone -declare double @__floor_uniform_double(double) nounwind readnone -declare double @__ceil_uniform_double(double) nounwind readnone - declare @__round_varying_double() nounwind readnone declare @__floor_varying_double() nounwind readnone declare @__ceil_varying_double() nounwind readnone @@ -175,78 +102,6 @@ declare @__ceil_varying_double() nounwind readn ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; min/max -define float @__max_uniform_float(float, float) nounwind readnone { - %cmp = fcmp ugt float %0, %1 - %r = select i1 %cmp, float %0, float %1 - ret float %r -} - -define float @__min_uniform_float(float, float) nounwind readnone { - %cmp = fcmp ult float %0, %1 - %r = select i1 %cmp, float %0, float %1 - ret float %r -} - -define i32 @__min_uniform_int32(i32, i32) nounwind readnone { - %cmp = icmp slt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__max_uniform_int32(i32, i32) nounwind readnone { - %cmp = icmp sgt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__min_uniform_uint32(i32, i32) nounwind readnone { - %cmp = icmp ult i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__max_uniform_uint32(i32, i32) nounwind readnone { - %cmp = icmp ugt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i64 @__min_uniform_int64(i64, i64) nounwind readnone { - %cmp = icmp slt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__max_uniform_int64(i64, i64) nounwind readnone { - %cmp = icmp sgt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__min_uniform_uint64(i64, i64) nounwind readnone { - %cmp = icmp ult i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__max_uniform_uint64(i64, i64) nounwind readnone { - %cmp = icmp ugt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define double @__min_uniform_double(double, double) nounwind readnone { - %cmp = fcmp olt double %0, %1 - %r = select i1 %cmp, double %0, double %1 - ret double %r -} - -define double @__max_uniform_double(double, double) nounwind readnone { - %cmp = fcmp ogt double %0, %1 - %r = select i1 %cmp, double %0, double %1 - ret double %r -} - declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone @@ -287,44 +142,6 @@ define @__max_varying_uint32(, ) nounwin ret <4 x i32> %r } -define @__min_varying_int64(, ) nounwind readnone { - %m = icmp slt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_int64(, ) nounwind readnone { - %m = icmp sgt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__min_varying_uint64(, ) nounwind readnone { - %m = icmp ult %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_uint64(, ) nounwind readnone { - %m = icmp ugt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__min_varying_double(, - ) nounwind readnone { - %m = fcmp olt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_double(, - ) nounwind readnone { - %m = fcmp ogt %0, %1 - %r = select %m, %0, %1 - ret %r -} - ;; sqrt/rsqrt/rcp declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone @@ -371,13 +188,6 @@ define float @__rcp_uniform_float(float) nounwind readnone { ret float %r } -declare float @llvm.sqrt.f32(float) - -define float @__sqrt_uniform_float(float) nounwind readnone { - %r = call float @llvm.sqrt.f32(float %0) - ret float %r -} - declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) define @__sqrt_varying_float() nounwind readnone { @@ -388,13 +198,6 @@ define @__sqrt_varying_float() nounwind readnone ret <4 x float> %result } -declare double @llvm.sqrt.f64(double) - -define double @__sqrt_uniform_double(double) nounwind readnone { - %r = call double @llvm.sqrt.f64(double %0) - ret double %r -} - declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) define @__sqrt_varying_double() nounwind readnone { @@ -402,21 +205,6 @@ define @__sqrt_varying_double() nounwind readno ret <4 x double> %r } -;; bit ops - -declare i32 @llvm.ctpop.i32(i32) nounwind readnone -declare i64 @llvm.ctpop.i64(i64) nounwind readnone - -define i32 @__popcnt_int32(i32) nounwind readnone { - %v = call i32 @llvm.ctpop.i32(i32 %0) - ret i32 %v -} - -define i64 @__popcnt_int64(i64) nounwind readnone { - %v = call i64 @llvm.ctpop.i64(i64 %0) - ret i64 %v -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions @@ -509,15 +297,38 @@ define float @__reduce_max_float(<4 x float>) nounwind readnone { neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) } -define internal i32 @add_i32(i32, i32) { - %r = add i32 %0, %1 +declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone + +define i16 @__reduce_add_int8() nounwind readnone { + %v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <8 x i32> + %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8) + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 + %r16 = trunc i32 %r to i16 + ret i16 %r16 +} + +declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone + +define i32 @__reduce_add_int16() nounwind readnone { + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 ret i32 %r } -declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone -define i32 @__reduce_add_int32() nounwind readnone { - neon_reduce(i32, @llvm.arm.neon.vpadd.v2i32, @add_i32) +define i64 @__reduce_add_int32() nounwind readnone { + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0) + %a0 = extractelement <2 x i64> %a64, i32 0 + %a1 = extractelement <2 x i64> %a64, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r } declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone @@ -617,90 +428,60 @@ define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; unaligned loads/loads+broadcasts +;; int8/int16 -masked_load(i8, 1) -masked_load(i16, 2) -masked_load(i32, 4) -masked_load(float, 4) -masked_load(i64, 8) -masked_load(double, 8) +declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone -gen_masked_store(i8) -gen_masked_store(i16) -gen_masked_store(i32) -gen_masked_store(i64) -masked_store_float_double() - -define void @__masked_store_blend_i8(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i8> %new, <4 x i8> %old - store %result, * %ptr - ret void +define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r } -define void @__masked_store_blend_i16(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i16> %new, <4 x i16> %old - store %result, * %ptr - ret void +declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r } -define void @__masked_store_blend_i32(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i32> %new, <4 x i32> %old - store %result, * %ptr - ret void +declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r } -define void @__masked_store_blend_i64(* nocapture %ptr, - %new, %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i64> %new, <4 x i64> %old - store %result, * %ptr - ret void +declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r } -;; yuck. We need declarations of these, even though we shouldnt ever -;; actually generate calls to them for the NEON target... +declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare @__svml_sin() -declare @__svml_cos() -declare void @__svml_sincos(, *, *) -declare @__svml_tan() -declare @__svml_atan() -declare @__svml_atan2(, ) -declare @__svml_exp() -declare @__svml_log() -declare @__svml_pow(, ) +define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; gather +declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -gen_gather_factored(i8) -gen_gather_factored(i16) -gen_gather_factored(i32) -gen_gather_factored(float) -gen_gather_factored(i64) -gen_gather_factored(double) +define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} -gen_scatter(i8) -gen_scatter(i16) -gen_scatter(i32) -gen_scatter(float) -gen_scatter(i64) -gen_scatter(double) +declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -packed_load_and_store(4) +define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; prefetch +declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -define_prefetches() +define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll new file mode 100644 index 00000000..2accfe53 --- /dev/null +++ b/builtins/target-neon-8.ll @@ -0,0 +1,583 @@ +;; +;; target-neon-8.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`16') +define(`MASK',`i8') + +include(`util.m4') +include(`target-neon-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone { + unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v) + ret <16 x float> %r +} + +define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone { + unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v) + ret <16 x i16> %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32> + %bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i, + + %bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i, + + %binop21.i = fadd <16 x float> %binop.i, + + %float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32> + %bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float> + ret <16 x float> %int_to_float_bitcast.i.i.i +} + +define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind + %bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32> + %bitop.i = and <16 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <16 x float> %binop.i +} + +define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind + %bincmp.i = fcmp olt <16 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32> + %bitop.i = and <16 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <16 x float> %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare @__round_varying_double() nounwind readnone +declare @__floor_varying_double() nounwind readnone +declare @__ceil_varying_double() nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__max_varying_float(, + ) nounwind readnone { + binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1) + ret %r +} + +define @__min_varying_float(, + ) nounwind readnone { + binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1) + ret %r +} + +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define @__min_varying_int32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1) + ret %r +} + +define @__max_varying_int32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1) + ret %r +} + +define @__min_varying_uint32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1) + ret %r +} + +define @__max_varying_uint32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1) + ret %r +} + +;; sqrt/rsqrt/rcp + +declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rcp_varying_float( %d) nounwind readnone { + unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d) + binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0) + %x1 = fmul %x0, %x0_nr + binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %d) nounwind readnone { + unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d) + %x0_2 = fmul %x0, %x0 + binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2) + %x1 = fmul %x0, %x0_nr + %x1_2 = fmul %x1, %x1 + binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +define float @__rsqrt_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <16 x i32> + %vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs) + %r = extractelement <16 x float> %vr, i32 0 + ret float %r +} + +define float @__rcp_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <16 x i32> + %vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs) + %r = extractelement <16 x float> %vr, i32 0 + ret float %r +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define @__sqrt_varying_float() nounwind readnone { + unary4to16(result, float, @llvm.sqrt.v4f32, %0) +;; this returns nan for v=0, which is undesirable.. +;; %rsqrt = call @__rsqrt_varying_float( %0) +;; %result = fmul <4 x float> %rsqrt, %0 + ret <16 x float> %result +} + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) + +define @__sqrt_varying_double() nounwind readnone { + unary4to16(r, double, @llvm.sqrt.v4f64, %0) + ret %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reductions + +define i64 @__movmsk() nounwind readnone { + %and_mask = and %0, + + %v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask) + %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8) + %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4) + %va = extractelement <2 x i64> %v2, i32 0 + %vb = extractelement <2 x i64> %v2, i32 1 + %vbshift = shl i64 %vb, 8 + %v = or i64 %va, %vbshift + ret i64 %v +} + +define i1 @__any() nounwind readnone alwaysinline { + v16tov8(MASK, %0, %v8a, %v8b) + %vor8 = or <8 x MASK> %v8a, %v8b + %v16 = sext <8 x i8> %vor8 to <8 x i16> + v8tov4(i16, %v16, %v16a, %v16b) + %vor16 = or <4 x i16> %v16a, %v16b + %v32 = sext <4 x i16> %vor16 to <4 x i32> + v4tov2(i32, %v32, %v32a, %v32b) + %vor32 = or <2 x i32> %v32a, %v32b + %v0 = extractelement <2 x i32> %vor32, i32 0 + %v1 = extractelement <2 x i32> %vor32, i32 1 + %v = or i32 %v0, %v1 + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all() nounwind readnone alwaysinline { + v16tov8(MASK, %0, %v8a, %v8b) + %vand8 = and <8 x MASK> %v8a, %v8b + %v16 = sext <8 x i8> %vand8 to <8 x i16> + v8tov4(i16, %v16, %v16a, %v16b) + %vand16 = and <4 x i16> %v16a, %v16b + %v32 = sext <4 x i16> %vand16 to <4 x i32> + v4tov2(i32, %v32, %v32a, %v32b) + %vand32 = and <2 x i32> %v32a, %v32b + %v0 = extractelement <2 x i32> %vand32, i32 0 + %v1 = extractelement <2 x i32> %vand32, i32 1 + %v = and i32 %v0, %v1 + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__none() nounwind readnone alwaysinline { + %any = call i1 @__any( %0) + %none = icmp eq i1 %any, 0 + ret i1 %none +} + +;; $1: scalar type +;; $2: vector/vector reduce function (2 x -> ) +;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>) +;; $4: scalar reduce function + +define(`neon_reduce', ` + v16tov8($1, %0, %va, %vb) + %va_16 = shufflevector <8 x $1> %va, <8 x $1> undef, + <16 x i32> + %vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef, + <16 x i32> + %v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16) + + %v8a = shufflevector <16 x $1> %v8, <16 x $1> undef, + <16 x i32> + %v8b = shufflevector <16 x $1> %v8, <16 x $1> undef, + <16 x i32> + + %v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b) + + %vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef, + <4 x i32> + v4tov2($1, %vfirst_4, %v0, %v1) + %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1) + %vh0 = extractelement <2 x $1> %vh, i32 0 + %vh1 = extractelement <2 x $1> %vh, i32 1 + %r = call $1 $4($1 %vh0, $1 %vh1) + ret $1 %r +') + +declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @add_f32(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define internal @__add_varying_float(, ) { + %r = fadd %0, %1 + ret %r +} + +define float @__reduce_add_float() nounwind readnone { + neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @min_f32(float, float) { + %cmp = fcmp olt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_min_float() nounwind readnone { + neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @max_f32(float, float) { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_max_float() nounwind readnone { + neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) +} + +declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone + +define i64 @__reduce_add_int8() nounwind readnone { + %a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0) + %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32) + %a0 = extractelement <2 x i64> %a64, i32 0 + %a1 = extractelement <2 x i64> %a64, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +define i64 @__reduce_add_int16() nounwind readnone { + v16tov8(i16, %0, %va, %vb) + %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va) + %b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32) + %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32) + %sum = add <2 x i64> %a64, %b64 + %a0 = extractelement <2 x i64> %sum, i32 0 + %a1 = extractelement <2 x i64> %sum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +define i64 @__reduce_add_int32() nounwind readnone { + v16tov4(i32, %0, %va, %vb, %vc, %vd) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va) + %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb) + %c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc) + %d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd) + %ab = add <2 x i64> %a64, %b64 + %cd = add <2 x i64> %c64, %d64 + %sum = add <2 x i64> %ab, %cd + %a0 = extractelement <2 x i64> %sum, i32 0 + %a1 = extractelement <2 x i64> %sum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_si32(i32, i32) { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_int32() nounwind readnone { + neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_si32(i32, i32) { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_int32() nounwind readnone { + neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_ui32(i32, i32) { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_uint32() nounwind readnone { + neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_ui32(i32, i32) { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_uint32() nounwind readnone { + neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32) +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define internal @__add_varying_double(, ) { + %r = fadd %0, %1 + ret %r +} + +define double @__reduce_add_double() nounwind readnone { + reduce16(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double() nounwind readnone { + reduce16(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double() nounwind readnone { + reduce16(double, @__max_varying_double, @__max_uniform_double) +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define internal @__add_varying_int64(, ) { + %r = add %0, %1 + ret %r +} + +define i64 @__reduce_add_int64() nounwind readnone { + reduce16(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64() nounwind readnone { + reduce16(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64() nounwind readnone { + reduce16(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64() nounwind readnone { + reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64() nounwind readnone { + reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll new file mode 100644 index 00000000..1c0b421f --- /dev/null +++ b/builtins/target-neon-common.ll @@ -0,0 +1,346 @@ +;; +;; target-neon-common.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32" + +stdlib_core() +scans() +reduce_equal(WIDTH) +rdrand_decls() +define_shuffles() +aossoa() +ctlztz() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone + +define float @__half_to_float_uniform(i16 %v) nounwind readnone { + %v1 = bitcast i16 %v to <1 x i16> + %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, + <4 x i32> + %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec) + %r = extractelement <4 x float> %h, i32 0 + ret float %r +} + +define i16 @__float_to_half_uniform(float %v) nounwind readnone { + %v1 = bitcast float %v to <1 x float> + %vec = shufflevector <1 x float> %v1, <1 x float> undef, + <4 x i32> + %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec) + %r = extractelement <4 x i16> %h, i32 0 + ret i16 %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +define void @__fastmath() nounwind { + ret void +} + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define float @__round_uniform_float(float) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32 + %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648 + %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i + %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06 + %binop21.i = fadd float %binop.i, -8.388608e+06 + %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32 + %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float + ret float %int_to_float_bitcast.i.i.i +} + +define float @__floor_uniform_float(float) nounwind readonly alwaysinline { + %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind + %bincmp.i = fcmp ogt float %calltmp.i, %0 + %selectexpr.i = sext i1 %bincmp.i to i32 + %bitop.i = and i32 %selectexpr.i, -1082130432 + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i + ret float %binop.i +} + +define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { + %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind + %bincmp.i = fcmp olt float %calltmp.i, %0 + %selectexpr.i = sext i1 %bincmp.i to i32 + %bitop.i = and i32 %selectexpr.i, 1065353216 + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i + ret float %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare double @__round_uniform_double(double) nounwind readnone +declare double @__floor_uniform_double(double) nounwind readnone +declare double @__ceil_uniform_double(double) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +define float @__max_uniform_float(float, float) nounwind readnone { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__min_uniform_float(float, float) nounwind readnone { + %cmp = fcmp ult float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define i32 @__min_uniform_int32(i32, i32) nounwind readnone { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__max_uniform_int32(i32, i32) nounwind readnone { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__min_uniform_uint32(i32, i32) nounwind readnone { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__max_uniform_uint32(i32, i32) nounwind readnone { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i64 @__min_uniform_int64(i64, i64) nounwind readnone { + %cmp = icmp slt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__max_uniform_int64(i64, i64) nounwind readnone { + %cmp = icmp sgt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__min_uniform_uint64(i64, i64) nounwind readnone { + %cmp = icmp ult i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__max_uniform_uint64(i64, i64) nounwind readnone { + %cmp = icmp ugt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define double @__min_uniform_double(double, double) nounwind readnone { + %cmp = fcmp olt double %0, %1 + %r = select i1 %cmp, double %0, double %1 + ret double %r +} + +define double @__max_uniform_double(double, double) nounwind readnone { + %cmp = fcmp ogt double %0, %1 + %r = select i1 %cmp, double %0, double %1 + ret double %r +} + +define @__min_varying_int64(, ) nounwind readnone { + %m = icmp slt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_int64(, ) nounwind readnone { + %m = icmp sgt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__min_varying_uint64(, ) nounwind readnone { + %m = icmp ult %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_uint64(, ) nounwind readnone { + %m = icmp ugt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__min_varying_double(, + ) nounwind readnone { + %m = fcmp olt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_double(, + ) nounwind readnone { + %m = fcmp ogt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +;; sqrt/rsqrt/rcp + +declare float @llvm.sqrt.f32(float) + +define float @__sqrt_uniform_float(float) nounwind readnone { + %r = call float @llvm.sqrt.f32(float %0) + ret float %r +} + +declare double @llvm.sqrt.f64(double) + +define double @__sqrt_uniform_double(double) nounwind readnone { + %r = call double @llvm.sqrt.f64(double %0) + ret double %r +} + +;; bit ops + +declare i32 @llvm.ctpop.i32(i32) nounwind readnone +declare i64 @llvm.ctpop.i64(i64) nounwind readnone + +define i32 @__popcnt_int32(i32) nounwind readnone { + %v = call i32 @llvm.ctpop.i32(i32 %0) + ret i32 %v +} + +define i64 @__popcnt_int64(i64) nounwind readnone { + %v = call i64 @llvm.ctpop.i64(i64 %0) + ret i64 %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) +masked_store_float_double() + +define void @__masked_store_blend_i8(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i16(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i32(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i64(* nocapture %ptr, + %new, %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +;; yuck. We need declarations of these, even though we shouldnt ever +;; actually generate calls to them for the NEON target... + + +include(`svml.m4') +svml_stubs(float,f,WIDTH) +svml_stubs(double,d,WIDTH) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + +packed_load_and_store(4) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefetch + +define_prefetches() diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll index f3f9bfd9..a728803f 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx64.ll @@ -5,6 +5,10 @@ define(`WIDTH',`1') include(`util.m4') +include(`svml.m4') +svml_stubs(float,f,WIDTH) +svml_stubs(double,d,WIDTH) + ; Define some basics for a 1-wide target stdlib_core() packed_load_and_store() @@ -467,6 +471,9 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline { declare i32 @llvm.ctpop.i32(i32) nounwind readnone +declare i16 @__reduce_add_int8() nounwind readnone +declare i32 @__reduce_add_int16() nounwind readnone + define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { %call = call i32 @llvm.ctpop.i32(i32 %0) ret i32 %call @@ -643,103 +650,6 @@ define <1 x double> @__rsqrt_varying_double(<1 x double> %v) nounwind readonly } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0) - ;ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm.sin.f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - unary1to1(float,@llvm.sin.f32) - -} - -define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0) - ;ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm.cos.f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - unary1to1(float, @llvm.cos.f32) - -} - -define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline { -; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0) -; store <1 x float> %s, <1 x float> * %1 -; ret void - %sin = call <1 x float> @__svml_sin (<1 x float> %0) - %cos = call <1 x float> @__svml_cos (<1 x float> %0) - store <1 x float> %sin, <1 x float> * %1 - store <1 x float> %cos, <1 x float> * %2 - ret void -} - -define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0) - ;ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm_tan_f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - ;unasry1to1(float, @llvm.tan.f32) - ; UNSUPPORTED! - ret <1 x float > %0 -} - -define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline { -; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0) -; ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm_atan_f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - ;unsary1to1(float,@llvm.atan.f32) - ;UNSUPPORTED! - ret <1 x float > %0 - -} - -define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1) - ;ret <1 x float> %ret - ;%y = extractelement <1 x float> %0, i32 0 - ;%x = extractelement <1 x float> %1, i32 0 - ;%q = fdiv float %y, %x - ;%a = call float @llvm.atan.f32 (float %q) - ;%rv = insertelement <1 x float> undef, float %a, i32 0 - ;ret <1 x float> %rv - ; UNSUPPORTED! - ret <1 x float > %0 -} - -define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0) - ;ret <1 x float> %ret - unary1to1(float, @llvm.exp.f32) -} - -define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0) - ;ret <1 x float> %ret - unary1to1(float, @llvm.log.f32) -} - -define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1) - ;ret <1 x float> %ret - %r = extractelement <1 x float> %0, i32 0 - %e = extractelement <1 x float> %1, i32 0 - %s = call float @llvm.pow.f32(float %r,float %e) - %rv = insertelement <1 x float> undef, float %s, i32 0 - ret <1 x float> %rv - -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max @@ -957,3 +867,8 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone declare @__half_to_float_varying( %v) nounwind readnone declare i16 @__float_to_half_uniform(float %v) nounwind readnone declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index c6a3afe2..ad1d88bc 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -269,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline { ret i64 %val } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 73361720..77bf1a9d 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; svml stuff -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone +include(`svml.m4') +;; single precision +svml_declare(float,f4,4) +svml_define_x(float,f4,4,f,8) - -define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_sinf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_cosf4, %0) - ret <8 x float> %ret -} - -define void @__svml_sincos(<8 x float>, <8 x float> *, - <8 x float> *) nounwind readnone alwaysinline { - ; call svml_sincosf4 two times with the two 4-wide sub-vectors - %a = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - %b = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - - %cospa = alloca <4 x float> - %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) - - %cospb = alloca <4 x float> - %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) - - %sin = shufflevector <4 x float> %sa, <4 x float> %sb, - <8 x i32> - store <8 x float> %sin, <8 x float> * %1 - - %cosa = load <4 x float> * %cospa - %cosb = load <4 x float> * %cospb - %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, - <8 x i32> - store <8 x float> %cos, <8 x float> * %2 - - ret void -} - -define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_tanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_atanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan2(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_atan2f4, %0, %1) - ret <8 x float> %ret -} - -define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_expf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_logf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_pow(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_powf4, %0, %1) - ret <8 x float> %ret -} +;; double precision +svml_declare(double,2,2) +svml_define_x(double,2,2,d,8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -367,6 +294,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + define <4 x float> @__vec4_add_float(<4 x float> %v0, <4 x float> %v1) nounwind readnone alwaysinline { %v = fadd <4 x float> %v0, %v1 diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 2bb06391..e42d4990 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <4 x i16> @__add_varying_i16(<4 x i16>, + <4 x i16>) nounwind readnone alwaysinline { + %r = add <4 x i16> %0, %1 + ret <4 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline { + reduce4(i16, @__add_varying_i16, @__add_uniform_i16) +} + define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline { %v1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> @@ -466,62 +496,15 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; svml stuff -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone +include(`svml.m4') +;; single precision +svml_declare(float,f4,4) +svml_define(float,f4,4,f) +;; double precision +svml_declare(double,2,2) +svml_define_x(double,2,2,d,4) -define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) - ret <4 x float> %ret -} - -define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { - %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) - store <4 x float> %s, <4 x float> * %1 - ret void -} - -define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - -define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_expf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_logf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll new file mode 100644 index 00000000..72b81ff0 --- /dev/null +++ b/builtins/target-sse4-16.ll @@ -0,0 +1,490 @@ +;; Copyright (c) 2013, Google, Inc. +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Google, Inc. nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Define common 4-wide stuff +define(`WIDTH',`8') +define(`MASK',`i16') +include(`util.m4') + +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define @__rcp_varying_float() nounwind readonly alwaysinline { + unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + %v_iv = fmul <8 x float> %0, %call + %two_minus = fsub <8 x float> , %v_iv + %iv_mul = fmul <8 x float> %call, %two_minus + ret <8 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <8 x float> %v, %is + %v_is_is = fmul <8 x float> %v_is, %is + %three_sub = fsub <8 x float> , %v_is_is + %is_mul = fmul <8 x float> %is, %three_sub + %half_scale = fmul <8 x float> , %is_mul + ret <8 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline { + unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0) + ret <8 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind +alwaysinline { + unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <8 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + round4to8(%0, 8) +} + +define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + round4to8(%0, 9) +} + +define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + round4to8(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone + +define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 8) +} + +define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 9) +} + +define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { + binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1) + ret <8 x float> %call +} + +define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { + binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1) + ret <8 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 min/max + +define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <8 x i32> %call +} + +define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <8 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unsigned int min/max + +define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <8 x i32> %call +} + +define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <8 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone { + binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <8 x double> %ret +} + +define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone { + binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <8 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME +include(`svml.m4') +svml_stubs(float,f,WIDTH) +svml_stubs(double,d,WIDTH) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone + +define i64 @__movmsk(<8 x MASK>) nounwind readnone alwaysinline { + %m8 = trunc <8 x MASK> %0 to <8 x i8> + %mask8 = shufflevector <8 x i8> %m8, <8 x i8> zeroinitializer, + <16 x i32> + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8) + %m64 = zext i32 %m to i64 + ret i64 %m64 +} + +define i1 @__any(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %mne = icmp ne i64 %m, 0 + ret i1 %mne +} + +define i1 @__all(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %meq = icmp eq i64 %m, ALL_ON_MASK + ret i1 %meq +} + +define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %meq = icmp eq i64 %m, 0 + ret i1 %meq +} + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + +define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) { + %r = fadd <8 x float> %0, %1 + ret <8 x float> %r +} + +define internal float @__add_uniform_float(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { + reduce8(float, @__add_varying_float, @__add_uniform_float) +} + +define float @__reduce_min_float(<8 x float>) nounwind readnone { + reduce8(float, @__min_varying_float, @__min_uniform_float) +} + +define float @__reduce_max_float(<8 x float>) nounwind readnone { + reduce8(float, @__max_varying_float, @__max_uniform_float) +} + +define internal <8 x i32> @__add_varying_int32(<8 x i32>, <8 x i32>) { + %r = add <8 x i32> %0, %1 + ret <8 x i32> %r +} + +define internal i32 @__add_uniform_int32(i32, i32) { + %r = add i32 %0, %1 + ret i32 %r +} + +define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__add_varying_int32, @__add_uniform_int32) +} + +define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__min_varying_int32, @__min_uniform_int32) +} + +define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone { + reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone { + reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +define internal <8 x double> @__add_varying_double(<8 x double>, <8 x double>) { + %r = fadd <8 x double> %0, %1 + ret <8 x double> %r +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define double @__reduce_add_double(<8 x double>) nounwind readnone { + reduce8(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double(<8 x double>) nounwind readnone { + reduce8(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double(<8 x double>) nounwind readnone { + reduce8(double, @__max_varying_double, @__max_uniform_double) +} + +define internal <8 x i64> @__add_varying_int64(<8 x i64>, <8 x i64>) { + %r = add <8 x i64> %0, %1 + ret <8 x i64> %r +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +reduce_equal(8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +define void @__masked_store_blend_i64(<8 x i64>* nocapture, <8 x i64>, + <8 x MASK> %mask) nounwind + alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i64>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i64> %1, <8 x i64> %old + store <8 x i64> %blend, <8 x i64>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i32>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i32> %1, <8 x i32> %old + store <8 x i32> %blend, <8 x i32>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i16>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i16> %1, <8 x i16> %old + store <8 x i16> %blend, <8 x i16>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i8>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i8> %1, <8 x i8> %old + store <8 x i8> %blend, <8 x i8>* %0, align 4 + ret void +} + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) + +masked_store_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) { + %r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +define_avg_up_int8() +define_avg_up_int16() +define_down_avgs() diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll new file mode 100644 index 00000000..69b355e3 --- /dev/null +++ b/builtins/target-sse4-8.ll @@ -0,0 +1,492 @@ +;; Copyright (c) 2013, Google, Inc. +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Google, Inc. nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Define common 4-wide stuff +define(`WIDTH',`16') +define(`MASK',`i8') +include(`util.m4') + +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define @__rcp_varying_float() nounwind readonly alwaysinline { + unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + %v_iv = fmul <16 x float> %0, %call + %two_minus = fsub <16 x float> , %v_iv + %iv_mul = fmul <16 x float> %call, %two_minus + ret <16 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <16 x float> %v, %is + %v_is_is = fmul <16 x float> %v_is, %is + %three_sub = fsub <16 x float> , %v_is_is + %is_mul = fmul <16 x float> %is, %three_sub + %half_scale = fmul <16 x float> , %is_mul + ret <16 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline { + unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0) + ret <16 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind +alwaysinline { + unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <16 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + round4to16(%0, 8) +} + +define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + round4to16(%0, 9) +} + +define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + round4to16(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone + +define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline { +; XXXround2to4double(%0, 8) + ; FIXME: need round2to16double in util.m4... + ret <16 x double> undef +} + +define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline { + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 +; XXXround2to4double(%0, 9) + ret <16 x double> undef +} + +define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline { + ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10 +; XXXround2to4double(%0, 10) + ret <16 x double> undef +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline { + binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1) + ret <16 x float> %call +} + +define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline { + binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1) + ret <16 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 min/max + +define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <16 x i32> %call +} + +define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <16 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unsigned int min/max + +define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <16 x i32> %call +} + +define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <16 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone { + binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <16 x double> %ret +} + +define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone { + binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <16 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME + +include(`svml.m4') +svml_stubs(float,f,WIDTH) +svml_stubs(double,d,WIDTH) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone + +define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %m64 = zext i32 %m to i64 + ret i64 %m64 +} + +define i1 @__any(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %mne = icmp ne i32 %m, 0 + ret i1 %mne +} + +define i1 @__all(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %meq = icmp eq i32 %m, ALL_ON_MASK + ret i1 %meq +} + +define i1 @__none(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %meq = icmp eq i32 %m, 0 + ret i1 %meq +} + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline { + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <16 x i16> @__add_varying_i16(<16 x i16>, + <16 x i16>) nounwind readnone alwaysinline { + %r = add <16 x i16> %0, %1 + ret <16 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline { + reduce16(i16, @__add_varying_i16, @__add_uniform_i16) +} + +define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) { + %r = fadd <16 x float> %0, %1 + ret <16 x float> %r +} + +define internal float @__add_uniform_float(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline { + reduce16(float, @__add_varying_float, @__add_uniform_float) +} + +define float @__reduce_min_float(<16 x float>) nounwind readnone { + reduce16(float, @__min_varying_float, @__min_uniform_float) +} + +define float @__reduce_max_float(<16 x float>) nounwind readnone { + reduce16(float, @__max_varying_float, @__max_uniform_float) +} + +define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) { + %r = add <16 x i32> %0, %1 + ret <16 x i32> %r +} + +define internal i32 @__add_uniform_int32(i32, i32) { + %r = add i32 %0, %1 + ret i32 %r +} + +define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__add_varying_int32, @__add_uniform_int32) +} + +define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__min_varying_int32, @__min_uniform_int32) +} + +define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone { + reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone { + reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) { + %r = fadd <16 x double> %0, %1 + ret <16 x double> %r +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define double @__reduce_add_double(<16 x double>) nounwind readnone { + reduce16(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double(<16 x double>) nounwind readnone { + reduce16(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double(<16 x double>) nounwind readnone { + reduce16(double, @__max_varying_double, @__max_uniform_double) +} + +define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) { + %r = add <16 x i64> %0, %1 + ret <16 x i64> %r +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone { + reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone { + reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +reduce_equal(16) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>, + <16 x i8> %mask) nounwind + alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i64>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old + store <16 x i64> %blend, <16 x i64>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, + <16 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i32>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old + store <16 x i32> %blend, <16 x i32>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, + <16 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i16>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old + store <16 x i16> %blend, <16 x i16>* %0, align 4 + ret void +} + +declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone + +define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>, + <16 x MASK> %mask) nounwind alwaysinline { + %old = load <16 x i8>* %0, align 4 + %blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1, + <16 x i8> %mask) + store <16 x i8> %blend, <16 x i8>* %0, align 4 + ret void +} + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) + +masked_store_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +define_avg_up_int8() +define_avg_up_int16() +define_down_avgs() diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index ccae4d51..842db53f 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; svml stuff -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone +include(`svml.m4') +;; single precision +svml_declare(float,f4,4) +svml_define_x(float,f4,4,f,8) - -define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_sinf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_cosf4, %0) - ret <8 x float> %ret -} - -define void @__svml_sincos(<8 x float>, <8 x float> *, - <8 x float> *) nounwind readnone alwaysinline { - ; call svml_sincosf4 two times with the two 4-wide sub-vectors - %a = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - %b = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - - %cospa = alloca <4 x float> - %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) - - %cospb = alloca <4 x float> - %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) - - %sin = shufflevector <4 x float> %sa, <4 x float> %sb, - <8 x i32> - store <8 x float> %sin, <8 x float> * %1 - - %cosa = load <4 x float> * %cospa - %cosb = load <4 x float> * %cospb - %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, - <8 x i32> - store <8 x float> %cos, <8 x float> * %2 - - ret void -} - -define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_tanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_atanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan2(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_atan2f4, %0, %1) - ret <8 x float> %ret -} - -define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_expf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_logf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_pow(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_powf4, %0, %1) - ret <8 x float> %ret -} +;; double precision +svml_declare(double,2,2) +svml_define_x(double,2,2,d,8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -309,6 +236,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float) } @@ -629,3 +586,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) ret <8 x double> %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index f622b839..88be6c59 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -209,62 +209,14 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; svml stuff -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone +include(`svml.m4') +;; single precision +svml_declare(float,f4,4) +svml_define(float,f4,4,f) - -define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) - ret <4 x float> %ret -} - -define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { - %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) - store <4 x float> %s, <4 x float> * %1 - ret void -} - -define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - -define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_expf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_logf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} +;; double precision +svml_declare(double,2,2) +svml_define_x(double,2,2,d,4) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions @@ -299,6 +251,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <4 x i16> @__add_varying_i16(<4 x i16>, + <4 x i16>) nounwind readnone alwaysinline { + %r = add <4 x i16> %0, %1 + ret <4 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline { + reduce4(i16, @__add_varying_i16, @__add_uniform_i16) +} + declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline { @@ -503,3 +485,9 @@ gen_scatter(i32) gen_scatter(float) gen_scatter(i64) gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/util.m4 b/builtins/util.m4 index c19d4930..68fa818b 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -49,6 +49,63 @@ define(`MASK_HIGH_BIT_ON', ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; vector deconstruction utilities +;; split 8-wide vector into 2 4-wide vectors +;; +;; $1: vector element type +;; $2: 8-wide vector +;; $3: first 4-wide vector +;; $4: second 4-wide vector + +define(`v8tov4', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> + $4 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> +') + +define(`v16tov8', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> + $4 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> +') + +define(`v4tov2', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> + $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> +') + +define(`v8tov2', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> +') + +define(`v16tov4', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; vector assembly: wider vector from two narrower vectors +;; +;; $1: vector element type +;; $2: first n-wide vector +;; $3: second n-wide vector +;; $4: result 2*n-wide vector +define(`v8tov16', ` + $4 = shufflevector <8 x $1> $2, <8 x $1> $3, + <16 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Helper macro for calling various SSE instructions for scalar values ;; but where the instruction takes a vector parameter. ;; $1 : name of variable to put the final value in @@ -156,10 +213,7 @@ define(`reduce16', ` ;; the final reduction define(`reduce8by4', ` - %v1 = shufflevector <8 x $1> %0, <8 x $1> undef, - <4 x i32> - %v2 = shufflevector <8 x $1> %0, <8 x $1> undef, - <4 x i32> + v8tov4($1, %0, %v1, %v2) %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2) %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef, <4 x i32> @@ -266,30 +320,66 @@ define(`binary2to4', ` ;; $4: 8-wide operand value define(`unary4to8', ` - %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> - %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0) - %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> - %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1) - %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, + %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> + %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) + %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> + %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) + %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, + <8 x i32> +' +) + +;; $1: name of variable into which the final result should go +;; $2: scalar type of the input vector elements +;; $3: scalar type of the result vector elements +;; $4: 4-wide unary vector function to apply +;; $5: 8-wide operand value + +define(`unary4to8conv', ` + %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> + %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) + %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> + %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) + %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, <8 x i32> ' ) define(`unary4to16', ` - %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0) - %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1) - %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2) - %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3) + %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) + %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) + %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2) + %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3) - %$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, + %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, <8 x i32> - %$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3, + %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, <8 x i32> - %$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b, + %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b, + <16 x i32> +' +) + +define(`unary4to16conv', ` + %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) + %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) + %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2) + %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3) + + %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, + <8 x i32> + %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, + <8 x i32> + %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b, <16 x i32> ' @@ -411,6 +501,42 @@ define(`unary2to8', ` ' ) +define(`unary2to16', ` + %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) + %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) + %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2) + %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3) + %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4) + %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5) + %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6) + %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> + %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, + <4 x i32> + %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, + <4 x i32> + %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, + <8 x i32> + + %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, + <16 x i32> +' +) + ;; Maps an 2-wide binary function to two 8-wide vector operands ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements @@ -432,12 +558,58 @@ define(`binary2to8', ` %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> +' +) + +define(`binary2to16', ` + %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) + %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) + %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b) + %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) + %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b) + %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b) + %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b) + %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, <4 x i32> - %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, <8 x i32> + + %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, + <4 x i32> + %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, + <4 x i32> + %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, + <8 x i32> + + %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, + <16 x i32> ' ) @@ -460,6 +632,26 @@ ret <8 x float> %ret ' ) +define(`round4to16', ` +%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2) +%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2) +%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2) +%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2) +%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1, + <8 x i32> +%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3, + <8 x i32> +%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23, + <16 x i32> +ret <16 x float> %ret +' +) + define(`round8to16', ` %v0 = shufflevector <16 x float> $1, <16 x float> undef, <8 x i32> @@ -690,6 +882,91 @@ shuffles(i64, 8) ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...) +define(`mask_converts', ` +define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) { + ret <$1 x i8> %0 +} +define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) { + %r = trunc <$1 x i16> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) { + ret <$1 x i16> %0 +} +define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) { + ret <$1 x i32> %0 +} +define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) { + %r = sext <$1 x i32> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) { + %r = trunc <$1 x i64> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) { + %r = trunc <$1 x i64> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) { + %r = trunc <$1 x i64> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) { + ret <$1 x i64> %0 +} +') + +mask_converts(WIDTH) + define(`global_atomic_associative', ` define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, @@ -697,17 +974,10 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, ; first, for any lanes where the mask is off, compute a vector where those lanes ; hold the identity value.. - ; for the bit tricks below, we need the mask to be sign extended to be - ; the size of the element type. - ifelse( - MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>', - $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>', - $3,i32, ` - ; silly workaround to do %mask = %m, which is not possible directly.. - %maskmem = alloca <$1 x i32> - store <$1 x i32> %m, <$1 x i32> * %maskmem - %mask = load <$1 x i32> * %maskmem' - ) + ; for the bit tricks below, we need the mask to have the + ; the same element size as the element type. + %mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m) + ; zero out any lanes that are off %valoff = and <$1 x $3> %val, %mask @@ -1551,11 +1821,6 @@ declare i1 @__is_compile_time_constant_mask( %mask) declare i1 @__is_compile_time_constant_uniform_int32(i32) declare i1 @__is_compile_time_constant_varying_int32() -define void @__pause() nounwind readnone { - call void asm sideeffect "pause", "~{dirflag},~{fpsr},~{flags}"() nounwind - ret void -} - ; This function declares placeholder masked store functions for the ; front-end to use. ; @@ -2440,13 +2705,16 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline { } define @__sext_varying_bool() nounwind readnone alwaysinline { - ifelse(MASK,i1, ` - %se = sext %0 to +;; ifelse(MASK,i32, `ret %0', +;; `%se = sext %0 to +;; ret %se') + ifelse(MASK,i32, `%se = bitcast %0 to ', + MASK,i64, `%se = trunc %0 to ', + `%se = sext %0 to ') ret %se - ', ` - ret %0') } + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; memcpy/memmove/memset @@ -2830,17 +3098,11 @@ m4exit(`1') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; read hw clock +declare i64 @llvm.readcyclecounter() + define i64 @__clock() nounwind { -entry: - tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind - %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind - %asmresult = extractvalue { i32, i32 } %0, 0 - %asmresult1 = extractvalue { i32, i32 } %0, 1 - %conv = zext i32 %asmresult1 to i64 - %shl = shl nuw i64 %conv, 32 - %conv2 = zext i32 %asmresult to i64 - %or = or i64 %shl, %conv2 - ret i64 %or + %r = call i64 @llvm.readcyclecounter() + ret i64 %r } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2918,6 +3180,7 @@ define float @__stdlib_powf(float, float) nounwind readnone alwaysinline { } declare double @sin(double) nounwind readnone +declare double @asin(double) nounwind readnone declare double @cos(double) nounwind readnone declare void @sincos(double, double *, double *) nounwind readnone declare double @tan(double) nounwind readnone @@ -2932,6 +3195,11 @@ define double @__stdlib_sin(double) nounwind readnone alwaysinline { ret double %r } +define double @__stdlib_asin(double) nounwind readnone alwaysinline { + %r = call double @asin(double %0) + ret double %r +} + define double @__stdlib_cos(double) nounwind readnone alwaysinline { %r = call double @cos(double %0) ret double %r @@ -3201,8 +3469,8 @@ return: ;; $1: llvm type of elements (and suffix for function name) define(`gen_masked_store', ` -define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { - per_lane(WIDTH, %2, ` +define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { + per_lane(WIDTH, %2, ` %ptr_LANE_ID = getelementptr * %0, i32 0, i32 LANE %storeval_LANE_ID = extractelement %1, i32 LANE store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID') @@ -3260,6 +3528,56 @@ define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>, } ') +define(`masked_store_blend_8_16_by_4_mask64', ` +define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>, + <4 x i64>) nounwind alwaysinline { + %old = load <4 x i8> * %0, align 1 + ifelse(LLVM_VERSION,LLVM_3_0,` + %old32 = bitcast <4 x i8> %old to i32 + %new32 = bitcast <4 x i8> %1 to i32 + + %mask8 = trunc <4 x i64> %2 to <4 x i8> + %mask32 = bitcast <4 x i8> %mask8 to i32 + %notmask32 = xor i32 %mask32, -1 + + %newmasked = and i32 %new32, %mask32 + %oldmasked = and i32 %old32, %notmask32 + %result = or i32 %newmasked, %oldmasked + + %resultvec = bitcast i32 %result to <4 x i8> + ',` + %m = trunc <4 x i64> %2 to <4 x i1> + %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old + ') + store <4 x i8> %resultvec, <4 x i8> * %0, align 1 + ret void +} + +define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>, + <4 x i64>) nounwind alwaysinline { + %old = load <4 x i16> * %0, align 2 + ifelse(LLVM_VERSION,LLVM_3_0,` + %old64 = bitcast <4 x i16> %old to i64 + %new64 = bitcast <4 x i16> %1 to i64 + + %mask16 = trunc <4 x i64> %2 to <4 x i16> + %mask64 = bitcast <4 x i16> %mask16 to i64 + %notmask64 = xor i64 %mask64, -1 + + %newmasked = and i64 %new64, %mask64 + %oldmasked = and i64 %old64, %notmask64 + %result = or i64 %newmasked, %oldmasked + + %resultvec = bitcast i64 %result to <4 x i16> + ',` + %m = trunc <4 x i64> %2 to <4 x i1> + %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old + ') + store <4 x i16> %resultvec, <4 x i16> * %0, align 2 + ret void +} +') + define(`masked_store_blend_8_16_by_8', ` define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>, <8 x i32>) nounwind alwaysinline { @@ -3378,10 +3696,10 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, define(`packed_load_and_store', ` define i32 @__packed_load_active(i32 * %startptr, * %val_ptr, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline { entry: - %mask = call i64 @__movmsk( %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: @@ -3432,10 +3750,10 @@ done: } define i32 @__packed_store_active(i32 * %startptr, %vals, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline { entry: - %mask = call i64 @__movmsk( %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: @@ -3544,10 +3862,10 @@ check_neighbors: %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1) %vr = bitcast <$1 x $4> %castvr to <$1 x $2> %eq = $5 $7 <$1 x $2> %vec, %vr - ifelse(MASK,i32, ` - %eq32 = sext <$1 x i1> %eq to <$1 x i32> - %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', ` - %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)') + ifelse(MASK,i1, ` + %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)', + `%eqm = sext <$1 x i1> %eq to <$1 x MASK> + %eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)') %alleq = icmp eq i64 %eqmm, ALL_ON_MASK br i1 %alleq, label %all_equal, label %not_all_equal ', ` @@ -3722,9 +4040,9 @@ pl_done: define(`gen_gather_general', ` ; fully general 32-bit gather, takes array of pointers encoded as vector of i32s define @__gather32_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` + per_lane(WIDTH, %vecmask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID @@ -3738,9 +4056,9 @@ define @__gather32_$1( %ptrs, ; fully general 64-bit gather, takes array of pointers encoded as vector of i32s define @__gather64_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` + per_lane(WIDTH, %vecmask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID @@ -3804,7 +4122,7 @@ define @__gather_elt64_$1(i8 * %ptr, %offsets, i32 %o define @__gather_factored_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) @@ -3813,13 +4131,13 @@ define @__gather_factored_base_offsets32_$1(i8 * %ptr, store zeroinitializer, * %offsetsPtr call void @__masked_store_blend_i32( * %offsetsPtr, %offsets, - %vecmask) + %vecmask) %newOffsets = load * %offsetsPtr %deltaPtr = alloca store zeroinitializer, * %deltaPtr call void @__masked_store_blend_i32( * %deltaPtr, %offset_delta, - %vecmask) + %vecmask) %newDelta = load * %deltaPtr %ret0 = call @__gather_elt32_$1(i8 * %ptr, %newOffsets, @@ -3835,7 +4153,7 @@ define @__gather_factored_base_offsets32_$1(i8 * %ptr, @__gather_factored_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) @@ -3844,13 +4162,13 @@ define @__gather_factored_base_offsets64_$1(i8 * %ptr, store zeroinitializer, * %offsetsPtr call void @__masked_store_blend_i64( * %offsetsPtr, %offsets, - %vecmask) + %vecmask) %newOffsets = load * %offsetsPtr %deltaPtr = alloca store zeroinitializer, * %deltaPtr call void @__masked_store_blend_i64( * %deltaPtr, %offset_delta, - %vecmask) + %vecmask) %newDelta = load * %deltaPtr %ret0 = call @__gather_elt64_$1(i8 * %ptr, %newOffsets, @@ -3876,27 +4194,27 @@ gen_gather_factored($1) define @__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale, %offsets, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %scale_vec = bitcast i32 %offset_scale to <1 x i32> %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef, < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > %scaled_offsets = mul %smear_scale, %offsets %v = call @__gather_factored_base_offsets32_$1(i8 * %ptr, %scaled_offsets, i32 1, - zeroinitializer, %vecmask) + zeroinitializer, %vecmask) ret %v } define @__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale, %offsets, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %scale64 = zext i32 %offset_scale to i64 %scale_vec = bitcast i64 %scale64 to <1 x i64> %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef, < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > %scaled_offsets = mul %smear_scale, %offsets %v = call @__gather_factored_base_offsets64_$1(i8 * %ptr, %scaled_offsets, - i32 1, zeroinitializer, %vecmask) + i32 1, zeroinitializer, %vecmask) ret %v } @@ -3955,9 +4273,9 @@ define void @__scatter_elt64_$1(i8 * %ptr, %offsets, i32 %offset_s define void @__scatter_factored_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, - %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane(WIDTH, %mask, ` + per_lane(WIDTH, %mask, ` call void @__scatter_elt32_$1(i8 * %base, %offsets, i32 %offset_scale, %offset_delta, %values, i32 LANE)') ret void @@ -3965,9 +4283,9 @@ define void @__scatter_factored_base_offsets32_$1(i8* %base, %offs define void @__scatter_factored_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, - %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane(WIDTH, %mask, ` + per_lane(WIDTH, %mask, ` call void @__scatter_elt64_$1(i8 * %base, %offsets, i32 %offset_scale, %offset_delta, %values, i32 LANE)') ret void @@ -3975,8 +4293,8 @@ define void @__scatter_factored_base_offsets64_$1(i8* %base, %offs ; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s define void @__scatter32_$1( %ptrs, %values, - %mask) nounwind alwaysinline { - per_lane(WIDTH, %mask, ` + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = extractelement %values, i32 LANE @@ -3987,8 +4305,8 @@ define void @__scatter32_$1( %ptrs, %values, ; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s define void @__scatter64_$1( %ptrs, %values, - %mask) nounwind alwaysinline { - per_lane(WIDTH, %mask, ` + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = extractelement %values, i32 LANE @@ -4044,3 +4362,109 @@ define i1 @__rdrand_i64(i64 * %ptr) { ret i1 %good } ') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define(`define_avg_up_uint8', ` +define @__avg_up_uint8(, ) { + %a16 = zext %0 to + %b16 = zext %1 to + %sum1 = add %a16, %b16 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_int8', ` +define @__avg_up_int8(, ) { + %a16 = sext %0 to + %b16 = sext %1 to + %sum1 = add %a16, %b16 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_uint16', ` +define @__avg_up_uint16(, ) { + %a32 = zext %0 to + %b32 = zext %1 to + %sum1 = add %a32, %b32 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_int16', ` +define @__avg_up_int16(, ) { + %a32 = sext %0 to + %b32 = sext %1 to + %sum1 = add %a32, %b32 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_uint8', ` +define @__avg_down_uint8(, ) { + %a16 = zext %0 to + %b16 = zext %1 to + %sum = add %a16, %b16 + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_int8', ` +define @__avg_down_int8(, ) { + %a16 = sext %0 to + %b16 = sext %1 to + %sum = add %a16, %b16 + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_uint16', ` +define @__avg_down_uint16(, ) { + %a32 = zext %0 to + %b32 = zext %1 to + %sum = add %a32, %b32 + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_int16', ` +define @__avg_down_int16(, ) { + %a32 = sext %0 to + %b32 = sext %1 to + %sum = add %a32, %b32 + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_up_avgs', ` +define_avg_up_uint8() +define_avg_up_int8() +define_avg_up_uint16() +define_avg_up_int16() +') + +define(`define_down_avgs', ` +define_avg_down_uint8() +define_avg_down_int8() +define_avg_down_uint16() +define_avg_down_int16() +') + +define(`define_avgs', ` +define_up_avgs() +define_down_avgs() +') diff --git a/cbackend.cpp b/cbackend.cpp index d23bcc20..7d4b4cfc 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -3704,6 +3704,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) { case llvm::Intrinsic::sadd_with_overflow: case llvm::Intrinsic::trap: case llvm::Intrinsic::objectsize: + case llvm::Intrinsic::readcyclecounter: // We directly implement these intrinsics break; default: @@ -4056,6 +4057,9 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID, return true; case llvm::Intrinsic::objectsize: return true; + case llvm::Intrinsic::readcyclecounter: + Out << "__clock()"; + return true; } } diff --git a/check_env.py b/check_env.py new file mode 100755 index 00000000..8c90d895 --- /dev/null +++ b/check_env.py @@ -0,0 +1,102 @@ +#!/usr/bin/python +# +# Copyright (c) 2013, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# // Author: Filippov Ilia + +import common +import sys +import os +import string +print_debug = common.print_debug +error = common.error +take_lines = common.take_lines + +exists = [False, False, False, False, False, False, False, False] +names = ["m4", "bison", "flex", "sde", "ispc", "clang", "gcc", "icc"] + +PATH_dir = string.split(os.getenv("PATH"), os.pathsep) +for counter in PATH_dir: + for i in range(0,8): + if os.path.exists(counter + os.sep + names[i]): + exists[i] = True + +print_debug("=== in PATH: ===\n", False, "") +print_debug("Tools:\n", False, "") +for i in range(0,3): + if exists[i]: + print_debug(take_lines(names[i] + " --version", "first"), False, "") + else: + error("you don't have " + names[i], 0) +if exists[0] and exists[1] and exists[2]: + if common.check_tools(2): + print_debug("Tools' versions are ok\n", False, "") +print_debug("\nSDE:\n", False, "") +if exists[3]: + print_debug(take_lines(names[3] + " --version", "first"), False, "") +else: + error("you don't have " + names[3], 2) +print_debug("\nISPC:\n", False, "") +if exists[4]: + print_debug(take_lines(names[4] + " --version", "first"), False, "") +else: + error("you don't have " + names[4], 2) +print_debug("\nC/C++ compilers:\n", False, "") +for i in range(5,8): + if exists[i]: + print_debug(take_lines(names[i] + " --version", "first"), False, "") + else: + error("you don't have " + names[i], 2) + +print_debug("\n=== in ISPC specific environment variables: ===\n", False, "") +if os.environ.get("LLVM_HOME") == None: + error("you have no LLVM_HOME", 2) +else: + print_debug("Your LLVM_HOME:" + os.environ.get("LLVM_HOME") + "\n", False, "") +if os.environ.get("ISPC_HOME") == None: + error("you have no ISPC_HOME", 2) +else: + print_debug("Your ISPC_HOME:" + os.environ.get("ISPC_HOME") + "\n", False, "") + if os.path.exists(os.environ.get("ISPC_HOME") + os.sep + "ispc"): + print_debug("You have ISPC in your ISPC_HOME: " + + take_lines(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version", "first"), False, "") + else: + error("you don't have ISPC in your ISPC_HOME", 2) +if os.environ.get("SDE_HOME") == None: + error("You have no SDE_HOME", 2) +else: + print_debug("Your SDE_HOME:" + os.environ.get("SDE_HOME") + "\n", False, "") + if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"): + print_debug("You have sde in your SDE_HOME: " + + take_lines(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version", "first"), False, "") + else: + error("you don't have any SDE in your ISPC_HOME", 2) diff --git a/common.py b/common.py new file mode 100644 index 00000000..be3e9526 --- /dev/null +++ b/common.py @@ -0,0 +1,124 @@ +#!/usr/bin/python +# +# Copyright (c) 2013, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# // Author: Filippov Ilia +import sys +import os +import shutil + +def write_to_file(filename, line): + f = open(filename, 'a') + f.writelines(line) + f.close() + +#remove file if it exists +def remove_if_exists(filename): + if os.path.exists(filename): + if os.path.isdir(filename): + shutil.rmtree(filename) + else: + os.remove(filename) + +# detect version which is printed after command +def take_lines(command, which): + os.system(command + " > " + "temp_detect_version") + version = open("temp_detect_version") + if which == "first": + answer = version.readline() + if which == "all": + answer = version.readlines() + version.close() + remove_if_exists("temp_detect_version") + return answer + +# print versions of compilers +def print_version(ispc_test, ispc_ref, ref_compiler, s, perf_log, is_windows): + print_debug("\nUsing test compiler: " + take_lines(ispc_test + " --version", "first"), s, perf_log) + if ispc_ref != "": + print_debug("Using ref compiler: " + take_lines(ispc_ref + " --version", "first"), s, perf_log) + if is_windows == False: + temp1 = take_lines(ref_compiler + " --version", "first") + else: + os.system(ref_compiler + " 2>&1" + " 2> temp_detect_version > temp_detect_version1" ) + version = open("temp_detect_version") + temp1 = version.readline() + version.close() + remove_if_exists("temp_detect_version") + remove_if_exists("temp_detect_version1") + print_debug("Using C/C++ compiler: " + temp1 + "\n", s, perf_log) + +# print everything from scripts instead errors +def print_debug(line, silent, filename): + if silent == False: + sys.stdout.write(line) + sys.stdout.flush() + if os.environ.get("ISPC_HOME") != None: + if os.path.exists(os.environ.get("ISPC_HOME")): + write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line) + if filename != "": + write_to_file(filename, line) + +# print errors from scripts +# type 1 for error in environment +# type 2 for warning +# type 3 for error of compiler or test which isn't the goal of script +def error(line, error_type): + line = line + "\n" + if error_type == 1: + sys.stderr.write("Fatal error: " + line) + sys.exit(1) + if error_type == 2: + sys.stderr.write("Warning: " + line) + if error_type == 0: + print_debug("FIND ERROR: " + line, False, "") + +def check_tools(m): + input_tools=[[[1,4],"m4 --version", "bad m4 version"], + [[2,4],"bison --version", "bad bison version"], + [[2,5], "flex --version", "bad flex version"]] + ret = 1 + for t in range(0,len(input_tools)): + t1 = ((take_lines(input_tools[t][1], "first"))[:-1].split(" ")) + for i in range(0,len(t1)): + t11 = t1[i].split(".") + f = True + for j in range(0,len(t11)): + if not t11[j].isdigit(): + f = False + if f == True: + for j in range(0,len(t11)): + if j < len(input_tools[t][0]): + if int(t11[j])" +"double precision floating point number, starting with dot, optional exponent +syn match cFloat display contained ".\d*d[-+]\=\d*\>" +"double precision floating point number, without dot, with exponent +syn match cFloat display contained "\d\+d[-+]\=\d\+\>" + " Default highlighting command -nargs=+ HiLink hi def link HiLink ispcStatement Statement diff --git a/ctx.cpp b/ctx.cpp index 1e79c97b..c50d22f9 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1456,13 +1456,13 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) { for (unsigned int i = 0; i < at->getNumElements(); ++i) { llvm::Value *elt = ExtractInst(b, i); llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType, - LLVMGetName(elt, "_to_boolvec32")); + LLVMGetName(elt, "_to_boolvec")); ret = InsertInst(ret, sext, i); } return ret; } else - return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_i32")); + return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_boolvec")); } @@ -2781,6 +2781,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, // Figure out if we need a 8, 16, 32 or 64-bit masked store. llvm::Function *maskedStoreFunc = NULL; + llvm::Type *llvmValueType = value->getType(); const PointerType *pt = CastType(valueType); if (pt != NULL) { @@ -2809,8 +2810,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, else maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } - else if (Type::Equal(valueType, AtomicType::VaryingBool) && - g->target->getMaskBitCount() == 1) { + else if (llvmValueType == LLVMTypes::Int1VectorType) { llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask, LLVMMaskAllOn, "~mask"); llvm::Value *old = LoadInst(ptr); @@ -2823,28 +2823,22 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, StoreInst(final, ptr); return; } - else if (Type::Equal(valueType, AtomicType::VaryingDouble)) { + else if (llvmValueType == LLVMTypes::DoubleVectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_double"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt64) || - Type::Equal(valueType, AtomicType::VaryingUInt64)) { + else if (llvmValueType == LLVMTypes::Int64VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } - else if (Type::Equal(valueType, AtomicType::VaryingFloat)) { + else if (llvmValueType == LLVMTypes::FloatVectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_float"); } - else if (Type::Equal(valueType, AtomicType::VaryingBool) || - Type::Equal(valueType, AtomicType::VaryingInt32) || - Type::Equal(valueType, AtomicType::VaryingUInt32) || - CastType(valueType) != NULL) { + else if (llvmValueType == LLVMTypes::Int32VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt16) || - Type::Equal(valueType, AtomicType::VaryingUInt16)) { + else if (llvmValueType == LLVMTypes::Int16VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i16"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt8) || - Type::Equal(valueType, AtomicType::VaryingUInt8)) { + else if (llvmValueType == LLVMTypes::Int8VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i8"); } AssertPos(currentPos, maskedStoreFunc != NULL); diff --git a/decl.cpp b/decl.cpp index e7b3cdef..8a10543b 100644 --- a/decl.cpp +++ b/decl.cpp @@ -69,8 +69,15 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) { if (type == NULL) return NULL; - if ((typeQualifiers & TYPEQUAL_CONST) != 0) + if ((typeQualifiers & TYPEQUAL_CONST) != 0) { type = type->GetAsConstType(); + } + + if ( ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) + && ((typeQualifiers & TYPEQUAL_VARYING) != 0) ) { + Error(pos, "Type \"%s\" cannot be qualified with both uniform and varying.", + type->GetString().c_str()); + } if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) { if (Type::Equal(type, AtomicType::Void)) @@ -84,9 +91,10 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) { else type = type->GetAsVaryingType(); } - else + else { if (Type::Equal(type, AtomicType::Void) == false) type = type->GetAsUnboundVariabilityType(); + } if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) { if ((typeQualifiers & TYPEQUAL_SIGNED) != 0) @@ -124,6 +132,17 @@ DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) { typeQualifiers = tq; soaWidth = 0; vectorSize = 0; + if (t != NULL) { + if (m->symbolTable->ContainsType(t)) { + // Typedefs might have uniform/varying qualifiers inside. + if (t->IsVaryingType()) { + typeQualifiers |= TYPEQUAL_VARYING; + } + else if (t->IsUniformType()) { + typeQualifiers |= TYPEQUAL_UNIFORM; + } + } + } } @@ -229,6 +248,7 @@ Declarator::Declarator(DeclaratorKind dk, SourcePos p) void Declarator::InitFromDeclSpecs(DeclSpecs *ds) { const Type *baseType = ds->GetBaseType(pos); + InitFromType(baseType, ds); if (type == NULL) { @@ -591,6 +611,7 @@ Declaration::Declaration(DeclSpecs *ds, Declarator *d) { } + std::vector Declaration::GetVariableDeclarations() const { Assert(declSpecs->storageClass != SC_TYPEDEF); diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt index 007f283e..a8575ea0 100644 --- a/docs/ReleaseNotes.txt +++ b/docs/ReleaseNotes.txt @@ -1,3 +1,63 @@ +=== v1.5.0 === (27 September 2013) + +A major new version of ISPC with several new targets and important bug fixes. +Here's a list of the most important changes, if you are using pre-built +binaries (which are based on patched version of LLVM 3.3): + +* The naming of targets was changed to explicitly include data type width and + a number of threads in the gang. For example, avx2-i32x8 is avx2 target, + which uses 32 bit types as a base and has 8 threads in a gang. Old naming + scheme is still supported, but depricated. + +* New SSE4 targets for calculations based on 8 bit and 16 bit data types: + sse4-i8x16 and sse4-i16x8. + +* New AVX1 target for calculations based on 64 bit data types: avx1-i64x4. + +* SVML support was extended and improved. + +* Behavior of -g switch was changed to not affect optimization level. + +* ISPC debug infrastructure was redesigned. See --help-dev for more info and + enjoy capabilities of new --debug-phase= and --off-phase= + switches. + +* Fixed an auto-dispatch bug, which caused AVX code execution when OS doesn't + support AVX (but hardware does). + +* Fixed a bug, which discarded uniform/varying keyword in typedefs. + +* Several performance regressions were fixed. + +If you are building ISPC yourself, then following changes are also available +to you: + +* --cpu=slm for targeting Intel Atom codename Silvermont (if LLVM 3.4 is used). + +* ARM NEON targets are available (if enabled in build system). + +* --debug-ir= is available to generate debug information based on LLVM + IR (if LLVM 3.4 is used). In debugger you'll see LLVM IR instead of source + code. + +* A redesigned and improved test and configuration management system is + available to facilitate the process of building LLVM and testing ISPC + compiler. + +Standard library changes/fixes: + +* __pause() function was removed from standard library. + +* Fixed reduce_[min|max]_[float|double] intrinsics, which were producing + incorrect code under some conditions. + +Language changes: + +* By default a floating point constant without a suffix is a single precision + constant (32 bit). A new suffix "d" was introduced to allow double precision + constant (64 bit). Please refer to tests/double-consts.ispc for syntax + examples. + === v1.4.4 === (19 July 2013) A minor version update with several stability fixes requested by the customers. diff --git a/docs/build.sh b/docs/build.sh index a13f3231..4f4fbfe4 100755 --- a/docs/build.sh +++ b/docs/build.sh @@ -1,14 +1,16 @@ #!/bin/bash +rst2html=rst2html.py + for i in ispc perfguide faq; do - rst2html --template=template.txt --link-stylesheet \ + $rst2html --template=template.txt --link-stylesheet \ --stylesheet-path=css/style.css $i.rst > $i.html done -rst2html --template=template-news.txt --link-stylesheet \ +$rst2html --template=template-news.txt --link-stylesheet \ --stylesheet-path=css/style.css news.rst > news.html -rst2html --template=template-perf.txt --link-stylesheet \ +$rst2html --template=template-perf.txt --link-stylesheet \ --stylesheet-path=css/style.css perf.rst > perf.html #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex diff --git a/docs/ispc.rst b/docs/ispc.rst old mode 100755 new mode 100644 index c6c63172..eac9b24e --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -270,6 +270,14 @@ new reserved words: ``unmasked``, ``foreach_unique``, ``foreach_active``, and ``in``. Any program that happens to have a variable or function with one of these names must be modified to rename that symbol. +Updating ISPC Programs For Changes In ISPC 1.5.0 +------------------------------------------------ + +This release adds support for double precision floating point constants. +Double precision floating point constants are floating point number with +``d`` suffix and optional exponent part. Here are some examples: 3.14d, +31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is +treated as single precision constant. Getting Started with ISPC ========================= @@ -467,45 +475,100 @@ There are three options that affect the compilation target: ``--arch``, which sets the target architecture, ``--cpu``, which sets the target CPU, and ``--target``, which sets the target instruction set. -By default, the ``ispc`` compiler generates code for the 64-bit x86-64 -architecture (i.e. ``--arch=x86-64``.) To compile to a 32-bit x86 target, -supply ``--arch=x86`` on the command line: +If none of these options is specified, ``ispc`` generates code for the +architecture of the system the compiler is running on (i.e. 64-bit x86-64 +(``--arch=x86-64``) on x86 systems and ARM NEON on ARM systems. + +To compile to a 32-bit x86 target, for example, supply ``--arch=x86`` on +the command line: :: ispc foo.ispc -o foo.obj --arch=x86 -No other architectures are currently supported. +Currently-supported architectures are ``x86-64``, ``x86``, and ``arm``. The target CPU determines both the default instruction set used as well as which CPU architecture the code is tuned for. ``ispc --help`` provides a -list of a number of the supported CPUs. By default, the CPU type of the -system on which you're running ``ispc`` is used to determine the target -CPU. +list of all of the supported CPUs. By default, the CPU type of the system +on which you're running ``ispc`` is used to determine the target CPU. :: ispc foo.ispc -o foo.obj --cpu=corei7-avx -Finally, ``--target`` selects between the SSE2, SSE4, and AVX, and AVX2 -instruction sets. (As general context, SSE2 was first introduced in -processors that shipped in 2001, SSE4 was introduced in 2007, and -processors with AVX were introduced in 2010. AVX2 will be supported on -future CPUs based on Intel's "Haswell" architecture. Consult your CPU's -manual for specifics on which vector instruction set it supports.) +Finally, ``--target`` selects the target instruction set. The target +string is of the form ``[ISA]-i[mask size]x[gang size]``. For example, +``--target=avx2-i32x16`` specifies a target with the AVX2 instruction set, +a mask size of 32 bits, and a gang size of 16. + +The following target ISAs are supported: + +============ ========================================== +Target Description +------------ ------------------------------------------ +avx, avx1 AVX (2010-2011 era Intel CPUs) +avx1.1 AVX 1.1 (2012 era "Ivybridge" Intel CPUs) +avx2 AVX 2 target (2013- Intel "Haswell" CPUs) +neon ARM NEON +sse2 SSE2 (early 2000s era x86 CPUs) +sse4 SSE4 (generally 2008-2010 Intel CPUs) +============ ========================================== + +Consult your CPU's manual for specifics on which vector instruction set it +supports. + +The mask size may be 8, 16, or 32 bits, though not all combinations of ISAs +and mask sizes are supported. For best performance, the best general +approach is to choose a mask size equal to the size of the most common +datatype in your programs. For example, if most of your computation is on +32-bit floating-point values, an ``i32`` target is appropriate. However, +if you're mostly doing computation on 8-bit images, ``i8`` is a better choice. + +See `Basic Concepts: Program Instances and Gangs of Program Instances`_ for +more discussion of the "gang size" and its implications for program +execution. + +Running ``ispc --help`` and looking at the output for the ``--target`` +option gives the most up-to-date documentation about which targets your +compiler binary supports. + +The naming scheme for compilation targets changed in August 2013; the +following table shows the relationship between names in the old scheme and +in the new scheme: + +============= =========== +Target Former Name +------------- ----------- +avx1-i32x8 avx, avx1 +avx1-i32x16 avx-x2 +avx1.1-i32x8 avx1.1 +avx1.1-i32x16 avx1.1-x2 +avx2-i32x8 avx2 +avx2-i32x16 avx2-x2 +neon-8 n/a +neon-16 n/a +neon-32 n/a +sse2-i32x4 sse2 +sse2-i32x8 sse2-x2 +sse4-i32x4 sse4 +sse4-i32x8 sse4-x2 +sse4-i8x16 n/a +sse4-i16x8 n/a +============= =========== By default, the target instruction set is chosen based on the most capable one supported by the system on which you're running ``ispc``. You can override this choice with the ``--target`` flag; for example, to select -Intel® SSE2, use ``--target=sse2``. (As with the other options in this -section, see the output of ``ispc --help`` for a full list of supported -targets.) +Intel® SSE2 with a 32-bit mask and 4 program instances in a gang, use +``--target=sse2-i32x4``. (As with the other options in this section, see +the output of ``ispc --help`` for a full list of supported targets.) Generating Generic C++ Output ----------------------------- In addition to generating object files or assembly output for specific -targets like SSE2, SSE4, and AVX, ``ispc`` provides an option to generate +targets like NEON, SSE2, SSE4, and AVX, ``ispc`` provides an option to generate "generic" C++ output. This As an example, consider the following simple ``ispc`` program: @@ -659,7 +722,7 @@ preprocessor runs: * - ISPC - 1 - Detecting that the ``ispc`` compiler is processing the file - * - ISPC_TARGET_{SSE2,SSE4,AVX,AVX2} + * - ISPC_TARGET_{NEON_8,NEON_16,NEON_32,SSE2,SSE4,AVX,AVX11,AVX2,GENERIC} - 1 - One of these will be set, depending on the compilation target. * - ISPC_POINTER_SIZE @@ -1294,7 +1357,8 @@ but are likely to be supported in future releases: * Bitfield members of ``struct`` types * Variable numbers of arguments to functions * Literal floating-point constants (even without a ``f`` suffix) are - currently treated as being ``float`` type, not ``double`` + currently treated as being ``float`` type, not ``double``. To have a double + precision floating point constant use ``d`` suffix. * The ``volatile`` qualifier * The ``register`` storage class for variables. (Will be ignored). @@ -3365,6 +3429,31 @@ The ``isnan()`` functions test whether the given value is a floating-point uniform bool isnan(uniform double v) +A number of functions are also available for performing operations on 8- and +16-bit quantities; these map to specialized instructions that perform these +operations on targets that support them. ``avg_up()`` computes the average +of the two values, rounding up if their average is halfway between two +integers (i.e., it computes ``(a+b+1)/2``). + +:: + + int8 avg_up(int8 a, int8 b) + unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) + int16 avg_up(int16 a, int16 b) + unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) + + +``avg_down()`` computes the average of the two values, rounding down (i.e., +it computes ``(a+b)/2``). + +:: + + int8 avg_down(int8 a, int8 b) + unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) + int16 avg_down(int16 a, int16 b) + unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) + + Transcendental Functions ------------------------ @@ -3582,7 +3671,7 @@ command-line argument. Cross-Program Instance Operations --------------------------------- -``ispc`` programs are often used to expresses independently-executing +``ispc`` programs are often used to express independently-executing programs performing computation on separate data elements. (i.e. pure data-parallelism). However, it's often the case where it's useful for the program instances to be able to cooperate in computing results. The @@ -3613,7 +3702,7 @@ the running program instances. The ``rotate()`` function allows each program instance to find the value of the given value that their neighbor ``offset`` steps away has. For -example, on an 8-wide target, if ``offset`` has the value (1, 2, 3, 4, 5, +example, on an 8-wide target, if ``value`` has the value (1, 2, 3, 4, 5, 6, 7, 8) across the gang of running program instances, then ``rotate(value, -1)`` causes the first program instance to get the value 8, the second program instance to get the value 1, the third 2, and so forth. The @@ -3692,7 +3781,7 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v`` Reductions ---------- -A number routines are available to evaluate conditions across the +A number of routines are available to evaluate conditions across the running program instances. For example, ``any()`` returns ``true`` if the given value ``v`` is ``true`` for any of the SPMD program instances currently running, ``all()`` returns ``true`` if it true @@ -3711,29 +3800,44 @@ instances are added together by the ``reduce_add()`` function. :: - uniform float reduce_add(float x) - uniform int reduce_add(int x) - uniform unsigned int reduce_add(unsigned int x) + uniform int16 reduce_add(int8 x) + uniform unsigned int16 reduce_add(unsigned int8 x) + uniform int32 reduce_add(int16 x) + uniform unsigned int32 reduce_add(unsigned 16int x) + uniform int64 reduce_add(int32 x) + uniform unsigned int64 reduce_add(unsigned int32 x) + uniform int64 reduce_add(int64 x) + uniform unsigned int64 reduce_add(unsigned int64 x) -You can also use functions to compute the minimum and maximum value of the -given value across all of the currently-executing program instances. + uniform float reduce_add(float x) + uniform double reduce_add(double x) + +You can also use functions to compute the minimum value of the given value +across all of the currently-executing program instances. :: - uniform float reduce_min(float a) uniform int32 reduce_min(int32 a) uniform unsigned int32 reduce_min(unsigned int32 a) - uniform double reduce_min(double a) uniform int64 reduce_min(int64 a) uniform unsigned int64 reduce_min(unsigned int64 a) - uniform float reduce_max(float a) + uniform float reduce_min(float a) + uniform double reduce_min(double a) + +Equivalent functions are available to comptue the maximum of the given +varying variable over the active program instances. + +:: + uniform int32 reduce_max(int32 a) uniform unsigned int32 reduce_max(unsigned int32 a) - uniform double reduce_max(double a) uniform int64 reduce_max(int64 a) uniform unsigned int64 reduce_max(unsigned int64 a) + uniform float reduce_max(float a) + uniform double reduce_max(double a) + Finally, you can check to see if a particular value has the same value in all of the currently-running program instances: @@ -3741,9 +3845,10 @@ all of the currently-running program instances: uniform bool reduce_equal(int32 v) uniform bool reduce_equal(unsigned int32 v) - uniform bool reduce_equal(float v) uniform bool reduce_equal(int64 v) uniform bool reduce_equal(unsigned int64 v) + + uniform bool reduce_equal(float v) uniform bool reduce_equal(double) There are also variants of these functions that return the value as a @@ -3758,10 +3863,11 @@ performance in the `Performance Guide`_. uniform bool reduce_equal(int32 v, uniform int32 * uniform sameval) uniform bool reduce_equal(unsigned int32 v, uniform unsigned int32 * uniform sameval) - uniform bool reduce_equal(float v, uniform float * uniform sameval) uniform bool reduce_equal(int64 v, uniform int64 * uniform sameval) uniform bool reduce_equal(unsigned int64 v, uniform unsigned int64 * uniform sameval) + + uniform bool reduce_equal(float v, uniform float * uniform sameval) uniform bool reduce_equal(double, uniform double * uniform sameval) If called when none of the program instances are running, diff --git a/docs/news.rst b/docs/news.rst index c1c35de3..7d78a662 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -2,6 +2,14 @@ ispc News ========= +ispc 1.5.0 is Released +---------------------- + +A major update of ``ispc`` has been released with several new targets available +and bunch of performance and stability fixes. The released binaries are built +with patched version of LLVM 3.3. Please refer to Release Notes for complete +set of changes. + ispc 1.4.4 is Released ---------------------- diff --git a/docs/template-news.txt b/docs/template-news.txt index 9a41fbdb..d5eebdd1 100644 --- a/docs/template-news.txt +++ b/docs/template-news.txt @@ -57,7 +57,7 @@ %(body)s
- diff --git a/docs/template-perf.txt b/docs/template-perf.txt index 4932e332..9537a836 100644 --- a/docs/template-perf.txt +++ b/docs/template-perf.txt @@ -57,7 +57,7 @@ %(body)s
- diff --git a/docs/template.txt b/docs/template.txt index 8cb4f5ab..b9041f19 100644 --- a/docs/template.txt +++ b/docs/template.txt @@ -57,7 +57,7 @@ %(body)s
- diff --git a/doxygen.cfg b/doxygen.cfg index 480d9331..a0ad3176 100644 --- a/doxygen.cfg +++ b/doxygen.cfg @@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler" # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 1.4.5dev +PROJECT_NUMBER = 1.5.1dev # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/examples/README.txt b/examples/README.txt index 5b47df44..b67529c1 100644 --- a/examples/README.txt +++ b/examples/README.txt @@ -146,6 +146,11 @@ This is a simple "hello world" type program that shows a ~10 line application program calling out to a ~5 line ispc program to do a simple computation. +Sort +==== +This is a bucket sort of 32 bit unsigned integers. +By default 1000000 random elements get sorted. +Call ./sort N in order to sort N elements instead. Volume ====== diff --git a/examples/aobench/ao.cpp b/examples/aobench/ao.cpp index cbe75a0b..2286316d 100644 --- a/examples/aobench/ao.cpp +++ b/examples/aobench/ao.cpp @@ -138,7 +138,7 @@ int main(int argc, char **argv) } // Report results and save image - printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", + printf("[aobench ispc]:\t\t\t[%.3f] million cycles (%d x %d image)\n", minTimeISPC, width, height); savePPM("ao-ispc.ppm", width, height); @@ -158,7 +158,7 @@ int main(int argc, char **argv) } // Report results and save image - printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", + printf("[aobench ispc + tasks]:\t\t[%.3f] million cycles (%d x %d image)\n", minTimeISPCTasks, width, height); savePPM("ao-ispc-tasks.ppm", width, height); @@ -176,7 +176,7 @@ int main(int argc, char **argv) } // Report more results, save another image... - printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, + printf("[aobench serial]:\t\t[%.3f] million cycles (%d x %d image)\n", minTimeSerial, width, height); printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); diff --git a/examples/common.mk b/examples/common.mk index cdfc4c6a..367d3eb3 100644 --- a/examples/common.mk +++ b/examples/common.mk @@ -9,18 +9,26 @@ CC=gcc CCFLAGS=-Iobjs/ -O2 LIBS=-lm $(TASK_LIB) -lstdc++ -ISPC=ispc -O2 $(ISPC_FLAGS) +ISPC=ispc +ISPC_FLAGS=-O2 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h) -ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/arm.*/arm/ -e s/sa110/arm/) +ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/) ifeq ($(ARCH),x86) ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \ $(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o) ISPC_TARGETS=$(ISPC_IA_TARGETS) - ISPC_FLAGS += --arch=x86-64 - CXXFLAGS += -m64 - CCFLAGS += -m64 + ARCH_BIT:=$(shell getconf LONG_BIT) + ifeq ($(ARCH_BIT),32) + ISPC_FLAGS += --arch=x86 + CXXFLAGS += -m32 + CCFLAGS += -m32 + else + ISPC_FLAGS += --arch=x86-64 + CXXFLAGS += -m64 + CCFLAGS += -m64 + endif else ifeq ($(ARCH),arm) ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o)) ISPC_TARGETS=$(ISPC_ARM_TARGETS) @@ -44,7 +52,7 @@ dirs: objs/%.cpp objs/%.o objs/%.h: dirs clean: - /bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 + /bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test $(EXAMPLE): $(OBJS) $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) @@ -61,10 +69,10 @@ objs/%.o: ../%.cpp dirs objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc - $(ISPC) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h + $(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC) - $(ISPC) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h + $(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp $(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@ @@ -73,7 +81,7 @@ $(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC) - $(ISPC) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h + $(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp $(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@ @@ -82,7 +90,7 @@ $(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC) - $(ISPC) $< -o $@ --target=generic-1 + $(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1 $(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) diff --git a/examples/deferred/main.cpp b/examples/deferred/main.cpp index 17bd3f42..4f2be879 100644 --- a/examples/deferred/main.cpp +++ b/examples/deferred/main.cpp @@ -130,7 +130,7 @@ int main(int argc, char** argv) { printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", serialCycles/ispcCycles, serialCycles/dynamicCilkCycles); #else - printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles); + printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles); #endif // __cilk DeleteInputData(input); diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 828c1ab4..d81101f7 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1162,19 +1162,20 @@ REDUCE_ADD(double, __vec16_d, __reduce_add_double) REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_int32) +REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec16_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec16_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >) @@ -1758,3 +1759,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} + +#endif // !WIN32 + diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 64b82cb1..531ed215 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -408,15 +408,15 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) { return (uint64_t)mask.v; } -static FORCEINLINE __vec32_i1 __any(__vec32_i1 mask) { +static FORCEINLINE bool __any(__vec32_i1 mask) { return (mask.v!=0); } -static FORCEINLINE __vec32_i1 __all(__vec32_i1 mask) { - return (mask.v==0xFFFFFFFF); +static FORCEINLINE bool __all(__vec32_i1 mask) { + return (mask.v==0xFFFFFFFFul); } -static FORCEINLINE __vec32_i1 __none(__vec32_i1 mask) { +static FORCEINLINE bool __none(__vec32_i1 mask) { return (mask.v==0); } @@ -1231,19 +1231,20 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double) REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_int32) +//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec32_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_max_uint64, >) @@ -1826,3 +1827,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // WIN32 + +#undef FORCEINLINE diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 7869faa5..bbeb007a 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -533,15 +533,15 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) { return (uint64_t)mask.v; } -static FORCEINLINE __vec64_i1 __any(__vec64_i1 mask) { +static FORCEINLINE bool __any(__vec64_i1 mask) { return (mask.v!=0); } -static FORCEINLINE __vec64_i1 __all(__vec64_i1 mask) { - return (mask.v==0xFFFFFFFFFFFFFFFF); +static FORCEINLINE bool __all(__vec64_i1 mask) { + return (mask.v==0xFFFFFFFFFFFFFFFFull); } -static FORCEINLINE __vec64_i1 __none(__vec64_i1 mask) { +static FORCEINLINE bool __none(__vec64_i1 mask) { return (mask.v==0); } @@ -1364,19 +1364,20 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double) REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_int32) +//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec64_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_max_uint64, >) @@ -1959,3 +1960,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif + +#undef FORCEINLINE diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h new file mode 100644 index 00000000..78d35ddc --- /dev/null +++ b/examples/intrinsics/knc-i1x16.h @@ -0,0 +1,2760 @@ +/** + Copyright (c) 2010-2013, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#define FORCEINLINE __forceinline +#define PRE_ALIGN(x) /*__declspec(align(x))*/ +#define POST_ALIGN(x) +#define roundf(x) (floorf(x + .5f)) +#define round(x) (floor(x + .5)) +#else +#define FORCEINLINE __forceinline +#define PRE_ALIGN(x) +#define POST_ALIGN(x) __attribute__ ((aligned(x))) +#endif + +#define KNC 1 +#if 0 +extern "C" +{ + int printf(const unsigned char *, ...); + int puts(unsigned char *); + unsigned int putchar(unsigned int); + int fflush(void *); + uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t); + uint8_t *memset(uint8_t *, uint8_t, uint64_t); + void memset_pattern16(void *, const void *, uint64_t); +} +#endif + +typedef float __vec1_f; +typedef double __vec1_d; +typedef int8_t __vec1_i8; +typedef int16_t __vec1_i16; +typedef int32_t __vec1_i32; +typedef int64_t __vec1_i64; + +/************ mask **************/ + +struct __vec16_i1 +{ + __mmask16 v; + + FORCEINLINE __vec16_i1() { } + FORCEINLINE __vec16_i1(const __mmask16 &vv) : v(vv) { } + FORCEINLINE __vec16_i1(bool v0, bool v1, bool v2, bool v3, + bool v4, bool v5, bool v6, bool v7, + bool v8, bool v9, bool v10, bool v11, + bool v12, bool v13, bool v14, bool v15) { + v = ((v0 & 1) | + ((v1 & 1) << 1) | + ((v2 & 1) << 2) | + ((v3 & 1) << 3) | + ((v4 & 1) << 4) | + ((v5 & 1) << 5) | + ((v6 & 1) << 6) | + ((v7 & 1) << 7) | + ((v8 & 1) << 8) | + ((v9 & 1) << 9) | + ((v10 & 1) << 10) | + ((v11 & 1) << 11) | + ((v12 & 1) << 12) | + ((v13 & 1) << 13) | + ((v14 & 1) << 14) | + ((v15 & 1) << 15)); + } + + FORCEINLINE operator __mmask16() const { return v; } +}; + +/************ vector **************/ + +struct PRE_ALIGN(64) __vec16_i32 +{ + __m512i v; + FORCEINLINE operator __m512i() const { return v; } + FORCEINLINE __vec16_i32() : v(_mm512_undefined_epi32()) {} + FORCEINLINE __vec16_i32(const int32_t &in) : v(_mm512_set1_epi32(in)) {} + FORCEINLINE __vec16_i32(const __m512i &in) : v(in) {} + FORCEINLINE __vec16_i32(const __vec16_i32 &o) : v(o.v) {} + FORCEINLINE __vec16_i32& operator =(const __vec16_i32 &o) { v=o.v; return *this; } + FORCEINLINE __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, + int32_t v04, int32_t v05, int32_t v06, int32_t v07, + int32_t v08, int32_t v09, int32_t v10, int32_t v11, + int32_t v12, int32_t v13, int32_t v14, int32_t v15) : + v ( _mm512_set_16to16_pi(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) ) {} + FORCEINLINE const int32_t& operator[](const int i) const { return ((int32_t*)this)[i]; } + FORCEINLINE int32_t& operator[](const int i) { return ((int32_t*)this)[i]; } +} POST_ALIGN(64); + +PRE_ALIGN(64) struct __vec16_f +{ + __m512 v; + FORCEINLINE operator __m512() const { return v; } + FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { } + FORCEINLINE __vec16_f(const __m512 &in) : v(in) {} + FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {} + FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; } + FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, + float v04, float v05, float v06, float v07, + float v08, float v09, float v10, float v11, + float v12, float v13, float v14, float v15) : + v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) ) {} + FORCEINLINE const float& operator[](const int i) const { return ((float*)this)[i]; } + FORCEINLINE float& operator[](const int i) { return ((float*)this)[i]; } +} POST_ALIGN(64); + +static void zmm2hilo(const __m512i v1, const __m512i v2, __m512i &_hi, __m512i &_lo) +{ + _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v2); + _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v1); + _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v2); + _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v1); +} +static void hilo2zmm(const __m512i v_hi, const __m512i v_lo, __m512i &_v1, __m512i &_v2) +{ + _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v_hi); + _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v_lo); + _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v_hi); + _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v_lo); +} + +struct PRE_ALIGN(128) __vec16_d +{ + union { + __m512d v1; + __m512d v_hi; + }; + union { + __m512d v2; + __m512d v_lo; + }; + FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {} + FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {} + FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {} + FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; } + FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, + double v04, double v05, double v06, double v07, + double v08, double v09, double v10, double v11, + double v12, double v13, double v14, double v15) { + v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08); + v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00); + } + FORCEINLINE const double& operator[](const int i) const { return ((double*)this)[i]; } + FORCEINLINE double& operator[](const int i) { return ((double*)this)[i]; } + FORCEINLINE __vec16_d cvt2hilo() const + { + const __m512i _v1 = _mm512_castpd_si512(v1); + const __m512i _v2 = _mm512_castpd_si512(v2); + __m512i _hi, _lo; + zmm2hilo(_v1, _v2, _hi, _lo); + return __vec16_d(_mm512_castsi512_pd(_hi), _mm512_castsi512_pd(_lo)); + } + FORCEINLINE __vec16_d cvt2zmm() const + { + const __m512i _hi = _mm512_castpd_si512(v_hi); + const __m512i _lo = _mm512_castpd_si512(v_lo); + __m512i _v1, _v2; + hilo2zmm(_hi,_lo, _v1,_v2); + return __vec16_d(_mm512_castsi512_pd(_v1), _mm512_castsi512_pd(_v2)); + } +} POST_ALIGN(128); + +struct PRE_ALIGN(128) __vec16_i64 +{ + union { + __m512i v1; + __m512i v_hi; + }; + union + { + __m512i v2; + __m512i v_lo; + }; + FORCEINLINE __vec16_i64() : v1(_mm512_undefined_epi32()), v2(_mm512_undefined_epi32()) {} + FORCEINLINE __vec16_i64(const __m512i _v1, const __m512i _v2) : v1(_v1), v2(_v2) {} + FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v1(o.v1), v2(o.v2) {} + FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v1=o.v1; v2=o.v2; return *this; } + FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, + int64_t v04, int64_t v05, int64_t v06, int64_t v07, + int64_t v08, int64_t v09, int64_t v10, int64_t v11, + int64_t v12, int64_t v13, int64_t v14, int64_t v15) { + v2 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08); + v1 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00); + } + FORCEINLINE const int64_t& operator[](const int i) const { return ((int64_t*)this)[i]; } + FORCEINLINE int64_t& operator[](const int i) { return ((int64_t*)this)[i]; } + FORCEINLINE __vec16_i64 cvt2hilo() const + { + __vec16_i64 ret; + zmm2hilo(v1,v2,ret.v_hi,ret.v_lo); + return ret; + } + FORCEINLINE __vec16_i64 cvt2zmm() const + { + __vec16_i64 ret; + hilo2zmm(v_hi,v_lo, ret.v1, ret.v2); + return ret; + } +} POST_ALIGN(128); + +/************ scalar **************/ + +template +struct vec16 +{ + FORCEINLINE vec16() { } + FORCEINLINE vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) { + data[0] = v0; data[1] = v1; data[2] = v2; data[3] = v3; + data[4] = v4; data[5] = v5; data[6] = v6; data[7] = v7; + data[8] = v8; data[9] = v9; data[10] = v10; data[11] = v11; + data[12] = v12; data[13] = v13; data[14] = v14; data[15] = v15; + } + T data[16]; + FORCEINLINE const T& operator[](const int i) const { return data[i]; } + FORCEINLINE T& operator[](const int i) { return data[i]; } +}; + +PRE_ALIGN(16) struct __vec16_i8 : public vec16 { + __vec16_i8() { } + __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, + int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) + : vec16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15) { } +} POST_ALIGN(16); + +PRE_ALIGN(32) struct __vec16_i16 : public vec16 { + __vec16_i16() { } + __vec16_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, + int16_t v4, int16_t v5, int16_t v6, int16_t v7, + int16_t v8, int16_t v9, int16_t v10, int16_t v11, + int16_t v12, int16_t v13, int16_t v14, int16_t v15) + : vec16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15) { } +} POST_ALIGN(32); + +static inline int32_t __extract_element(__vec16_i32, int); + + +/////////////////////////////////////////////////////////////////////////// +// macros... + +/* knc::macro::used */ +#define BINARY_OP(TYPE, NAME, OP) \ +static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = a[i] OP b[i]; \ + return ret; \ +} + +/* knc::macro::used */ +#define BINARY_OP_CAST(TYPE, CAST, NAME, OP) \ +static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = (CAST)(a[i]) OP (CAST)(b[i]); \ + return ret; \ +} + +/* knc::macro::used */ +#define BINARY_OP_FUNC(TYPE, NAME, FUNC) \ +static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = FUNC(a[i], b[i]); \ + return ret; \ +} + +/* knc::macro::used */ +#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \ +static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ + __vec16_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 16; ++i) \ + ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i; \ + return ret; \ +} \ +static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b, \ + __vec16_i1 mask) { \ + __vec16_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 16; ++i) \ + ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i; \ + ret.v &= mask.v; \ + return ret; \ +} + +/* knc::macro::used */ +#define INSERT_EXTRACT(VTYPE, STYPE) \ +static FORCEINLINE STYPE __extract_element(VTYPE v, int index) { \ + return ((STYPE *)&v)[index]; \ +} \ +static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \ + ((STYPE *)v)[index] = val; \ +} + +/* knc::macro::used */ +#define LOAD_STORE(VTYPE, STYPE) \ +template \ +static FORCEINLINE VTYPE __load(const VTYPE *p) { \ + STYPE *ptr = (STYPE *)p; \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = ptr[i]; \ + return ret; \ +} \ +template \ +static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \ + STYPE *ptr = (STYPE *)p; \ + for (int i = 0; i < 16; ++i) \ + ptr[i] = v[i]; \ +} + +/* knc::macro::used */ +#define REDUCE_ADD(TYPE, VTYPE, NAME) \ +static FORCEINLINE TYPE NAME(VTYPE v) { \ + TYPE ret = v[0]; \ + for (int i = 1; i < 16; ++i) \ + ret = ret + v[i]; \ + return ret; \ +} + +/* knc::macro::used */ +#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP) \ +static FORCEINLINE TYPE NAME(VTYPE v) { \ + TYPE ret = v[0]; \ + for (int i = 1; i < 16; ++i) \ + ret = (ret OP (TYPE)v[i]) ? ret : (TYPE)v[i]; \ + return ret; \ +} + +/* knc::macro::used */ +#define SELECT(TYPE) \ +static FORCEINLINE TYPE __select(__vec16_i1 mask, TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = (mask.v & (1< VTYPE __smear_##NAME(STYPE); \ +template <> FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = v; \ + return ret; \ +} + +/* knc::macro::used */ +#define SETZERO(VTYPE, NAME) \ +template VTYPE __setzero_##NAME(); \ +template <> FORCEINLINE VTYPE __setzero_##NAME() { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = 0; \ + return ret; \ +} + +/* knc::macro::used */ +#define UNDEF(VTYPE, NAME) \ +template VTYPE __undef_##NAME(); \ +template <> FORCEINLINE VTYPE __undef_##NAME() { \ + return VTYPE(); \ +} + +/* knc::macro::used */ +#define BROADCAST(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = v[index & 0xf]; \ + return ret; \ +} \ + +/* knc::macro::used */ +#define ROTATE(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = v[(i+index) & 0xf]; \ + return ret; \ +} \ + +/* knc::macro::used */ +#define SHUFFLES(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = v[__extract_element(index, i) & 0xf]; \ + return ret; \ +} \ +static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) { \ + int ii = __extract_element(index, i) & 0x1f; \ + ret[i] = (ii < 16) ? v0[ii] : v1[ii-16]; \ + } \ + return ret; \ +} + +/////////////////////////////////////////////////////////////////////////// + +INSERT_EXTRACT(__vec1_i8, int8_t) +INSERT_EXTRACT(__vec1_i16, int16_t) +INSERT_EXTRACT(__vec1_i32, int32_t) +INSERT_EXTRACT(__vec1_i64, int64_t) +INSERT_EXTRACT(__vec1_f, float) +INSERT_EXTRACT(__vec1_d, double) + +/////////////////////////////////////////////////////////////////////////// +// mask +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) { return _mm512_kmov (mask); } +static FORCEINLINE bool __any (__vec16_i1 mask) { return !_mm512_kortestz(mask, mask); } +static FORCEINLINE bool __all (__vec16_i1 mask) { return _mm512_kortestc(mask, mask); } +static FORCEINLINE bool __none (__vec16_i1 mask) { return _mm512_kortestz(mask, mask); } +static FORCEINLINE __vec16_i1 __not (__vec16_i1 mask) { return _mm512_knot (mask); } + +static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kxnor (a,b); } +static FORCEINLINE __vec16_i1 __and (__vec16_i1 a, __vec16_i1 b) { return _mm512_kand (a,b); } +static FORCEINLINE __vec16_i1 __xor (__vec16_i1 a, __vec16_i1 b) { return _mm512_kxor (a,b); } +static FORCEINLINE __vec16_i1 __or (__vec16_i1 a, __vec16_i1 b) { return _mm512_kor (a,b); } +static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandn (a,b); } +static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandnr(a,b); } + +static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, __vec16_i1 b) { return __or(__and(a, mask), __and_not2(b, mask)); } +static FORCEINLINE __vec16_i1 __select( bool cond, __vec16_i1 a, __vec16_i1 b) { return cond ? a : b; } + +static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) { return (vec.v & (1 << index)) ? true : false; } +static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, bool val) +{ + if (val == false) vec->v &= ~(1 << index); + else vec->v |= (1 << index); +} + +template static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) +{ + return *p; +} + +template static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) +{ + *p = v; +} + +template RetVecType __smear_i1(int i); +template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; } + +template RetVecType __setzero_i1(); +template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; } + +template __vec16_i1 __undef_i1(); +template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); } + +/////////////////////////////////////////////////////////////////////////// +// int8 +/////////////////////////////////////////////////////////////////////////// + +BINARY_OP(__vec16_i8, __add, +) +BINARY_OP(__vec16_i8, __sub, -) +BINARY_OP(__vec16_i8, __mul, *) + +BINARY_OP(__vec16_i8, __or, |) +BINARY_OP(__vec16_i8, __and, &) +BINARY_OP(__vec16_i8, __xor, ^) +BINARY_OP(__vec16_i8, __shl, <<) + +BINARY_OP_CAST(__vec16_i8, uint8_t, __udiv, /) +BINARY_OP_CAST(__vec16_i8, int8_t, __sdiv, /) + +BINARY_OP_CAST(__vec16_i8, uint8_t, __urem, %) +BINARY_OP_CAST(__vec16_i8, int8_t, __srem, %) +BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>) +BINARY_OP_CAST(__vec16_i8, int8_t, __ashr, >>) + +SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>) +SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>) +SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<) + +CMP_OP(__vec16_i8, i8, int8_t, __equal, ==) +CMP_OP(__vec16_i8, i8, int8_t, __not_equal, !=) +CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=) +CMP_OP(__vec16_i8, i8, int8_t, __signed_less_equal, <=) +CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=) +CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_equal, >=) +CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <) +CMP_OP(__vec16_i8, i8, int8_t, __signed_less_than, <) +CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >) +CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_than, >) + +SELECT(__vec16_i8) +INSERT_EXTRACT(__vec16_i8, int8_t) +SMEAR(__vec16_i8, i8, int8_t) +SETZERO(__vec16_i8, i8) +UNDEF(__vec16_i8, i8) +BROADCAST(__vec16_i8, i8, int8_t) +ROTATE(__vec16_i8, i8, int8_t) +SHUFFLES(__vec16_i8, i8, int8_t) +LOAD_STORE(__vec16_i8, int8_t) + +/////////////////////////////////////////////////////////////////////////// +// int16 +/////////////////////////////////////////////////////////////////////////// + +BINARY_OP(__vec16_i16, __add, +) +BINARY_OP(__vec16_i16, __sub, -) +BINARY_OP(__vec16_i16, __mul, *) + +BINARY_OP(__vec16_i16, __or, |) +BINARY_OP(__vec16_i16, __and, &) +BINARY_OP(__vec16_i16, __xor, ^) +BINARY_OP(__vec16_i16, __shl, <<) + +BINARY_OP_CAST(__vec16_i16, uint16_t, __udiv, /) +BINARY_OP_CAST(__vec16_i16, int16_t, __sdiv, /) + +BINARY_OP_CAST(__vec16_i16, uint16_t, __urem, %) +BINARY_OP_CAST(__vec16_i16, int16_t, __srem, %) +BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>) +BINARY_OP_CAST(__vec16_i16, int16_t, __ashr, >>) + +SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>) +SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>) +SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<) + +CMP_OP(__vec16_i16, i16, int16_t, __equal, ==) +CMP_OP(__vec16_i16, i16, int16_t, __not_equal, !=) +CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=) +CMP_OP(__vec16_i16, i16, int16_t, __signed_less_equal, <=) +CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=) +CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_equal, >=) +CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <) +CMP_OP(__vec16_i16, i16, int16_t, __signed_less_than, <) +CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >) +CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_than, >) + +SELECT(__vec16_i16) +INSERT_EXTRACT(__vec16_i16, int16_t) +SMEAR(__vec16_i16, i16, int16_t) +SETZERO(__vec16_i16, i16) +UNDEF(__vec16_i16, i16) +BROADCAST(__vec16_i16, i16, int16_t) +ROTATE(__vec16_i16, i16, int16_t) +SHUFFLES(__vec16_i16, i16, int16_t) +LOAD_STORE(__vec16_i16, int16_t) + +/////////////////////////////////////////////////////////////////////////// +// int32 +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec16_i32 __add (__vec16_i32 a, __vec16_i32 b) { return _mm512_add_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __sub (__vec16_i32 a, __vec16_i32 b) { return _mm512_sub_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __mul (__vec16_i32 a, __vec16_i32 b) { return _mm512_mullo_epi32(a,b); } +static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epu32 (a,b); } +static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epu32 (a,b); } +static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __or (__vec16_i32 a, __vec16_i32 b) { return _mm512_or_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __and (__vec16_i32 a, __vec16_i32 b) { return _mm512_and_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __xor (__vec16_i32 a, __vec16_i32 b) { return _mm512_xor_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a, __vec16_i32 b) { return _mm512_sllv_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srlv_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srav_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a, int32_t n) { return _mm512_slli_epi32 (a,n); } +static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, int32_t n) { return _mm512_srli_epi32 (a,n); } +static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) { return _mm512_srai_epi32 (a,n); } + +static FORCEINLINE __vec16_i1 __equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpeq_epi32_mask (a,b); } +static FORCEINLINE __vec16_i1 __not_equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpneq_epi32_mask(a,b); } +static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epu32_mask (a,b); } +static FORCEINLINE __vec16_i1 __signed_less_equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epi32_mask (a,b); } +static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epu32_mask (a,b); } +static FORCEINLINE __vec16_i1 __signed_greater_equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epi32_mask (a,b); } +static FORCEINLINE __vec16_i1 __unsigned_less_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epu32_mask (a,b); } +static FORCEINLINE __vec16_i1 __signed_less_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epi32_mask (a,b); } +static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epu32_mask (a,b); } +static FORCEINLINE __vec16_i1 __signed_greater_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epi32_mask (a,b); } + +static FORCEINLINE __vec16_i1 __equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpeq_epi32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpneq_epi32_mask(m,a,b); } +static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epu32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epi32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epu32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epi32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epu32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epi32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epu32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epi32_mask (m,a,b); } + +static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask, __vec16_i32 a, __vec16_i32 b) { return _mm512_mask_mov_epi32(b, mask, a); } +static FORCEINLINE __vec16_i32 __select( bool cond, __vec16_i32 a, __vec16_i32 b) { return cond ? a : b; } + +static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int32_t index) { return v[index]; } +static FORCEINLINE void __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val; } + +template RetVecType __smear_i32(int32_t i); +template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); } + +static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1); +static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0); +static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32); +static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1); +static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + +template RetVecType __setzero_i32(); +template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); } + +template RetVecType __undef_i32(); +template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); } + +static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); } + +static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) +{ + __vec16_i32 idx = __smear_i32<__vec16_i32>(index); + __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0xF)); + return _mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v); +} + +static FORCEINLINE __vec16_i32 __shuffle_i32 (__vec16_i32 v, __vec16_i32 index) +{ + return _mm512_mask_permutevar_epi32(v, 0xFFFF, __and(index, __smear_i32<__vec16_i32>(0xF)), v); +} +static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __vec16_i32 index) +{ + const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10)); + index = __and(index, __smear_i32<__vec16_i32>(0xF)); + __vec16_i32 ret = __undef_i32<__vec16_i32>(); + ret = _mm512_mask_permutevar_epi32(ret, mask, index, v0); + ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1); + return ret; +} + +template static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED + return __load<64>(p); +#else + __vec16_i32 v; + v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + return v; +#endif +} + +template static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED + __store<64>(p,v); +#else + _mm512_extpackstorelo_epi32( p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +#endif +} + +#if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */ +template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) +{ + return _mm512_load_epi32(p); +} +template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) +{ + _mm512_store_epi32(p, v); +} +#endif + +/////////////////////////////////////////////////////////////////////////// +// int64 +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b) +{ + return __vec16_i64(_mm512_add_epi64(a.v1, b.v1), _mm512_add_epi64(a.v2,b.v2)); +} + +static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) +{ +#if __ICC >= 99999 /* compiler gate, icc >= 99999 will hopefully support _mm512_sub_epi64 */ + return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2)); +#else + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + __vec16_i64 ret; + __mmask16 borrow = 0; + ret.v_lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow); + ret.v_hi = _mm512_sbb_epi32 (a.v_hi, borrow, b.v_hi, &borrow); + return ret.cvt2zmm(); +#endif +} + +static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b) +{ + const __vec16_i64 b = _b.cvt2hilo(); + return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo), + _mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi), + _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm(); +} + +static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_i64 b) +{ + __vec16_i64 ret; + ret.v1 = _mm512_mask_mov_epi64(b.v1, mask, a.v1); + ret.v2 = _mm512_mask_mov_epi64(b.v2, mask >> 8, a.v2); + return ret; +} + +#if __ICC >= 1400 /* compiler gate, icc >= 14.0.0 support _mm512_mullox_epi64 */ +static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) +{ + return __vec16_i64(_mm512_mullox_epi64(a.v1,b.v1), _mm512_mullox_epi64(a.v2,b.v2)); +} +#else /* __ICC >= 1400 */ +static FORCEINLINE void __abs_i32i64(__m512i &_hi, __m512i &_lo) +{ + /* abs(x) : + * mask = x >> 32; + * abs(x) = (x^mask) - mask + */ + const __vec16_i32 mask = __ashr(_hi, __ispc_thirty_two); + __vec16_i32 hi = __xor(_hi, mask); + __vec16_i32 lo = __xor(_lo, mask); + __mmask16 borrow = 0; + _lo = _mm512_subsetb_epi32(lo, mask, &borrow); + _hi = _mm512_sbb_epi32 (hi, borrow, mask, &borrow); +} +static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) +{ + __vec16_i64 a = _a.cvt2hilo(); + __vec16_i64 b = _b.cvt2hilo(); + /* sign = (a^b) >> 32, if sign == 0 then a*b >= 0, otherwise a*b < 0 */ + const __vec16_i1 sign = __not_equal_i32(__ashr(__xor(a.v_hi, b.v_hi), __ispc_thirty_two), __ispc_zero); + __abs_i32i64(a.v_hi, a.v_lo); /* abs(a) */ + __abs_i32i64(b.v_hi, b.v_lo); /* abs(b) */ + const __vec16_i32 lo_m1 = _mm512_mullo_epi32(a.v_lo, b.v_lo); + const __vec16_i32 hi_m1 = _mm512_mulhi_epu32(a.v_lo, b.v_lo); + const __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo); + const __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi); + __mmask16 carry; + const __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m3, &carry); + const __vec16_i32 hi = _mm512_adc_epi32(hi_p23, carry, hi_m1, &carry); + const __vec16_i32 lo = lo_m1; + const __vec16_i64 ret_abs = __vec16_i64(hi,lo).cvt2zmm(); + /* if sign != 0, means either a or b is negative, then negate the result */ + return __select(sign, __sub(__vec16_i64(__ispc_zero, __ispc_zero), ret_abs), ret_abs); +} +#endif /* __ICC >= 1400 */ + + +static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_or_epi64 (a.v1, b.v1), _mm512_or_epi64 (a.v2, b.v2)); } +static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); } +static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); } + +static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2)); } +static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2)); } + +static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); } +static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); } + + +static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) +{ + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + /* this is a safety gate in case b-shift >= 32 */ + const __vec16_i32 xfer = __select( + __signed_less_than_i32(b.v_lo, __ispc_thirty_two), + __lshr(a.v_lo, __sub(__ispc_thirty_two, b.v_lo)), + __shl (a.v_lo, __sub(b.v_lo, __ispc_thirty_two)) + ); + const __vec16_i32 hi = __or(__shl(a.v_hi, b.v_lo), xfer); + const __vec16_i32 lo = __shl(a.v_lo, b.v_lo); + return __vec16_i64(hi,lo).cvt2zmm(); +} +static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) +{ + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + /* this is a safety gate in case b-shift >= 32 */ + const __vec16_i32 xfer = __select( + __signed_less_than_i32(b.v_lo, __ispc_thirty_two), + __shl (a.v_hi, __sub(__ispc_thirty_two, b.v_lo)), + __lshr(a.v_hi, __sub(b.v_lo, __ispc_thirty_two)) + ); + const __vec16_i32 lo = __or(__lshr(a.v_lo, b.v_lo), xfer); + const __vec16_i32 hi = __lshr(a.v_hi, b.v_lo); + return __vec16_i64(hi,lo).cvt2zmm(); +} +static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) +{ + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + /* this is a safety gate in case b-shift >= 32 */ + const __vec16_i32 xfer = __select( + __signed_less_than_i32(b.v_lo, __ispc_thirty_two), + __shl (a.v_hi, __sub(__ispc_thirty_two, b.v_lo)), + __ashr(a.v_hi, __sub(b.v_lo, __ispc_thirty_two)) + ); + const __vec16_i32 lo = __or(__lshr(a.v_lo, b.v_lo), xfer); + const __vec16_i32 hi = __ashr(a.v_hi, b.v_lo); + return __vec16_i64(hi,lo).cvt2zmm(); +} + +template RetVecType __smear_i64(const int64_t &l); +template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); } + +template RetVecType __setzero_i64(); +template <> FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); } + +template RetVecType __undef_i64(); +template <> FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); } + +static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, uint64_t shift) { return __lshr(a, __smear_i64<__vec16_i64>(shift)); } +static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, int64_t shift) { return __ashr(a, __smear_i64<__vec16_i64>(shift)); } +static FORCEINLINE __vec16_i64 __shl (__vec16_i64 a, int64_t shift) { return __shl (a, __smear_i64<__vec16_i64>(shift)); } + +static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) +{ + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo); + return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); +} +static FORCEINLINE __vec16_i1 __equal_i64_and_mask(__vec16_i64 _a, __vec16_i64 _b, __vec16_i1 mask) +{ + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo); + __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); + return _mm512_kand(full_match, (__mmask16)mask); +} +static FORCEINLINE __vec16_i1 __not_equal_i64(__vec16_i64 a, __vec16_i64 b) +{ + return __not(__equal_i64(a,b)); +} +static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(__vec16_i64 a, __vec16_i64 b, __vec16_i1 mask) +{ + return __and(__not(__equal_i64(a,b)), mask); +} +CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=) +CMP_OP(__vec16_i64, i64, int64_t, __signed_less_equal, <=) +CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=) +CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_equal, >=) +CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <) +CMP_OP(__vec16_i64, i64, int64_t, __signed_less_than, <) +CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >) +CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_than, >) + + +INSERT_EXTRACT(__vec16_i64, int64_t) + + +#define CASTL2I(_v_, _v_hi_, _v_lo_) \ + __vec16_i32 _v_hi_, _v_lo_; \ + { \ + const __vec16_i64 v = _v_.cvt2hilo(); \ + _v_hi_ = v.v_hi; \ + _v_lo_ = v.v_lo; } +#define CASTI2L(_ret_hi_, _ret_lo_) \ + __vec16_i64(_ret_hi_, _ret_lo_).cvt2zmm() +static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 _v, int index) +{ + CASTL2I(_v, v_hi, v_lo); + const __vec16_i32 ret_hi = __broadcast_i32(v_hi, index); + const __vec16_i32 ret_lo = __broadcast_i32(v_lo, index); + return CASTI2L(ret_hi, ret_lo); +} +static FORCEINLINE __vec16_i64 __rotate_i64(const __vec16_i64 _v, const int index) +{ + CASTL2I(_v, v_hi, v_lo); + const __vec16_i32 ret_hi = __rotate_i32(v_hi, index); + const __vec16_i32 ret_lo = __rotate_i32(v_lo, index); + return CASTI2L(ret_hi, ret_lo); +} +static FORCEINLINE __vec16_i64 __shuffle_double(__vec16_i64 _v, const __vec16_i32 index) +{ + CASTL2I(_v, v_hi, v_lo); + const __vec16_i32 ret_hi = __shuffle_i32(v_hi, index); + const __vec16_i32 ret_lo = __shuffle_i32(v_lo, index); + return CASTI2L(ret_hi, ret_lo); +} +static FORCEINLINE __vec16_i64 __shuffle2_double(__vec16_i64 _v0, __vec16_i64 _v1, const __vec16_i32 index) +{ + CASTL2I(_v0, v0_hi, v0_lo); + CASTL2I(_v1, v1_hi, v1_lo); + const __vec16_i32 ret_hi = __shuffle2_i32(v0_hi, v1_hi, index); + const __vec16_i32 ret_lo = __shuffle2_i32(v0_lo, v1_lo, index); + return CASTI2L(ret_hi, ret_lo); +} +#undef CASTI2L +#undef CASTL2I + +template static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED + return __load<128>(p); +#else + __vec16_i32 v1; + __vec16_i32 v2; + v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v2 = _mm512_extloadunpackhi_epi32(v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + return __vec16_i64(v2,v1); +#endif +} + + +template static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED + return __store<128>(p,v); +#else + __m512i v1 = v.v2; + __m512i v2 = v.v1; + _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +#endif +} + +#if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */ +template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) +{ + __m512i v2 = _mm512_load_epi32(p); + __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64); + return __vec16_i64(v2,v1); +} +template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); } +template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) +{ + __m512i v1 = v.v2; + __m512i v2 = v.v1; + _mm512_store_epi64(p, v2); + _mm512_store_epi64(((uint8_t*)p)+64, v1); +} +template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); } +#endif + + +/////////////////////////////////////////////////////////////////////////// +// float +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { return _mm512_add_ps(a,b); } +static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) { return _mm512_sub_ps(a,b); } +static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) { return _mm512_mul_ps(a,b); } +static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) { return _mm512_div_ps(a,b); } + +static FORCEINLINE __vec16_i1 __equal_float (__vec16_f a, __vec16_f b) { return _mm512_cmpeq_ps_mask (a,b); } +static FORCEINLINE __vec16_i1 __not_equal_float (__vec16_f a, __vec16_f b) { return _mm512_cmpneq_ps_mask(a,b); } +static FORCEINLINE __vec16_i1 __less_than_float (__vec16_f a, __vec16_f b) { return _mm512_cmplt_ps_mask (a,b); } +static FORCEINLINE __vec16_i1 __less_equal_float (__vec16_f a, __vec16_f b) { return _mm512_cmple_ps_mask (a,b); } +static FORCEINLINE __vec16_i1 __greater_than_float (__vec16_f a, __vec16_f b) { return _mm512_cmp_ps_mask (a,b,_CMP_GT_OS); } +static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmp_ps_mask (a,b,_CMP_GE_OS); } + +static FORCEINLINE __vec16_i1 __equal_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmpeq_ps_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __not_equal_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmpneq_ps_mask(m,a,b); } +static FORCEINLINE __vec16_i1 __less_than_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmplt_ps_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __less_equal_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmple_ps_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __greater_than_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmp_ps_mask (m,a,b,_CMP_GT_OS); } +static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmp_ps_mask (m,a,b,_CMP_GE_OS); } + +static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpord_ps_mask (a,b); } +static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpunord_ps_mask(a,b); } + +static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) { return _mm512_mask_mov_ps(b, mask, a); } +static FORCEINLINE __vec16_f __select( bool cond, __vec16_f a, __vec16_f b) { return cond ? a : b; } + +static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) { return v[index]; } +static FORCEINLINE void __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; } + +template RetVecType __smear_float(float f); +template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); } + +template RetVecType __setzero_float(); +template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); } + +template RetVecType __undef_float(); +template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); } + +static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index) +{ + const __vec16_i32 v = _mm512_castps_si512(_v); + return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v)); +} + +static FORCEINLINE __vec16_f __rotate_float(__vec16_f _v, int index) +{ + const __vec16_i32 v = _mm512_castps_si512(_v); + const __vec16_i32 idx = __smear_i32<__vec16_i32>(index); + const __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0xF)); + return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v)); +} +static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) +{ + return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v))); +} +static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f _v0, __vec16_f _v1, __vec16_i32 index) +{ + const __vec16_i32 v0 = _mm512_castps_si512(_v0); + const __vec16_i32 v1 = _mm512_castps_si512(_v1); + const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10)); + index = __and(index, __smear_i32<__vec16_i32>(0xF)); + __vec16_i32 ret = __undef_i32<__vec16_i32>(); + ret = _mm512_mask_permutevar_epi32(ret, mask, index, v0); + ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1); + return _mm512_castsi512_ps(ret); +} + +template static FORCEINLINE __vec16_f __load(const __vec16_f *p) +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED + return __load<64>(p); +#else + __vec16_f v; + v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + return v; +#endif +} + +template static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED + __store<64>(p,v); +#else + _mm512_extpackstorelo_ps( p, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); +#endif +} + +#if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */ +template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) +{ + return _mm512_load_ps(p); +} +/* this one doesn't fail but it is commented out for completeness, no aligned load/stores */ +template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) +{ + _mm512_store_ps(p, v); +} +#endif + +/******** math ******/ + +/*** float ***/ +static FORCEINLINE float __exp_uniform_float(float v) { return expf(v);} +static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) { return _mm512_exp_ps(v); } + +static FORCEINLINE float __log_uniform_float(float v) { return logf(v);} +static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) { return _mm512_log_ps(v); } + +static FORCEINLINE float __pow_uniform_float(float a, float b) { return powf(a, b);} +static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); } + +/*** double ***/ +static FORCEINLINE double __exp_uniform_double(double v) { return exp(v);} +static FORCEINLINE __vec16_d __exp_varying_double(__vec16_d v) { return __vec16_d(_mm512_exp_pd(v.v1),_mm512_exp_pd(v.v2)); } + +static FORCEINLINE double __log_uniform_double(double v) { return log(v);} +static FORCEINLINE __vec16_d __log_varying_double(__vec16_d v) { return __vec16_d(_mm512_log_pd(v.v1),_mm512_log_pd(v.v2)); } + +static FORCEINLINE double __pow_uniform_double(double a, double b) { return pow(a,b);} +static FORCEINLINE __vec16_d __pow_varying_double(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1),_mm512_pow_pd(a.v2,b.v2)); } + +/******** bitcast ******/ + +static FORCEINLINE int __intbits(float v) { + union { + float f; + int i; + } u; + u.f = v; + return u.i; +} + +static FORCEINLINE float __floatbits(int v) { + union { + float f; + int i; + } u; + u.i = v; + return u.f; +} + +/////////////////////////////////////////////////////////////////////////// +// half<->float : this one passes the tests +// source : +// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion +/////////////////////////////////////////////////////////////////////////// +class Float16Compressor +{ + union Bits + { + float f; + int32_t si; + uint32_t ui; + }; + + static int const shift = 13; + static int const shiftSign = 16; + + static int32_t const infN = 0x7F800000; // flt32 infinity + static int32_t const maxN = 0x477FE000; // max flt16 normal as a flt32 + static int32_t const minN = 0x38800000; // min flt16 normal as a flt32 + static int32_t const signN = 0x80000000; // flt32 sign bit + + static int32_t const infC = infN >> shift; + static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32 + static int32_t const maxC = maxN >> shift; + static int32_t const minC = minN >> shift; + static int32_t const signC = signN >> shiftSign; // flt16 sign bit + + static int32_t const mulN = 0x52000000; // (1 << 23) / minN + static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift)) + + static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted + static int32_t const norC = 0x00400; // min flt32 normal down shifted + + static int32_t const maxD = infC - maxC - 1; + static int32_t const minD = minC - subC - 1; + + public: + + static uint16_t compress(float value) + { + Bits v, s; + v.f = value; + uint32_t sign = v.si & signN; + v.si ^= sign; + sign >>= shiftSign; // logical shift + s.si = mulN; + s.si = s.f * v.f; // correct subnormals + v.si ^= (s.si ^ v.si) & -(minN > v.si); + v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); + v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); + v.ui >>= shift; // logical shift + v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); + v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); + return v.ui | sign; + } + + static float decompress(uint16_t value) + { + Bits v; + v.ui = value; + int32_t sign = v.si & signC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + } +}; + +static FORCEINLINE float __half_to_float_uniform(int16_t h) +{ + return Float16Compressor::decompress(h); +} +static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) +{ + __vec16_f ret; + for (int i = 0; i < 16; ++i) + ret[i] = __half_to_float_uniform(v[i]); + return ret; +} + + +static FORCEINLINE int16_t __float_to_half_uniform(float f) +{ + return Float16Compressor::compress(f); +} +static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) +{ + __vec16_i16 ret; + for (int i = 0; i < 16; ++i) + ret[i] = __float_to_half_uniform(v[i]); + return ret; +} + + +/////////////////////////////////////////////////////////////////////////// +// double +/////////////////////////////////////////////////////////////////////////// + +#define VECOP(OP) __vec16_d(_mm512_##OP(a.v1,b.v1),_mm512_##OP(a.v2,b.v2)) +static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { return VECOP(add_pd); } +static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) { return VECOP(sub_pd); } +static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) { return VECOP(mul_pd); } +static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) { return VECOP(div_pd); } +#undef VECOP + +#define CMPOP(OP) _mm512_kmovlhb(_mm512_##OP(a.v1,b.v1),_mm512_##OP(a.v2,b.v2)) +static FORCEINLINE __vec16_i1 __equal_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpeq_pd_mask); } +static FORCEINLINE __vec16_i1 __not_equal_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpneq_pd_mask); } +static FORCEINLINE __vec16_i1 __less_than_double (__vec16_d a, __vec16_d b) { return CMPOP(cmplt_pd_mask); } +static FORCEINLINE __vec16_i1 __less_equal_double (__vec16_d a, __vec16_d b) { return CMPOP(cmple_pd_mask); } +static FORCEINLINE __vec16_i1 __greater_than_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpnle_pd_mask); } +static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { return CMPOP(cmpnlt_pd_mask); } +static FORCEINLINE __vec16_i1 __ordered_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpord_pd_mask); } +static FORCEINLINE __vec16_i1 __unordered_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpunord_pd_mask); } +#undef CMPOP + +#define CMPOPMASK(OP) _mm512_kmovlhb(_mm512_mask_##OP(m,a.v1,b.v1),_mm512_mask_##OP(_mm512_kswapb(m,m),a.v2,b.v2)) +static FORCEINLINE __vec16_i1 __equal_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpeq_pd_mask); } +static FORCEINLINE __vec16_i1 __not_equal_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpneq_pd_mask); } +static FORCEINLINE __vec16_i1 __less_than_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmplt_pd_mask); } +static FORCEINLINE __vec16_i1 __less_equal_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmple_pd_mask); } +static FORCEINLINE __vec16_i1 __greater_than_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpnle_pd_mask); } +static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpnlt_pd_mask); } +#undef CMOPMASK + + +static FORCEINLINE __vec16_d __select(__vec16_i1 m, __vec16_d a, __vec16_d b) +{ + return __vec16_d(_mm512_mask_mov_pd(b.v1, m, a.v1), _mm512_mask_mov_pd(b.v2, _mm512_kswapb(m, m), a.v2)); +} +static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) +{ + return cond ? a : b; +} + +static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) { return v[index]; } +static FORCEINLINE void __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; } + +template RetVecType __smear_double(double d); +template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); } + +template RetVecType __setzero_double(); +template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); } + +template RetVecType __undef_double(); +template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); } + +#define CASTD2F(_v_, _v_hi_, _v_lo_) \ + __vec16_f _v_hi_, _v_lo_; \ + { \ + const __vec16_d v = _v_.cvt2hilo(); \ + _v_hi_ = _mm512_castpd_ps(v.v_hi); \ + _v_lo_ = _mm512_castpd_ps(v.v_lo); } +#define CASTF2D(_ret_hi_, _ret_lo_) \ + __vec16_d(_mm512_castps_pd(_ret_hi_), _mm512_castps_pd(_ret_lo_)).cvt2zmm() +static FORCEINLINE __vec16_d __broadcast_double(__vec16_d _v, int index) +{ + CASTD2F(_v, v_hi, v_lo); + const __vec16_f ret_hi = __broadcast_float(v_hi, index); + const __vec16_f ret_lo = __broadcast_float(v_lo, index); + return CASTF2D(ret_hi, ret_lo); +} +static FORCEINLINE __vec16_d __rotate_double(const __vec16_d _v, const int index) +{ + CASTD2F(_v, v_hi, v_lo); + const __vec16_f ret_hi = __rotate_float(v_hi, index); + const __vec16_f ret_lo = __rotate_float(v_lo, index); + return CASTF2D(ret_hi, ret_lo); +} +static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index) +{ + CASTD2F(_v, v_hi, v_lo); + const __vec16_f ret_hi = __shuffle_float(v_hi, index); + const __vec16_f ret_lo = __shuffle_float(v_lo, index); + return CASTF2D(ret_hi, ret_lo); +} +static FORCEINLINE __vec16_d __shuffle2_double(__vec16_d _v0, __vec16_d _v1, const __vec16_i32 index) +{ + CASTD2F(_v0, v0_hi, v0_lo); + CASTD2F(_v1, v1_hi, v1_lo); + const __vec16_f ret_hi = __shuffle2_float(v0_hi, v1_hi, index); + const __vec16_f ret_lo = __shuffle2_float(v0_lo, v1_lo, index); + return CASTF2D(ret_hi, ret_lo); +} +#undef CASTF2D +#undef CASTD2F + +template static FORCEINLINE __vec16_d __load(const __vec16_d *p) \ +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED + return __load<128>(p); +#else + __vec16_d ret; + ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + return ret; +#endif +} + +template static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED + return __store<128>(p,v); +#else + _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); +#endif +} + + +#if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */ +template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) +{ + return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64)); +} +template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) +{ + _mm512_store_pd(p, v.v1); + _mm512_store_pd(((uint8_t*)p)+64, v.v2); +} +template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p) { return __load<64>(p); } +template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v); } +#endif + +/////////////////////////////////////////////////////////////////////////// +// casts +/////////////////////////////////////////////////////////////////////////// + + +/* knc::macro::used */ +#define CAST(TO, STO, FROM, SFROM, FUNC) \ +static FORCEINLINE TO FUNC(TO, FROM val) { \ + TO ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = (STO)((SFROM)(val[i])); \ + return ret; \ +} + +// sign extension conversions + +// CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext) +static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val) +{ + return __vec16_i64(_mm512_srai_epi32(val.v,31), val.v).cvt2zmm(); +} +CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext) +CAST(__vec16_i64, int64_t, __vec16_i8, int8_t, __cast_sext) +CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext) +CAST(__vec16_i32, int32_t, __vec16_i8, int8_t, __cast_sext) +CAST(__vec16_i16, int16_t, __vec16_i8, int8_t, __cast_sext) + +/* knc::macro::used */ +#define CAST_SEXT_I1(TYPE) \ +static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) { \ + ret[i] = 0; \ + if (v.v & (1 << i)) \ + ret[i] = ~ret[i]; \ + } \ + return ret; \ +} + +CAST_SEXT_I1(__vec16_i8) +CAST_SEXT_I1(__vec16_i16) + +//CAST_SEXT_I1(__vec16_i32) +static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val) +{ + __vec16_i32 ret = _mm512_setzero_epi32(); + __vec16_i32 one = _mm512_set1_epi32(-1); + return _mm512_mask_mov_epi32(ret, val, one); +} + +CAST_SEXT_I1(__vec16_i64) + +// zero extension +// CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext) +static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val) +{ + return __vec16_i64(_mm512_setzero_epi32(), val.v).cvt2zmm(); +} + +CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext) +CAST(__vec16_i64, uint64_t, __vec16_i8, uint8_t, __cast_zext) +CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext) +CAST(__vec16_i32, uint32_t, __vec16_i8, uint8_t, __cast_zext) +CAST(__vec16_i16, uint16_t, __vec16_i8, uint8_t, __cast_zext) + +/* knc::macro::used */ +#define CAST_ZEXT_I1(TYPE) \ +static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = (v.v & (1 << i)) ? 1 : 0; \ + return ret; \ +} + +CAST_ZEXT_I1(__vec16_i8) +CAST_ZEXT_I1(__vec16_i16) + +//CAST_ZEXT_I1(__vec16_i32) +static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val) +{ + __vec16_i32 ret = _mm512_setzero_epi32(); + __vec16_i32 one = _mm512_set1_epi32(1); + return _mm512_mask_mov_epi32(ret, val, one); +} + +CAST_ZEXT_I1(__vec16_i64) + +// truncations +CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc) +CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc) +CAST(__vec16_i8, int8_t, __vec16_i64, int64_t, __cast_trunc) +CAST(__vec16_i16, int16_t, __vec16_i32, int32_t, __cast_trunc) +CAST(__vec16_i8, int8_t, __vec16_i32, int32_t, __cast_trunc) +CAST(__vec16_i8, int8_t, __vec16_i16, int16_t, __cast_trunc) + +// signed int to float/double + +//CAST(__vec16_f, float, __vec16_i8, int8_t, __cast_sitofp) +static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +//CAST(__vec16_f, float, __vec16_i16, int16_t, __cast_sitofp) +static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +//CAST(__vec16_f, float, __vec16_i32, int32_t, __cast_sitofp) +static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepi32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);} + +CAST(__vec16_f, float, __vec16_i64, int64_t, __cast_sitofp) + +//CAST(__vec16_d, double, __vec16_i8, int8_t, __cast_sitofp) +static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) +{ + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepi32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepi32lo_pd(other8); + return ret; +} + +// CAST(__vec16_d, double, __vec16_i16, int16_t, __cast_sitofp) +static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) +{ + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepi32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepi32lo_pd(other8); + return ret; +} + +// CAST(__vec16_d, double, __vec16_i32, int32_t, __cast_sitofp) +static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) +{ + __vec16_d ret; + ret.v1 = _mm512_cvtepi32lo_pd(val); + __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepi32lo_pd(other8); + return ret; +} + +CAST(__vec16_d, double, __vec16_i64, int64_t, __cast_sitofp) + +// unsigned int to float/double + +// CAST(__vec16_f, float, __vec16_i8, uint8_t, __cast_uitofp) +static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i8 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +//CAST(__vec16_f, float, __vec16_i16, uint16_t, __cast_uitofp) +static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +//CAST(__vec16_f, float, __vec16_i32, uint32_t, __cast_uitofp) +static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepu32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);} + +CAST(__vec16_f, float, __vec16_i64, uint64_t, __cast_uitofp) + +// CAST(__vec16_d, double, __vec16_i8, uint8_t, __cast_uitofp) +static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i8 val) +{ + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepu32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepu32lo_pd(other8); + return ret; +} + +// CAST(__vec16_d, double, __vec16_i16, uint16_t, __cast_uitofp) +static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i16 val) +{ + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepu32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepu32lo_pd(other8); + return ret; +} + +// CAST(__vec16_d, double, __vec16_i32, uint32_t, __cast_uitofp) +static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i32 val) +{ + __vec16_d ret; + ret.v1 = _mm512_cvtepu32lo_pd(val); + __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepu32lo_pd(other8); + return ret; +} + +CAST(__vec16_d, double, __vec16_i64, uint64_t, __cast_uitofp) + +static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) +{ + const __m512 ret = _mm512_setzero_ps(); + const __m512 one = _mm512_set1_ps(1.0); + return _mm512_mask_mov_ps(ret, v, one); +} + +// float/double to signed int +CAST(__vec16_i8, int8_t, __vec16_f, float, __cast_fptosi) +CAST(__vec16_i16, int16_t, __vec16_f, float, __cast_fptosi) + +// CAST(__vec16_i32, int32_t, __vec16_f, float, __cast_fptosi) +static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) +{ + return _mm512_cvtfxpnt_round_adjustps_epi32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); +} + +CAST(__vec16_i64, int64_t, __vec16_f, float, __cast_fptosi) +CAST(__vec16_i8, int8_t, __vec16_d, double, __cast_fptosi) +CAST(__vec16_i16, int16_t, __vec16_d, double, __cast_fptosi) +#if 0 /* knc::2implement */ +#else +CAST(__vec16_i32, int32_t, __vec16_d, double, __cast_fptosi) +#endif +CAST(__vec16_i64, int64_t, __vec16_d, double, __cast_fptosi) + +// float/double to unsigned int +CAST(__vec16_i8, uint8_t, __vec16_f, float, __cast_fptoui) +CAST(__vec16_i16, uint16_t, __vec16_f, float, __cast_fptoui) + +// CAST(__vec16_i32, uint32_t, __vec16_f, float, __cast_fptoui) +static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) +{ + return _mm512_cvtfxpnt_round_adjustps_epu32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); +} + +CAST(__vec16_i64, uint64_t, __vec16_f, float, __cast_fptoui) +CAST(__vec16_i8, uint8_t, __vec16_d, double, __cast_fptoui) +CAST(__vec16_i16, uint16_t, __vec16_d, double, __cast_fptoui) +#if 0 /* knc::2implement */ +#else +CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui) +#endif +CAST(__vec16_i64, uint64_t, __vec16_d, double, __cast_fptoui) + +// float/double conversions + +// CAST(__vec16_f, float, __vec16_d, double, __cast_fptrunc) +static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) +{ + __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1)); + __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2)); + + return _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA)); +} + +// CAST(__vec16_d, double, __vec16_f, float, __cast_fpext) +static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) +{ + __vec16_d ret; + ret.v1 = _mm512_cvtpslo_pd(val.v); + __vec16_f other8 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC)); + ret.v2 = _mm512_cvtpslo_pd(other8); + return ret; +} + +typedef union { + int32_t i32; + float f; + int64_t i64; + double d; +} BitcastUnion; + +/* knc::macro::not used */ +#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT) \ +static FORCEINLINE TO __cast_bits(TO, FROM val) { \ + TO r; \ + for (int i = 0; i < 16; ++i) { \ + BitcastUnion u; \ + u.FROM_ELT = val[i]; \ + r[i] = u.TO_ELT; \ + } \ + return r; \ +} + +// CAST_BITS(__vec16_f, f, __vec16_i32, i32) +static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) { return _mm512_castsi512_ps(val); } +// CAST_BITS(__vec16_i32, i32, __vec16_f, f) +static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) { return _mm512_castps_si512(val); } + +// CAST_BITS(__vec16_d, d, __vec16_i64, i64) +static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) { return *(__vec16_i64*)&val; } +// CAST_BITS(__vec16_i64, i64, __vec16_d, d) +static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) { return *(__vec16_d*)&val; } + +/* knc::macro::used */ +#define CAST_BITS_SCALAR(TO, FROM) \ +static FORCEINLINE TO __cast_bits(TO, FROM v) { \ + union { \ + TO to; \ + FROM from; \ + } u; \ + u.from = v; \ + return u.to; \ +} + +CAST_BITS_SCALAR(uint32_t, float) +CAST_BITS_SCALAR(int32_t, float) +CAST_BITS_SCALAR(float, uint32_t) +CAST_BITS_SCALAR(float, int32_t) +CAST_BITS_SCALAR(uint64_t, double) +CAST_BITS_SCALAR(int64_t, double) +CAST_BITS_SCALAR(double, uint64_t) +CAST_BITS_SCALAR(double, int64_t) + +/////////////////////////////////////////////////////////////////////////// +// various math functions +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE void __fastmath() { +} + +static FORCEINLINE float __round_uniform_float(float v) { + return roundf(v); +} + +static FORCEINLINE float __floor_uniform_float(float v) { + return floorf(v); +} + +static FORCEINLINE float __ceil_uniform_float(float v) { + return ceilf(v); +} + +static FORCEINLINE double __round_uniform_double(double v) { + return round(v); +} + +static FORCEINLINE double __floor_uniform_double(double v) { + return floor(v); +} + +static FORCEINLINE double __ceil_uniform_double(double v) { + return ceil(v); +} + +static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) { return _mm512_round_ps(v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); } +static FORCEINLINE __vec16_f __floor_varying_float(__vec16_f v) { return _mm512_floor_ps(v); } +static FORCEINLINE __vec16_f __ceil_varying_float(__vec16_f v) { return _mm512_ceil_ps(v); } + +static FORCEINLINE __vec16_d __round_varying_float(__vec16_d v) { return __vec16_d(_mm512_svml_round_pd(v.v1), _mm512_svml_round_pd(v.v2)); } +static FORCEINLINE __vec16_d __floor_varying_float(__vec16_d v) { return __vec16_d(_mm512_floor_pd(v.v1), _mm512_floor_pd(v.v2)); } +static FORCEINLINE __vec16_d __ceil_varying_float(__vec16_d v) { return __vec16_d(_mm512_ceil_pd(v.v1), _mm512_ceil_pd(v.v2)); } + +// min/max + +static FORCEINLINE float __min_uniform_float (float a, float b) { return (ab) ? a : b; } +static FORCEINLINE double __min_uniform_double(double a, double b) { return (ab) ? a : b; } + +static FORCEINLINE int32_t __min_uniform_int32 ( int32_t a, int32_t b) { return (ab) ? a : b; } +static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (ab) ? a : b; } + +static FORCEINLINE int64_t __min_uniform_int64 ( int64_t a, int64_t b) { return (ab) ? a : b; } +static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (ab) ? a : b; } + +static FORCEINLINE __vec16_f __max_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmax_ps(v1, v2);} +static FORCEINLINE __vec16_f __min_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmin_ps(v1, v2);} +static FORCEINLINE __vec16_d __max_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmax_pd(v1.v1, v2.v1),_mm512_gmax_pd(v1.v2,v2.v2));} +static FORCEINLINE __vec16_d __min_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmin_pd(v1.v1, v2.v1),_mm512_gmin_pd(v1.v2,v2.v2));} + +static FORCEINLINE __vec16_i32 __max_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epi32(v1, v2);} +static FORCEINLINE __vec16_i32 __min_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epi32(v1, v2);} +static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epu32(v1, v2);} +static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epu32(v1, v2);} + +BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64) +BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64) +BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64) +BINARY_OP_FUNC(__vec16_i64, __min_varying_uint64, __min_uniform_uint64) + +// sqrt/rsqrt/rcp + +static FORCEINLINE float __rsqrt_uniform_float(float v) { return 1.f / sqrtf(v); } +static FORCEINLINE float __rcp_uniform_float (float v) { return 1.f / v; } +static FORCEINLINE float __sqrt_uniform_float (float v) { return sqrtf(v); } +static FORCEINLINE double __sqrt_uniform_double(double v) { return sqrt (v); } + +static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) +{ +#ifdef ISPC_FAST_MATH + return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy. +#else + return _mm512_recip_ps(v); +#endif +} + +static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) +{ +#ifdef ISPC_FAST_MATH + return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy +#else + return _mm512_invsqrt_ps(v); +#endif +} +static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);} +static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));} + +/////////////////////////////////////////////////////////////////////////// +// svml +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec16_f __svml_sinf (__vec16_f v) { return _mm512_sin_ps(v); } +static FORCEINLINE __vec16_f __svml_asinf (__vec16_f v) { return _mm512_asin_ps(v); } +static FORCEINLINE __vec16_f __svml_cosf (__vec16_f v) { return _mm512_cos_ps(v); } +static FORCEINLINE __vec16_f __svml_tanf (__vec16_f v) { return _mm512_tan_ps(v); } +static FORCEINLINE __vec16_f __svml_atanf (__vec16_f v) { return _mm512_atan_ps(v); } +static FORCEINLINE __vec16_f __svml_atan2f(__vec16_f a, __vec16_f b) { return _mm512_atan2_ps(a,b); } +static FORCEINLINE __vec16_f __svml_expf (__vec16_f v) { return _mm512_exp_ps(v); } +static FORCEINLINE __vec16_f __svml_logf (__vec16_f v) { return _mm512_log_ps(v); } +static FORCEINLINE __vec16_f __svml_powf (__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); } + +static FORCEINLINE __vec16_d __svml_sind (__vec16_d v) { return __vec16_d(_mm512_sin_pd(v.v1), _mm512_sin_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_asind (__vec16_d v) { return __vec16_d(_mm512_asin_pd(v.v1), _mm512_asin_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_cosd (__vec16_d v) { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_tand (__vec16_d v) { return __vec16_d(_mm512_tan_pd(v.v1), _mm512_tan_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_atand (__vec16_d v) { return __vec16_d(_mm512_atan_pd(v.v1), _mm512_atan_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_atan2d(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_atan2_pd(a.v1,b.v1), _mm512_atan2_pd(a.v2,b.v2)); } +static FORCEINLINE __vec16_d __svml_expd (__vec16_d v) { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_logd (__vec16_d v) { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_powd (__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); } + +/////////////////////////////////////////////////////////////////////////// +// bit ops +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE int32_t __popcnt_int32(uint32_t v) { + int count = 0; + for (; v != 0; v >>= 1) + count += (v & 1); + return count; +} + +static FORCEINLINE int32_t __popcnt_int64(uint64_t v) { + int count = 0; + for (; v != 0; v >>= 1) + count += (v & 1); + return count; +} + +static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) { + if (v == 0) + return 32; + + int count = 0; + while ((v & 1) == 0) { + ++count; + v >>= 1; + } + return count; +} + +static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) { + if (v == 0) + return 64; + + int count = 0; + while ((v & 1) == 0) { + ++count; + v >>= 1; + } + return count; +} + +static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) { + if (v == 0) + return 32; + + int count = 0; + while ((v & (1<<31)) == 0) { + ++count; + v <<= 1; + } + return count; +} + +static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { + if (v == 0) + return 64; + + int count = 0; + while ((v & (1ull<<63)) == 0) { + ++count; + v <<= 1; + } + return count; +} + +/////////////////////////////////////////////////////////////////////////// +// reductions +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE float __reduce_add_float(__vec16_f v) { return _mm512_reduce_add_ps(v); } +static FORCEINLINE float __reduce_min_float(__vec16_f v) { return _mm512_reduce_min_ps(v); } +static FORCEINLINE float __reduce_max_float(__vec16_f v) { return _mm512_reduce_max_ps(v); } + +static FORCEINLINE float __reduce_add_double(__vec16_d v) { return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2); } +static FORCEINLINE float __reduce_min_double(__vec16_d v) { return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2)); } +static FORCEINLINE float __reduce_max_double(__vec16_d v) { return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2)); } + + + +static FORCEINLINE int64_t __reduce_add_int32 (__vec16_i32 v) { return _mm512_reduce_add_epi32(v);} +static FORCEINLINE int32_t __reduce_min_int32 (__vec16_i32 v) { return _mm512_reduce_min_epi32(v);} +static FORCEINLINE int32_t __reduce_max_int32 (__vec16_i32 v) { return _mm512_reduce_max_epi32(v);} +static FORCEINLINE uint32_t __reduce_min_uint32 (__vec16_i32 v) { return _mm512_reduce_min_epu32(v);} +static FORCEINLINE uint32_t __reduce_max_uint32 (__vec16_i32 v) { return _mm512_reduce_max_epu32(v);} + +REDUCE_ADD ( int16_t, __vec16_i8, __reduce_add_int8) +REDUCE_ADD ( int32_t, __vec16_i16, __reduce_add_int16) +REDUCE_ADD ( int64_t, __vec16_i64, __reduce_add_int64) +REDUCE_MINMAX( int64_t, __vec16_i64, __reduce_min_int64, <) +REDUCE_MINMAX( int64_t, __vec16_i64, __reduce_max_int64, >) +REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <) +REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >) + +/////////////////////////////////////////////////////////////////////////// +// masked load/store +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec16_i8 __masked_load_i8(void *p, + __vec16_i1 mask) { + __vec16_i8 ret; + int8_t *ptr = (int8_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} + +static FORCEINLINE __vec16_i16 __masked_load_i16(void *p, + __vec16_i1 mask) { + __vec16_i16 ret; + int16_t *ptr = (int16_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} + +static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY + return _mm512_mask_load_epi32(__vec16_i32(), mask, p); +#else + __vec16_i32 tmp; + tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __vec16_i32 ret; + return _mm512_mask_mov_epi32(ret.v, mask, tmp.v); +#endif +} + +static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY + return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p); +#else + __vec16_f tmp; + tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + __vec16_f ret; + return _mm512_mask_mov_ps(ret.v, mask, tmp.v); +#endif +} + +static FORCEINLINE __vec16_i64 __masked_load_i64(void *p, + __vec16_i1 mask) { + __vec16_i64 ret; + int64_t *ptr = (int64_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} + +static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY + __vec16_d ret; + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p); + ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64); + return ret; +#else + __vec16_d tmp; + tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + __vec16_d ret; + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1); + ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2); + return ret; +#endif +} + + +static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val, + __vec16_i1 mask) { + int8_t *ptr = (int8_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} + +static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val, + __vec16_i1 mask) { + int16_t *ptr = (int16_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} + +static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY + _mm512_mask_store_epi32(p, mask, val.v); +#else + __vec16_i32 tmp; + tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v); + _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +#endif +} + +static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, __vec16_i1 mask) +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY + _mm512_mask_store_ps(p, mask, val.v); +#else + __vec16_f tmp; + tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v); + _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); +#endif +} + +static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val, + __vec16_i1 mask) { + int64_t *ptr = (int64_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} + +static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, __vec16_i1 mask) +{ +#ifdef ISPC_FORCE_ALIGNED_MEMORY + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + _mm512_mask_store_pd(p, mask, val.v1); + _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2); +#else + __vec16_d tmp; + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1); + tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2); + _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); +#endif +} + +static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val, + __vec16_i1 mask) { + __masked_store_i8(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val, + __vec16_i1 mask) { + __masked_store_i16(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val, + __vec16_i1 mask) { + __masked_store_i32(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, + __vec16_i1 mask) { + __masked_store_float(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val, + __vec16_i1 mask) { + __masked_store_i64(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val, + __vec16_i1 mask) { + __masked_store_double(p, val, mask); +} + +/////////////////////////////////////////////////////////////////////////// +// gather/scatter +/////////////////////////////////////////////////////////////////////////// + +// offsets * offsetScale is in bytes (for all of these) + +/* knc::macro::used */ +#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, __vec16_i1 mask) { \ + VTYPE ret; \ + int8_t *base = (int8_t *)b; \ + for (int i = 0; i < 16; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset[i]); \ + ret[i] = *ptr; \ + } \ + return ret; \ +} + + +/****************/ +// GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) +static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) +{ + // (iw): need to temporarily store as int because gathers can only return ints. + __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, + _MM_UPCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); + // now, downconverting to chars into temporary char vector + __vec16_i8 ret; + _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; +} +// GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) +static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) +{ + const __vec16_i64 offsets = _offsets.cvt2hilo(); + __vec16_i1 still_to_do = mask; + __vec16_i32 tmp; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base, + _MM_UPCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } + __vec16_i8 ret; + _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; +} +/****************/ +GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) +GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) +/****************/ +// GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32) +static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) +{ + return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, + base, _MM_UPCONV_EPI32_NONE, scale, + _MM_HINT_NONE); +} +// GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) +static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) +{ + const __vec16_i64 offsets = _offsets.cvt2hilo(); + // There is no gather instruction with 64-bit offsets in KNC. + // We have to manually iterate over the upper 32 bits ;-) + __vec16_i1 still_to_do = mask; + __vec16_i32 ret; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + ret = _mm512_mask_i32extgather_epi32(ret, match, offsets.v_lo, base, + _MM_UPCONV_EPI32_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match, still_to_do); + } + + return ret; +} +/****************/ +// GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float) +static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) +{ + return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets, + base, _MM_UPCONV_PS_NONE, scale, + _MM_HINT_NONE); +} +// GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float) +static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) +{ + const __vec16_i64 offsets = _offsets.cvt2hilo(); + // There is no gather instruction with 64-bit offsets in KNC. + // We have to manually iterate over the upper 32 bits ;-) + __vec16_i1 still_to_do = mask; + __vec16_f ret; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base, + _MM_UPCONV_PS_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match, still_to_do); + } + + return ret; +} +/****************/ +GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) +GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) +/****************/ +// GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double) +static FORCEINLINE __vec16_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) +{ + __vec16_d ret; + ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets, + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); + __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC); + const __mmask8 mask8 = 0x00FF & (mask >> 8); /* knc::testme */ + ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets, + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); + return ret; +} +GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_base_offsets64_double) + +/* knc::macro::used */ +#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)ptrs[i]; \ + ret[i] = *ptr; \ + } \ + return ret; \ +} +/* knc::macro::used */ +#define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1) \ +static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \ + return FUNC1(0, 1, ptrs, mask); \ +} + + +/***********/ +GATHER_GENERALF(__vec16_i8, int8_t, __vec16_i32, __gather32_i8, __gather_base_offsets32_i8) +GATHER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __gather32_i16, __gather_base_offsets32_i16) +GATHER_GENERALF(__vec16_i32, int32_t, __vec16_i32, __gather32_i32, __gather_base_offsets32_i32) +GATHER_GENERALF(__vec16_i64, int64_t, __vec16_i32, __gather32_i64, __gather_base_offsets32_i64) +GATHER_GENERALF(__vec16_f, float, __vec16_i32, __gather32_float, __gather_base_offsets32_float) +GATHER_GENERALF(__vec16_d, double, __vec16_i32, __gather32_double, __gather_base_offsets32_double) +/***********/ +GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __gather64_i8); +GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16); +GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32); +GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64); +GATHER_GENERAL(__vec16_f, float, __vec16_i64, __gather64_float); +GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double); +/***********/ + +// scatter + +/* knc::macro::used */ +#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, VTYPE val, \ + __vec16_i1 mask) { \ + int8_t *base = (int8_t *)b; \ + for (int i = 0; i < 16; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset[i]); \ + *ptr = val[i]; \ + } \ +} + + +/*****************/ +SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) +SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) +/*****************/ +SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) +SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) +/*****************/ +// SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32) +static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets, __vec16_i32 val, __vec16_i1 mask) +{ + _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); +} +// SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) +static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask) +{ + const __vec16_i64 offsets = _offsets.cvt2hilo(); + + __vec16_i1 still_to_do = mask; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, + value, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } +} +/*****************/ +// SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float) +static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets, + __vec16_f val, __vec16_i1 mask) +{ + _mm512_mask_i32extscatter_ps(base, mask, offsets, val, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); +} +//SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float) +static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) +{ + const __vec16_i64 offsets = _offsets.cvt2hilo(); + + __vec16_i1 still_to_do = mask; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + _mm512_mask_i32extscatter_ps(base, match, offsets.v_lo, + value, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } +} +/*****************/ +SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) +SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) +/*****************/ +// SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double) +static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec16_i32 offsets, + __vec16_d val, __vec16_i1 mask) +{ + _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v1, + _MM_DOWNCONV_PD_NONE, scale, + _MM_HINT_NONE); + __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC); + const __mmask8 mask8 = 0x00FF & (mask >> 8); /* knc::testme */ + _mm512_mask_i32loextscatter_pd(base, mask8, shuffled_offsets, val.v2, + _MM_DOWNCONV_PD_NONE, scale, + _MM_HINT_NONE); +} +SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_base_offsets64_double) + +/* knc::macro::used */ +#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ +static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)ptrs[i]; \ + *ptr = val[i]; \ + } \ +} +/* knc::macro::used */ +#define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1) \ +static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ + return FUNC1(0, 1, ptrs, val, mask); \ +} + +/***********/ +SCATTER_GENERALF(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8, __scatter_base_offsets32_i8) +SCATTER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16, __scatter_base_offsets32_i16) +SCATTER_GENERALF(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32, __scatter_base_offsets32_i32) +SCATTER_GENERALF(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64, __scatter_base_offsets32_i64) +SCATTER_GENERALF(__vec16_f, float, __vec16_i32, __scatter32_float, __scatter_base_offsets32_float) +SCATTER_GENERALF(__vec16_d, double, __vec16_i32, __scatter32_double, __scatter_base_offsets32_double) +/***********/ +SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8) +SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16) +SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32) +SCATTER_GENERAL(__vec16_f, float, __vec16_i64, __scatter64_float) +SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64) +SCATTER_GENERAL(__vec16_d, double, __vec16_i64, __scatter64_double) +/***********/ + +/////////////////////////////////////////////////////////////////////////// +// packed load/store +/////////////////////////////////////////////////////////////////////////// + + +static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, __vec16_i1 mask) +{ + __vec16_i32 v = __load<64>(val); + v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __store<64>(val, v); + return _mm_countbits_32(uint32_t(mask)); +} + +static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + return _mm_countbits_32(uint32_t(mask)); +} + +static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, __vec16_i1 mask) +{ + __vec16_i32 v = __load<64>(val); + v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __store<64>(val, v); + return _mm_countbits_32(uint32_t(mask)); +} + +static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + return _mm_countbits_32(uint32_t(mask)); +} + +/////////////////////////////////////////////////////////////////////////// +// aos/soa +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE void __soa_to_aos3_float(__vec16_f v0, __vec16_f v1, __vec16_f v2, + float *ptr) { + for (int i = 0; i < 16; ++i) { + *ptr++ = __extract_element(v0, i); + *ptr++ = __extract_element(v1, i); + *ptr++ = __extract_element(v2, i); + } +} + +static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec16_f *out0, __vec16_f *out1, + __vec16_f *out2) { + for (int i = 0; i < 16; ++i) { + __insert_element(out0, i, *ptr++); + __insert_element(out1, i, *ptr++); + __insert_element(out2, i, *ptr++); + } +} + +static FORCEINLINE void __soa_to_aos4_float(__vec16_f v0, __vec16_f v1, __vec16_f v2, + __vec16_f v3, float *ptr) { + for (int i = 0; i < 16; ++i) { + *ptr++ = __extract_element(v0, i); + *ptr++ = __extract_element(v1, i); + *ptr++ = __extract_element(v2, i); + *ptr++ = __extract_element(v3, i); + } +} + +static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec16_f *out0, __vec16_f *out1, + __vec16_f *out2, __vec16_f *out3) { + for (int i = 0; i < 16; ++i) { + __insert_element(out0, i, *ptr++); + __insert_element(out1, i, *ptr++); + __insert_element(out2, i, *ptr++); + __insert_element(out3, i, *ptr++); + } +} + +/////////////////////////////////////////////////////////////////////////// +// prefetch +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) { + _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$ +} + +static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *p) { + _mm_prefetch((char *)p, _MM_HINT_T1); // prefetch into L2$ +} + +static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *p) { + // There is no L3$ on KNC, don't want to pollute L2$ unecessarily +} + +static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) { + _mm_prefetch((char *)p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint + // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint +} + +/////////////////////////////////////////////////////////////////////////// +// atomics +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedAdd((LONG volatile *)p, v) - v; +#else + return __sync_fetch_and_add(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedAdd((LONG volatile *)p, -v) + v; +#else + return __sync_fetch_and_sub(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedAnd((LONG volatile *)p, v); +#else + return __sync_fetch_and_and(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedOr((LONG volatile *)p, v); +#else + return __sync_fetch_and_or(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedXor((LONG volatile *)p, v); +#else + return __sync_fetch_and_xor(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) { + int32_t old, min; + do { + old = *((volatile int32_t *)p); + min = (old < (int32_t)v) ? old : (int32_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) { + int32_t old, max; + do { + old = *((volatile int32_t *)p); + max = (old > (int32_t)v) ? old : (int32_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) { + uint32_t old, min; + do { + old = *((volatile uint32_t *)p); + min = (old < v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) { + uint32_t old, max; + do { + old = *((volatile uint32_t *)p); + max = (old > v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedExchange((LONG volatile *)p, v); +#else + return __sync_lock_test_and_set(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval, + uint32_t newval) { +#ifdef _MSC_VER + return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval); +#else + return __sync_val_compare_and_swap(p, cmpval, newval); +#endif +} + +static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedAdd64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_add(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedAdd64((LONGLONG volatile *)p, -v) + v; +#else + return __sync_fetch_and_sub(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedAnd64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_and(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedOr64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_or(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedXor64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_xor(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) { + int64_t old, min; + do { + old = *((volatile int64_t *)p); + min = (old < (int64_t)v) ? old : (int64_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) { + int64_t old, max; + do { + old = *((volatile int64_t *)p); + max = (old > (int64_t)v) ? old : (int64_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) { + uint64_t old, min; + do { + old = *((volatile uint64_t *)p); + min = (old < v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) { + uint64_t old, max; + do { + old = *((volatile uint64_t *)p); + max = (old > v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedExchange64((LONGLONG volatile *)p, v); +#else + return __sync_lock_test_and_set(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, + uint64_t newval) { +#ifdef _MSC_VER + return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval); +#else + return __sync_val_compare_and_swap(p, cmpval, newval); +#endif +} + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} + +#endif // !WIN32 + +#undef FORCEINLINE +#undef PRE_ALIGN +#undef POST_ALIGN diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h new file mode 100644 index 00000000..d7696117 --- /dev/null +++ b/examples/intrinsics/knc-i1x8.h @@ -0,0 +1,2818 @@ +/** + Copyright (c) 2010-2012, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#define FORCEINLINE __forceinline +#define PRE_ALIGN(x) /*__declspec(align(x))*/ +#define POST_ALIGN(x) +#define roundf(x) (floorf(x + .5f)) +#define round(x) (floor(x + .5)) +#else +#define FORCEINLINE __forceinline +#define PRE_ALIGN(x) +#define POST_ALIGN(x) __attribute__ ((aligned(x))) +#endif + +#define KNC 1 +#if 0 +extern "C" +{ + int printf(const unsigned char *, ...); + int puts(unsigned char *); + unsigned int putchar(unsigned int); + int fflush(void *); + uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t); + uint8_t *memset(uint8_t *, uint8_t, uint64_t); + void memset_pattern16(void *, const void *, uint64_t); +} +#endif + +typedef float __vec1_f; +typedef double __vec1_d; +typedef int8_t __vec1_i8; +typedef int16_t __vec1_i16; +typedef int32_t __vec1_i32; +typedef int64_t __vec1_i64; + +struct __vec8_i1 { + __vec8_i1() { } + __vec8_i1(const __mmask8 &vv) : v(vv) { } + __vec8_i1(bool v0, bool v1, bool v2, bool v3, + bool v4, bool v5, bool v6, bool v7) { + v = ((v0 & 1) | + ((v1 & 1) << 1) | + ((v2 & 1) << 2) | + ((v3 & 1) << 3) | + ((v4 & 1) << 4) | + ((v5 & 1) << 5) | + ((v6 & 1) << 6) | + ((v7 & 1) << 7) ); + } + + __mmask8 v; + FORCEINLINE operator __mmask8() const { return v; }//0xFF & v; } +}; + + +template +struct vec8 { + vec8() { } + vec8(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) { + data[0] = v0; data[1] = v1; data[2] = v2; data[3] = v3; + data[4] = v4; data[5] = v5; data[6] = v6; data[7] = v7; + } + T data[8]; + FORCEINLINE const T& operator[](const int i) const { return data[i]; } + FORCEINLINE T& operator[](const int i) { return data[i]; } +}; + +/****************/ + +struct PRE_ALIGN(32) __vec8_i32 +{ +#ifdef __ZMM64BIT__ + __m512i _data; + FORCEINLINE __vec8_i32(const __m512i &in) : _data(in) {} + FORCEINLINE operator __m512i() const { return _data; } +#else /* __ZMM64BIT__ */ + typedef int32_t _v8si __attribute__((vector_size(32))); + _v8si _data; + FORCEINLINE __vec8_i32(const __m512i &in) + { + _mm512_mask_extpackstorelo_epi32((__m512i*)&_data, 0xFF, in, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + } + FORCEINLINE operator __m512i() const + { + return _mm512_extloadunpacklo_epi32(_mm512_setzero_epi32(), (uint8_t*)&_data, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + } +#endif /* __ZMM64BIT__ */ + + __vec8_i32() { } + FORCEINLINE __vec8_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, + int32_t v4, int32_t v5, int32_t v6, int32_t v7) + { + const __m512i v = _mm512_set_16to16_pi(0,0,0,0,0,0,0,0, v7, v6, v5, v4, v3, v2, v1, v0); + *this = __vec8_i32(v); + } + + FORCEINLINE const int32_t& operator[](const int i) const { return ((int32_t*)this)[i]; } + FORCEINLINE int32_t& operator[](const int i) { return ((int32_t*)this)[i]; } +} POST_ALIGN(32); + +PRE_ALIGN(32) struct __vec8_f +{ +#ifdef __ZMM64BIT__ + __m512 _data; + FORCEINLINE __vec8_f(const __m512 &in) : _data(in) {} + FORCEINLINE operator __m512() const { return _data; } +#else /* __ZMM64BIT__ */ + typedef float _v8sf __attribute__((vector_size(32))); + _v8sf _data; + FORCEINLINE __vec8_f(const __m512 &in) + { + _mm512_mask_extpackstorelo_ps((__m512*)&_data, 0xFF, in, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + } + FORCEINLINE operator __m512() const + { + return _mm512_extloadunpacklo_ps(_mm512_setzero_ps(), (uint8_t*)&_data, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + } +#endif /* __ZMM64BIT__ */ + FORCEINLINE __vec8_f() { } + FORCEINLINE __vec8_f(float v0, float v1, float v2, float v3, + float v4, float v5, float v6, float v7) + { + const __m512 v = _mm512_set_16to16_ps(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, v7, v6, v5, v4, v3, v2, v1, v0); + *this = __vec8_f(v); + } + + FORCEINLINE const float& operator[](const int i) const { return ((float*)this)[i]; } + FORCEINLINE float& operator[](const int i) { return ((float*)this)[i]; } +} POST_ALIGN(32); + +struct PRE_ALIGN(64) __vec8_d +{ + __m512d v; + FORCEINLINE __vec8_d() : v(_mm512_undefined_pd()) {} + FORCEINLINE __vec8_d(const __m512d _v) : v(_v) {} + FORCEINLINE __vec8_d(const __vec8_d &o) : v(o.v) {} + FORCEINLINE __vec8_d& operator =(const __vec8_d &o) { v=o.v; return *this; } + FORCEINLINE operator __m512d() const { return v; } + FORCEINLINE __vec8_d(double v00, double v01, double v02, double v03, + double v04, double v05, double v06, double v07) : + v ( _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00) ) {} + FORCEINLINE const double& operator[](const int i) const { return ((double*)this)[i]; } + FORCEINLINE double& operator[](const int i) { return ((double*)this)[i]; } +} POST_ALIGN(64); + +/****************/ + +PRE_ALIGN(64) struct __vec8_i64 : public vec8 { + __vec8_i64() { } + __vec8_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, + int64_t v4, int64_t v5, int64_t v6, int64_t v7) + : vec8(v0, v1, v2, v3, v4, v5, v6, v7) { } +} POST_ALIGN(64); + +PRE_ALIGN(16) struct __vec8_i8 : public vec8 { + __vec8_i8() { } + __vec8_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, + int8_t v4, int8_t v5, int8_t v6, int8_t v7) + : vec8(v0, v1, v2, v3, v4, v5, v6, v7) { } +} POST_ALIGN(16); + +PRE_ALIGN(32) struct __vec8_i16 : public vec8 { + __vec8_i16() { } + __vec8_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, + int16_t v4, int16_t v5, int16_t v6, int16_t v7) + : vec8(v0, v1, v2, v3, v4, v5, v6, v7) { } +} POST_ALIGN(32); + +static inline int32_t __extract_element(__vec8_i32, int); + + +/////////////////////////////////////////////////////////////////////////// +// macros... + +#define UNARY_OP(TYPE, NAME, OP) \ +static FORCEINLINE TYPE NAME(TYPE v) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = OP(v[i]); \ + return ret; \ +} + +#define BINARY_OP(TYPE, NAME, OP) \ +static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = a[i] OP b[i]; \ + return ret; \ +} + +#define BINARY_OP_CAST(TYPE, CAST, NAME, OP) \ +static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = (CAST)(a[i]) OP (CAST)(b[i]); \ + return ret; \ +} + +#define BINARY_OP_FUNC(TYPE, NAME, FUNC) \ +static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = FUNC(a[i], b[i]); \ + return ret; \ +} + +#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \ +static FORCEINLINE __vec8_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ + __vec8_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 8; ++i) \ + ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i; \ + return ret; \ +} \ +static FORCEINLINE __vec8_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b, \ + __vec8_i1 mask) { \ + __vec8_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 8; ++i) \ + ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i; \ + ret.v &= mask.v; \ + return ret; \ +} + +#define INSERT_EXTRACT(VTYPE, STYPE) \ +static FORCEINLINE STYPE __extract_element(VTYPE v, int index) { \ + return ((STYPE *)&v)[index]; \ +} \ +static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \ + ((STYPE *)v)[index] = val; \ +} + +#define LOAD_STORE(VTYPE, STYPE) \ +template \ +static FORCEINLINE VTYPE __load(const VTYPE *p) { \ + STYPE *ptr = (STYPE *)p; \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = ptr[i]; \ + return ret; \ +} \ +template \ +static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \ + STYPE *ptr = (STYPE *)p; \ + for (int i = 0; i < 8; ++i) \ + ptr[i] = v[i]; \ +} + +#define LOADS(VTYPE, STYPE) \ +template \ +static FORCEINLINE VTYPE __load(const VTYPE *p) { \ + STYPE *ptr = (STYPE *)p; \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = ptr[i]; \ + return ret; \ +} \ + +#define STORES(VTYPE, STYPE) \ +template \ +static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \ + STYPE *ptr = (STYPE *)p; \ + for (int i = 0; i < 8; ++i) \ + ptr[i] = v[i]; \ +} + +#define REDUCE_ADD(TYPE, VTYPE, NAME) \ +static FORCEINLINE TYPE NAME(VTYPE v) { \ + TYPE ret = v[0]; \ + for (int i = 1; i < 8; ++i) \ + ret = ret + v[i]; \ + return ret; \ +} + +#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP) \ +static FORCEINLINE TYPE NAME(VTYPE v) { \ + TYPE ret = v[0]; \ + for (int i = 1; i < 8; ++i) \ + ret = (ret OP (TYPE)v[i]) ? ret : (TYPE)v[i]; \ + return ret; \ +} + +#define SELECT(TYPE) \ +static FORCEINLINE TYPE __select(__vec8_i1 mask, TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = (mask.v & (1< VTYPE __smear_##NAME(STYPE); \ +template <> FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = v; \ + return ret; \ +} + +#define SETZERO(VTYPE, NAME) \ +template VTYPE __setzero_##NAME(); \ +template <> FORCEINLINE VTYPE __setzero_##NAME() { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = 0; \ + return ret; \ +} + +#define UNDEF(VTYPE, NAME) \ +template VTYPE __undef_##NAME(); \ +template <> FORCEINLINE VTYPE __undef_##NAME() { \ + return VTYPE(); \ +} + +#define BROADCAST(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = v[index & 0x7]; \ + return ret; \ +} \ + +#define ROTATE(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = v[(i+index) & 0x7]; \ + return ret; \ +} \ + +#define SHUFFLES(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec8_i32 index) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = v[__extract_element(index, i) & 0x7]; \ + return ret; \ +} \ +static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec8_i32 index) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) { \ + int ii = __extract_element(index, i) & 0xf; \ + ret[i] = (ii < 8) ? v0[ii] : v1[ii-8]; \ + } \ + return ret; \ +} + +#define SHUFFLE2(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec8_i32 index) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) { \ + int ii = __extract_element(index, i) & 0xf; \ + ret[i] = (ii < 8) ? v0[ii] : v1[ii-8]; \ + } \ + return ret; \ +} + +/////////////////////////////////////////////////////////////////////////// + +INSERT_EXTRACT(__vec1_i8, int8_t) +INSERT_EXTRACT(__vec1_i16, int16_t) +INSERT_EXTRACT(__vec1_i32, int32_t) +INSERT_EXTRACT(__vec1_i64, int64_t) +INSERT_EXTRACT(__vec1_f, float) +INSERT_EXTRACT(__vec1_d, double) + +/////////////////////////////////////////////////////////////////////////// +// mask ops + +static FORCEINLINE __vec8_i1 __movmsk(__vec8_i1 mask) { + return mask.v; +} + +static FORCEINLINE bool __any(__vec8_i1 mask) { + return (mask.v!=0); +} + +static FORCEINLINE bool __all(__vec8_i1 mask) { + return (mask.v==0xFF); +} + +static FORCEINLINE bool __none(__vec8_i1 mask) { + return (mask.v==0); +} + +static FORCEINLINE __vec8_i1 __equal_i1(__vec8_i1 a, __vec8_i1 b) { + return (a.v & b.v) | (~a.v & ~b.v); +} + +static FORCEINLINE __vec8_i1 __and(__vec8_i1 a, __vec8_i1 b) { + return a.v & b.v; +} + +static FORCEINLINE __vec8_i1 __xor(__vec8_i1 a, __vec8_i1 b) { + return a.v ^ b.v; +} + +static FORCEINLINE __vec8_i1 __or(__vec8_i1 a, __vec8_i1 b) { + return a.v | b.v; +} + +static FORCEINLINE __vec8_i1 __not(__vec8_i1 v) { + return ~v; +} + +static FORCEINLINE __vec8_i1 __and_not1(__vec8_i1 a, __vec8_i1 b) { + return ~a.v & b.v; +} + +static FORCEINLINE __vec8_i1 __and_not2(__vec8_i1 a, __vec8_i1 b) { + return a.v & ~b.v; +} + +static FORCEINLINE __vec8_i1 __select(__vec8_i1 mask, __vec8_i1 a, + __vec8_i1 b) { + return (a.v & mask.v) | (b.v & ~mask.v); +} + +static FORCEINLINE __vec8_i1 __select(bool cond, __vec8_i1 a, __vec8_i1 b) { + return cond ? a : b; +} + +static FORCEINLINE bool __extract_element(__vec8_i1 vec, int index) { + return (vec.v & (1 << index)) ? true : false; +} + +static FORCEINLINE void __insert_element(__vec8_i1 *vec, int index, + bool val) { + if (val == false) + vec->v &= ~(1 << index); + else + vec->v |= (1 << index); +} + +template static FORCEINLINE __vec8_i1 __load(const __vec8_i1 *p) { + uint8_t *ptr = (uint8_t *)p; + __vec8_i1 r; + r.v = *ptr; + return r; +} + +template static FORCEINLINE void __store(__vec8_i1 *p, __vec8_i1 v) { + uint8_t *ptr = (uint8_t *)p; + *ptr = v.v; +} + +template RetVecType __smear_i1(int i); +template <> static FORCEINLINE __vec8_i1 __smear_i1<__vec8_i1>(int i) { + return i?0xFF:0x0; +} + +template RetVecType __setzero_i1(); +template <> static FORCEINLINE __vec8_i1 __setzero_i1<__vec8_i1>() { + return 0; +} + +template __vec8_i1 __undef_i1(); +template <> FORCEINLINE __vec8_i1 __undef_i1<__vec8_i1>() { + return __vec8_i1(); +} + + +/////////////////////////////////////////////////////////////////////////// +// int8 + +BINARY_OP(__vec8_i8, __add, +) +BINARY_OP(__vec8_i8, __sub, -) +BINARY_OP(__vec8_i8, __mul, *) + +BINARY_OP(__vec8_i8, __or, |) +BINARY_OP(__vec8_i8, __and, &) +BINARY_OP(__vec8_i8, __xor, ^) +BINARY_OP(__vec8_i8, __shl, <<) + +BINARY_OP_CAST(__vec8_i8, uint8_t, __udiv, /) +BINARY_OP_CAST(__vec8_i8, int8_t, __sdiv, /) + +BINARY_OP_CAST(__vec8_i8, uint8_t, __urem, %) +BINARY_OP_CAST(__vec8_i8, int8_t, __srem, %) +BINARY_OP_CAST(__vec8_i8, uint8_t, __lshr, >>) +BINARY_OP_CAST(__vec8_i8, int8_t, __ashr, >>) + +SHIFT_UNIFORM(__vec8_i8, uint8_t, __lshr, >>) +SHIFT_UNIFORM(__vec8_i8, int8_t, __ashr, >>) +SHIFT_UNIFORM(__vec8_i8, int8_t, __shl, <<) + +CMP_OP(__vec8_i8, i8, int8_t, __equal, ==) +CMP_OP(__vec8_i8, i8, int8_t, __not_equal, !=) +CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_less_equal, <=) +CMP_OP(__vec8_i8, i8, int8_t, __signed_less_equal, <=) +CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_greater_equal, >=) +CMP_OP(__vec8_i8, i8, int8_t, __signed_greater_equal, >=) +CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_less_than, <) +CMP_OP(__vec8_i8, i8, int8_t, __signed_less_than, <) +CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_greater_than, >) +CMP_OP(__vec8_i8, i8, int8_t, __signed_greater_than, >) + +SELECT(__vec8_i8) +INSERT_EXTRACT(__vec8_i8, int8_t) +SMEAR(__vec8_i8, i8, int8_t) +SETZERO(__vec8_i8, i8) +UNDEF(__vec8_i8, i8) +BROADCAST(__vec8_i8, i8, int8_t) +ROTATE(__vec8_i8, i8, int8_t) +SHUFFLES(__vec8_i8, i8, int8_t) +LOAD_STORE(__vec8_i8, int8_t) + +/////////////////////////////////////////////////////////////////////////// +// int16 + +BINARY_OP(__vec8_i16, __add, +) +BINARY_OP(__vec8_i16, __sub, -) +BINARY_OP(__vec8_i16, __mul, *) + +BINARY_OP(__vec8_i16, __or, |) +BINARY_OP(__vec8_i16, __and, &) +BINARY_OP(__vec8_i16, __xor, ^) +BINARY_OP(__vec8_i16, __shl, <<) + +BINARY_OP_CAST(__vec8_i16, uint16_t, __udiv, /) +BINARY_OP_CAST(__vec8_i16, int16_t, __sdiv, /) + +BINARY_OP_CAST(__vec8_i16, uint16_t, __urem, %) +BINARY_OP_CAST(__vec8_i16, int16_t, __srem, %) +BINARY_OP_CAST(__vec8_i16, uint16_t, __lshr, >>) +BINARY_OP_CAST(__vec8_i16, int16_t, __ashr, >>) + +SHIFT_UNIFORM(__vec8_i16, uint16_t, __lshr, >>) +SHIFT_UNIFORM(__vec8_i16, int16_t, __ashr, >>) +SHIFT_UNIFORM(__vec8_i16, int16_t, __shl, <<) + +CMP_OP(__vec8_i16, i16, int16_t, __equal, ==) +CMP_OP(__vec8_i16, i16, int16_t, __not_equal, !=) +CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_less_equal, <=) +CMP_OP(__vec8_i16, i16, int16_t, __signed_less_equal, <=) +CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_greater_equal, >=) +CMP_OP(__vec8_i16, i16, int16_t, __signed_greater_equal, >=) +CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_less_than, <) +CMP_OP(__vec8_i16, i16, int16_t, __signed_less_than, <) +CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_greater_than, >) +CMP_OP(__vec8_i16, i16, int16_t, __signed_greater_than, >) + +SELECT(__vec8_i16) +INSERT_EXTRACT(__vec8_i16, int16_t) +SMEAR(__vec8_i16, i16, int16_t) +SETZERO(__vec8_i16, i16) +UNDEF(__vec8_i16, i16) +BROADCAST(__vec8_i16, i16, int16_t) +ROTATE(__vec8_i16, i16, int16_t) +SHUFFLES(__vec8_i16, i16, int16_t) +LOAD_STORE(__vec8_i16, int16_t) + +#if 0 /* evghenii::int32 */ +/////////////////////////////////////////////////////////////////////////// +// int32 + +BINARY_OP(__vec8_i32, __add, +) +BINARY_OP(__vec8_i32, __sub, -) +BINARY_OP(__vec8_i32, __mul, *) + +BINARY_OP(__vec8_i32, __or, |) +BINARY_OP(__vec8_i32, __and, &) +BINARY_OP(__vec8_i32, __xor, ^) +BINARY_OP(__vec8_i32, __shl, <<) + +BINARY_OP_CAST(__vec8_i32, uint32_t, __udiv, /) +BINARY_OP_CAST(__vec8_i32, int32_t, __sdiv, /) + +BINARY_OP_CAST(__vec8_i32, uint32_t, __urem, %) +BINARY_OP_CAST(__vec8_i32, int32_t, __srem, %) +BINARY_OP_CAST(__vec8_i32, uint32_t, __lshr, >>) +BINARY_OP_CAST(__vec8_i32, int32_t, __ashr, >>) + +SHIFT_UNIFORM(__vec8_i32, uint32_t, __lshr, >>) +SHIFT_UNIFORM(__vec8_i32, int32_t, __ashr, >>) +SHIFT_UNIFORM(__vec8_i32, int32_t, __shl, <<) + +CMP_OP(__vec8_i32, i32, int32_t, __equal, ==) +CMP_OP(__vec8_i32, i32, int32_t, __not_equal, !=) +CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_less_equal, <=) +CMP_OP(__vec8_i32, i32, int32_t, __signed_less_equal, <=) +CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_greater_equal, >=) +CMP_OP(__vec8_i32, i32, int32_t, __signed_greater_equal, >=) +CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_less_than, <) +CMP_OP(__vec8_i32, i32, int32_t, __signed_less_than, <) +CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_greater_than, >) +CMP_OP(__vec8_i32, i32, int32_t, __signed_greater_than, >) + +SELECT(__vec8_i32) +INSERT_EXTRACT(__vec8_i32, int32_t) +SMEAR(__vec8_i32, i32, int32_t) +SETZERO(__vec8_i32, i32) +UNDEF(__vec8_i32, i32) +BROADCAST(__vec8_i32, i32, int32_t) +ROTATE(__vec8_i32, i32, int32_t) +SHUFFLES(__vec8_i32, i32, int32_t) +LOAD_STORE(__vec8_i32, int32_t) + +#else /* evghenii::int32 */ +/////////////////////////////////////////////////////////////////////////// +// int32 +/////////////////////////////////////////////////////////////////////////// + +#define IZERO _mm512_setzero_epi32() +static FORCEINLINE __vec8_i32 __add(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_add_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __sub(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_sub_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __mul(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_mullo_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __udiv(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_div_epu32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __sdiv(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_div_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __urem(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_rem_epu32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __srem(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_rem_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __or(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_or_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __and(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_and_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __xor(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_xor_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __shl(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_sllv_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __lshr(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_srlv_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __ashr(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_srav_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __shl(__vec8_i32 a, int32_t n) { + return _mm512_mask_slli_epi32(IZERO,0xFF, a, n); +} + +static FORCEINLINE __vec8_i32 __lshr(__vec8_i32 a, int32_t n) { + return _mm512_mask_srli_epi32(IZERO,0xFF, a, n); +} + +static FORCEINLINE __vec8_i32 __ashr(__vec8_i32 a, int32_t n) { + return _mm512_mask_srai_epi32(IZERO,0xFF, a, n); +} + +static FORCEINLINE __vec8_i1 __equal_i32(const __vec8_i32 &a, const __vec8_i32 &b) { + return _mm512_mask_cmpeq_epi32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __equal_i32_and_mask(const __vec8_i32 &a, const __vec8_i32 &b, + __vec8_i1 m) { + return _mm512_mask_cmpeq_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __not_equal_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmpneq_epi32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __not_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmpneq_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_less_equal_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmple_epu32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_less_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmple_epu32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __signed_less_equal_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmple_epi32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __signed_less_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmple_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_greater_equal_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmpge_epu32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_greater_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmpge_epu32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __signed_greater_equal_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmpge_epi32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __signed_greater_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmpge_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_less_than_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmplt_epu32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_less_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmplt_epu32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __signed_less_than_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmplt_epi32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __signed_less_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmplt_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_greater_than_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmpgt_epu32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_greater_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmpgt_epu32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __signed_greater_than_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmpgt_epi32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __signed_greater_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmpgt_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i32 __select(__vec8_i1 mask, + __vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_mov_epi32(b, mask, a); +} + +static FORCEINLINE __vec8_i32 __select(bool cond, __vec8_i32 a, __vec8_i32 b) { + return cond ? a : b; +} + +static FORCEINLINE int32_t __extract_element(__vec8_i32 v, int index) { //uint32_t index) { + return ((int32_t *)&v)[index]; +} + +static FORCEINLINE void __insert_element(__vec8_i32 *v, uint32_t index, int32_t val) { + ((int32_t *)v)[index] = val; +} + +template RetVecType __smear_i32(int32_t i); +template <> static FORCEINLINE __vec8_i32 __smear_i32<__vec8_i32>(int32_t i) { + return _mm512_set_16to16_epi32(0,0,0,0,0,0,0,0, i,i,i,i,i,i,i,i); +} + +static const __vec8_i32 __ispc_one = __smear_i32<__vec8_i32>(1); +static const __vec8_i32 __ispc_thirty_two = __smear_i32<__vec8_i32>(32); +static const __vec8_i32 __ispc_ffffffff = __smear_i32<__vec8_i32>(-1); +static const __vec8_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7); + +template RetVecType __setzero_i32(); +template <> static FORCEINLINE __vec8_i32 __setzero_i32<__vec8_i32>() { + return _mm512_setzero_epi32(); +} + +template RetVecType __undef_i32(); +template <> static FORCEINLINE __vec8_i32 __undef_i32<__vec8_i32>() { + return __vec8_i32(); +} + +static FORCEINLINE __vec8_i32 __broadcast_i32(__vec8_i32 v, int index) { + int32_t val = __extract_element(v, index & 0xf); + return _mm512_set1_epi32(val); +} + +#if 0 /* evghenii::doesn't work */ +static FORCEINLINE __vec8_i32 __rotate_i32(__vec8_i32 v, int index) { + __vec8_i32 idx = __smear_i32<__vec8_i32>(index); + __vec8_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec8_i32>(0x7)); + return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v); +} +#else +ROTATE(__vec8_i32, i32, int32_t) +#endif + +static FORCEINLINE __vec8_i32 __shuffle_i32(__vec8_i32 v, __vec8_i32 index) { + return _mm512_mask_permutevar_epi32(v, 0xffff, index, v); +} +SHUFFLE2(__vec8_i32, i32, int32_t) /* evghenii::to implement */ + +template static FORCEINLINE __vec8_i32 __load(const __vec8_i32 *p) { + __vec8_i32 v; + v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + return __select(0xFF,v,IZERO); +} + + +template static FORCEINLINE void __store(__vec8_i32 *p, __vec8_i32 v) { + _mm512_mask_extpackstorelo_epi32( p, 0xFF, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +} + +#if 0 +template <> static FORCEINLINE __vec8_i32 __load<64>(const __vec8_i32 *p) { + return _mm512_load_epi32(p); +} +template <> static FORCEINLINE void __store<64>(__vec8_i32 *p, __vec8_i32 v) { + _mm512_store_epi32(p, v); +} +#endif +#endif /* evghenii::int32 */ + +/////////////////////////////////////////////////////////////////////////// +// int64 + +BINARY_OP(__vec8_i64, __add, +) +BINARY_OP(__vec8_i64, __sub, -) +BINARY_OP(__vec8_i64, __mul, *) + +BINARY_OP(__vec8_i64, __or, |) +BINARY_OP(__vec8_i64, __and, &) +BINARY_OP(__vec8_i64, __xor, ^) +BINARY_OP(__vec8_i64, __shl, <<) + +BINARY_OP_CAST(__vec8_i64, uint64_t, __udiv, /) +BINARY_OP_CAST(__vec8_i64, int64_t, __sdiv, /) + +BINARY_OP_CAST(__vec8_i64, uint64_t, __urem, %) +BINARY_OP_CAST(__vec8_i64, int64_t, __srem, %) +BINARY_OP_CAST(__vec8_i64, uint64_t, __lshr, >>) +BINARY_OP_CAST(__vec8_i64, int64_t, __ashr, >>) + +SHIFT_UNIFORM(__vec8_i64, uint64_t, __lshr, >>) +SHIFT_UNIFORM(__vec8_i64, int64_t, __ashr, >>) +SHIFT_UNIFORM(__vec8_i64, int64_t, __shl, <<) + +CMP_OP(__vec8_i64, i64, int64_t, __equal, ==) +CMP_OP(__vec8_i64, i64, int64_t, __not_equal, !=) +CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_less_equal, <=) +CMP_OP(__vec8_i64, i64, int64_t, __signed_less_equal, <=) +CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_greater_equal, >=) +CMP_OP(__vec8_i64, i64, int64_t, __signed_greater_equal, >=) +CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_less_than, <) +CMP_OP(__vec8_i64, i64, int64_t, __signed_less_than, <) +CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_greater_than, >) +CMP_OP(__vec8_i64, i64, int64_t, __signed_greater_than, >) + +SELECT(__vec8_i64) +INSERT_EXTRACT(__vec8_i64, int64_t) +SMEAR(__vec8_i64, i64, int64_t) +SETZERO(__vec8_i64, i64) +UNDEF(__vec8_i64, i64) +BROADCAST(__vec8_i64, i64, int64_t) +ROTATE(__vec8_i64, i64, int64_t) +SHUFFLES(__vec8_i64, i64, int64_t) +LOAD_STORE(__vec8_i64, int64_t) + + +#if 0 /* evghenii::float */ +/////////////////////////////////////////////////////////////////////////// +// float + +BINARY_OP(__vec8_f, __add, +) +BINARY_OP(__vec8_f, __sub, -) +BINARY_OP(__vec8_f, __mul, *) +BINARY_OP(__vec8_f, __div, /) + +CMP_OP(__vec8_f, float, float, __equal, ==) +CMP_OP(__vec8_f, float, float, __not_equal, !=) +CMP_OP(__vec8_f, float, float, __less_than, <) +CMP_OP(__vec8_f, float, float, __less_equal, <=) +CMP_OP(__vec8_f, float, float, __greater_than, >) +CMP_OP(__vec8_f, float, float, __greater_equal, >=) + +static FORCEINLINE __vec8_i1 __ordered_float(__vec8_f a, __vec8_f b) { + __vec8_i1 ret; + ret.v = 0; + for (int i = 0; i < 8; ++i) + ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0; + return ret; +} + +static FORCEINLINE __vec8_i1 __unordered_float(__vec8_f a, __vec8_f b) { + __vec8_i1 ret; + ret.v = 0; + for (int i = 0; i < 8; ++i) + ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0; + return ret; +} + +#if 0 + case Instruction::FRem: intrinsic = "__frem"; break; +#endif + +SELECT(__vec8_f) +INSERT_EXTRACT(__vec8_f, float) +SMEAR(__vec8_f, float, float) +SETZERO(__vec8_f, float) +UNDEF(__vec8_f, float) +BROADCAST(__vec8_f, float, float) +ROTATE(__vec8_f, float, float) +SHUFFLES(__vec8_f, float, float) +LOAD_STORE(__vec8_f, float) +#else /* evghenii::float */ + +/////////////////////////////////////////////////////////////////////////// +// float +/////////////////////////////////////////////////////////////////////////// + +#define FZERO _mm512_setzero_ps() +static FORCEINLINE __vec8_f __add(__vec8_f a, __vec8_f b) { + return _mm512_mask_add_ps(FZERO, 0xFF, a, b); +} + +static FORCEINLINE __vec8_f __sub(__vec8_f a, __vec8_f b) { + return _mm512_mask_sub_ps(FZERO, 0xFF, a, b); +} + +static FORCEINLINE __vec8_f __mul(__vec8_f a, __vec8_f b) { + return _mm512_mask_mul_ps(FZERO, 0xFF, a, b); +} + +static FORCEINLINE __vec8_f __div(__vec8_f a, __vec8_f b) { + return _mm512_mask_div_ps(FZERO, 0xFF, a, b); +} + +static FORCEINLINE __vec8_i1 __equal_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmpeq_ps_mask(0xFF, a, b); +} + +static FORCEINLINE __vec8_i1 __equal_float_and_mask(__vec8_f a, __vec8_f b, + __vec8_i1 m) { + return _mm512_mask_cmpeq_ps_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __not_equal_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmpneq_ps_mask(0xFF, a, b); +} + +static FORCEINLINE __vec8_i1 __not_equal_float_and_mask(__vec8_f a, __vec8_f b, + __vec8_i1 m) { + return _mm512_mask_cmpneq_ps_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __less_than_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmplt_ps_mask(0xFF, a, b); +} + +static FORCEINLINE __vec8_i1 __less_than_float_and_mask(__vec8_f a, __vec8_f b, + __vec8_i1 m) { + return _mm512_mask_cmplt_ps_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __less_equal_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmple_ps_mask(0xFF, a, b); +} + +static FORCEINLINE __vec8_i1 __less_equal_float_and_mask(__vec8_f a, __vec8_f b, + __vec8_i1 m) { + return _mm512_mask_cmple_ps_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __greater_than_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmp_ps_mask(0xFF, a, b,_CMP_GT_OS); +} + +static FORCEINLINE __vec8_i1 __greater_than_float_and_mask(__vec8_f a, __vec8_f b, + __vec8_i1 m) { + return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GT_OS); +} + +static FORCEINLINE __vec8_i1 __greater_equal_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmp_ps_mask(0xFF, a, b,_CMP_GE_OS); +} + +static FORCEINLINE __vec8_i1 __greater_equal_float_and_mask(__vec8_f a, __vec8_f b, + __vec8_i1 m) { + return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GE_OS); +} + +static FORCEINLINE __vec8_i1 __ordered_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmpord_ps_mask(0xFF, a, b); +} + +static FORCEINLINE __vec8_i1 __unordered_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmpunord_ps_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_f __select(__vec8_i1 mask, __vec8_f a, __vec8_f b) { + return _mm512_mask_mov_ps(b, mask & 0xFF, a); +} + +static FORCEINLINE __vec8_f __select(bool cond, __vec8_f a, __vec8_f b) { + return cond ? a : b; +} + +static FORCEINLINE float __extract_element(__vec8_f v, uint32_t index) { + return v[index]; + // return ((float *)&v)[index]; +} + +static FORCEINLINE void __insert_element(__vec8_f *v, uint32_t index, float val) { + (*v)[index] = val; +// ((float *)v)[index] = val; +} + +template RetVecType __smear_float(float f); +template <> static FORCEINLINE __vec8_f __smear_float<__vec8_f>(float f) { + return _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, f,f,f,f,f,f,f,f); +} + +template RetVecType __setzero_float(); +template <> static FORCEINLINE __vec8_f __setzero_float<__vec8_f>() { + return _mm512_setzero_ps(); +} + +template RetVecType __undef_float(); +template <> static FORCEINLINE __vec8_f __undef_float<__vec8_f>() { + return __vec8_f(); +} + +static FORCEINLINE __vec8_f __broadcast_float(__vec8_f v, int index) { + float val = __extract_element(v, index & 0x7); + return _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, val,val,val,val,val,val,val,val); +} + +#if 1 +static FORCEINLINE __vec8_f __shuffle_float(__vec8_f v, __vec8_i32 index) { + return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v))); +} +#endif +ROTATE(__vec8_f, float, float) +SHUFFLE2(__vec8_f, float, float) + +#if 0 +LOADS(__vec8_f, float) +#else +template static FORCEINLINE __vec8_f __load(const __vec8_f *p) { + __vec8_f v; + v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + return __select(0xFF,v,FZERO); +} +#endif + +#if 0 +STORES(__vec8_f, float) +#else +template static FORCEINLINE void __store(__vec8_f *p, __vec8_f v) +{ + _mm512_mask_extpackstorelo_ps( p, 0xFF, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_ps((uint8_t*)p+64, 0xFF, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); +} +#endif + +#endif /* evghenii::float */ + +static FORCEINLINE float __exp_uniform_float(float v) { return expf(v);} +static FORCEINLINE __vec8_f __exp_varying_float(__vec8_f v) { return _mm512_mask_exp_ps(FZERO, 0xFF, v); } + + +static FORCEINLINE float __log_uniform_float(float v) { return logf(v);} +static FORCEINLINE __vec8_f __log_varying_float(__vec8_f v) { return _mm512_mask_log_ps(FZERO, 0xFF, v); } + +static FORCEINLINE float __pow_uniform_float(float a, float b) { return powf(a, b);} +static FORCEINLINE __vec8_f __pow_varying_float(__vec8_f a, __vec8_f b) { return _mm512_mask_pow_ps(FZERO, 0xFF, a,b); } + + +static FORCEINLINE int __intbits(float v) { + union { + float f; + int i; + } u; + u.f = v; + return u.i; +} + +static FORCEINLINE float __floatbits(int v) { + union { + float f; + int i; + } u; + u.i = v; + return u.f; +} + +static FORCEINLINE float __half_to_float_uniform(int16_t h) { + static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift + + int32_t o = ((int32_t)(h & 0x7fff)) << 13; // exponent/mantissa bits + uint32_t exp = shifted_exp & o; // just the exponent + o += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) // Inf/NaN? + o += (128 - 16) << 23; // extra exp adjust + else if (exp == 0) { // Zero/Denormal? + o += 1 << 23; // extra exp adjust + o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize + } + + o |= ((int32_t)(h & 0x8000)) << 16; // sign bit + return __floatbits(o); +} + + +static FORCEINLINE __vec8_f __half_to_float_varying(__vec8_i16 v) { + __vec8_f ret; + for (int i = 0; i < 8; ++i) + ret[i] = __half_to_float_uniform(v[i]); + return ret; +} + + +static FORCEINLINE int16_t __float_to_half_uniform(float f) { + uint32_t sign_mask = 0x80000000u; + int32_t o; + + int32_t fint = __intbits(f); + int32_t sign = fint & sign_mask; + fint ^= sign; + + int32_t f32infty = 255 << 23; + o = (fint > f32infty) ? 0x7e00 : 0x7c00; + + // (De)normalized number or zero + // update fint unconditionally to save the blending; we don't need it + // anymore for the Inf/NaN case anyway. + const uint32_t round_mask = ~0xfffu; + const int32_t magic = 15 << 23; + const int32_t f16infty = 31 << 23; + + int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask; + fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed + + if (fint < f32infty) + o = fint2 >> 13; // Take the bits! + + return (o | (sign >> 16)); +} + + +static FORCEINLINE __vec8_i16 __float_to_half_varying(__vec8_f v) { + __vec8_i16 ret; + for (int i = 0; i < 8; ++i) + ret[i] = __float_to_half_uniform(v[i]); + return ret; +} + + +#if 0 /* evghenii::double */ +/////////////////////////////////////////////////////////////////////////// +// double + +BINARY_OP(__vec8_d, __add, +) +BINARY_OP(__vec8_d, __sub, -) +BINARY_OP(__vec8_d, __mul, *) +BINARY_OP(__vec8_d, __div, /) + +CMP_OP(__vec8_d, double, double, __equal, ==) +CMP_OP(__vec8_d, double, double, __not_equal, !=) +CMP_OP(__vec8_d, double, double, __less_than, <) +CMP_OP(__vec8_d, double, double, __less_equal, <=) +CMP_OP(__vec8_d, double, double, __greater_than, >) +CMP_OP(__vec8_d, double, double, __greater_equal, >=) + +static FORCEINLINE __vec8_i1 __ordered_double(__vec8_d a, __vec8_d b) { + __vec8_i1 ret; + ret.v = 0; + for (int i = 0; i < 8; ++i) + ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0; + return ret; +} + +static FORCEINLINE __vec8_i1 __unordered_double(__vec8_d a, __vec8_d b) { + __vec8_i1 ret; + ret.v = 0; + for (int i = 0; i < 8; ++i) + ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0; + return ret; +} + +#if 0 + case Instruction::FRem: intrinsic = "__frem"; break; +#endif + +SELECT(__vec8_d) +INSERT_EXTRACT(__vec8_d, double) +SMEAR(__vec8_d, double, double) +SETZERO(__vec8_d, double) +UNDEF(__vec8_d, double) +BROADCAST(__vec8_d, double, double) +ROTATE(__vec8_d, double, double) +SHUFFLES(__vec8_d, double, double) +LOAD_STORE(__vec8_d, double) +#else /* evghenii::double */ +/////////////////////////////////////////////////////////////////////////// +// double +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec8_d __add(__vec8_d a, __vec8_d b) { + return _mm512_add_pd(a, b); +} +static FORCEINLINE __vec8_d __sub(__vec8_d a, __vec8_d b) { + return _mm512_sub_pd(a, b); +} +static FORCEINLINE __vec8_d __mul(__vec8_d a, __vec8_d b) { + return _mm512_mul_pd(a, b); +} + +static FORCEINLINE __vec8_d __div(__vec8_d a, __vec8_d b) { + return _mm512_div_pd(a, b); +} + +static FORCEINLINE __vec8_i1 __equal_double(__vec8_d a, __vec8_d b) { + return _mm512_cmpeq_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __equal_double_and_mask(__vec8_d a, __vec8_d b, + __vec8_i1 m) { + return _mm512_mask_cmpeq_pd_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __not_equal_double(__vec8_d a, __vec8_d b) { + return _mm512_cmpneq_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __not_equal_double_and_mask(__vec8_d a, __vec8_d b, + __vec8_i1 m) { + return _mm512_mask_cmpneq_pd_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __less_than_double(__vec8_d a, __vec8_d b) { + return _mm512_cmplt_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __less_than_double_and_mask(__vec8_d a, __vec8_d b, + __vec8_i1 m) { + return _mm512_mask_cmplt_pd_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __less_equal_double(__vec8_d a, __vec8_d b) { + return _mm512_cmple_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __less_equal_double_and_mask(__vec8_d a, __vec8_d b, + __vec8_i1 m) { + return _mm512_mask_cmple_pd_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __greater_than_double(__vec8_d a, __vec8_d b) { + return _mm512_cmpnle_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __greater_than_double_and_mask(__vec8_d a, __vec8_d b, + __vec8_i1 m) { + return _mm512_mask_cmpnle_pd_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __greater_equal_double(__vec8_d a, __vec8_d b) { + return _mm512_cmpnlt_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __greater_equal_double_and_mask(__vec8_d a, __vec8_d b, + __vec8_i1 m) { + return _mm512_mask_cmpnlt_pd_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __ordered_double(__vec8_d a, __vec8_d b) { + return _mm512_cmpord_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __unordered_double(__vec8_d a, __vec8_d b) { + return _mm512_cmpunord_pd_mask(a, b); +} + +static FORCEINLINE __vec8_d __select(__vec8_i1 mask, __vec8_d a, __vec8_d b) { + return _mm512_mask_mov_pd(b, mask, a); +} + + +static FORCEINLINE __vec8_d __select(bool cond, __vec8_d a, __vec8_d b) { + return cond ? a : b; +} + +static FORCEINLINE double __extract_element(__vec8_d v, uint32_t index) { + return ((double *)&v)[index]; +} + +static FORCEINLINE void __insert_element(__vec8_d *v, uint32_t index, double val) { + ((double *)v)[index] = val; +} + +template RetVecType __smear_double(double d); +template <> static FORCEINLINE __vec8_d __smear_double<__vec8_d>(double d) { return _mm512_set1_pd(d); } + +template RetVecType __setzero_double(); +template <> static FORCEINLINE __vec8_d __setzero_double<__vec8_d>() { return _mm512_setzero_pd(); } + +template RetVecType __undef_double(); +template <> static FORCEINLINE __vec8_d __undef_double<__vec8_d>() { return __vec8_d();} + +static FORCEINLINE __vec8_d __broadcast_double(__vec8_d v, int index) { + double val = __extract_element(v, index & 0xf); + return _mm512_set1_pd(val); +} + +ROTATE(__vec8_d, double, double) +SHUFFLES(__vec8_d, double, double) + +template static FORCEINLINE __vec8_d __load(const __vec8_d *p) { + __vec8_d ret; + ret.v = _mm512_extloadunpacklo_pd(ret.v, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v = _mm512_extloadunpackhi_pd(ret.v, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + return ret; +} + +template static FORCEINLINE void __store(__vec8_d *p, __vec8_d v) { + _mm512_extpackstorelo_pd(p, v.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); +} + + +#if 0 +template <> static FORCEINLINE __vec8_d __load<64>(const __vec8_d *p) { + return _mm512_load_pd(p); +} +template <> static FORCEINLINE __vec8_d __load<128>(const __vec8_d *p) { + return __load<64>(p); +} +template <> static FORCEINLINE void __store<64>(__vec8_d *p, __vec8_d v) { + _mm512_store_pd(p, v.v); +} +template <> static FORCEINLINE void __store<128>(__vec8_d *p, __vec8_d v) { + __store<64>(p, v); +} +#endif +#endif /* evghenii::double */ + +/////////////////////////////////////////////////////////////////////////// +// casts + + +#define CAST(TO, STO, FROM, SFROM, FUNC) \ +static FORCEINLINE TO FUNC(TO, FROM val) { \ + TO ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = (STO)((SFROM)(val[i])); \ + return ret; \ +} + +// sign extension conversions +CAST(__vec8_i64, int64_t, __vec8_i32, int32_t, __cast_sext) +CAST(__vec8_i64, int64_t, __vec8_i16, int16_t, __cast_sext) +CAST(__vec8_i64, int64_t, __vec8_i8, int8_t, __cast_sext) +CAST(__vec8_i32, int32_t, __vec8_i16, int16_t, __cast_sext) +CAST(__vec8_i32, int32_t, __vec8_i8, int8_t, __cast_sext) +CAST(__vec8_i16, int16_t, __vec8_i8, int8_t, __cast_sext) + +#define CAST_SEXT_I1(TYPE) \ +static FORCEINLINE TYPE __cast_sext(TYPE, __vec8_i1 v) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) { \ + ret[i] = 0; \ + if (v.v & (1 << i)) \ + ret[i] = ~ret[i]; \ + } \ + return ret; \ +} + +CAST_SEXT_I1(__vec8_i8) +CAST_SEXT_I1(__vec8_i16) +#if 0 +CAST_SEXT_I1(__vec8_i32) +#else +static FORCEINLINE __vec8_i32 __cast_sext(const __vec8_i32 &, const __vec8_i1 &val) +{ + __vec8_i32 ret = _mm512_setzero_epi32(); + __vec8_i32 one = _mm512_set1_epi32(-1); + return _mm512_mask_mov_epi32(ret, 0xFF & val, one); +} +#endif +CAST_SEXT_I1(__vec8_i64) + +// zero extension +CAST(__vec8_i64, uint64_t, __vec8_i32, uint32_t, __cast_zext) +CAST(__vec8_i64, uint64_t, __vec8_i16, uint16_t, __cast_zext) +CAST(__vec8_i64, uint64_t, __vec8_i8, uint8_t, __cast_zext) +CAST(__vec8_i32, uint32_t, __vec8_i16, uint16_t, __cast_zext) +CAST(__vec8_i32, uint32_t, __vec8_i8, uint8_t, __cast_zext) +CAST(__vec8_i16, uint16_t, __vec8_i8, uint8_t, __cast_zext) + +#define CAST_ZEXT_I1(TYPE) \ +static FORCEINLINE TYPE __cast_zext(TYPE, __vec8_i1 v) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = (v.v & (1 << i)) ? 1 : 0; \ + return ret; \ +} + +CAST_ZEXT_I1(__vec8_i8) +CAST_ZEXT_I1(__vec8_i16) +#if 0 +CAST_ZEXT_I1(__vec8_i32) +#else +static FORCEINLINE __vec8_i32 __cast_zext(const __vec8_i32 &, const __vec8_i1 &val) +{ + __vec8_i32 ret = _mm512_setzero_epi32(); + __vec8_i32 one = _mm512_set1_epi32(1); + return _mm512_mask_mov_epi32(ret, 0xFF & val, one); +} +#endif +CAST_ZEXT_I1(__vec8_i64) + +// truncations +CAST(__vec8_i32, int32_t, __vec8_i64, int64_t, __cast_trunc) +CAST(__vec8_i16, int16_t, __vec8_i64, int64_t, __cast_trunc) +CAST(__vec8_i8, int8_t, __vec8_i64, int64_t, __cast_trunc) +CAST(__vec8_i16, int16_t, __vec8_i32, int32_t, __cast_trunc) +CAST(__vec8_i8, int8_t, __vec8_i32, int32_t, __cast_trunc) +CAST(__vec8_i8, int8_t, __vec8_i16, int16_t, __cast_trunc) + +// signed int to float/double +#if 0 +CAST(__vec8_f, float, __vec8_i8, int8_t, __cast_sitofp) +CAST(__vec8_f, float, __vec8_i16, int16_t, __cast_sitofp) +CAST(__vec8_f, float, __vec8_i32, int32_t, __cast_sitofp) +#else +static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i8 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i16 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i32 val) {return _mm512_mask_cvtfxpnt_round_adjustepi32_ps(FZERO, 0xFF, val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);} +#endif +CAST(__vec8_f, float, __vec8_i64, int64_t, __cast_sitofp) +#if 0 +CAST(__vec8_d, double, __vec8_i8, int8_t, __cast_sitofp) +CAST(__vec8_d, double, __vec8_i16, int16_t, __cast_sitofp) +CAST(__vec8_d, double, __vec8_i32, int32_t, __cast_sitofp) +#else +static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i8 val) { + __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + return _mm512_cvtepi32lo_pd(vi); +} + +static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i16 val) { + __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + return _mm512_cvtepi32lo_pd(vi); +} + +static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i32 val) { + __vec8_d ret; + return _mm512_cvtepi32lo_pd(val); +} +#endif +CAST(__vec8_d, double, __vec8_i64, int64_t, __cast_sitofp) + +// unsigned int to float/double +#if 0 +CAST(__vec8_f, float, __vec8_i8, uint8_t, __cast_uitofp) +CAST(__vec8_f, float, __vec8_i16, uint16_t, __cast_uitofp) +CAST(__vec8_f, float, __vec8_i32, uint32_t, __cast_uitofp) +#else +static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i8 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i16 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i32 val) {return _mm512_mask_cvtfxpnt_round_adjustepu32_ps(FZERO, 0xFF, val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);} +#endif +CAST(__vec8_f, float, __vec8_i64, uint64_t, __cast_uitofp) +#if 0 +CAST(__vec8_d, double, __vec8_i8, uint8_t, __cast_uitofp) +CAST(__vec8_d, double, __vec8_i16, uint16_t, __cast_uitofp) +CAST(__vec8_d, double, __vec8_i32, uint32_t, __cast_uitofp) +#else +static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i8 val) { + __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + return _mm512_cvtepu32lo_pd(vi); +} + +static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i16 val) { + __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + return _mm512_cvtepu32lo_pd(vi); +} + +static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i32 val) { + __vec8_d ret; + return _mm512_cvtepu32lo_pd(val); +} +#endif +CAST(__vec8_d, double, __vec8_i64, uint64_t, __cast_uitofp) + +#if 0 +static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i1 v) { + __vec8_f ret; + for (int i = 0; i < 8; ++i) + ret[i] = (v.v & (1 << i)) ? 1. : 0.; + return ret; +} +#else +static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i1 v) +{ + const __m512 ret = _mm512_setzero_ps(); + const __m512 one = _mm512_set1_ps(1.0); + return _mm512_mask_mov_ps(ret, v & 0xFF, one); +} +#endif + +// float/double to signed int +CAST(__vec8_i8, int8_t, __vec8_f, float, __cast_fptosi) +CAST(__vec8_i16, int16_t, __vec8_f, float, __cast_fptosi) +#if 0 +CAST(__vec8_i32, int32_t, __vec8_f, float, __cast_fptosi) +#else +static FORCEINLINE __vec8_i32 __cast_fptosi(__vec8_i32, __vec8_f val) { + return _mm512_mask_cvtfxpnt_round_adjustps_epi32(IZERO, 0xFF, val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); +} +#endif +CAST(__vec8_i64, int64_t, __vec8_f, float, __cast_fptosi) +CAST(__vec8_i8, int8_t, __vec8_d, double, __cast_fptosi) +CAST(__vec8_i16, int16_t, __vec8_d, double, __cast_fptosi) +#if 1 +CAST(__vec8_i32, int32_t, __vec8_d, double, __cast_fptosi) +#else +#endif +CAST(__vec8_i64, int64_t, __vec8_d, double, __cast_fptosi) + +// float/double to unsigned int +CAST(__vec8_i8, uint8_t, __vec8_f, float, __cast_fptoui) +CAST(__vec8_i16, uint16_t, __vec8_f, float, __cast_fptoui) +#if 0 +CAST(__vec8_i32, uint32_t, __vec8_f, float, __cast_fptoui) +#else +static FORCEINLINE __vec8_i32 __cast_fptoui(__vec8_i32, __vec8_f val) { + return _mm512_mask_cvtfxpnt_round_adjustps_epu32(IZERO, 0xFF, val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); +} +#endif +CAST(__vec8_i64, uint64_t, __vec8_f, float, __cast_fptoui) +CAST(__vec8_i8, uint8_t, __vec8_d, double, __cast_fptoui) +CAST(__vec8_i16, uint16_t, __vec8_d, double, __cast_fptoui) +#if 1 +CAST(__vec8_i32, uint32_t, __vec8_d, double, __cast_fptoui) +#else +#endif +CAST(__vec8_i64, uint64_t, __vec8_d, double, __cast_fptoui) + +// float/double conversions +#if 0 +CAST(__vec8_f, float, __vec8_d, double, __cast_fptrunc) +CAST(__vec8_d, double, __vec8_f, float, __cast_fpext) +#else +static FORCEINLINE __vec8_f __cast_fptrunc(__vec8_f, __vec8_d val) { + return _mm512_mask_cvtpd_pslo(FZERO, 0xFF, val); +} +static FORCEINLINE __vec8_d __cast_fpext(__vec8_d, __vec8_f val) { + return _mm512_cvtpslo_pd(val); +} +#endif + +typedef union { + int32_t i32; + float f; + int64_t i64; + double d; +} BitcastUnion; + +#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT) \ +static FORCEINLINE TO __cast_bits(TO, FROM val) { \ + TO r; \ + for (int i = 0; i < 8; ++i) { \ + BitcastUnion u; \ + u.FROM_ELT = val[i]; \ + r[i] = u.TO_ELT; \ + } \ + return r; \ +} + +#if 0 +CAST_BITS(__vec8_f, f, __vec8_i32, i32) +CAST_BITS(__vec8_i32, i32, __vec8_f, f) +#else +static FORCEINLINE __vec8_f __cast_bits(__vec8_f, __vec8_i32 val) { + return _mm512_castsi512_ps(val); +} +static FORCEINLINE __vec8_i32 __cast_bits(__vec8_i32, __vec8_f val) { + return _mm512_castps_si512(val); +} +#endif + +#if 0 +CAST_BITS(__vec8_d, d, __vec8_i64, i64) +CAST_BITS(__vec8_i64, i64, __vec8_d, d) +#else +static FORCEINLINE __vec8_i64 __cast_bits(__vec8_i64, __vec8_d val) { + return *(__vec8_i64*)&val; +} +static FORCEINLINE __vec8_d __cast_bits(__vec8_d, __vec8_i64 val) { + return *(__vec8_d*)&val; +} +#endif + +#define CAST_BITS_SCALAR(TO, FROM) \ +static FORCEINLINE TO __cast_bits(TO, FROM v) { \ + union { \ + TO to; \ + FROM from; \ + } u; \ + u.from = v; \ + return u.to; \ +} + +CAST_BITS_SCALAR(uint32_t, float) +CAST_BITS_SCALAR(int32_t, float) +CAST_BITS_SCALAR(float, uint32_t) +CAST_BITS_SCALAR(float, int32_t) +CAST_BITS_SCALAR(uint64_t, double) +CAST_BITS_SCALAR(int64_t, double) +CAST_BITS_SCALAR(double, uint64_t) +CAST_BITS_SCALAR(double, int64_t) + +/////////////////////////////////////////////////////////////////////////// +// various math functions + +static FORCEINLINE void __fastmath() { +} + +static FORCEINLINE float __round_uniform_float(float v) { + return roundf(v); +} + +static FORCEINLINE float __floor_uniform_float(float v) { + return floorf(v); +} + +static FORCEINLINE float __ceil_uniform_float(float v) { + return ceilf(v); +} + +static FORCEINLINE double __round_uniform_double(double v) { + return round(v); +} + +static FORCEINLINE double __floor_uniform_double(double v) { + return floor(v); +} + +static FORCEINLINE double __ceil_uniform_double(double v) { + return ceil(v); +} + +#if 0 +UNARY_OP(__vec8_f, __round_varying_float, roundf) +UNARY_OP(__vec8_f, __floor_varying_float, floorf) +UNARY_OP(__vec8_f, __ceil_varying_float, ceilf) +#else +static FORCEINLINE __vec8_f __round_varying_float(__vec8_f v) { + return _mm512_mask_round_ps(FZERO, 0xFF, v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); +} + +static FORCEINLINE __vec8_f __floor_varying_float(__vec8_f v) { + return _mm512_mask_floor_ps(FZERO, 0xFF, v); +} + +static FORCEINLINE __vec8_f __ceil_varying_float(__vec8_f v) { + return _mm512_mask_ceil_ps(FZERO, 0xFF, v); +} +#endif + +#if 0 +UNARY_OP(__vec8_d, __round_varying_double, round) +UNARY_OP(__vec8_d, __floor_varying_double, floor) +UNARY_OP(__vec8_d, __ceil_varying_double, ceil) +#else +static FORCEINLINE __vec8_d __round_varying_float(__vec8_d v) { + return _mm512_svml_round_pd(v); +} + +static FORCEINLINE __vec8_d __floor_varying_float(__vec8_d v) { + return _mm512_floor_pd(v); +} + +static FORCEINLINE __vec8_d __ceil_varying_float(__vec8_d v) { + return _mm512_ceil_pd(v); +} +#endif + + +// min/max + +static FORCEINLINE float __min_uniform_float(float a, float b) { return (ab) ? a : b; } +static FORCEINLINE double __min_uniform_double(double a, double b) { return (ab) ? a : b; } + +static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (ab) ? a : b; } +static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (ab) ? a : b; } + +static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (ab) ? a : b; } +static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (ab) ? a : b; } + + +#if 0 +BINARY_OP_FUNC(__vec8_f, __max_varying_float, __max_uniform_float) +BINARY_OP_FUNC(__vec8_f, __min_varying_float, __min_uniform_float) +#else +static FORCEINLINE __vec8_f __max_varying_float (__vec8_f v1, __vec8_f v2) { return _mm512_mask_gmax_ps(FZERO, 0xFF, v1, v2);} +static FORCEINLINE __vec8_f __min_varying_float (__vec8_f v1, __vec8_f v2) { return _mm512_mask_gmin_ps(FZERO, 0xFF, v1, v2);} +#endif + +#if 0 +BINARY_OP_FUNC(__vec8_d, __max_varying_double, __max_uniform_double) +BINARY_OP_FUNC(__vec8_d, __min_varying_double, __min_uniform_double) +#else +static FORCEINLINE __vec8_d __max_varying_double(__vec8_d v1, __vec8_d v2) { return _mm512_gmax_pd(v1,v2); } +static FORCEINLINE __vec8_d __min_varying_double(__vec8_d v1, __vec8_d v2) { return _mm512_gmin_pd(v1,v2); } +#endif + +#if 0 +BINARY_OP_FUNC(__vec8_i32, __max_varying_int32, __max_uniform_int32) +BINARY_OP_FUNC(__vec8_i32, __min_varying_int32, __min_uniform_int32) +BINARY_OP_FUNC(__vec8_i32, __max_varying_uint32, __max_uniform_uint32) +BINARY_OP_FUNC(__vec8_i32, __min_varying_uint32, __min_uniform_uint32) +#else +static FORCEINLINE __vec8_i32 __max_varying_int32 (__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_max_epi32(IZERO,0xFF, v1, v2);} +static FORCEINLINE __vec8_i32 __min_varying_int32 (__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_min_epi32(IZERO,0xFF, v1, v2);} +static FORCEINLINE __vec8_i32 __max_varying_uint32(__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_max_epu32(IZERO,0xFF, v1, v2);} +static FORCEINLINE __vec8_i32 __min_varying_uint32(__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_min_epu32(IZERO,0xFF, v1, v2);} +#endif + +BINARY_OP_FUNC(__vec8_i64, __max_varying_int64, __max_uniform_int64) +BINARY_OP_FUNC(__vec8_i64, __min_varying_int64, __min_uniform_int64) +BINARY_OP_FUNC(__vec8_i64, __max_varying_uint64, __max_uniform_uint64) +BINARY_OP_FUNC(__vec8_i64, __min_varying_uint64, __min_uniform_uint64) + +// sqrt/rsqrt/rcp + +static FORCEINLINE float __rsqrt_uniform_float(float v) { + return 1.f / sqrtf(v); +} + +static FORCEINLINE float __rcp_uniform_float(float v) { + return 1.f / v; +} + +static FORCEINLINE float __sqrt_uniform_float(float v) { + return sqrtf(v); +} + +static FORCEINLINE double __sqrt_uniform_double(double v) { + return sqrt(v); +} + +#if 0 +UNARY_OP(__vec8_f, __rcp_varying_float, __rcp_uniform_float) +UNARY_OP(__vec8_f, __rsqrt_varying_float, __rsqrt_uniform_float) +UNARY_OP(__vec8_f, __sqrt_varying_float, __sqrt_uniform_float) +#else +static FORCEINLINE __vec8_f __rcp_varying_float(__vec8_f v) { +#ifdef ISPC_FAST_MATH + return _mm512_mask_rcp23_ps(FZERO, 0xFF, v); // Approximation with 23 bits of accuracy. +#else + return _mm512_mask_recip_ps(FZERO, 0xFF, v); +#endif +} + +static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) { +#ifdef ISPC_FAST_MATH + return _mm512_mask_rsqrt23_ps(FZERO,0xFF,v); // Approximation with 0.775ULP accuracy +#else + return _mm512_mask_invsqrt_ps(FZERO,0xFF,v); +#endif +} +static FORCEINLINE __vec8_f __sqrt_varying_float (__vec8_f v) { return _mm512_mask_sqrt_ps(FZERO,0xFF,v);} +#endif + +#if 0 +UNARY_OP(__vec8_d, __sqrt_varying_double, __sqrt_uniform_double) +#else +static FORCEINLINE __vec8_d __sqrt_varying_double(__vec8_d v) { return _mm512_sqrt_pd(v); } +#endif + +/////////////////////////////////////////////////////////////////////////// +// svml +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec8_f __svml_logf(__vec8_f v) { return _mm512_mask_log_ps(FZERO,0xFF,v); } +static FORCEINLINE __vec8_f __svml_expf(__vec8_f v) { return _mm512_mask_exp_ps(FZERO,0xFF,v); } +static FORCEINLINE __vec8_f __svml_cosf(__vec8_f v) { return _mm512_mask_cos_ps(FZERO,0xFF,v); } +static FORCEINLINE __vec8_f __svml_powf(__vec8_f a, __vec8_f b) { return _mm512_mask_pow_ps(FZERO,0xFF,a,b); } + +static FORCEINLINE __vec8_d __svml_logd(__vec8_d v) { return _mm512_log_pd(v); } +static FORCEINLINE __vec8_d __svml_expd(__vec8_d v) { return _mm512_exp_pd(v); } +static FORCEINLINE __vec8_d __svml_cosd(__vec8_d v) { return _mm512_cos_pd(v); } +static FORCEINLINE __vec8_d __svml_powd(__vec8_d a, __vec8_d b) { return _mm512_pow_pd(a,b); } + +/////////////////////////////////////////////////////////////////////////// +// bit ops + +static FORCEINLINE int32_t __popcnt_int32(uint32_t v) { + int count = 0; + for (; v != 0; v >>= 1) + count += (v & 1); + return count; +} + +static FORCEINLINE int32_t __popcnt_int64(uint64_t v) { + int count = 0; + for (; v != 0; v >>= 1) + count += (v & 1); + return count; +} + +static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) { + if (v == 0) + return 32; + + int count = 0; + while ((v & 1) == 0) { + ++count; + v >>= 1; + } + return count; +} + +static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) { + if (v == 0) + return 64; + + int count = 0; + while ((v & 1) == 0) { + ++count; + v >>= 1; + } + return count; +} + +static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) { + if (v == 0) + return 32; + + int count = 0; + while ((v & (1<<31)) == 0) { + ++count; + v <<= 1; + } + return count; +} + +static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { + if (v == 0) + return 64; + + int count = 0; + while ((v & (1ull<<63)) == 0) { + ++count; + v <<= 1; + } + return count; +} + +/////////////////////////////////////////////////////////////////////////// +// reductions + +#if 0 +REDUCE_ADD(float, __vec8_f, __reduce_add_float) +REDUCE_MINMAX(float, __vec8_f, __reduce_min_float, <) +REDUCE_MINMAX(float, __vec8_f, __reduce_max_float, >) +#else +static FORCEINLINE float __reduce_add_float(__vec8_f v) { return _mm512_mask_reduce_add_ps(0xFF,v); } +static FORCEINLINE float __reduce_min_float(__vec8_f v) { return _mm512_mask_reduce_min_ps(0xFF,v); } +static FORCEINLINE float __reduce_max_float(__vec8_f v) { return _mm512_mask_reduce_max_ps(0xFF,v); } +#endif + +#if 0 +REDUCE_ADD(double, __vec8_d, __reduce_add_double) +REDUCE_MINMAX(double, __vec8_d, __reduce_min_double, <) +REDUCE_MINMAX(double, __vec8_d, __reduce_max_double, >) +#else +static FORCEINLINE float __reduce_add_double(__vec8_d v) { return _mm512_reduce_add_pd(v); } +static FORCEINLINE float __reduce_min_double(__vec8_d v) { return _mm512_reduce_min_pd(v); } +static FORCEINLINE float __reduce_max_double(__vec8_d v) { return _mm512_reduce_max_pd(v); } +#endif + + + +#if 0 +REDUCE_ADD (int64_t, __vec8_i32, __reduce_add_int32) +REDUCE_MINMAX(int32_t, __vec8_i32, __reduce_min_int32, <) +REDUCE_MINMAX(int32_t, __vec8_i32, __reduce_max_int32, >) +REDUCE_MINMAX(uint32_t, __vec8_i32, __reduce_min_uint32, <) +REDUCE_MINMAX(uint32_t, __vec8_i32, __reduce_max_uint32, >) +#else +static FORCEINLINE int64_t __reduce_add_int32 (__vec8_i32 v) { return _mm512_mask_reduce_add_epi32(0xFF, v);} +static FORCEINLINE int32_t __reduce_min_int32 (__vec8_i32 v) { return _mm512_mask_reduce_min_epi32(0xFF, v);} +static FORCEINLINE int32_t __reduce_max_int32 (__vec8_i32 v) { return _mm512_mask_reduce_max_epi32(0xFF, v);} +static FORCEINLINE uint32_t __reduce_min_uint32 (__vec8_i32 v) { return _mm512_mask_reduce_min_epu32(0xFF, v);} +static FORCEINLINE uint32_t __reduce_max_uint32 (__vec8_i32 v) { return _mm512_mask_reduce_max_epu32(0xFF, v);} +#endif + +REDUCE_ADD ( int16_t, __vec8_i8, __reduce_add_int8) +REDUCE_ADD ( int32_t, __vec8_i16, __reduce_add_int16) +REDUCE_ADD ( int64_t, __vec8_i64, __reduce_add_int64) +REDUCE_MINMAX( int64_t, __vec8_i64, __reduce_min_int64, <) +REDUCE_MINMAX( int64_t, __vec8_i64, __reduce_max_int64, >) +REDUCE_MINMAX(uint64_t, __vec8_i64, __reduce_min_uint64, <) +REDUCE_MINMAX(uint64_t, __vec8_i64, __reduce_max_uint64, >) + +/////////////////////////////////////////////////////////////////////////// +// masked load/store + +static FORCEINLINE __vec8_i8 __masked_load_i8(void *p, + __vec8_i1 mask) { + __vec8_i8 ret; + int8_t *ptr = (int8_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} + +static FORCEINLINE __vec8_i16 __masked_load_i16(void *p, + __vec8_i1 mask) { + __vec8_i16 ret; + int16_t *ptr = (int16_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} + +#if 0 +static FORCEINLINE __vec8_i32 __masked_load_i32(void *p, + __vec8_i1 mask) { + __vec8_i32 ret; + int32_t *ptr = (int32_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} +#else +static FORCEINLINE __vec8_i32 __masked_load_i32(void *p, __vec8_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + return _mm512_mask_load_epi32(__vec8_i32(), mask, p); +#else + __vec8_i32 tmp; + tmp = _mm512_mask_extloadunpacklo_epi32(tmp, 0xFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp = _mm512_mask_extloadunpackhi_epi32(tmp, 0xFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __vec8_i32 ret; + return _mm512_mask_mov_epi32(ret, 0xFF & mask, tmp); +#endif +} +#endif + +#if 0 +static FORCEINLINE __vec8_f __masked_load_float(void *p, + __vec8_i1 mask) { + __vec8_f ret; + float *ptr = (float *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} +#else +static FORCEINLINE __vec8_f __masked_load_float(void *p, __vec8_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p); +#else + __vec8_f tmp; + tmp = _mm512_mask_extloadunpacklo_ps(tmp, 0xFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp = _mm512_mask_extloadunpackhi_ps(tmp, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + __vec8_f ret; + return _mm512_mask_mov_ps(ret, 0xFF & mask, tmp); +#endif +} +#endif + +static FORCEINLINE __vec8_i64 __masked_load_i64(void *p, + __vec8_i1 mask) { + __vec8_i64 ret; + int64_t *ptr = (int64_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} + +#if 0 +static FORCEINLINE __vec8_d __masked_load_double(void *p, + __vec8_i1 mask) { + __vec8_d ret; + double *ptr = (double *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} +#else +static FORCEINLINE __vec8_d __masked_load_double(void *p, __vec8_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + __vec8_d ret = FZERO; + ret = _mm512_mask_load_pd(ret, 0xFF & mask, p); + return ret; +#else + __vec8_d tmp = FZERO; + tmp.v = _mm512_mask_extloadunpacklo_pd(tmp.v, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_extloadunpackhi_pd(tmp.v, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + __vec8_d ret = FZERO; + ret.v = _mm512_mask_mov_pd(ret.v, mask, tmp.v); + return ret; +#endif +} +#endif + + +static FORCEINLINE void __masked_store_i8(void *p, __vec8_i8 val, + __vec8_i1 mask) { + int8_t *ptr = (int8_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} + +static FORCEINLINE void __masked_store_i16(void *p, __vec8_i16 val, + __vec8_i1 mask) { + int16_t *ptr = (int16_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} + +#if 0 +static FORCEINLINE void __masked_store_i32(void *p, __vec8_i32 val, + __vec8_i1 mask) { + int32_t *ptr = (int32_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} +#else +static FORCEINLINE void __masked_store_i32(void *p, __vec8_i32 val, __vec8_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + _mm512_mask_store_epi32(p, mask, val.v); +#else + __vec8_i32 tmp; + tmp = _mm512_extloadunpacklo_epi32(tmp, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp = _mm512_extloadunpackhi_epi32(tmp, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp = _mm512_mask_mov_epi32(tmp, 0xFF & mask, val); + _mm512_mask_extpackstorelo_epi32( p, 0xFF, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +#endif +} +#endif + +#if 0 +static FORCEINLINE void __masked_store_float(void *p, __vec8_f val, + __vec8_i1 mask) { + float *ptr = (float *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} +#else +static FORCEINLINE void __masked_store_float(void *p, __vec8_f val, + __vec8_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + _mm512_mask_store_ps(p, 0xFF & mask, val.v); +#else + __vec8_f tmp = FZERO; + tmp = _mm512_extloadunpacklo_ps(tmp, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp = _mm512_extloadunpackhi_ps(tmp, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp = _mm512_mask_mov_ps(tmp, 0xFF & mask, val); + _mm512_mask_extpackstorelo_ps( p, 0xFF, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_ps((uint8_t*)p+64, 0xFF, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); +#endif +} +#endif + +static FORCEINLINE void __masked_store_i64(void *p, __vec8_i64 val, + __vec8_i1 mask) { + int64_t *ptr = (int64_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} + +#if 0 +static FORCEINLINE void __masked_store_double(void *p, __vec8_d val, + __vec8_i1 mask) { + double *ptr = (double *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} +#else +static FORCEINLINE void __masked_store_double(void *p, __vec8_d val, + __vec8_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + _mm512_mask_store_pd(p, mask, val.v); +#else + __vec8_d tmp; + tmp.v = _mm512_extloadunpacklo_pd(tmp.v, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v = _mm512_extloadunpackhi_pd(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_mov_pd(tmp.v, mask, val.v); + _mm512_extpackstorelo_pd(p, tmp.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); +#endif +} +#endif + +static FORCEINLINE void __masked_store_blend_i8(void *p, __vec8_i8 val, + __vec8_i1 mask) { + __masked_store_i8(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i16(void *p, __vec8_i16 val, + __vec8_i1 mask) { + __masked_store_i16(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i32(void *p, __vec8_i32 val, + __vec8_i1 mask) { + __masked_store_i32(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_float(void *p, __vec8_f val, + __vec8_i1 mask) { + __masked_store_float(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i64(void *p, __vec8_i64 val, + __vec8_i1 mask) { + __masked_store_i64(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_double(void *p, __vec8_d val, + __vec8_i1 mask) { + __masked_store_double(p, val, mask); +} + +/////////////////////////////////////////////////////////////////////////// +// gather/scatter + +// offsets * offsetScale is in bytes (for all of these) + +#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, __vec8_i1 mask) { \ + VTYPE ret; \ + int8_t *base = (int8_t *)b; \ + for (int i = 0; i < 8; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset[i]); \ + ret[i] = *ptr; \ + } \ + return ret; \ +} + + +/****************/ +#if 0 +GATHER_BASE_OFFSETS(__vec8_i8, int8_t, __vec8_i32, __gather_base_offsets32_i8) +#else +static FORCEINLINE __vec8_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) +{ + // (iw): need to temporarily store as int because gathers can only return ints. + __vec8_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), 0xFF & mask, offsets, base, + _MM_UPCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); + // now, downconverting to chars into temporary char vector + __vec8_i8 ret; + _mm512_mask_extstore_epi32(ret.data,0xFF,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; +} +#endif +GATHER_BASE_OFFSETS(__vec8_i8, int8_t, __vec8_i64, __gather_base_offsets64_i8) +/****************/ +GATHER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i32, __gather_base_offsets32_i16) +GATHER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i64, __gather_base_offsets64_i16) +/****************/ +#if 0 +GATHER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i32, __gather_base_offsets32_i32) +#else +static FORCEINLINE __vec8_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) +{ + return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), 0xFF & mask, offsets, + base, _MM_UPCONV_EPI32_NONE, scale, + _MM_HINT_NONE); +} +#endif +GATHER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i64, __gather_base_offsets64_i32) +/****************/ +#if 0 +GATHER_BASE_OFFSETS(__vec8_f, float, __vec8_i32, __gather_base_offsets32_float) +#else +static FORCEINLINE __vec8_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) +{ + return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), 0xFF & mask, offsets, + base, _MM_UPCONV_PS_NONE, scale, + _MM_HINT_NONE); +} +#endif +GATHER_BASE_OFFSETS(__vec8_f, float, __vec8_i64, __gather_base_offsets64_float) +/****************/ +GATHER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i32, __gather_base_offsets32_i64) +GATHER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i64, __gather_base_offsets64_i64) +/****************/ +#if 0 +GATHER_BASE_OFFSETS(__vec8_d, double, __vec8_i32, __gather_base_offsets32_double) +#else +static FORCEINLINE __vec8_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) +{ + __vec8_d ret; + ret.v = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets, + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); +#if 0 + __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC); + const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */ + ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets, + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); +#endif + return ret; +} +#endif +GATHER_BASE_OFFSETS(__vec8_d, double, __vec8_i64, __gather_base_offsets64_double) + +#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec8_i1 mask) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)ptrs[i]; \ + ret[i] = *ptr; \ + } \ + return ret; \ +} +#define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1) \ +static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec8_i1 mask) { \ + return FUNC1(0, 1, ptrs, mask); \ +} + + +#if 1 +/***********/ +GATHER_GENERALF(__vec8_i8, int8_t, __vec8_i32, __gather32_i8, __gather_base_offsets32_i8) +GATHER_GENERALF(__vec8_i16, int16_t, __vec8_i32, __gather32_i16, __gather_base_offsets32_i16) +GATHER_GENERALF(__vec8_i32, int32_t, __vec8_i32, __gather32_i32, __gather_base_offsets32_i32) +GATHER_GENERALF(__vec8_i64, int64_t, __vec8_i32, __gather32_i64, __gather_base_offsets32_i64) +GATHER_GENERALF(__vec8_f, float, __vec8_i32, __gather32_float, __gather_base_offsets32_float) +GATHER_GENERALF(__vec8_d, double, __vec8_i32, __gather32_double, __gather_base_offsets32_double) +/***********/ +GATHER_GENERAL(__vec8_i8, int8_t, __vec8_i64, __gather64_i8); +GATHER_GENERAL(__vec8_i16, int16_t, __vec8_i64, __gather64_i16); +GATHER_GENERAL(__vec8_i32, int32_t, __vec8_i64, __gather64_i32); +GATHER_GENERAL(__vec8_i64, int64_t, __vec8_i64, __gather64_i64); +GATHER_GENERAL(__vec8_f, float, __vec8_i64, __gather64_float); +GATHER_GENERAL(__vec8_d, double, __vec8_i64, __gather64_double); +/***********/ +#endif + +// scatter + +#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, VTYPE val, \ + __vec8_i1 mask) { \ + int8_t *base = (int8_t *)b; \ + for (int i = 0; i < 8; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset[i]); \ + *ptr = val[i]; \ + } \ +} + + +/*****************/ +SCATTER_BASE_OFFSETS(__vec8_i8, int8_t, __vec8_i32, __scatter_base_offsets32_i8) +SCATTER_BASE_OFFSETS(__vec8_i8, int8_t, __vec8_i64, __scatter_base_offsets64_i8) +/*****************/ +SCATTER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i32, __scatter_base_offsets32_i16) +SCATTER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i64, __scatter_base_offsets64_i16) +/*****************/ +#if 0 +SCATTER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i32, __scatter_base_offsets32_i32) +#else +static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec8_i32 offsets, __vec8_i32 val, __vec8_i1 mask) +{ + _mm512_mask_i32extscatter_epi32(b, 0xFF & mask, offsets, val, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); +} +#endif +SCATTER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i64, __scatter_base_offsets64_i32) +/*****************/ +#if 0 +SCATTER_BASE_OFFSETS(__vec8_f, float, __vec8_i32, __scatter_base_offsets32_float) +#else +static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec8_i32 offsets, + __vec8_f val, __vec8_i1 mask) +{ + _mm512_mask_i32extscatter_ps(base, 0xFF & mask, offsets, val, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); +} +#endif +SCATTER_BASE_OFFSETS(__vec8_f, float, __vec8_i64, __scatter_base_offsets64_float) +/*****************/ +SCATTER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i32, __scatter_base_offsets32_i64) +SCATTER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i64, __scatter_base_offsets64_i64) +/*****************/ +#if 0 /* evghenii::to implement */ +SCATTER_BASE_OFFSETS(__vec8_d, double, __vec8_i32, __scatter_base_offsets32_double) +#else /* evghenii:testme */ +static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec8_i32 offsets, + __vec8_d val, __vec8_i1 mask) +{ + _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v, + _MM_DOWNCONV_PD_NONE, scale, + _MM_HINT_NONE); +} +#endif +SCATTER_BASE_OFFSETS(__vec8_d, double, __vec8_i64, __scatter_base_offsets64_double) + +#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ +static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec8_i1 mask) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)ptrs[i]; \ + *ptr = val[i]; \ + } \ +} +#define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1) \ +static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec8_i1 mask) { \ + return FUNC1(0, 1, ptrs, val, mask); \ +} + +#if 1 +/***********/ +SCATTER_GENERALF(__vec8_i8, int8_t, __vec8_i32, __scatter32_i8, __scatter_base_offsets32_i8) +SCATTER_GENERALF(__vec8_i16, int16_t, __vec8_i32, __scatter32_i16, __scatter_base_offsets32_i16) +SCATTER_GENERALF(__vec8_i32, int32_t, __vec8_i32, __scatter32_i32, __scatter_base_offsets32_i32) +SCATTER_GENERALF(__vec8_i64, int64_t, __vec8_i32, __scatter32_i64, __scatter_base_offsets32_i64) +SCATTER_GENERALF(__vec8_f, float, __vec8_i32, __scatter32_float, __scatter_base_offsets32_float) +SCATTER_GENERALF(__vec8_d, double, __vec8_i32, __scatter32_double, __scatter_base_offsets32_double) +/***********/ +SCATTER_GENERAL(__vec8_i8, int8_t, __vec8_i64, __scatter64_i8) +SCATTER_GENERAL(__vec8_i16, int16_t, __vec8_i64, __scatter64_i16) +SCATTER_GENERAL(__vec8_i32, int32_t, __vec8_i64, __scatter64_i32) +SCATTER_GENERAL(__vec8_f, float, __vec8_i64, __scatter64_float) +SCATTER_GENERAL(__vec8_i64, int64_t, __vec8_i64, __scatter64_i64) +SCATTER_GENERAL(__vec8_d, double, __vec8_i64, __scatter64_double) +/***********/ +#endif + +/////////////////////////////////////////////////////////////////////////// +// packed load/store + +#if 0 +static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec8_i32 *val, + __vec8_i1 mask) { + int count = 0; + for (int i = 0; i < 8; ++i) { + if ((mask.v & (1 << i)) != 0) { + val->operator[](i) = *ptr++; + ++count; + } + } + return count; +} +static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, + __vec8_i32 val, + __vec8_i1 mask) { + int count = 0; + for (int i = 0; i < 8; ++i) { + if ((mask.v & (1 << i)) != 0) { + *ptr++ = val[i]; + ++count; + } + } + return count; +} +static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, + __vec8_i32 *val, + __vec8_i1 mask) { + int count = 0; + for (int i = 0; i < 8; ++i) { + if ((mask.v & (1 << i)) != 0) { + val->operator[](i) = *ptr++; + ++count; + } + } + return count; +} +static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, + __vec8_i32 val, + __vec8_i1 mask) { + int count = 0; + for (int i = 0; i < 8; ++i) { + if ((mask.v & (1 << i)) != 0) { + *ptr++ = val[i]; + ++count; + } + } + return count; +} +#else +static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec8_i32 *val, + __vec8_i1 mask) { + __vec8_i32 v = __load<64>(val); + v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __store<64>(val, v); + return _mm_countbits_32(uint32_t(0xFF & mask)); +} +static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val, + __vec8_i1 mask) { + _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + return _mm_countbits_32(uint32_t(0xFF & mask)); +} +static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val, + __vec8_i1 mask) { + __vec8_i32 v = __load<64>(val); + v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __store<64>(val, v); + return _mm_countbits_32(uint32_t(0xFF & mask)); +} +static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val, + __vec8_i1 mask) { + _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + return _mm_countbits_32(uint32_t(0xFF & mask)); +} +#endif + +/////////////////////////////////////////////////////////////////////////// +// aos/soa + +static FORCEINLINE void __soa_to_aos3_float(__vec8_f v0, __vec8_f v1, __vec8_f v2, + float *ptr) { + for (int i = 0; i < 8; ++i) { + *ptr++ = __extract_element(v0, i); + *ptr++ = __extract_element(v1, i); + *ptr++ = __extract_element(v2, i); + } +} + +static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec8_f *out0, __vec8_f *out1, + __vec8_f *out2) { + for (int i = 0; i < 8; ++i) { + __insert_element(out0, i, *ptr++); + __insert_element(out1, i, *ptr++); + __insert_element(out2, i, *ptr++); + } +} + +static FORCEINLINE void __soa_to_aos4_float(__vec8_f v0, __vec8_f v1, __vec8_f v2, + __vec8_f v3, float *ptr) { + for (int i = 0; i < 8; ++i) { + *ptr++ = __extract_element(v0, i); + *ptr++ = __extract_element(v1, i); + *ptr++ = __extract_element(v2, i); + *ptr++ = __extract_element(v3, i); + } +} + +static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec8_f *out0, __vec8_f *out1, + __vec8_f *out2, __vec8_f *out3) { + for (int i = 0; i < 8; ++i) { + __insert_element(out0, i, *ptr++); + __insert_element(out1, i, *ptr++); + __insert_element(out2, i, *ptr++); + __insert_element(out3, i, *ptr++); + } +} + +/////////////////////////////////////////////////////////////////////////// +// prefetch + +static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) { + _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$ +} + +static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *p) { + _mm_prefetch((char *)p, _MM_HINT_T1); // prefetch into L2$ +} + +static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *p) { + // There is no L3$ on KNC, don't want to pollute L2$ unecessarily +} + +static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) { + _mm_prefetch((char *)p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint + // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint +} + +/////////////////////////////////////////////////////////////////////////// +// atomics + +static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedAdd((LONG volatile *)p, v) - v; +#else + return __sync_fetch_and_add(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedAdd((LONG volatile *)p, -v) + v; +#else + return __sync_fetch_and_sub(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedAnd((LONG volatile *)p, v); +#else + return __sync_fetch_and_and(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedOr((LONG volatile *)p, v); +#else + return __sync_fetch_and_or(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedXor((LONG volatile *)p, v); +#else + return __sync_fetch_and_xor(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) { + int32_t old, min; + do { + old = *((volatile int32_t *)p); + min = (old < (int32_t)v) ? old : (int32_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) { + int32_t old, max; + do { + old = *((volatile int32_t *)p); + max = (old > (int32_t)v) ? old : (int32_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) { + uint32_t old, min; + do { + old = *((volatile uint32_t *)p); + min = (old < v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) { + uint32_t old, max; + do { + old = *((volatile uint32_t *)p); + max = (old > v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedExchange((LONG volatile *)p, v); +#else + return __sync_lock_test_and_set(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval, + uint32_t newval) { +#ifdef _MSC_VER + return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval); +#else + return __sync_val_compare_and_swap(p, cmpval, newval); +#endif +} + +static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedAdd64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_add(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedAdd64((LONGLONG volatile *)p, -v) + v; +#else + return __sync_fetch_and_sub(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedAnd64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_and(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedOr64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_or(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedXor64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_xor(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) { + int64_t old, min; + do { + old = *((volatile int64_t *)p); + min = (old < (int64_t)v) ? old : (int64_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) { + int64_t old, max; + do { + old = *((volatile int64_t *)p); + max = (old > (int64_t)v) ? old : (int64_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) { + uint64_t old, min; + do { + old = *((volatile uint64_t *)p); + min = (old < v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) { + uint64_t old, max; + do { + old = *((volatile uint64_t *)p); + max = (old > v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedExchange64((LONGLONG volatile *)p, v); +#else + return __sync_lock_test_and_set(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, + uint64_t newval) { +#ifdef _MSC_VER + return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval); +#else + return __sync_val_compare_and_swap(p, cmpval, newval); +#endif +} + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} + +#endif // !WIN32 + +#undef FORCEINLINE +#undef PRE_ALIGN +#undef POST_ALIGN diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h new file mode 100644 index 00000000..05be27bd --- /dev/null +++ b/examples/intrinsics/knc-i1x8unsafe_fast.h @@ -0,0 +1,86 @@ +#define __ZMM64BIT__ +#include "knc-i1x8.h" + +/* the following tests fails because on KNC native vec8_i32 and vec8_float are 512 and not 256 bit in size. + * + * Using test compiler: Intel(r) SPMD Program Compiler (ispc), 1.4.5dev (build commit d68dbbc7bce74803 @ 20130919, LLVM 3.3) + * Using C/C++ compiler: icpc (ICC) 14.0.0 20130728 + * + */ + +/* knc-i1x8unsafe_fast.h fails: + * ---------------------------- +1 / 1206 tests FAILED compilation: + ./tests/ptr-assign-lhs-math-1.ispc +33 / 1206 tests FAILED execution: + ./tests/array-gather-simple.ispc + ./tests/array-gather-vary.ispc + ./tests/array-multidim-gather-scatter.ispc + ./tests/array-scatter-vary.ispc + ./tests/atomics-5.ispc + ./tests/atomics-swap.ispc + ./tests/cfor-array-gather-vary.ispc + ./tests/cfor-gs-improve-varying-1.ispc + ./tests/cfor-struct-gather-2.ispc + ./tests/cfor-struct-gather-3.ispc + ./tests/cfor-struct-gather.ispc + ./tests/gather-struct-vector.ispc + ./tests/global-array-4.ispc + ./tests/gs-improve-varying-1.ispc + ./tests/half-1.ispc + ./tests/half-3.ispc + ./tests/half.ispc + ./tests/launch-3.ispc + ./tests/launch-4.ispc + ./tests/masked-scatter-vector.ispc + ./tests/masked-struct-scatter-varying.ispc + ./tests/new-delete-6.ispc + ./tests/ptr-24.ispc + ./tests/ptr-25.ispc + ./tests/short-vec-15.ispc + ./tests/struct-gather-2.ispc + ./tests/struct-gather-3.ispc + ./tests/struct-gather.ispc + ./tests/struct-ref-lvalue.ispc + ./tests/struct-test-118.ispc + ./tests/struct-vary-index-expr.ispc + ./tests/typedef-2.ispc + ./tests/vector-varying-scatter.ispc +*/ + +/* knc-i1x8.h fails: + * ---------------------------- +1 / 1206 tests FAILED compilation: + ./tests/ptr-assign-lhs-math-1.ispc +3 / 1206 tests FAILED execution: + ./tests/half-1.ispc + ./tests/half-3.ispc + ./tests/half.ispc +*/ + +/* knc-i1x8.h fails: + * ---------------------------- +1 / 1206 tests FAILED compilation: + ./tests/ptr-assign-lhs-math-1.ispc +4 / 1206 tests FAILED execution: + ./tests/half-1.ispc + ./tests/half-3.ispc + ./tests/half.ispc + ./tests/test-141.ispc +*/ + +/* generic-16.h fails: (from these knc-i1x8.h & knc-i1x16.h are derived + * ---------------------------- +1 / 1206 tests FAILED compilation: + ./tests/ptr-assign-lhs-math-1.ispc +6 / 1206 tests FAILED execution: + ./tests/func-overload-max.ispc + ./tests/half-1.ispc + ./tests/half-3.ispc + ./tests/half.ispc + ./tests/test-141.ispc + ./tests/test-143.ispc +*/ + + + diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index bf383c88..8baef8cb 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1511,6 +1511,22 @@ static FORCEINLINE int64_t __count_trailing_zeros_i64(const __vec1_i64 mask) { // reductions /////////////////////////////////////////////////////////////////////////// +static FORCEINLINE int16_t __reduce_add_i8(__vec16_i8 v) { + // TODO: improve this! + int16_t ret = 0; + for (int i = 0; i < 16; ++i) + ret += v.v[i]; + return ret; +} + +static FORCEINLINE int32_t __reduce_add_i16(__vec16_i16 v) { + // TODO: improve this! + int32_t ret = 0; + for (int i = 0; i < 16; ++i) + ret += v.v[i]; + return ret; +} + static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) { return _mm512_reduce_add_epi32(v); } @@ -2105,9 +2121,24 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, #endif } +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE #undef PRE_ALIGN #undef POST_ALIGN - - - diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h index 0041a6c9..a1b1fc9d 100644 --- a/examples/intrinsics/knc2x.h +++ b/examples/intrinsics/knc2x.h @@ -1607,6 +1607,9 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { /////////////////////////////////////////////////////////////////////////// // reductions +REDUCE_ADD(int16_t, __vec32_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec32_i16, __reduce_add_int16) + static FORCEINLINE float __reduce_add_float(__vec32_f v) { return _mm512_reduce_add_ps(v.v1) + _mm512_reduce_add_ps(v.v2); } @@ -2052,7 +2055,24 @@ static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec32_f *out0, __vec32 } */ +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE #undef PRE_ALIGN #undef POST_ALIGN - diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index d4739d61..ff00d920 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -2528,6 +2528,22 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { /////////////////////////////////////////////////////////////////////////// // reductions +static FORCEINLINE int16_t __reduce_add_int8(__vec4_i8 v) { + // TODO: improve + int16_t ret = 0; + for (int i = 0; i < 4; ++i) + ret += __extract_element(v, i); + return ret; +} + +static FORCEINLINE int32_t __reduce_add_int16(__vec4_i16 v) { + // TODO: improve + int32_t ret = 0; + for (int i = 0; i < 4; ++i) + ret += __extract_element(v, i); + return ret; +} + static FORCEINLINE float __reduce_add_float(__vec4_f v) { float r = bits_as_float(_mm_extract_ps(v.v, 0)); r += bits_as_float(_mm_extract_ps(v.v, 1)); @@ -3984,6 +4000,22 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, #endif } +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE - - diff --git a/examples/mandelbrot/mandelbrot.cpp b/examples/mandelbrot/mandelbrot.cpp index 7e73768f..d2bebb96 100644 --- a/examples/mandelbrot/mandelbrot.cpp +++ b/examples/mandelbrot/mandelbrot.cpp @@ -109,7 +109,7 @@ int main() { minSerial = std::min(minSerial, dt); } - printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial); + printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); writePPM(buf, width, height, "mandelbrot-serial.ppm"); printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC); diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile index 7e83e618..1a565ffd 100644 --- a/examples/mandelbrot_tasks/Makefile +++ b/examples/mandelbrot_tasks/Makefile @@ -1,7 +1,7 @@ -EXAMPLE=mandelbrot -CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp -ISPC_SRC=mandelbrot.ispc +EXAMPLE=mandelbrot_tasks +CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp +ISPC_SRC=mandelbrot_tasks.ispc ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_ARM_TARGETS=neon diff --git a/examples/mandelbrot_tasks/mandelbrot.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp similarity index 97% rename from examples/mandelbrot_tasks/mandelbrot.cpp rename to examples/mandelbrot_tasks/mandelbrot_tasks.cpp index a01cfe43..698daf0f 100644 --- a/examples/mandelbrot_tasks/mandelbrot.cpp +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -42,7 +42,7 @@ #include #include #include "../timing.h" -#include "mandelbrot_ispc.h" +#include "mandelbrot_tasks_ispc.h" using namespace ispc; extern void mandelbrot_serial(float x0, float y0, float x1, float y1, @@ -137,7 +137,7 @@ int main(int argc, char *argv[]) { minSerial = std::min(minSerial, dt); } - printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial); + printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); writePPM(buf, width, height, "mandelbrot-serial.ppm"); printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); diff --git a/examples/mandelbrot_tasks/mandelbrot.ispc b/examples/mandelbrot_tasks/mandelbrot_tasks.ispc similarity index 100% rename from examples/mandelbrot_tasks/mandelbrot.ispc rename to examples/mandelbrot_tasks/mandelbrot_tasks.ispc diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj index b92de72f..3a8fca79 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj @@ -21,7 +21,7 @@ {E80DA7D4-AB22-4648-A068-327307156BE6} Win32Proj - mandelbrot + mandelbrot_tasks @@ -65,22 +65,22 @@ true $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot + mandelbrot_tasks true $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot + mandelbrot_tasks false $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot + mandelbrot_tasks false $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot + mandelbrot_tasks @@ -153,12 +153,12 @@ - - + + - + Document ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 diff --git a/examples/mandelbrot_tasks/mandelbrot_serial.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp similarity index 100% rename from examples/mandelbrot_tasks/mandelbrot_serial.cpp rename to examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp diff --git a/examples/noise/Makefile b/examples/noise/Makefile index 8cc72689..58d1cf3b 100644 --- a/examples/noise/Makefile +++ b/examples/noise/Makefile @@ -1,6 +1,6 @@ EXAMPLE=noise -CPP_SRC=$(EXAMPLE).cpp $(EXAMPLE)_serial.cpp +CPP_SRC=noise.cpp noise_serial.cpp ISPC_SRC=noise.ispc ISPC_IA_TARGETS=sse2,sse4,avx-x2 ISPC_ARM_TARGETS=neon diff --git a/examples/noise/noise.cpp b/examples/noise/noise.cpp index 58552ce3..123f98c7 100644 --- a/examples/noise/noise.cpp +++ b/examples/noise/noise.cpp @@ -106,7 +106,7 @@ int main() { minSerial = std::min(minSerial, dt); } - printf("[noise serial]:\t\t\t[%.3f] millon cycles\n", minSerial); + printf("[noise serial]:\t\t\t[%.3f] million cycles\n", minSerial); writePPM(buf, width, height, "noise-serial.ppm"); printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC); diff --git a/examples/perf.py b/examples/perf.py deleted file mode 100755 index 8503bd8c..00000000 --- a/examples/perf.py +++ /dev/null @@ -1,262 +0,0 @@ -#!/usr/bin/python -# // Author: Filippov Ilia - -from optparse import OptionParser -import sys -import os -import operator -import time -import glob -import string -import platform - -def build_test(): - global build_log - global is_windows - if is_windows == False: - os.system("make clean >> "+build_log) - return os.system("make >> "+build_log+" 2>> "+build_log) - else: - os.system("msbuild /t:clean >> " + build_log) - return os.system("msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /t:rebuild >> " + build_log) - -def execute_test(command): - global perf_temp - r = 0 - if os.path.exists(perf_temp): - os.remove(perf_temp) - for k in range(int(options.number)): - r = r + os.system(command) - return r - -#gathers all tests results and made an item test from answer structure -def run_test(command, c1, c2, test): - global perf_temp - if build_test() != 0: - sys.stdout.write("ERROR: Compilation fails\n") - return - if execute_test(command) != 0: - sys.stdout.write("ERROR: Execution fails\n") - return - tasks = [] #list of results with tasks, it will be test[2] - ispc = [] #list of results without tasks, it will be test[1] - j = 1 - for line in open(perf_temp): # we take test output - if "speedup" in line: # we are interested only in lines with speedup - if j == c1: # we are interested only in lines with c1 numbers - sys.stdout.write(line) - line = line.expandtabs(0) - line = line.replace("("," ") - line = line.split(",") - for i in range(len(line)): - subline = line[i].split(" ") - number = float(subline[1][:-1]) - if "speedup from ISPC + tasks" in line[i]: - tasks.append(number) - else: - ispc.append(number) - c1 = c1 + c2 - j+=1 - test[1] = test[1] + ispc - test[2] = test[2] + tasks - - -def cpu_get(): - p = open("/proc/stat", 'r') - cpu = p.readline() - p.close() - cpu = cpu.split(" ") - cpu_usage = (int(cpu[2]) + int(cpu[3]) + int(cpu[4])) - cpu_all = cpu_usage + int(cpu[5]) - return [cpu_usage, cpu_all] - -#returns cpu_usage -def cpu_check(): - if is_windows == False: - cpu1 = cpu_get() - time.sleep(1) - cpu2 = cpu_get() - cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100 - else: - os.system("wmic cpu get loadpercentage /value > cpu_temp") - c = open("cpu_temp", 'r') - c_lines = c.readlines() - c.close() - os.remove("cpu_temp") - t = "0" - for i in c_lines[2]: - if i.isdigit(): - t = t + i - cpu_percent = int(t) - return cpu_percent - -#returns geomean of list -def geomean(par): - temp = 1 - l = len(par) - for i in range(l): - temp = temp * par[i] - temp = temp ** (1.0/l) - return round(temp, 2) - -#takes an answer struct and print it. -#answer struct: list answer contains lists test -#test[0] - name of test -#test[1] - list of results without tasks -#test[2] - list of results with tasks -#test[1] or test[2] may be empty -def print_answer(answer): - sys.stdout.write("Name of test:\t\tISPC:\tISPC + tasks:\n") - max_t = [0,0] - diff_t = [0,0] - geomean_t = [0,0] - list_of_max = [[],[]] - for i in range(len(answer)): - for t in range(1,3): - if len(answer[i][t]) == 0: - max_t[t-1] = "n/a" - diff_t[t-1] = "n/a" - else: - list_of_max[t-1].append(max(answer[i][t])) - max_t[t-1] = str(max(answer[i][t])) - diff_t[t-1] = str(max(answer[i][t]) - min(answer[i][t])) - sys.stdout.write("%s:\n" % answer[i][0]) - sys.stdout.write("\t\tmax:\t%s\t%s\n" % (max_t[0], max_t[1])) - sys.stdout.write("\t\tdiff:\t%s\t%s\n" % (diff_t[0], diff_t[1])) - - geomean_t[0] = geomean(list_of_max[0]) - geomean_t[1] = geomean(list_of_max[1]) - sys.stdout.write("---------------------------------------------\n") - sys.stdout.write("Geomean:\t\t%s\t%s\n" % (geomean_t[0], geomean_t[1])) - -###Main### -# parsing options -parser = OptionParser() -parser.add_option('-n', '--number', dest='number', - help='number of repeats', default="3") -parser.add_option('-c', '--config', dest='config', - help='config file of tests', default="./perf.ini") -parser.add_option('-p', '--path', dest='path', - help='path to examples directory', default="./") -(options, args) = parser.parse_args() - -global is_windows -is_windows = (platform.system() == 'Windows' or - 'CYGWIN_NT' in platform.system()) - -# save corrent path -pwd = os.getcwd() -pwd = pwd + os.sep -if is_windows: - pwd = "..\\" - -# check if cpu usage is low now -cpu_percent = cpu_check() -if cpu_percent > 20: - sys.stdout.write("Warning: CPU Usage is very high.\n") - sys.stdout.write("Close other applications.\n") - -# check that required compilers exist -PATH_dir = string.split(os.getenv("PATH"), os.pathsep) -compiler_exists = False -ref_compiler_exists = False -if is_windows == False: - compiler = "ispc" - ref_compiler = "g++" -else: - compiler = "ispc.exe" - ref_compiler = "cl.exe" -for counter in PATH_dir: - if os.path.exists(counter + os.sep + compiler): - compiler_exists = True - if os.path.exists(counter + os.sep + ref_compiler): - ref_compiler_exists = True -if not compiler_exists: - sys.stderr.write("Fatal error: ISPC compiler not found.\n") - sys.stderr.write("Added path to ispc compiler to your PATH variable.\n") - sys.exit() -if not ref_compiler_exists: - sys.stderr.write("Fatal error: reference compiler %s not found.\n" % ref_compiler) - sys.stderr.write("Added path to %s compiler to your PATH variable.\n" % ref_compiler) - sys.exit() - -# checks that config file exists -path_config = os.path.normpath(options.config) -if os.path.exists(path_config) == False: - sys.stderr.write("Fatal error: config file not found: %s.\n" % options.config) - sys.stderr.write("Set path to your config file in --config.\n") - sys.exit() - -# read lines from config file except comments -f = open(path_config, 'r') -f_lines = f.readlines() -f.close() -lines =[] -for i in range(len(f_lines)): - if f_lines[i][0] != "%": - lines.append(f_lines[i]) -length = len(lines) - -# prepare build.log and perf_temp files -global build_log -build_log = pwd + "build.log" -if is_windows == False: - if os.path.exists(build_log): - os.remove(build_log) -else: - if os.path.exists("build.log"): - os.remove("build.log") -global perf_temp -perf_temp = pwd + "perf_temp" - -i = 0 -answer = [] -sys.stdout.write("Okey go go go!\n\n") -# loop for all tests -while i < length-2: - # we read name of test - sys.stdout.write("%s" % lines[i]) - test = [lines[i][:-1],[],[]] - # read location of test - folder = lines[i+1] - folder = folder[:-1] - folder = os.path.normpath(options.path + os.sep + folder) - # check that test exists - if os.path.exists(folder) == False: - sys.stdout.write("Fatal error: Can't find test %s. Your path is: \"%s\".\n" % (lines[i][:-1], options.path)) - sys.stdout.write("Change current location to /examples or set path to /examples in --path.\n") - exit(0) - os.chdir(folder) - # read parameters of test - command = lines[i+2] - command = command[:-1] - if is_windows == False: - command = "./"+command + " >> " + perf_temp - else: - command = "x64\\Release\\"+command + " >> " + perf_temp - # parsing config parameters - next_line = lines[i+3] - if next_line[0] == "!": # we should take only one part of test output - R = next_line.split(' ') - c1 = int(R[1]) #c1 is a number of string which we want to use in test output - c2 = int(R[2]) #c2 is total number of strings in test output - i = i+1 - else: - c1 = 1 - c2 = 1 - next_line = lines[i+3] - if next_line[0] == "^": #we should concatenate result of this test with previous one - run_test(command, c1, c2, answer[len(answer)-1]) - i = i+1 - else: #we run this test and append it's result to answer structure - run_test(command, c1, c2, test) - answer.append(test) - # preparing next loop iteration - os.chdir(pwd) - i+=4 - -# delete temp file -if os.path.exists(perf_temp): - os.remove(perf_temp) -#print collected answer -print_answer(answer) diff --git a/examples/sort/sort.cpp b/examples/sort/sort.cpp index 1d05b247..f5e4264a 100644 --- a/examples/sort/sort.cpp +++ b/examples/sort/sort.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2013, Intel Corporation + Copyright (c) 2013, Durham University All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,7 +13,7 @@ notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its + * Neither the name of Durham University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. @@ -86,7 +86,8 @@ int main (int argc, char *argv[]) tISPC1 += get_elapsed_mcycles(); - progressbar (i, m); + if (argc != 3) + progressbar (i, m); } printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1); @@ -103,10 +104,11 @@ int main (int argc, char *argv[]) tISPC2 += get_elapsed_mcycles(); - progressbar (i, m); + if (argc != 3) + progressbar (i, m); } - - printf("[sort ispc+tasks]:\t[%.3f] million cycles\n", tISPC2); + + printf("[sort ispc + tasks]:\t[%.3f] million cycles\n", tISPC2); srand (0); @@ -120,13 +122,13 @@ int main (int argc, char *argv[]) tSerial += get_elapsed_mcycles(); - progressbar (i, m); + if (argc != 3) + progressbar (i, m); } printf("[sort serial]:\t\t[%.3f] million cycles\n", tSerial); - printf("\t\t\t\t(%.2fx speedup from ISPC serial)\n", tSerial/tISPC1); - printf("\t\t\t\t(%.2fx speedup from ISPC with tasks)\n", tSerial/tISPC2); + printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2); delete code; delete order; diff --git a/examples/sort/sort.ispc b/examples/sort/sort.ispc index 65df4736..25ea90f4 100644 --- a/examples/sort/sort.ispc +++ b/examples/sort/sort.ispc @@ -1,5 +1,5 @@ /* - Copyright (c) 2013, Intel Corporation + Copyright (c) 2013, Durham University All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,7 +13,7 @@ notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its + * Neither the name of Durham University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. @@ -172,7 +172,7 @@ task void bumpup (uniform int h[], uniform int g[]) static void prefix_sum (uniform int num, uniform int h[]) { - uniform int * uniform g = uniform new int [num+1]; + uniform int * uniform g = uniform new uniform int [num+1]; uniform int i; launch[num] addup (h, g+1); @@ -191,9 +191,9 @@ export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int o uniform int num = ntasks < 1 ? num_cores () : ntasks; uniform int span = n / num; uniform int hsize = 256*programCount*num; - uniform int * uniform hist = uniform new int [hsize]; - uniform int64 * uniform pair = uniform new int64 [n]; - uniform int64 * uniform temp = uniform new int64 [n]; + uniform int * uniform hist = uniform new uniform int [hsize]; + uniform int64 * uniform pair = uniform new uniform int64 [n]; + uniform int64 * uniform temp = uniform new uniform int64 [n]; uniform int pass, i; #if DEBUG diff --git a/examples/sort/sort_serial.cpp b/examples/sort/sort_serial.cpp index ba955c77..38bbdda6 100644 --- a/examples/sort/sort_serial.cpp +++ b/examples/sort/sort_serial.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2013, Intel Corporation + Copyright (c) 2013, Durham University All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,7 +13,7 @@ notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its + * Neither the name of Durham University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp index 9d5b3ee6..593d901f 100644 --- a/examples/stencil/stencil.cpp +++ b/examples/stencil/stencil.cpp @@ -130,7 +130,7 @@ int main() { minTimeSerial = std::min(minTimeSerial, dt); } - printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial); + printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index b4ced5c7..c9c2fa7b 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -365,7 +365,7 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) static inline int32_t lAtomicAdd(volatile int32_t *v, int32_t delta) { #ifdef ISPC_IS_WINDOWS - return InterlockedAdd((volatile LONG *)v, delta); + return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta; #else return __sync_fetch_and_add(v, delta); #endif diff --git a/examples/volume_rendering/volume.cpp b/examples/volume_rendering/volume.cpp index 7d8b8e99..458cd407 100644 --- a/examples/volume_rendering/volume.cpp +++ b/examples/volume_rendering/volume.cpp @@ -204,7 +204,7 @@ int main(int argc, char *argv[]) { minSerial = std::min(minSerial, dt); } - printf("[volume serial]:\t\t[%.3f] millon cycles\n", minSerial); + printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial); writePPM(image, width, height, "volume-serial.ppm"); printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", diff --git a/expr.cpp b/expr.cpp index fc3d295a..614cb5e5 100644 --- a/expr.cpp +++ b/expr.cpp @@ -1911,6 +1911,40 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1, } +/* Returns true if shifting right by the given amount will lead to + inefficient code. (Assumes x86 target. May also warn inaccurately if + later optimization simplify the shift amount more than we are able to + see at this point.) */ +static bool +lIsDifficultShiftAmount(Expr *expr) { + // Uniform shifts (of uniform values) are no problem. + if (expr->GetType()->IsVaryingType() == false) + return false; + + ConstExpr *ce = dynamic_cast(expr); + if (ce) { + // If the shift is by a constant amount, *and* it's the same amount + // in all vector lanes, we're in good shape. + uint32_t amount[ISPC_MAX_NVEC]; + int count = ce->GetValues(amount); + for (int i = 1; i < count; ++i) + if (amount[i] != amount[0]) + return true; + return false; + } + + TypeCastExpr *tce = dynamic_cast(expr); + if (tce && tce->expr) { + // Finally, if the shift amount is given by a uniform value that's + // been smeared out into a varying, we have the same shift for all + // lanes and are also in good shape. + return (tce->expr->GetType()->IsUniformType() == false); + } + + return true; +} + + llvm::Value * BinaryExpr::GetValue(FunctionEmitContext *ctx) const { if (!arg0 || !arg1) { @@ -1951,9 +1985,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const { case BitAnd: case BitXor: case BitOr: { - if (op == Shr && arg1->GetType()->IsVaryingType() && - dynamic_cast(arg1) == NULL) - PerformanceWarning(pos, "Shift right is extremely inefficient for " + if (op == Shr && lIsDifficultShiftAmount(arg1)) + PerformanceWarning(pos, "Shift right is inefficient for " "varying shift amounts."); return lEmitBinaryBitOp(op, value0, value1, arg0->GetType()->IsUnsignedType(), ctx); @@ -2207,6 +2240,49 @@ lConstFoldBinaryIntOp(ConstExpr *constArg0, ConstExpr *constArg1, } +/* Returns true if the given arguments (which are assumed to be the + operands of a divide) represent a divide that can be performed by one of + the __fast_idiv functions. + */ +static bool +lCanImproveVectorDivide(Expr *arg0, Expr *arg1, int *divisor) { + const Type *type = arg0->GetType(); + if (!type) + return false; + + // The value being divided must be an int8/16/32. + if (!(Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingInt16) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt16) || + Type::EqualIgnoringConst(type, AtomicType::VaryingInt32) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt32))) + return false; + + // The divisor must be the same compile-time constant value for all of + // the vector lanes. + ConstExpr *ce = dynamic_cast(arg1); + if (!ce) + return false; + int64_t div[ISPC_MAX_NVEC]; + int count = ce->GetValues(div); + for (int i = 1; i < count; ++i) + if (div[i] != div[0]) + return false; + *divisor = div[0]; + + // And finally, the divisor must be >= 2 and <128 (for 8-bit divides), + // and <256 otherwise. + if (*divisor < 2) + return false; + if (Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8)) + return *divisor < 128; + else + return *divisor < 256; +} + + Expr * BinaryExpr::Optimize() { if (arg0 == NULL || arg1 == NULL) @@ -2269,6 +2345,32 @@ BinaryExpr::Optimize() { } } + int divisor; + if (op == Div && lCanImproveVectorDivide(arg0, arg1, &divisor)) { + Debug(pos, "Improving vector divide by constant %d", divisor); + + std::vector idivFuns; + m->symbolTable->LookupFunction("__fast_idiv", &idivFuns); + if (idivFuns.size() == 0) { + Warning(pos, "Couldn't find __fast_idiv to optimize integer divide. " + "Are you compiling with --nostdlib?"); + return this; + } + + Expr *idivSymExpr = new FunctionSymbolExpr("__fast_idiv", idivFuns, pos); + ExprList *args = new ExprList(arg0, pos); + args->exprs.push_back(new ConstExpr(AtomicType::UniformInt32, divisor, arg1->pos)); + Expr *idivCall = new FunctionCallExpr(idivSymExpr, args, pos); + + idivCall = ::TypeCheck(idivCall); + if (idivCall == NULL) + return NULL; + + Assert(Type::EqualIgnoringConst(GetType(), idivCall->GetType())); + idivCall = new TypeCastExpr(GetType(), idivCall, pos); + return ::Optimize(idivCall); + } + // From here on out, we're just doing constant folding, so if both args // aren't constants then we're done... if (constArg0 == NULL || constArg1 == NULL) @@ -3021,6 +3123,14 @@ static llvm::Value * lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *expr1, llvm::Value *expr2, const Type *type) { +#if 0 // !defined(LLVM_3_1) + // Though it should be equivalent, this seems to cause non-trivial + // performance regressions versus the below. This may be related to + // http://llvm.org/bugs/show_bug.cgi?id=16941. + if (test->getType() != LLVMTypes::Int1VectorType) + test = ctx->TruncInst(test, LLVMTypes::Int1VectorType); + return ctx->SelectInst(test, expr1, expr2, "select"); +#else llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp"); // Don't need to worry about masking here ctx->StoreInst(expr2, resultPtr); @@ -3029,6 +3139,7 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, PointerType::GetUniform(type)->LLVMType(g->ctx)); ctx->StoreInst(expr1, resultPtr, test, type, PointerType::GetUniform(type)); return ctx->LoadInst(resultPtr, "selectexpr_final"); +#endif // !LLVM_3_1 } @@ -6059,9 +6170,9 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) - // If we have a bool vector of i32 elements, first truncate - // down to a single bit + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) + // If we have a bool vector of non-i1 elements, first + // truncate down to a single bit. exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); // And then do an unisgned int->float cast cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int @@ -6103,8 +6214,8 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) - // truncate i32 bool vector values to i1s + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) + // truncate bool vector values to i1s exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double exprVal, targetType, cOpName); @@ -6141,7 +6252,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6177,7 +6288,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6219,7 +6330,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6259,7 +6370,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6305,7 +6416,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6345,7 +6456,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6391,7 +6502,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6429,7 +6540,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6523,12 +6634,12 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, if (fromType->IsUniformType()) { if (toType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) { - // extend out to i32 bool values from i1 here. then we'll - // turn into a vector below, the way it does for everyone - // else... + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) { + // extend out to an bool as an i8/i16/i32 from the i1 here. + // Then we'll turn that into a vector below, the way it + // does for everyone else... cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(), - LLVMGetName(cast, "to_i32bool")); + LLVMGetName(cast, "to_i_bool")); } } else diff --git a/fail_db.txt b/fail_db.txt new file mode 100644 index 00000000..31db9961 --- /dev/null +++ b/fail_db.txt @@ -0,0 +1,951 @@ +% List of known fails. +% The list is unordered and contains information about commonly used platforms / configurations. +% Our goas is to maintain this list for Linux, MacOS and Windows with reasonably new compilers. +% Note, that it's important which C++ compiler was used. For example, gcc 4.4 is know to produce +% considerably more fails with generic targets, than gcc 4.7 or later. +% Using old compilers (gcc 4.4 is considered to be relatively old) may cause LLVM bugs. +% To avoid them you can use LLVM selfbuild. +% +./tests/masked-scatter-vector.ispc runfail x86-64 sse2-i32x4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-13.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-10.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-11.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-13.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-5.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-6.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-13.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-10.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-11.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-13.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-10.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-11.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-13.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-5.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-6.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-13.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-10.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-11.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-6.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-swap.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-down-uint16.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-down-uint8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-up-uint16.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-up-uint8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/broadcast-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/broadcast-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/count-leading-trailing-zeros-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/count-leading-trailing-zeros-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/exclusive-scan-add-10.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/exclusive-scan-add-8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/exclusive-scan-add-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/exclusive-scan-and-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/exclusive-scan-or-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-uniform-7.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-uniform-8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-uniform-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-varying-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-varying-7.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-varying-8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-varying-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/half-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/idiv.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/int64-max-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/int64-max.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/int64-min-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/int64-min.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-11.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-12.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-13.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-swap.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-varyingptr-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/new-delete-6.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/phi-opts-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/phi-opts-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/popcnt-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/popcnt-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/popcnt-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-add-int16-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-add-int16.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-10.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-12.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-13.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-7.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/rotate-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/rotate-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/rotate-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/rotate-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/rotate-6.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/rotate.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/short-vec-14.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle-flatten.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-11.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-6.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-7.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/soa-27.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/soa-28.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-128.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-129.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-130.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-57.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/uint64-max-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/uint64-max.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/uint64-min-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/uint64-min.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-141.ispc runfail x86-64 generic-16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-141.ispc runfail x86 avx2-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-141.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-gather-ifs.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-gather-multi-unif.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-gather-unif.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-mixed-unif-vary-indexing-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-mixed-unif-vary-indexing-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-mixed-unif-vary-indexing.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-multidim-gather.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-scatter-unif-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-scatter-vary.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-struct-gather.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-6.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-down-uint16.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-down-uint8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-up-uint16.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-up-uint8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/broadcast-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/broadcast-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/cfor-array-gather-ifs.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/cfor-array-gather-unif.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/cfor-array-multidim-gather.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/cfor-array-struct-gather.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/cfor-struct-test-114.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/cfor-unif-struct-test-114.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/count-leading-trailing-zeros-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/count-leading-trailing-zeros-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/exclusive-scan-add-10.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/exclusive-scan-add-8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/exclusive-scan-add-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/exclusive-scan-and-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/exclusive-scan-or-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-uniform-7.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-uniform-8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-uniform-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-varying-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-varying-7.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-varying-8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-varying-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/gather-int16.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/gather-to-vload-neg-offset.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/global-array-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/half-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/idiv.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/int64-max-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/int64-max.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/int64-min-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/int64-min.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-11.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-12.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-13.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-swap.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-varyingptr-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/masked-scatter-struct.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/masked-scatter-vector.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/nested-structs-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/new-delete-6.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/pass-varying-lvalue-to-ref.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/phi-opts-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/phi-opts-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/popcnt-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/popcnt-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/popcnt-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-add-int16-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-add-int16.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-10.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-12.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-13.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-7.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/rotate-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/rotate-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/rotate-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/rotate-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/rotate-6.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/rotate.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/scatter-int16-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/scatter-int16.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/scatter-mask-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/scatter-mask-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/short-vec-12.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/short-vec-14.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle-flatten.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-11.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-6.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-7.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/soa-28.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/struct-test-114.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-128.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-129.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-130.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-57.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/uint64-max-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/uint64-max.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/uint64-min-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/uint64-min.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/unif-struct-test-114.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/varying-struct-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/varying-struct-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/varying-struct-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/write-same-loc.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-141.ispc runfail x86-64 generic-16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-varyingptr-2.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-varyingptr-3.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-varyingptr-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-11.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-12.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-13.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-5.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-6.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-7.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-8.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-swap.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-varyingptr-2.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-varyingptr-3.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-varyingptr-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/memset-varying.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-1.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-12.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-2.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-3.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-7.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-141.ispc runfail x86 avx2-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-141.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-swap.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/masked-scatter-struct.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-13.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/broadcast-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/half-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-11.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-12.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-13.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-swap.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-varyingptr-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/memset-varying.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/rotate-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/rotate-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle-flatten.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-10.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-11.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-6.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-7.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/test-129.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/test-130.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 generic-16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/rotate.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shift1.ispc runfail x86 avx2-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/test-141.ispc runfail x86 avx2-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shift1.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/broadcast-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/half-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-11.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-12.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-13.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-swap.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/memset-varying.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/rotate-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/rotate-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle-flatten.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-10.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-11.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-6.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-7.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/test-129.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/test-130.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 generic-16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-2.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-3.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-11.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-12.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-13.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-5.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-6.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-7.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-8.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-swap.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-2.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-3.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/memset-varying.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-1.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-12.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-2.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-3.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-7.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shift1.ispc runfail x86 avx2-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/test-141.ispc runfail x86 avx2-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shift1.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1.1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1.1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/test-141.ispc runfail x86 avx2-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 avx2-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/broadcast-1.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/half-3.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-1.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-13.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-5.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-9.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-swap.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-varyingptr-3.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/memset-varying.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/rotate-2.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle-4.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-1.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-10.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-11.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-2.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-3.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-4.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-5.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-6.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-7.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-8.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-9.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/test-129.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/test-130.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 generic-16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/test-143.ispc runfail x86-64 generic-16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1.1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1.1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-2.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-3.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-4.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-11.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-12.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-13.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-4.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-5.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-6.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-7.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-8.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-swap.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-2.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-3.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-4.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/memset-varying.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-1.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-12.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-2.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-3.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-4.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-7.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/test-141.ispc runfail x86 avx2-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 avx2-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/broadcast-1.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/half-3.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-1.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-13.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-5.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-9.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-swap.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-3.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/memset-varying.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/rotate-2.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle-4.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-1.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-10.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-11.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-2.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-3.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-4.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-5.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-6.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-7.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-8.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-9.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/test-129.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/test-130.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 generic-16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/test-143.ispc runfail x86-64 generic-16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Mac LLVM 3.4 g++4.7 -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\atomics-13.ispc compfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc compfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-11.ispc compfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-13.ispc compfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-5.ispc compfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-6.ispc compfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\funcptr-null-4.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\funcptr-null-5.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\funcptr-null-6.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\atomics-13.ispc compfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc compfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-11.ispc compfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-13.ispc compfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-5.ispc compfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-6.ispc compfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\avg-down-int8.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\avg-up-int8.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\avg-down-int8.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\avg-up-int8.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\atomics-13.ispc compfail x86-64 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc compfail x86-64 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-11.ispc compfail x86-64 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-13.ispc compfail x86-64 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-5.ispc compfail x86-64 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-6.ispc compfail x86-64 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\atomics-13.ispc compfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc compfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-11.ispc compfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-13.ispc compfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-5.ispc compfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-6.ispc compfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86-64 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86-64 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86-64 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86-64 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86-64 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\avg-down-int8.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\avg-up-int8.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86-64 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86-64 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86-64 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86-64 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86-64 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86-64 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86-64 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86-64 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86-64 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86-64 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\test-141.ispc runfail x86-64 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\test-141.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1-i64x4 Windows LLVM 3.4 cl -O2 * diff --git a/ispc.cpp b/ispc.cpp index 480ff99a..0d9a4190 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -106,7 +106,7 @@ static void __cpuidex(int info[4], int level, int count) { static const char * lGetSystemISA() { #ifdef __arm__ - return "neon"; + return "neon-i32x4"; #else int info[4]; __cpuid(info, 1); @@ -121,19 +121,19 @@ lGetSystemISA() { int info2[4]; __cpuidex(info2, 7, 0); if ((info2[1] & (1 << 5)) != 0) - return "avx2"; + return "avx2-i32x8"; else - return "avx1.1"; + return "avx1.1-i32x8"; } // Regular AVX - return "avx"; + return "avx1-i32x8"; } else if ((info[2] & (1 << 19)) != 0) - return "sse4"; + return "sse4-i32x4"; else if ((info[3] & (1 << 26)) != 0) - return "sse2"; + return "sse2-i32x4"; else { - fprintf(stderr, "Unable to detect supported SSE/AVX ISA. Exiting.\n"); + Error(SourcePos(), "Unable to detect supported SSE/AVX ISA. Exiting."); exit(1); } #endif @@ -141,14 +141,20 @@ lGetSystemISA() { static const char *supportedCPUs[] = { + "sm_35", +#ifdef ISPC_ARM_ENABLED // FIXME: LLVM supports a ton of different ARM CPU variants--not just // cortex-a9 and a15. We should be able to handle any of them that also // have NEON support. - "sm_35", "cortex-a9", "cortex-a15", + "cortex-a9", "cortex-a15", +#endif "atom", "penryn", "core2", "corei7", "corei7-avx" #if !defined(LLVM_3_1) , "core-avx-i", "core-avx2" #endif // LLVM 3.2+ +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) + , "slm" +#endif // LLVM 3.4+ }; Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : @@ -184,22 +190,25 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // If a CPU was specified explicitly, try to pick the best // possible ISA based on that. if (!strcmp(cpu, "core-avx2")) - isa = "avx2"; - else if (!strcmp(cpu, "cortex-a9") || - !strcmp(cpu, "cortex-a15")) - isa = "neon"; + isa = "avx2-i32x8"; else if (!strcmp(cpu, "sm_35")) isa = "nvptx64"; +#ifdef ISPC_ARM_ENABLED + else if (!strcmp(cpu, "cortex-a9") || + !strcmp(cpu, "cortex-a15")) + isa = "neon-i32x4"; +#endif else if (!strcmp(cpu, "core-avx-i")) - isa = "avx1.1"; + isa = "avx1.1-i32x8"; else if (!strcmp(cpu, "sandybridge") || !strcmp(cpu, "corei7-avx")) - isa = "avx"; + isa = "avx1-i32x8"; else if (!strcmp(cpu, "corei7") || - !strcmp(cpu, "penryn")) - isa = "sse4"; + !strcmp(cpu, "penryn") || + !strcmp(cpu, "slm")) + isa = "sse4-i32x4"; else - isa = "sse2"; + isa = "sse2-i32x4"; Warning(SourcePos(), "No --target specified on command-line. " "Using ISA \"%s\" based on specified CPU \"%s\".", isa, cpu); @@ -209,12 +218,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // supports. isa = lGetSystemISA(); Warning(SourcePos(), "No --target specified on command-line. " - "Using system ISA \"%s\".", isa); + "Using default system target \"%s\".", isa); } } -#if !defined(__arm__) - if (cpu == NULL && !strcmp(isa, "neon")) +#if defined(ISPC_ARM_ENABLED) && !defined(__arm__) + if (cpu == NULL && !strncmp(isa, "neon", 4)) // If we're compiling NEON on an x86 host and the CPU wasn't // supplied, don't go and set the CPU based on the host... cpu = "cortex-a9"; @@ -242,8 +251,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } } if (foundCPU == false) { - fprintf(stderr, "Error: CPU type \"%s\" unknown. Supported CPUs: " - "%s.\n", cpu, SupportedTargetCPUs().c_str()); + Error(SourcePos(), "Error: CPU type \"%s\" unknown. Supported CPUs: " + "%s.", cpu, SupportedCPUs().c_str()); return; } } @@ -251,10 +260,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_cpu = cpu; if (arch == NULL) { - if (!strcmp(isa, "neon")) - arch = "arm"; - else if (!strcmp(isa, "nvptx64")) + if (!strcmp(isa, "nvptx64")) arch = "nvptx64"; +#ifdef ISPC_ARM_ENABLED + else if (!strncmp(isa, "neon", 4)) + arch = "arm"; +#endif else arch = "x86-64"; } @@ -284,40 +295,98 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } // Check default LLVM generated targets - if (!strcasecmp(isa, "sse2")) { + if (!strcasecmp(isa, "sse2") || + !strcasecmp(isa, "sse2-i32x4")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; - this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; + this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" +#if defined(LLVM_3_4) + ",-sse4.1,-sse4.2" +#else + ",-sse41,-sse42" +#endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "sse2-x2")) { + else if (!strcasecmp(isa, "sse2-x2") || + !strcasecmp(isa, "sse2-i32x8")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 8; - this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; + this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" +#if defined(LLVM_3_4) + ",-sse4.1,-sse4.2" +#else + ",-sse41,-sse42" +#endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "sse4")) { + else if (!strcasecmp(isa, "sse4") || + !strcasecmp(isa, "sse4-i32x4")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; // TODO: why not sse42 and popcnt? - this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" +#if defined(LLVM_3_4) + ",+sse4.1,-sse4.2" +#else + ",+sse41,-sse42" +#endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) { + else if (!strcasecmp(isa, "sse4x2") || + !strcasecmp(isa, "sse4-x2") || + !strcasecmp(isa, "sse4-i32x8")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 8; - this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" +#if defined(LLVM_3_4) + ",+sse4.1,-sse4.2" +#else + ",+sse41,-sse42" +#endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "generic-4")) { + else if (!strcasecmp(isa, "sse4-i8x16")) { + this->m_isa = Target::SSE4; + this->m_nativeVectorWidth = 16; + this->m_vectorWidth = 16; + this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" +#if defined(LLVM_3_4) + ",+sse4.1,-sse4.2" +#else + ",+sse41,-sse42" +#endif + ; + this->m_maskingIsFree = false; + this->m_maskBitCount = 8; + } + else if (!strcasecmp(isa, "sse4-i16x8")) { + this->m_isa = Target::SSE4; + this->m_nativeVectorWidth = 8; + this->m_vectorWidth = 8; + this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" +#if defined(LLVM_3_4) + ",+sse4.1,-sse4.2" +#else + ",+sse41,-sse42" +#endif + ; + this->m_maskingIsFree = false; + this->m_maskBitCount = 16; + } + else if (!strcasecmp(isa, "generic-4") || + !strcasecmp(isa, "generic-x4")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; @@ -327,7 +396,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-8")) { + else if (!strcasecmp(isa, "generic-8") || + !strcasecmp(isa, "generic-x8")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -337,7 +407,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-16")) { + else if (!strcasecmp(isa, "generic-16") || + !strcasecmp(isa, "generic-x16")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; @@ -347,7 +418,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-32")) { + else if (!strcasecmp(isa, "generic-32") || + !strcasecmp(isa, "generic-x32")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 32; this->m_vectorWidth = 32; @@ -357,7 +429,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-64")) { + else if (!strcasecmp(isa, "generic-64") || + !strcasecmp(isa, "generic-x64")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 64; this->m_vectorWidth = 64; @@ -367,14 +440,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-1")) { + else if (!strcasecmp(isa, "generic-1") || + !strcasecmp(isa, "generic-x1")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 1; this->m_vectorWidth = 1; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avx") || !strcasecmp(isa, "avx1")) { + else if (!strcasecmp(isa, "avx") || + !strcasecmp(isa, "avx1") || + !strcasecmp(isa, "avx1-i32x8")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -382,7 +458,18 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2")) { + else if (!strcasecmp(isa, "avx-i64x4") || + !strcasecmp(isa, "avx1-i64x4")) { + this->m_isa = Target::AVX; + this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_vectorWidth = 4; + this->m_attributes = "+avx,+popcnt,+cmov"; + this->m_maskingIsFree = false; + this->m_maskBitCount = 64; + } + else if (!strcasecmp(isa, "avx-x2") || + !strcasecmp(isa, "avx1-x2") || + !strcasecmp(isa, "avx1-i32x16")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 16; @@ -390,11 +477,18 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avx1.1")) { + else if (!strcasecmp(isa, "avx1.1") || + !strcasecmp(isa, "avx1.1-i32x8")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; - this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand"; + this->m_attributes = "+avx,+popcnt,+cmov,+f16c" +#if defined(LLVM_3_4) + ",+rdrnd" +#else + ",+rdrand" +#endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; @@ -403,11 +497,18 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasRand = true; #endif } - else if (!strcasecmp(isa, "avx1.1-x2")) { + else if (!strcasecmp(isa, "avx1.1-x2") || + !strcasecmp(isa, "avx1.1-i32x16")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 16; - this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand"; + this->m_attributes = "+avx,+popcnt,+cmov,+f16c" +#if defined(LLVM_3_4) + ",+rdrnd" +#else + ",+rdrand" +#endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; @@ -416,11 +517,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasRand = true; #endif } - else if (!strcasecmp(isa, "avx2")) { + else if (!strcasecmp(isa, "avx2") || + !strcasecmp(isa, "avx2-i32x8")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" +#if defined(LLVM_3_4) + ",+rdrnd" +#else + ",+rdrand" +#endif #ifndef LLVM_3_1 ",+fma" #endif // !LLVM_3_1 @@ -434,11 +541,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasGather = true; #endif } - else if (!strcasecmp(isa, "avx2-x2")) { + else if (!strcasecmp(isa, "avx2-x2") || + !strcasecmp(isa, "avx2-i32x16")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" +#if defined(LLVM_3_4) + ",+rdrnd" +#else + ",+rdrand" +#endif #ifndef LLVM_3_1 ",+fma" #endif // !LLVM_3_1 @@ -452,8 +565,28 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasGather = true; #endif } - else if (!strcasecmp(isa, "neon")) { - this->m_isa = Target::NEON; +#ifdef ISPC_ARM_ENABLED + else if (!strcasecmp(isa, "neon-i8x16")) { + this->m_isa = Target::NEON8; + this->m_nativeVectorWidth = 16; + this->m_vectorWidth = 16; + this->m_attributes = "+neon,+fp16"; + this->m_hasHalf = true; // ?? + this->m_maskingIsFree = false; + this->m_maskBitCount = 8; + } + else if (!strcasecmp(isa, "neon-i16x8")) { + this->m_isa = Target::NEON16; + this->m_nativeVectorWidth = 8; + this->m_vectorWidth = 8; + this->m_attributes = "+neon,+fp16"; + this->m_hasHalf = true; // ?? + this->m_maskingIsFree = false; + this->m_maskBitCount = 16; + } + else if (!strcasecmp(isa, "neon") || + !strcasecmp(isa, "neon-i32x4")) { + this->m_isa = Target::NEON32; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; this->m_attributes = "+neon,+fp16"; @@ -461,6 +594,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } +#endif else if (!strcasecmp(isa, "nvptx64")) { this->m_isa = Target::NVPTX64; this->m_nativeVectorWidth = 1; @@ -478,8 +612,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : #endif } else { - fprintf(stderr, "Target ISA \"%s\" is unknown. Choices are: %s\n", - isa, SupportedTargetISAs()); + Error(SourcePos(), "Target \"%s\" is unknown. Choices are: %s.", + isa, SupportedTargets()); error = true; } @@ -491,8 +625,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : llvm::Reloc::Default; std::string featuresString = m_attributes; llvm::TargetOptions options; - if (m_isa == Target::NEON) +#ifdef ISPC_ARM_ENABLED + if (m_isa == Target::NEON8 || m_isa == Target::NEON16 || + m_isa == Target::NEON32) options.FloatABIType = llvm::FloatABI::Hard; +#endif #if !defined(LLVM_3_1) if (g->opt.disableFMA == false) options.AllowFPOpFusion = llvm::FPOpFusion::Fast; @@ -551,6 +688,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // Initialize target-specific "target-feature" attribute. if (!m_attributes.empty()) { llvm::AttrBuilder attrBuilder; + if (m_isa != Target::NVPTX64) + attrBuilder.addAttribute("target-cpu", this->m_cpu); attrBuilder.addAttribute("target-features", this->m_attributes); this->m_tf_attributes = new llvm::AttributeSet( llvm::AttributeSet::get( @@ -570,7 +709,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : std::string -Target::SupportedTargetCPUs() { +Target::SupportedCPUs() { std::string ret; int count = sizeof(supportedCPUs) / sizeof(supportedCPUs[0]); for (int i = 0; i < count; ++i) { @@ -583,30 +722,45 @@ Target::SupportedTargetCPUs() { const char * -Target::SupportedTargetArchs() { - return "nvptx64, arm, x86, x86-64"; +Target::SupportedArchs() { + return "nvptx64, " +#ifdef ISPC_ARM_ENABLED + "arm, " +#endif + "x86, x86-64"; } const char * -Target::SupportedTargetISAs() { - return "nvptx64, neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2" - ", avx1.1, avx1.1-x2, avx2, avx2-x2" - ", generic-1, generic-4, generic-8, generic-16, generic-32"; +Target::SupportedTargets() { + return "nvptx64, " +#ifdef ISPC_ARM_ENABLED + "neon-i8x16, neon-16x8, neon-32x4, " +#endif + "sse2-i32x4, sse2-i32x8, " + "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, " + "avx1-i32x8, avx1-i32x16, avx1-i64x4, " + "avx1.1-i32x8, avx1.1-i32x16, " + "avx2-i32x8, avx2-i32x16, " + "generic-x1, generic-x4, generic-x8, generic-x16, " + "generic-x32, generic-x64"; } std::string Target::GetTripleString() const { llvm::Triple triple; - if (m_arch == "arm") { - triple.setTriple("armv7-eabi"); - } - else if (m_arch == "nvptx64") + if (m_arch == "nvptx64") { triple.setTriple("nvptx64"); } - else { +#ifdef ISPC_ARM_ENABLED + else if (m_arch == "arm") { + triple.setTriple("armv7-eabi"); + } +#endif + else + { // Start with the host triple as the default triple.setTriple(llvm::sys::getDefaultTargetTriple()); @@ -629,10 +783,16 @@ Target::GetTripleString() const { const char * Target::ISAToString(ISA isa) { switch (isa) { - case Target::NEON: - return "neon"; case Target::NVPTX64: return "nvptx64"; +#ifdef ISPC_ARM_ENABLED + case Target::NEON8: + return "neon-8"; + case Target::NEON16: + return "neon-16"; + case Target::NEON32: + return "neon-32"; +#endif case Target::SSE2: return "sse2"; case Target::SSE4: @@ -803,6 +963,7 @@ Globals::Globals() { includeStdlib = true; runCPP = true; debugPrint = false; + debugIR = -1; disableWarnings = false; warningsAsErrors = false; quiet = false; diff --git a/ispc.h b/ispc.h index de41a3e8..e2a58ba9 100644 --- a/ispc.h +++ b/ispc.h @@ -38,7 +38,7 @@ #ifndef ISPC_H #define ISPC_H -#define ISPC_VERSION "1.4.5dev" +#define ISPC_VERSION "1.5.1dev" #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) #error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported" @@ -59,6 +59,7 @@ #include #include #include +#include #include /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation @@ -66,6 +67,9 @@ */ #define ISPC_MAX_NVEC 64 +// Number of final optimization phase +#define LAST_OPT_NUMBER 1000 + // Forward declarations of a number of widely-used LLVM types namespace llvm { class AttributeSet; @@ -175,7 +179,12 @@ public: flexible/performant of them will apear last in the enumerant. Note also that __best_available_isa() needs to be updated if ISAs are added or the enumerant values are reordered. */ - enum ISA { NVPTX64, NEON, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS }; + enum ISA { NVPTX64, +#ifdef ISPC_ARM_ENABLED + NEON32, NEON16, NEON8, +#endif + SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, + NUM_ISAS }; /** Initializes the given Target pointer for a target of the given name, if the name is a known target. Returns true if the @@ -183,16 +192,16 @@ public: Target(const char *arch, const char *cpu, const char *isa, bool pic); /** Returns a comma-delimited string giving the names of the currently - supported target ISAs. */ - static const char *SupportedTargetISAs(); + supported compilation targets. */ + static const char *SupportedTargets(); /** Returns a comma-delimited string giving the names of the currently - supported target CPUs. */ - static std::string SupportedTargetCPUs(); + supported CPUs. */ + static std::string SupportedCPUs(); /** Returns a comma-delimited string giving the names of the currently - supported target architectures. */ - static const char *SupportedTargetArchs(); + supported architectures. */ + static const char *SupportedArchs(); /** Returns a triple string specifying the target architecture, vendor, and environment. */ @@ -494,6 +503,16 @@ struct Globals { ispc's execution. */ bool debugPrint; + /** Indicates which stages of optimization we want to dump. */ + std::set debug_stages; + + /** Indicates after which optimization we want to generate + DebugIR information. */ + int debugIR; + + /** Indicates which phases of optimization we want to switch off. */ + std::set off_stages; + /** Indicates whether all warning messages should be surpressed. */ bool disableWarnings; diff --git a/ispc.vcxproj b/ispc.vcxproj index 96682fe3..58fa5b08 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -22,6 +22,8 @@ + + @@ -45,18 +47,23 @@ - - + + + + - - + + + + + 4146;4800;4996;4355;4624;4005;4003;4018 @@ -99,11 +106,14 @@ Document - %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 > $(Configuration)/gen-stdlib-x86.cpp; -%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic > $(Configuration)/gen-stdlib-generic.cpp; + %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask1 > $(Configuration)/gen-stdlib-mask1.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask8 > $(Configuration)/gen-stdlib-mask8.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask16 > $(Configuration)/gen-stdlib-mask16.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask32 > $(Configuration)/gen-stdlib-mask32.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask64 > $(Configuration)/gen-stdlib-mask64.cpp; - $(Configuration)/gen-stdlib-generic.cpp;$(Configuration)/gen-stdlib-x86.cpp - Building gen-stdlib-{generic,x86}.cpp + $(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp;$(Configuration)/gen-stdlib-mask64.cpp + Building gen-stdlib-{mask1,8,16,32,64}.cpp @@ -111,7 +121,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins/dispatch.ll | python bitcode2cpp.py dispatch.ll > $(Configuration)/gen-bitcode-dispatch.cpp $(Configuration)/gen-bitcode-dispatch.cpp - builtins\util.m4 + builtins\util.m4;builtins\svml.m4 Building gen-bitcode-dispatch.cpp @@ -120,7 +130,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit > $(Configuration)/gen-bitcode-sse4-32bit.cpp $(Configuration)/gen-bitcode-sse4-32bit.cpp - builtins\util.m4;builtins\target-sse4-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-32bit.cpp @@ -129,16 +139,52 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 64bit > $(Configuration)/gen-bitcode-sse4-64bit.cpp $(Configuration)/gen-bitcode-sse4-64bit.cpp - builtins\util.m4;builtins\target-sse4-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-64bit.cpp + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit > $(Configuration)/gen-bitcode-sse4-8-32bit.cpp + $(Configuration)/gen-bitcode-sse4-8-32bit.cpp + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-8-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit > $(Configuration)/gen-bitcode-sse4-8-64bit.cpp + $(Configuration)/gen-bitcode-sse4-8-64bit.cpp + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-8-64bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit > $(Configuration)/gen-bitcode-sse4-16-32bit.cpp + $(Configuration)/gen-bitcode-sse4-16-32bit.cpp + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-16-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit > $(Configuration)/gen-bitcode-sse4-16-64bit.cpp + $(Configuration)/gen-bitcode-sse4-16-64bit.cpp + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-16-64bit.cpp + + Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit > $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp - builtins\util.m4;builtins\target-sse4-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-x2-32bit.cpp @@ -147,7 +193,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit > $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp - builtins\util.m4;builtins\target-sse4-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-x2-64bit.cpp @@ -156,7 +202,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit > $(Configuration)/gen-bitcode-sse2-32bit.cpp $(Configuration)/gen-bitcode-sse2-32bit.cpp - builtins\util.m4;builtins\target-sse2-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll Building gen-bitcode-sse2-32bit.cpp @@ -165,7 +211,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit > $(Configuration)/gen-bitcode-sse2-64bit.cpp $(Configuration)/gen-bitcode-sse2-64bit.cpp - builtins\util.m4;builtins\target-sse2-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll Building gen-bitcode-sse2-64bit.cpp @@ -174,7 +220,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit > $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp - builtins\util.m4;builtins\target-sse2-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll Building gen-bitcode-sse2-x2-32bit.cpp @@ -183,29 +229,16 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit > $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp - builtins\util.m4;builtins\target-sse2-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll Building gen-bitcode-sse2-x2-64bit.cpp - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll > gen-bitcode-neon.cpp - gen-bitcode-neon.cpp - builtins\util.m4 - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll > gen-bitcode-neon.cpp - gen-bitcode-neon.cpp - builtins\util.m4 - Building gen-bitcode-neon.cpp - Building gen-bitcode-neon.cpp - - Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit > $(Configuration)/gen-bitcode-avx1-32bit.cpp $(Configuration)/gen-bitcode-avx1-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll Building gen-bitcode-avx1-32bit.cpp @@ -214,7 +247,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit > $(Configuration)/gen-bitcode-avx1-64bit.cpp $(Configuration)/gen-bitcode-avx1-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll Building gen-bitcode-avx1-64bit.cpp @@ -223,7 +256,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit > $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll Building gen-bitcode-avx1-x2-32bit.cpp @@ -232,16 +265,34 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit > $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll Building gen-bitcode-avx1-x2-64bit.cpp + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp + $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll + Building gen-bitcode-avx1-i64x4-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 64bit > $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp + $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll + Building gen-bitcode-avx1-i64x4-64bit.cpp + + Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit > $(Configuration)/gen-bitcode-avx11-32bit.cpp $(Configuration)/gen-bitcode-avx11-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll Building gen-bitcode-avx11-32bit.cpp @@ -250,7 +301,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit > $(Configuration)/gen-bitcode-avx11-64bit.cpp $(Configuration)/gen-bitcode-avx11-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll Building gen-bitcode-avx11-64bit.cpp @@ -259,7 +310,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit > $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll Building gen-bitcode-avx11-x2-32bit.cpp @@ -268,7 +319,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit > $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll Building gen-bitcode-avx11-x2-64bit.cpp @@ -277,7 +328,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit > $(Configuration)/gen-bitcode-avx2-32bit.cpp $(Configuration)/gen-bitcode-avx2-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll Building gen-bitcode-avx2-32bit.cpp @@ -286,7 +337,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit > $(Configuration)/gen-bitcode-avx2-64bit.cpp $(Configuration)/gen-bitcode-avx2-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll Building gen-bitcode-avx2-64bit.cpp @@ -295,7 +346,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit > $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll Building gen-bitcode-avx2-x2-32bit.cpp @@ -304,7 +355,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit > $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll Building gen-bitcode-avx2-x2-64bit.cpp @@ -313,7 +364,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit > $(Configuration)/gen-bitcode-generic-1-32bit.cpp $(Configuration)/gen-bitcode-generic-1-32bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-1-32bit.cpp @@ -322,7 +373,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit > $(Configuration)/gen-bitcode-generic-1-64bit.cpp $(Configuration)/gen-bitcode-generic-1-64bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-1-64bit.cpp @@ -331,7 +382,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit > $(Configuration)/gen-bitcode-generic-4-32bit.cpp $(Configuration)/gen-bitcode-generic-4-32bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-4-32bit.cpp @@ -340,7 +391,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit > $(Configuration)/gen-bitcode-generic-4-64bit.cpp $(Configuration)/gen-bitcode-generic-4-64bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-4-64bit.cpp @@ -349,7 +400,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit > $(Configuration)/gen-bitcode-generic-8-32bit.cpp $(Configuration)/gen-bitcode-generic-8-32bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-8-32bit.cpp @@ -358,7 +409,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit > $(Configuration)/gen-bitcode-generic-8-64bit.cpp $(Configuration)/gen-bitcode-generic-8-64bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-8-64bit.cpp @@ -367,7 +418,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit > $(Configuration)/gen-bitcode-generic-16-32bit.cpp $(Configuration)/gen-bitcode-generic-16-32bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-16-32bit.cpp @@ -376,7 +427,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit > $(Configuration)/gen-bitcode-generic-16-64bit.cpp $(Configuration)/gen-bitcode-generic-16-64bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-16-64bit.cpp @@ -385,7 +436,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit > $(Configuration)/gen-bitcode-generic-32-32bit.cpp $(Configuration)/gen-bitcode-generic-32-32bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-32-32bit.cpp @@ -394,7 +445,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit > $(Configuration)/gen-bitcode-generic-32-64bit.cpp $(Configuration)/gen-bitcode-generic-32-64bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-32-64bit.cpp @@ -403,7 +454,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit > $(Configuration)/gen-bitcode-generic-64-32bit.cpp $(Configuration)/gen-bitcode-generic-64-32bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-64-32bit.cpp @@ -412,7 +463,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit > $(Configuration)/gen-bitcode-generic-64-64bit.cpp $(Configuration)/gen-bitcode-generic-64-64bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-64-64bit.cpp diff --git a/lex.ll b/lex.ll index f6633fce..3655220f 100644 --- a/lex.ll +++ b/lex.ll @@ -76,7 +76,9 @@ static int allTokens[] = { TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED, TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT, - TOKEN_FLOAT_CONSTANT, + TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT, + TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT, + TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT, TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT, TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT, TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP, @@ -150,6 +152,11 @@ void ParserInit() { tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\""; tokenToName[TOKEN_DOTDOTDOT] = "..."; tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT"; + tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT"; + tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT"; + tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT"; + tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT"; + tokenToName[TOKEN_UINT16_CONSTANT] = "TOKEN_UINT16_CONSTANT"; tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT"; tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT"; tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT"; @@ -260,6 +267,11 @@ void ParserInit() { tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\""; tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'"; tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant"; + tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant"; + tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant"; + tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant"; + tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant"; + tokenNameRemap["TOKEN_UINT16_CONSTANT"] = "unsigned int16 constant"; tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant"; tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant"; tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant"; @@ -333,6 +345,9 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]* INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\. FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?) HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?) +FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9]+)|([0-9]+[dD][-+]?[0-9]+)|(\.[0-9]*[dD][-+]?[0-9]+)) + + IDENT [a-zA-Z_][a-zA-Z_0-9]* ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+ @@ -427,6 +442,17 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA return lParseInteger(true); } +{FORTRAN_DOUBLE_NUMBER} { + RT; + { + int i = 0; + while (yytext[i] != 'd' && yytext[i] != 'D') i++; + yytext[i] = 'E'; + } + yylval.doubleVal = atof(yytext); + return TOKEN_DOUBLE_CONSTANT; +} + {FLOAT_NUMBER} { RT; @@ -440,6 +466,8 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA return TOKEN_FLOAT_CONSTANT; } + + "++" { RT; return TOKEN_INC_OP; } "--" { RT; return TOKEN_DEC_OP; } "<<" { RT; return TOKEN_LEFT_OP; } @@ -599,7 +627,22 @@ lParseInteger(bool dotdotdot) { } else { // No u or l suffix - // First, see if we can fit this into a 32-bit integer... + // If we're compiling to an 8-bit mask target and the constant + // fits into 8 bits, return an 8-bit int. + if (g->target->getMaskBitCount() == 8) { + if (yylval.intVal <= 0x7fULL) + return TOKEN_INT8_CONSTANT; + else if (yylval.intVal <= 0xffULL) + return TOKEN_UINT8_CONSTANT; + } + // And similarly for 16-bit masks and constants + if (g->target->getMaskBitCount() == 16) { + if (yylval.intVal <= 0x7fffULL) + return TOKEN_INT16_CONSTANT; + else if (yylval.intVal <= 0xffffULL) + return TOKEN_UINT16_CONSTANT; + } + // Otherwise, see if we can fit this into a 32-bit integer... if (yylval.intVal <= 0x7fffffffULL) return TOKEN_INT32_CONSTANT; else if (yylval.intVal <= 0xffffffffULL) diff --git a/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch new file mode 100644 index 00000000..36bb5572 --- /dev/null +++ b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch @@ -0,0 +1,102 @@ +This patch needs to be applied to LLVM 3.3 to fix performance regression after r172868 revision. +This regression is due to increased register pressure after revision causing spills in case of multiple loads +This regression is fixed in 3.4 but the changes in 3.4 is not back portable, +so we roll back r172868 to avoid regression with 3.3. + +Index: test/CodeGen/X86/sandybridge-loads.ll +=================================================================== +--- test/CodeGen/X86/sandybridge-loads.ll (revision 191082) ++++ test/CodeGen/X86/sandybridge-loads.ll (working copy) +@@ -1,24 +1,5 @@ + ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s + +-;CHECK: wideloads +-;CHECK: vmovaps +-;CHECK: vinsertf128 +-;CHECK: vmovaps +-;CHECK-NOT: vinsertf128 +-;CHECK: ret +- +-define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { +- %v0 = load <8 x float>* %a, align 16 ; <---- unaligned! +- %v1 = load <8 x float>* %b, align 32 ; <---- aligned! +- %m0 = fcmp olt <8 x float> %v1, %v0 +- %v2 = load <8 x float>* %c, align 32 ; <---- aligned! +- %m1 = fcmp olt <8 x float> %v2, %v0 +- %mand = and <8 x i1> %m1, %m0 +- %r = zext <8 x i1> %mand to <8 x i32> +- store <8 x i32> %r, <8 x i32>* undef, align 32 +- ret void +-} +- + ; CHECK: widestores + ; loads: + ; CHECK: vmovaps +Index: test/CodeGen/X86/v8i1-masks.ll +=================================================================== +--- test/CodeGen/X86/v8i1-masks.ll (revision 172868) ++++ test/CodeGen/X86/v8i1-masks.ll (revision 172866) +@@ -1,7 +1,7 @@ + ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s + + ;CHECK: and_masks +-;CHECK: vmovaps ++;CHECK: vmovups + ;CHECK: vcmpltp + ;CHECK: vcmpltp + ;CHECK: vandps +Index: lib/Target/X86/X86ISelLowering.cpp +=================================================================== +--- lib/Target/X86/X86ISelLowering.cpp (revision 191077) ++++ lib/Target/X86/X86ISelLowering.cpp (working copy) +@@ -16756,42 +16756,9 @@ + EVT MemVT = Ld->getMemoryVT(); + DebugLoc dl = Ld->getDebugLoc(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); +- unsigned RegSz = RegVT.getSizeInBits(); + +- // On Sandybridge unaligned 256bit loads are inefficient. + ISD::LoadExtType Ext = Ld->getExtensionType(); +- unsigned Alignment = Ld->getAlignment(); +- bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; +- if (RegVT.is256BitVector() && !Subtarget->hasInt256() && +- !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { +- unsigned NumElems = RegVT.getVectorNumElements(); +- if (NumElems < 2) +- return SDValue(); + +- SDValue Ptr = Ld->getBasePtr(); +- SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); +- +- EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), +- NumElems/2); +- SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, +- Ld->getPointerInfo(), Ld->isVolatile(), +- Ld->isNonTemporal(), Ld->isInvariant(), +- Alignment); +- Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); +- SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, +- Ld->getPointerInfo(), Ld->isVolatile(), +- Ld->isNonTemporal(), Ld->isInvariant(), +- std::min(16U, Alignment)); +- SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, +- Load1.getValue(1), +- Load2.getValue(1)); +- +- SDValue NewVec = DAG.getUNDEF(RegVT); +- NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); +- NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); +- return DCI.CombineTo(N, NewVec, TF, true); +- } +- + // If this is a vector EXT Load then attempt to optimize it using a + // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the + // expansion is still better than scalar code. +@@ -16805,6 +16772,7 @@ + assert(MemVT.isVector() && "Must load a vector from memory"); + + unsigned NumElems = RegVT.getVectorNumElements(); ++ unsigned RegSz = RegVT.getSizeInBits(); + unsigned MemSz = MemVT.getSizeInBits(); + assert(RegSz > MemSz && "Register size must be greater than the mem size"); + diff --git a/llvm_patches/r183327-AVX2-GATHER.patch b/llvm_patches/3_3_r183327-AVX2-GATHER.patch old mode 100755 new mode 100644 similarity index 100% rename from llvm_patches/r183327-AVX2-GATHER.patch rename to llvm_patches/3_3_r183327-AVX2-GATHER.patch diff --git a/llvm_patches/r184575-x86-shift.patch b/llvm_patches/3_3_r184575-x86-shift.patch similarity index 100% rename from llvm_patches/r184575-x86-shift.patch rename to llvm_patches/3_3_r184575-x86-shift.patch diff --git a/llvmutil.cpp b/llvmutil.cpp index 26c18bf5..275cf794 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -115,13 +115,29 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0); LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0); - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: LLVMTypes::MaskType = LLVMTypes::BoolVectorType = llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.getVectorWidth()); - else { - Assert(target.getMaskBitCount() == 32); + break; + case 8: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt8Ty(*ctx), target.getVectorWidth()); + break; + case 16: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt16Ty(*ctx), target.getVectorWidth()); + break; + case 32: LLVMTypes::MaskType = LLVMTypes::BoolVectorType = llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth()); + break; + case 64: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt64Ty(*ctx), target.getVectorWidth()); + break; + default: + FATAL("Unhandled mask width for initializing MaskType"); } LLVMTypes::Int1VectorType = @@ -154,12 +170,30 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { std::vector maskOnes; llvm::Constant *onMask = NULL; - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1, false /*unsigned*/); // 0x1 - else + break; + case 8: + onMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), -1, + true /*signed*/); // 0xff + break; + case 16: + onMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), -1, + true /*signed*/); // 0xffff + break; + case 32: onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1, true /*signed*/); // 0xffffffff + break; + case 64: + onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1, + true /*signed*/); // 0xffffffffffffffffull + break; + default: + FATAL("Unhandled mask width for onMask"); + } for (int i = 0; i < target.getVectorWidth(); ++i) maskOnes.push_back(onMask); @@ -167,13 +201,30 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { std::vector maskZeros; llvm::Constant *offMask = NULL; - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0, true /*signed*/); - else + break; + case 8: + offMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), 0, + true /*signed*/); + break; + case 16: + offMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), 0, + true /*signed*/); + break; + case 32: offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0, true /*signed*/); - + break; + case 64: + offMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), 0, + true /*signed*/); + break; + default: + FATAL("Unhandled mask width for offMask"); + } for (int i = 0; i < target.getVectorWidth(); ++i) maskZeros.push_back(offMask); LLVMMaskAllOff = llvm::ConstantVector::get(maskZeros); @@ -441,12 +492,20 @@ LLVMUInt64Vector(const uint64_t *ivec) { llvm::Constant * LLVMBoolVector(bool b) { llvm::Constant *v; - if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int64Type, b ? 0xffffffffffffffffull : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0, false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int16Type, b ? 0xffff : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int8Type, b ? 0xff : 0, + false /*unsigned*/); else { - Assert(LLVMTypes::BoolVectorType->getElementType() == - llvm::Type::getInt1Ty(*g->ctx)); + Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType); v = b ? LLVMTrue : LLVMFalse; } @@ -462,12 +521,20 @@ LLVMBoolVector(const bool *bvec) { std::vector vals; for (int i = 0; i < g->target->getVectorWidth(); ++i) { llvm::Constant *v; - if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int64Type, bvec[i] ? 0xffffffffffffffffull : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0, false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int16Type, bvec[i] ? 0xffff : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int8Type, bvec[i] ? 0xff : 0, + false /*unsigned*/); else { - Assert(LLVMTypes::BoolVectorType->getElementType() == - llvm::Type::getInt1Ty(*g->ctx)); + Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType); v = bvec[i] ? LLVMTrue : LLVMFalse; } diff --git a/main.cpp b/main.cpp index 9ab0b793..ce6b5d4c 100644 --- a/main.cpp +++ b/main.cpp @@ -85,13 +85,16 @@ usage(int ret) { printf(" \t\taddressing calculations are done by default, even\n"); printf(" \t\ton 64-bit target architectures.)\n"); printf(" [--arch={%s}]\t\tSelect target architecture\n", - Target::SupportedTargetArchs()); + Target::SupportedArchs()); printf(" [--c++-include-file=]\t\tSpecify name of file to emit in #include statement in generated C++ code.\n"); #ifndef ISPC_IS_WINDOWS printf(" [--colored-output]\t\tAlways use terminal colors in error/warning messages.\n"); #endif - printf(" [--cpu=]\t\t\tSelect target CPU type\n"); - printf(" ={%s}\n", Target::SupportedTargetCPUs().c_str()); + printf(" "); + char cpuHelp[2048]; + sprintf(cpuHelp, "[--cpu=]\t\t\tSelect target CPU type\n={%s}\n", + Target::SupportedCPUs().c_str()); + PrintWithWordBreaks(cpuHelp, 16, TerminalWidth(), stdout); printf(" [-D]\t\t\t\t#define given value when running preprocessor\n"); printf(" [--dev-stub ]\t\tEmit device-side offload stub functions to file\n"); printf(" [--emit-asm]\t\t\tGenerate assembly language file as output\n"); @@ -127,7 +130,11 @@ usage(int ret) { printf(" [--pic]\t\t\t\tGenerate position-independent code\n"); #endif // !ISPC_IS_WINDOWS printf(" [--quiet]\t\t\t\tSuppress all output\n"); - printf(" [--target=]\t\t\tSelect target ISA. ={%s}\n", Target::SupportedTargetISAs()); + printf(" "); + char targetHelp[2048]; + sprintf(targetHelp, "[--target=]\t\t\tSelect target ISA and width.\n" + "={%s}", Target::SupportedTargets()); + PrintWithWordBreaks(targetHelp, 24, TerminalWidth(), stdout); printf(" [--version]\t\t\t\tPrint ispc version\n"); printf(" [--werror]\t\t\t\tTreat warnings as errors\n"); printf(" [--woff]\t\t\t\tDisable warnings\n"); @@ -156,6 +163,11 @@ devUsage(int ret) { printf(" disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n"); printf(" disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n"); printf(" [--yydebug]\t\t\t\tPrint debugging information during parsing\n"); + printf(" [--debug-phase=]\t\tSet optimization phases to dump. --debug-phase=first,210:220,300,305,310:last\n"); +#ifdef LLVM_3_4 + printf(" [--debug-ir=]\t\tSet optimization phase to generate debugIR after it\n"); +#endif + printf(" [--off-phase=]\t\tSwitch off optimization phases. --off-phase=first,210:220,300,305,310:last\n"); exit(ret); } @@ -212,6 +224,47 @@ lSignal(void *) { } +static int ParsingPhaseName(char * stage) { + if (strncmp(stage, "first", 5) == 0) { + return 0; + } + else if (strncmp(stage, "last", 4) == 0) { + return LAST_OPT_NUMBER; + } + else { + int t = atoi(stage); + if (t < 0 || t > LAST_OPT_NUMBER) { + fprintf(stderr, "Phases must be from 0 to %d. %s is incorrect.\n", LAST_OPT_NUMBER, stage); + exit(0); + } + else { + return t; + } + } +} + + +static std::set ParsingPhases(char * stages) { + std::set phases; + int begin = ParsingPhaseName(stages); + int end = begin; + + for (unsigned i = 0; i < strlen(stages); i++) { + if ((stages[i] == ',') || (i == strlen(stages) - 1)) { + for (int j = begin; j < end + 1; j++) { + phases.insert(j); + } + begin = ParsingPhaseName(stages + i + 1); + end = begin; + } + else if (stages[i] == ':') { + end = ParsingPhaseName(stages + i + 1); + } + } + return phases; +} + + static void lParseInclude(const char *path) { #ifdef ISPC_IS_WINDOWS @@ -254,6 +307,8 @@ int main(int Argc, char *Argv[]) { LLVMInitializeX86Disassembler(); LLVMInitializeX86TargetMC(); #endif // !__ARM__ + +#ifdef ISPC_ARM_ENABLED // Generating ARM from x86 is more likely to be useful, though. LLVMInitializeARMTargetInfo(); LLVMInitializeARMTarget(); @@ -261,6 +316,7 @@ int main(int Argc, char *Argv[]) { LLVMInitializeARMAsmParser(); LLVMInitializeARMDisassembler(); LLVMInitializeARMTargetMC(); +#endif LLVMInitializeNVPTXTargetInfo(); LLVMInitializeNVPTXTarget(); @@ -282,7 +338,6 @@ int main(int Argc, char *Argv[]) { // as we're parsing below g = new Globals; - bool debugSet = false, optSet = false; Module::OutputType ot = Module::Object; bool generatePIC = false; const char *arch = NULL, *cpu = NULL, *target = NULL; @@ -325,7 +380,6 @@ int main(int Argc, char *Argv[]) { g->emitInstrumentation = true; else if (!strcmp(argv[i], "-g")) { g->generateDebuggingSymbols = true; - debugSet = true; } else if (!strcmp(argv[i], "--emit-asm")) ot = Module::Asm; @@ -452,12 +506,10 @@ int main(int Argc, char *Argv[]) { } else if (!strcmp(argv[i], "-O0")) { g->opt.level = 0; - optSet = true; } else if (!strcmp(argv[i], "-O") || !strcmp(argv[i], "-O1") || !strcmp(argv[i], "-O2") || !strcmp(argv[i], "-O3")) { g->opt.level = 1; - optSet = true; } else if (!strcmp(argv[i], "-")) ; @@ -498,6 +550,20 @@ int main(int Argc, char *Argv[]) { } hostStubFileName = argv[i]; } + else if (strncmp(argv[i], "--debug-phase=", 14) == 0) { + fprintf(stderr, "WARNING: Adding debug phases may change the way PassManager" + "handles the phases and it may possibly make some bugs go" + "away or introduce the new ones.\n"); + g->debug_stages = ParsingPhases(argv[i] + strlen("--debug-phase=")); + } +#ifdef LLVM_3_4 + else if (strncmp(argv[i], "--debug-ir=", 11) == 0) { + g->debugIR = ParsingPhaseName(argv[i] + strlen("--debug-ir=")); + } +#endif + else if (strncmp(argv[i], "--off-phase=", 12) == 0) { + g->off_stages = ParsingPhases(argv[i] + strlen("--off-phase=")); + } else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) { lPrintVersion(); return 0; @@ -517,12 +583,6 @@ int main(int Argc, char *Argv[]) { } } - // If the user specified -g, then the default optimization level is 0. - // If -g wasn't specified, the default optimization level is 1 (full - // optimization). - if (debugSet && !optSet) - g->opt.level = 0; - if (g->enableFuzzTest) { if (g->fuzzTestSeed == -1) { #ifdef ISPC_IS_WINDOWS diff --git a/module.cpp b/module.cpp index 85bf242c..755a5dc4 100644 --- a/module.cpp +++ b/module.cpp @@ -1877,6 +1877,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre char *p = targetMacro; while (*p) { *p = toupper(*p); + if (*p == '-') *p = '_'; ++p; } opts.addMacroDef(targetMacro); diff --git a/opt.cpp b/opt.cpp index ba32c639..75eae20c 100644 --- a/opt.cpp +++ b/opt.cpp @@ -63,6 +63,9 @@ #include #include #endif +#if defined (LLVM_3_4) + #include +#endif #include #include #include @@ -85,6 +88,7 @@ #include #include #include +#include #if defined(LLVM_3_1) #include #else @@ -108,7 +112,8 @@ #endif static llvm::Pass *CreateIntrinsicsOptPass(); -static llvm::Pass *CreateVSelMovmskOptPass(); +static llvm::Pass *CreateInstructionSimplifyPass(); +static llvm::Pass *CreatePeepholePass(); static llvm::Pass *CreateImproveMemoryOpsPass(); static llvm::Pass *CreateGatherCoalescePass(); @@ -117,6 +122,8 @@ static llvm::Pass *CreateReplacePseudoMemoryOpsPass(); static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry); static llvm::Pass *CreateMakeInternalFuncsStaticPass(); +static llvm::Pass *CreateDebugPass(char * output); + #define DEBUG_START_PASS(NAME) \ if (g->debugPrint && \ (getenv("FUNC") == NULL || \ @@ -393,6 +400,54 @@ lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) { } +/////////////////////////////////////////////////////////////////////////// +// This is a wrap over class llvm::PassManager. This duplicates PassManager function run() +// and change PassManager function add by adding some checks and debug passes. +// This wrap can control: +// - If we want to switch off optimization with given number. +// - If we want to dump LLVM IR after optimization with given number. +// - If we want to generate LLVM IR debug for gdb after optimization with given number. +class DebugPassManager { +public: + DebugPassManager():number(0){} + void add(llvm::Pass * P, int stage); + bool run(llvm::Module& M) {return PM.run(M);} + llvm::PassManager& getPM() {return PM;} + +private: + llvm::PassManager PM; + int number; +}; + +void +DebugPassManager::add(llvm::Pass * P, int stage = -1) { + // taking number of optimization + if (stage == -1) { + number++; + } + else { + number = stage; + } + if (g->off_stages.find(number) == g->off_stages.end()) { + // adding optimization (not switched off) + PM.add(P); + if (g->debug_stages.find(number) != g->debug_stages.end()) { + // adding dump of LLVM IR after optimization + char buf[100]; + sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n", + number, P->getPassName()); + PM.add(CreateDebugPass(buf)); + } +#ifdef LLVM_3_4 + if (g->debugIR == number) { + // adding generating of LLVM IR debug after optimization + char buf[100]; + sprintf(buf, "Debug_IR_after_%d_phase.bc", number); + PM.add(llvm::createDebugIRPass(true, true, ".", buf)); + } +#endif + } +} /////////////////////////////////////////////////////////////////////////// void @@ -401,14 +456,8 @@ Optimize(llvm::Module *module, int optLevel) { printf("*** Code going into optimization ***\n"); module->dump(); } - - llvm::PassManager optPM; - optPM.add(llvm::createVerifierPass()); - -#if 0 - std::string err; - optPM.add(llvm::createPrintModulePass(new llvm::raw_fd_ostream("-", err))); -#endif + DebugPassManager optPM; + optPM.add(llvm::createVerifierPass(),0); llvm::TargetLibraryInfo *targetLibraryInfo = new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple())); @@ -425,7 +474,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(), targetMachine->getVectorTargetTransformInfo())); #else // LLVM 3.3+ - targetMachine->addAnalysisPasses(optPM); + targetMachine->addAnalysisPasses(optPM.getPM()); #endif #endif @@ -437,11 +486,11 @@ Optimize(llvm::Module *module, int optLevel) { // run absolutely no optimizations, since the front-end needs us to // take the various __pseudo_* functions it has emitted and turn // them into something that can actually execute. - optPM.add(CreateImproveMemoryOpsPass()); + optPM.add(CreateImproveMemoryOpsPass(), 100); if (g->opt.disableHandlePseudoMemoryOps == false) optPM.add(CreateReplacePseudoMemoryOpsPass()); - optPM.add(CreateIntrinsicsOptPass()); + optPM.add(CreateIntrinsicsOptPass(), 102); optPM.add(CreateIsCompileTimeConstantPass(true)); optPM.add(llvm::createFunctionInliningPass()); optPM.add(CreateMakeInternalFuncsStaticPass()); @@ -460,7 +509,7 @@ Optimize(llvm::Module *module, int optLevel) { llvm::initializeInstrumentation(*registry); llvm::initializeTarget(*registry); - optPM.add(llvm::createGlobalDCEPass()); + optPM.add(llvm::createGlobalDCEPass(), 200); // Early optimizations to try to reduce the total amount of code to // work with if we can @@ -469,16 +518,19 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createDeadInstEliminationPass()); optPM.add(llvm::createCFGSimplificationPass()); + optPM.add(llvm::createPromoteMemoryToRegisterPass()); + optPM.add(llvm::createAggressiveDCEPass()); + if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) { - optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createInstructionCombiningPass(), 210); optPM.add(CreateImproveMemoryOpsPass()); } if (!g->opt.disableMaskAllOnOptimizations) { - optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateIntrinsicsOptPass(), 215); + optPM.add(CreateInstructionSimplifyPass()); } - optPM.add(llvm::createDeadInstEliminationPass()); + optPM.add(llvm::createDeadInstEliminationPass(), 220); // Max struct size threshold for scalar replacement is // 1) 4 fields (r,g,b,w) @@ -508,9 +560,10 @@ Optimize(llvm::Module *module, int optLevel) { #if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3) // Starting from 3.4 this functionality was moved to // InstructionCombiningPass. See r184459 for details. - optPM.add(llvm::createSimplifyLibCallsPass()); + optPM.add(llvm::createSimplifyLibCallsPass(), 240); #endif - optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createAggressiveDCEPass()); + optPM.add(llvm::createInstructionCombiningPass(), 241); optPM.add(llvm::createJumpThreadingPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold)); @@ -518,75 +571,85 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createTailCallEliminationPass()); if (!g->opt.disableMaskAllOnOptimizations) { - optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateIntrinsicsOptPass(), 250); + optPM.add(CreateInstructionSimplifyPass()); } if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) { - optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createInstructionCombiningPass(), 255); optPM.add(CreateImproveMemoryOpsPass()); if (g->opt.disableCoalescing == false && g->target->getISA() != Target::GENERIC) { // It is important to run this here to make it easier to // finding matching gathers we can coalesce.. - optPM.add(llvm::createEarlyCSEPass()); + optPM.add(llvm::createEarlyCSEPass(), 260); optPM.add(CreateGatherCoalescePass()); } } - optPM.add(llvm::createFunctionInliningPass()); + optPM.add(llvm::createFunctionInliningPass(), 265); optPM.add(llvm::createConstantPropagationPass()); optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) { - optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createInstructionCombiningPass(), 270); optPM.add(CreateImproveMemoryOpsPass()); } - optPM.add(llvm::createIPSCCPPass()); + optPM.add(llvm::createIPSCCPPass(), 275); optPM.add(llvm::createDeadArgEliminationPass()); + optPM.add(llvm::createAggressiveDCEPass()); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createCFGSimplificationPass()); - if (g->opt.disableHandlePseudoMemoryOps == false) - optPM.add(CreateReplacePseudoMemoryOpsPass()); - optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + if (g->opt.disableHandlePseudoMemoryOps == false) { + optPM.add(CreateReplacePseudoMemoryOpsPass(),280); + } + optPM.add(CreateIntrinsicsOptPass(),281); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createFunctionInliningPass()); optPM.add(llvm::createArgumentPromotionPass()); optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false)); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createReassociatePass()); optPM.add(llvm::createLoopRotatePass()); optPM.add(llvm::createLICMPass()); optPM.add(llvm::createLoopUnswitchPass(false)); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createIndVarSimplifyPass()); optPM.add(llvm::createLoopIdiomPass()); optPM.add(llvm::createLoopDeletionPass()); - if (g->opt.unrollLoops) - optPM.add(llvm::createLoopUnrollPass()); - optPM.add(llvm::createGVNPass()); + if (g->opt.unrollLoops) { + optPM.add(llvm::createLoopUnrollPass(), 300); + } + optPM.add(llvm::createGVNPass(), 301); optPM.add(CreateIsCompileTimeConstantPass(true)); optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createMemCpyOptPass()); optPM.add(llvm::createSCCPPass()); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createJumpThreadingPass()); optPM.add(llvm::createCorrelatedValuePropagationPass()); optPM.add(llvm::createDeadStoreEliminationPass()); optPM.add(llvm::createAggressiveDCEPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); + optPM.add(CreatePeepholePass()); + optPM.add(llvm::createFunctionInliningPass()); + optPM.add(llvm::createAggressiveDCEPass()); optPM.add(llvm::createStripDeadPrototypesPass()); optPM.add(CreateMakeInternalFuncsStaticPass()); optPM.add(llvm::createGlobalDCEPass()); @@ -595,7 +658,7 @@ Optimize(llvm::Module *module, int optLevel) { // Finish up by making sure we didn't mess anything up in the IR along // the way. - optPM.add(llvm::createVerifierPass()); + optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER); optPM.run(*module); if (g->debugPrint) { @@ -670,14 +733,17 @@ IntrinsicsOpt::IntrinsicsOpt() // All of the mask instructions we may encounter. Note that even if // compiling for AVX, we may still encounter the regular 4-wide SSE // MOVMSK instruction. - llvm::Function *sseMovmsk = + llvm::Function *ssei8Movmsk = + llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse2_pmovmskb_128); + maskInstructions.push_back(ssei8Movmsk); + llvm::Function *sseFloatMovmsk = llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps); - maskInstructions.push_back(sseMovmsk); + maskInstructions.push_back(sseFloatMovmsk); maskInstructions.push_back(m->module->getFunction("__movmsk")); - llvm::Function *avxMovmsk = + llvm::Function *avxFloatMovmsk = llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256); - Assert(avxMovmsk != NULL); - maskInstructions.push_back(avxMovmsk); + Assert(avxFloatMovmsk != NULL); + maskInstructions.push_back(avxFloatMovmsk); // And all of the blend instructions blendInstructions.push_back(BlendInstruction( @@ -924,80 +990,153 @@ CreateIntrinsicsOptPass() { @todo The better thing to do would be to submit a patch to LLVM to get these; they're presumably pretty simple patterns to match. */ -class VSelMovmskOpt : public llvm::BasicBlockPass { +class InstructionSimplifyPass : public llvm::BasicBlockPass { public: - VSelMovmskOpt() + InstructionSimplifyPass() : BasicBlockPass(ID) { } const char *getPassName() const { return "Vector Select Optimization"; } bool runOnBasicBlock(llvm::BasicBlock &BB); static char ID; + +private: + static bool simplifySelect(llvm::SelectInst *selectInst, + llvm::BasicBlock::iterator iter); + static llvm::Value *simplifyBoolVec(llvm::Value *value); + static bool simplifyCall(llvm::CallInst *callInst, + llvm::BasicBlock::iterator iter); }; -char VSelMovmskOpt::ID = 0; +char InstructionSimplifyPass::ID = 0; + + +llvm::Value * +InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) { + llvm::TruncInst *trunc = llvm::dyn_cast(value); + if (trunc != NULL) { + // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector) + llvm::SExtInst *sext = llvm::dyn_cast(value); + if (sext && + sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType) + return sext->getOperand(0); + + llvm::ZExtInst *zext = llvm::dyn_cast(value); + if (zext && + zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType) + return zext->getOperand(0); + } + + llvm::ICmpInst *icmp = llvm::dyn_cast(value); + if (icmp != NULL) { + // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo + if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) { + llvm::Value *op1 = icmp->getOperand(1); + if (llvm::isa(op1)) { + llvm::Value *op0 = icmp->getOperand(0); + llvm::SExtInst *sext = llvm::dyn_cast(op0); + if (sext) + return sext->getOperand(0); + llvm::ZExtInst *zext = llvm::dyn_cast(op0); + if (zext) + return zext->getOperand(0); + } + } + } + return NULL; +} bool -VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) { - DEBUG_START_PASS("VSelMovmaskOpt"); +InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst, + llvm::BasicBlock::iterator iter) { + if (selectInst->getType()->isVectorTy() == false) + return false; + + llvm::Value *factor = selectInst->getOperand(0); + + // Simplify all-on or all-off mask values + MaskStatus maskStatus = lGetMaskStatus(factor); + llvm::Value *value = NULL; + if (maskStatus == ALL_ON) + // Mask all on -> replace with the first select value + value = selectInst->getOperand(1); + else if (maskStatus == ALL_OFF) + // Mask all off -> replace with the second select value + value = selectInst->getOperand(2); + if (value != NULL) { + llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), + iter, value); + return true; + } + + // Sometimes earlier LLVM optimization passes generate unnecessarily + // complex expressions for the selection vector, which in turn confuses + // the code generators and leads to sub-optimal code (particularly for + // 8 and 16-bit masks). We'll try to simplify them out here so that + // the code generator patterns match.. + if ((factor = simplifyBoolVec(factor)) != NULL) { + llvm::Instruction *newSelect = + llvm::SelectInst::Create(factor, selectInst->getOperand(1), + selectInst->getOperand(2), + selectInst->getName()); + llvm::ReplaceInstWithInst(selectInst, newSelect); + return true; + } + + return false; +} + + +bool +InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst, + llvm::BasicBlock::iterator iter) { + llvm::Function *calledFunc = callInst->getCalledFunction(); + + // Turn a __movmsk call with a compile-time constant vector into the + // equivalent scalar value. + if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk")) + return false; + + uint64_t mask; + if (lGetMask(callInst->getArgOperand(0), &mask) == true) { + llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), + iter, LLVMInt64(mask)); + return true; + } + return false; +} + + +bool +InstructionSimplifyPass::runOnBasicBlock(llvm::BasicBlock &bb) { + DEBUG_START_PASS("InstructionSimplify"); bool modifiedAny = false; restart: for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) { llvm::SelectInst *selectInst = llvm::dyn_cast(&*iter); - if (selectInst != NULL && selectInst->getType()->isVectorTy()) { - llvm::Value *factor = selectInst->getOperand(0); - - MaskStatus maskStatus = lGetMaskStatus(factor); - llvm::Value *value = NULL; - if (maskStatus == ALL_ON) - // Mask all on -> replace with the first select value - value = selectInst->getOperand(1); - else if (maskStatus == ALL_OFF) - // Mask all off -> replace with the second select value - value = selectInst->getOperand(2); - - if (value != NULL) { - llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), - iter, value); - modifiedAny = true; - goto restart; - } + if (selectInst && simplifySelect(selectInst, iter)) { + modifiedAny = true; + goto restart; } - llvm::CallInst *callInst = llvm::dyn_cast(&*iter); - if (callInst == NULL) - continue; - - llvm::Function *calledFunc = callInst->getCalledFunction(); - if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk")) - continue; - - uint64_t mask; - if (lGetMask(callInst->getArgOperand(0), &mask) == true) { -#if 0 - fprintf(stderr, "mask %d\n", mask); - callInst->getArgOperand(0)->dump(); - fprintf(stderr, "-----------\n"); -#endif - llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), - iter, LLVMInt64(mask)); + if (callInst && simplifyCall(callInst, iter)) { modifiedAny = true; goto restart; } } - DEBUG_END_PASS("VSelMovMskOpt"); + DEBUG_END_PASS("InstructionSimplify"); return modifiedAny; } static llvm::Pass * -CreateVSelMovmskOptPass() { - return new VSelMovmskOpt; +CreateInstructionSimplifyPass() { + return new InstructionSimplifyPass; } @@ -4240,6 +4379,42 @@ CreateIsCompileTimeConstantPass(bool isLastTry) { return new IsCompileTimeConstantPass(isLastTry); } +////////////////////////////////////////////////////////////////////////// +// DebugPass + +/** This pass is added in list of passes after optimizations which + we want to debug and print dump of LLVM IR in stderr. Also it + prints name and number of previous optimization. + */ +class DebugPass : public llvm::ModulePass { +public: + static char ID; + DebugPass(char * output) : ModulePass(ID) { + sprintf(str_output, "%s", output); + } + + const char *getPassName() const { return "Dump LLVM IR"; } + bool runOnModule(llvm::Module &m); + +private: + char str_output[100]; +}; + +char DebugPass::ID = 0; + +bool +DebugPass::runOnModule(llvm::Module &module) { + fprintf(stderr, "%s", str_output); + fflush(stderr); + module.dump(); + return true; +} + +static llvm::Pass * +CreateDebugPass(char * output) { + return new DebugPass(output); +} + /////////////////////////////////////////////////////////////////////////// // MakeInternalFuncsStaticPass @@ -4273,6 +4448,14 @@ char MakeInternalFuncsStaticPass::ID = 0; bool MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { const char *names[] = { + "__avg_up_uint8", + "__avg_up_int8", + "__avg_up_uint16", + "__avg_up_int16", + "__avg_down_uint8", + "__avg_down_int8", + "__avg_down_uint16", + "__avg_down_int16", "__fast_masked_vload", "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64", @@ -4352,3 +4535,391 @@ static llvm::Pass * CreateMakeInternalFuncsStaticPass() { return new MakeInternalFuncsStaticPass; } + + +/////////////////////////////////////////////////////////////////////////// +// PeepholePass + +class PeepholePass : public llvm::BasicBlockPass { +public: + PeepholePass(); + + const char *getPassName() const { return "Peephole Optimizations"; } + bool runOnBasicBlock(llvm::BasicBlock &BB); + + static char ID; +}; + +char PeepholePass::ID = 0; + +PeepholePass::PeepholePass() + : BasicBlockPass(ID) { +} + +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) + +using namespace llvm::PatternMatch; + +template +struct CastClassTypes_match { + Op_t Op; + const llvm::Type *fromType, *toType; + + CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f, + const llvm::Type *t) + : Op(OpMatch), fromType(f), toType(t) {} + + template + bool match(OpTy *V) { + if (llvm::Operator *O = llvm::dyn_cast(V)) + return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) && + O->getType() == toType && + O->getOperand(0)->getType() == fromType); + return false; + } +}; + +template +inline CastClassTypes_match +m_SExt8To16(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int8VectorType, + LLVMTypes::Int16VectorType); +} + +template +inline CastClassTypes_match +m_ZExt8To16(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int8VectorType, + LLVMTypes::Int16VectorType); +} + + +template +inline CastClassTypes_match +m_Trunc16To8(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int16VectorType, + LLVMTypes::Int8VectorType); +} + +template +inline CastClassTypes_match +m_SExt16To32(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int16VectorType, + LLVMTypes::Int32VectorType); +} + +template +inline CastClassTypes_match +m_ZExt16To32(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int16VectorType, + LLVMTypes::Int32VectorType); +} + + +template +inline CastClassTypes_match +m_Trunc32To16(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int32VectorType, + LLVMTypes::Int16VectorType); +} + +template +struct UDiv2_match { + Op_t Op; + + UDiv2_match(const Op_t &OpMatch) + : Op(OpMatch) {} + + template + bool match(OpTy *V) { + llvm::BinaryOperator *bop; + llvm::ConstantDataVector *cdv; + if ((bop = llvm::dyn_cast(V)) && + (cdv = llvm::dyn_cast(bop->getOperand(1))) && + cdv->getSplatValue() != NULL) { + const llvm::APInt &apInt = cdv->getUniqueInteger(); + + switch (bop->getOpcode()) { + case llvm::Instruction::UDiv: + // divide by 2 + return (apInt.isIntN(2) && Op.match(bop->getOperand(0))); + case llvm::Instruction::LShr: + // shift left by 1 + return (apInt.isIntN(1) && Op.match(bop->getOperand(0))); + default: + return false; + } + } + return false; + } +}; + +template +inline UDiv2_match +m_UDiv2(const V &v) { + return UDiv2_match(v); +} + +template +struct SDiv2_match { + Op_t Op; + + SDiv2_match(const Op_t &OpMatch) + : Op(OpMatch) {} + + template + bool match(OpTy *V) { + llvm::BinaryOperator *bop; + llvm::ConstantDataVector *cdv; + if ((bop = llvm::dyn_cast(V)) && + (cdv = llvm::dyn_cast(bop->getOperand(1))) && + cdv->getSplatValue() != NULL) { + const llvm::APInt &apInt = cdv->getUniqueInteger(); + + switch (bop->getOpcode()) { + case llvm::Instruction::SDiv: + // divide by 2 + return (apInt.isIntN(2) && Op.match(bop->getOperand(0))); + case llvm::Instruction::AShr: + // shift left by 1 + return (apInt.isIntN(1) && Op.match(bop->getOperand(0))); + default: + return false; + } + } + return false; + } +}; + +template +inline SDiv2_match +m_SDiv2(const V &v) { + return SDiv2_match(v); +} + +// Returns true if the given function has a call to an intrinsic function +// in its definition. +static bool +lHasIntrinsicInDefinition(llvm::Function *func) { + llvm::Function::iterator bbiter = func->begin(); + for (; bbiter != func->end(); ++bbiter) { + for (llvm::BasicBlock::iterator institer = bbiter->begin(); + institer != bbiter->end(); ++institer) { + if (llvm::isa(institer)) + return true; + } + } + return false; +} + +static llvm::Instruction * +lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) { + llvm::Function *func = m->module->getFunction(name); + Assert(func != NULL); + + // Make sure that the definition of the llvm::Function has a call to an + // intrinsic function in its instructions; otherwise we will generate + // infinite loops where we "helpfully" turn the default implementations + // of target builtins like __avg_up_uint8 that are implemented with plain + // arithmetic ops into recursive calls to themselves. + if (lHasIntrinsicInDefinition(func)) + return lCallInst(func, opa, opb, name); + else + return NULL; +} + +////////////////////////////////////////////////// + +static llvm::Instruction * +lMatchAvgUpUInt8(llvm::Value *inst) { + // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2) + llvm::Value *opa, *opb; + const llvm::APInt *delta; + if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr( + m_CombineOr( + m_Add(m_ZExt8To16(m_Value(opa)), + m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))), + m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)), + m_ZExt8To16(m_Value(opb)))), + m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))), + m_APInt(delta))))))) { + if (delta->isIntN(1) == false) + return NULL; + + return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb); + } + return NULL; +} + + +static llvm::Instruction * +lMatchAvgDownUInt8(llvm::Value *inst) { + // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2) + llvm::Value *opa, *opb; + if (match(inst, m_Trunc16To8(m_UDiv2( + m_Add(m_ZExt8To16(m_Value(opa)), + m_ZExt8To16(m_Value(opb))))))) { + return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb); + } + return NULL; +} + +static llvm::Instruction * +lMatchAvgUpUInt16(llvm::Value *inst) { + // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2) + llvm::Value *opa, *opb; + const llvm::APInt *delta; + if (match(inst, m_Trunc32To16(m_UDiv2(m_CombineOr( + m_CombineOr( + m_Add(m_ZExt16To32(m_Value(opa)), + m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))), + m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)), + m_ZExt16To32(m_Value(opb)))), + m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))), + m_APInt(delta))))))) { + if (delta->isIntN(1) == false) + return NULL; + + return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb); + } + return NULL; +} + + +static llvm::Instruction * +lMatchAvgDownUInt16(llvm::Value *inst) { + // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2) + llvm::Value *opa, *opb; + if (match(inst, m_Trunc32To16(m_UDiv2( + m_Add(m_ZExt16To32(m_Value(opa)), + m_ZExt16To32(m_Value(opb))))))) { + return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb); + } + return NULL; +} + + +static llvm::Instruction * +lMatchAvgUpInt8(llvm::Value *inst) { + // (int8)(((int16)a + (int16)b + 1)/2) + llvm::Value *opa, *opb; + const llvm::APInt *delta; + if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr( + m_CombineOr( + m_Add(m_SExt8To16(m_Value(opa)), + m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))), + m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)), + m_SExt8To16(m_Value(opb)))), + m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))), + m_APInt(delta))))))) { + if (delta->isIntN(1) == false) + return NULL; + + return lGetBinaryIntrinsic("__avg_up_int8", opa, opb); + } + return NULL; +} + + +static llvm::Instruction * +lMatchAvgDownInt8(llvm::Value *inst) { + // (int8)(((int16)a + (int16)b)/2) + llvm::Value *opa, *opb; + if (match(inst, m_Trunc16To8(m_SDiv2( + m_Add(m_SExt8To16(m_Value(opa)), + m_SExt8To16(m_Value(opb))))))) { + return lGetBinaryIntrinsic("__avg_down_int8", opa, opb); + } + return NULL; +} + +static llvm::Instruction * +lMatchAvgUpInt16(llvm::Value *inst) { + // (int16)(((int32)a + (int32)b + 1)/2) + llvm::Value *opa, *opb; + const llvm::APInt *delta; + if (match(inst, m_Trunc32To16(m_SDiv2(m_CombineOr( + m_CombineOr( + m_Add(m_SExt16To32(m_Value(opa)), + m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))), + m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)), + m_SExt16To32(m_Value(opb)))), + m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))), + m_APInt(delta))))))) { + if (delta->isIntN(1) == false) + return NULL; + + return lGetBinaryIntrinsic("__avg_up_int16", opa, opb); + } + return NULL; +} + +static llvm::Instruction * +lMatchAvgDownInt16(llvm::Value *inst) { + // (int16)(((int32)a + (int32)b)/2) + llvm::Value *opa, *opb; + if (match(inst, m_Trunc32To16(m_SDiv2( + m_Add(m_SExt16To32(m_Value(opa)), + m_SExt16To32(m_Value(opb))))))) { + return lGetBinaryIntrinsic("__avg_down_int16", opa, opb); + } + return NULL; +} +#endif // !LLVM_3_1 && !LLVM_3_2 + +bool +PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) { + DEBUG_START_PASS("PeepholePass"); + + bool modifiedAny = false; + restart: + for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) { + llvm::Instruction *inst = &*iter; + + llvm::Instruction *builtinCall = NULL; +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) + if (!builtinCall) + builtinCall = lMatchAvgUpUInt8(inst); + if (!builtinCall) + builtinCall = lMatchAvgUpUInt16(inst); + if (!builtinCall) + builtinCall = lMatchAvgDownUInt8(inst); + if (!builtinCall) + builtinCall = lMatchAvgDownUInt16(inst); + if (!builtinCall) + builtinCall = lMatchAvgUpInt8(inst); + if (!builtinCall) + builtinCall = lMatchAvgUpInt16(inst); + if (!builtinCall) + builtinCall = lMatchAvgDownInt8(inst); + if (!builtinCall) + builtinCall = lMatchAvgDownInt16(inst); +#endif // !LLVM_3_1 && !LLVM_3_2 + if (builtinCall != NULL) { + llvm::ReplaceInstWithInst(inst, builtinCall); + modifiedAny = true; + goto restart; + } + } + + DEBUG_END_PASS("PeepholePass"); + + return modifiedAny; +} + +static llvm::Pass * +CreatePeepholePass() { + return new PeepholePass; +} diff --git a/parse.yy b/parse.yy index 3ad815cf..933a3455 100644 --- a/parse.yy +++ b/parse.yy @@ -149,7 +149,8 @@ struct ForeachDimension { %union { uint64_t intVal; - float floatVal; + float floatVal; + double doubleVal; std::string *stringVal; const char *constCharPtr; @@ -179,11 +180,13 @@ struct ForeachDimension { } +%token TOKEN_INT8_CONSTANT TOKEN_UINT8_CONSTANT +%token TOKEN_INT16_CONSTANT TOKEN_UINT16_CONSTANT %token TOKEN_INT32_CONSTANT TOKEN_UINT32_CONSTANT %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT -%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL +%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP @@ -291,6 +294,22 @@ primary_expression Error(@1, "Undeclared symbol \"%s\".%s", name, alts.c_str()); } } + | TOKEN_INT8_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformInt8->GetAsConstType(), + (int8_t)yylval.intVal, @1); + } + | TOKEN_UINT8_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformUInt8->GetAsConstType(), + (uint8_t)yylval.intVal, @1); + } + | TOKEN_INT16_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformInt16->GetAsConstType(), + (int16_t)yylval.intVal, @1); + } + | TOKEN_UINT16_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformUInt16->GetAsConstType(), + (uint16_t)yylval.intVal, @1); + } | TOKEN_INT32_CONSTANT { $$ = new ConstExpr(AtomicType::UniformInt32->GetAsConstType(), (int32_t)yylval.intVal, @1); @@ -309,7 +328,11 @@ primary_expression } | TOKEN_FLOAT_CONSTANT { $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(), - (float)yylval.floatVal, @1); + yylval.floatVal, @1); + } + | TOKEN_DOUBLE_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(), + yylval.doubleVal, @1); } | TOKEN_TRUE { $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1); @@ -1233,7 +1256,10 @@ declarator ; int_constant - : TOKEN_INT32_CONSTANT { $$ = yylval.intVal; } + : TOKEN_INT8_CONSTANT { $$ = yylval.intVal; } + | TOKEN_INT16_CONSTANT { $$ = yylval.intVal; } + | TOKEN_INT32_CONSTANT { $$ = yylval.intVal; } + | TOKEN_INT64_CONSTANT { $$ = yylval.intVal; } ; direct_declarator @@ -2148,8 +2174,27 @@ lAddFunctionParams(Declarator *decl) { /** Add a symbol for the built-in mask variable to the symbol table */ static void lAddMaskToSymbolTable(SourcePos pos) { - const Type *t = g->target->getMaskBitCount() == 1 ? - AtomicType::VaryingBool : AtomicType::VaryingUInt32; + const Type *t = NULL; + switch (g->target->getMaskBitCount()) { + case 1: + t = AtomicType::VaryingBool; + break; + case 8: + t = AtomicType::VaryingUInt8; + break; + case 16: + t = AtomicType::VaryingUInt16; + break; + case 32: + t = AtomicType::VaryingUInt32; + break; + case 64: + t = AtomicType::VaryingUInt64; + break; + default: + FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable"); + } + t = t->GetAsConstType(); Symbol *maskSymbol = new Symbol("__mask", pos, t); m->symbolTable->AddVariable(maskSymbol); @@ -2241,7 +2286,11 @@ lGetConstantInt(Expr *expr, int *value, SourcePos pos, const char *usage) { Error(pos, "%s must be representable with a 32-bit integer.", usage); return false; } - *value = (int)ci->getZExtValue(); + const Type *type = expr->GetType(); + if (type->IsUnsignedType()) + *value = (int)ci->getZExtValue(); + else + *value = (int)ci->getSExtValue(); return true; } } diff --git a/examples/perf.ini b/perf.ini similarity index 84% rename from examples/perf.ini rename to perf.ini index 3814bf16..249c25f4 100755 --- a/examples/perf.ini +++ b/perf.ini @@ -10,44 +10,48 @@ %**************************************************************************************************** AOBench aobench -ao 10 512 512 +10 512 512 #*** Deferred Shading deferred -deferred_shading data/pp1280x720.bin +data/pp1280x720.bin #*** Mandelbrot Set mandelbrot -mandelbrot + #*** Mandelbrot Set mandelbrot_tasks -mandelbrot + ^ #*** Perlin Noise Function noise -noise + #*** Binomial Options options -options + ! 1 2 #*** Black-Scholes Options options -options + ! 2 2 #*** Ray Tracer rt -rt sponza +sponza #*** 3D Stencil stencil -stencil + #*** Volume Rendering volume_rendering -volume camera.dat density_highres.vol +camera.dat density_highres.vol +#*** +Sort +sort +1000000 1 #*** diff --git a/perf.py b/perf.py new file mode 100755 index 00000000..576a5c7d --- /dev/null +++ b/perf.py @@ -0,0 +1,489 @@ +#!/usr/bin/python +# +# Copyright (c) 2013, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# // Author: Filippov Ilia + +def print_file(line): + if options.output != "": + output = open(options.output, 'w') + output.writelines(line) + output.close() + +def build_test(commands): + os.system(commands[4]) + test = os.system(commands[1]) + if options.ref: + ref = os.system(commands[3]) + return (options.ref and ref) or test + +def execute_test(commands): + r = 0 + common.remove_if_exists(perf_temp+"_test") + common.remove_if_exists(perf_temp+"_ref") + for k in range(int(options.number)): + r = r + os.system(commands[0]) + if options.ref: + r = r + os.system(commands[2]) + return r + +#gathers all tests results and made an item test from answer structure +def run_test(commands, c1, c2, test, test_ref, b_serial): + if build_test(commands) != 0: + error("Compilation fails of test %s\n" % test[0], 0) + return + if execute_test(commands) != 0: + error("Execution fails of test %s\n" % test[0], 0) + return + print_debug("TEST COMPILER:\n", s, perf_log) + analyse_test(c1, c2, test, b_serial, perf_temp+"_test") + if options.ref: + print_debug("REFERENCE COMPILER:\n", s, perf_log) + analyse_test(c1, c2, test_ref, b_serial, perf_temp+"_ref") + + +def analyse_test(c1, c2, test, b_serial, perf_temp_n): + tasks = [] #list of results with tasks, it will be test[2] + ispc = [] #list of results without tasks, it will be test[1] + absolute_tasks = [] #list of absolute results with tasks, it will be test[4] + absolute_ispc = [] #list of absolute results without tasks, ut will be test[3] + serial = [] #list serial times, it will be test[5] + j = 1 + for line in open(perf_temp_n): # we take test output + if "speedup" in line: # we are interested only in lines with speedup + if j == c1: # we are interested only in lines with c1 numbers + line = line.expandtabs(0) + line = line.replace("("," ") + line = line.split(",") + for i in range(len(line)): + subline = line[i].split(" ") + number = float(subline[1][:-1]) + if "speedup from ISPC + tasks" in line[i]: + tasks.append(number) + else: + ispc.append(number) + c1 = c1 + c2 + j+=1 + if "million cycles" in line: + if j == c1: + line = line.replace("]","[") + line = line.split("[") + number = float(line[3]) + if "tasks" in line[1]: + absolute_tasks.append(number) + else: + if "ispc" in line[1]: + absolute_ispc.append(number) + if "serial" in line[1]: + serial.append(number) + + if len(ispc) != 0: + if len(tasks) != 0: + print_debug("ISPC speedup / ISPC + tasks speedup / ISPC time / ISPC + tasks time / serial time\n", s, perf_log) + for i in range(0,len(serial)): + print_debug("%10s /\t%10s\t /%9s / %10s\t /%10s\n" % + (ispc[i], tasks[i], absolute_ispc[i], absolute_tasks[i], serial[i]), s, perf_log) + else: + print_debug("ISPC speedup / ISPC time / serial time\n", s, perf_log) + for i in range(0,len(serial)): + print_debug("%10s /%9s /%10s\n" % (ispc[i], absolute_ispc[i], serial[i]), s, perf_log) + else: + if len(tasks) != 0: + print_debug("ISPC + tasks speedup / ISPC + tasks time / serial time\n", s, perf_log) + for i in range(0,len(serial)): + print_debug("%10s\t / %10s\t /%10s\n" % (tasks[i], absolute_tasks[i], serial[i]), s, perf_log) + + test[1] = test[1] + ispc + test[2] = test[2] + tasks + test[3] = test[3] + absolute_ispc + test[4] = test[4] + absolute_tasks + if b_serial == True: + #if we concatenate outputs we should use only the first serial answer. + test[5] = test[5] + serial + +def cpu_get(): + p = open("/proc/stat", 'r') + cpu = p.readline() + p.close() + cpu = cpu.split(" ") + cpu_usage = (int(cpu[2]) + int(cpu[3]) + int(cpu[4])) + cpu_all = cpu_usage + int(cpu[5]) + return [cpu_usage, cpu_all] + +#returns cpu_usage +def cpu_check(): + if is_windows == False: + if is_mac == False: + cpu1 = cpu_get() + time.sleep(1) + cpu2 = cpu_get() + cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100 + else: + os.system("sysctl -n vm.loadavg > cpu_temp") + c = open("cpu_temp", 'r') + c_line = c.readline() + c.close + os.remove("cpu_temp") + R = c_line.split(' ') + cpu_percent = float(R[1]) * 3 + else: + os.system("wmic cpu get loadpercentage /value > cpu_temp") + c = open("cpu_temp", 'r') + c_lines = c.readlines() + c.close() + os.remove("cpu_temp") + t = "0" + for i in c_lines[2]: + if i.isdigit(): + t = t + i + cpu_percent = int(t) + return cpu_percent + +#returns geomean of list +def geomean(par): + temp = 1 + l = len(par) + for i in range(l): + temp = temp * par[i] + temp = temp ** (1.0/l) + return round(temp, 2) + +#takes an answer struct and print it. +#answer struct: list answer contains lists test +#test[0] - name of test +#test[1] - list of results without tasks +#test[2] - list of results with tasks +#test[3] - list of absolute results without tasks +#test[4] - list of absolute results with tasks +#test[5] - list of absolute time without ISPC (serial) +#test[1..4] may be empty +def print_answer(answer): + filelist = [] + print_debug("--------------------------------------------------------------------------\n", s, perf_log) + print_debug("test name:\t ISPC speedup: ISPC + tasks speedup: | " + + " ISPC time: ISPC + tasks time: serial:\n", s, perf_log) + filelist.append("test name,ISPC speedup,diff," + + "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n") + max_t = [0,0,0,0,0] + diff_t = [0,0,0,0,0] + geomean_t = [0,0,0,0,0] + list_of_max = [[],[],[],[],[]] + list_of_compare = [[],[],[],[],[],[]] + for i in range(len(answer)): + list_of_compare[0].append(answer[i][0]) + for t in range(1,6): + if len(answer[i][t]) == 0: + max_t[t-1] = "n/a" + diff_t[t-1] = "n/a" + list_of_compare[t].append(0); + else: + if t < 3: + mm = max(answer[i][t]) + else: + mm = min(answer[i][t]) + list_of_compare[t].append(mm) + max_t[t-1] = '%.2f' % mm + list_of_max[t-1].append(mm) + diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t])) + print_debug("%s:\n" % answer[i][0], s, perf_log) + print_debug("\t\tmax:\t%5s\t\t%10s\t|min:%10s\t%10s\t%10s\n" % + (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4]), s, perf_log) + print_debug("\t\tdiff:\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" % + (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4]), s, perf_log) + for t in range(0,5): + if max_t[t] == "n/a": + max_t[t] = "" + if diff_t[t] == "n/a": + diff_t[t] = "" + filelist.append(answer[i][0] + "," + + max_t[0] + "," + diff_t[0] + "," + max_t[1] + "," + diff_t[1] + "," + + max_t[2] + "," + diff_t[2] + "," + max_t[3] + "," + diff_t[3] + "," + + max_t[4] + "," + diff_t[4] + "\n") + for i in range(0,5): + geomean_t[i] = geomean(list_of_max[i]) + print_debug("---------------------------------------------------------------------------------\n", s, perf_log) + print_debug("Geomean:\t\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" % + (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]), s, perf_log) + filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1]) + + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n") + print_file(filelist) + return list_of_compare + + +def compare(A, B): + print_debug("\n\n_____________________PERFORMANCE REPORT____________________________\n", False, "") + print_debug("test name: ISPC time: ISPC time ref: %:\n", False, "") + for i in range(0,len(A[0])): + if B[3][i] == 0: + p1 = 0 + else: + p1 = 100 - 100 * A[3][i]/B[3][i] + print_debug("%21s: %10.2f %10.2f %10.2f" % (A[0][i], A[3][i], B[3][i], abs(p1)), False, "") + if p1 < -1: + print_debug(" <+", False, "") + if p1 > 1: + print_debug(" <-", False, "") + print_debug("\n", False, "") + print_debug("\n", False, "") + + print_debug("test name: TASKS time: TASKS time ref: %:\n", False, "") + for i in range(0,len(A[0])): + if B[4][i] == 0: + p2 = 0 + else: + p2 = 100 - 100 * A[4][i]/B[4][i] + print_debug("%21s: %10.2f %10.2f %10.2f" % (A[0][i], A[4][i], B[4][i], abs(p2)), False, "") + if p2 < -1: + print_debug(" <+", False, "") + if p2 > 1: + print_debug(" <-", False, "") + print_debug("\n", False, "") + if "performance.log" in options.in_file: + print_debug("\n\n_________________Watch performance.log for details________________\n", False, "") + else: + print_debug("\n\n__________________________________________________________________\n", False, "") + + + +def perf(options1, args): + global options + options = options1 + global s + s = options.silent + + # save current OS + global is_windows + is_windows = (platform.system() == 'Windows' or + 'CYGWIN_NT' in platform.system()) + global is_mac + is_mac = (platform.system() == 'Darwin') + + # save current path + pwd = os.getcwd() + pwd = pwd + os.sep + pwd1 = pwd + if is_windows: + pwd1 = "..\\..\\" + + # check if cpu usage is low now + cpu_percent = cpu_check() + if cpu_percent > 20: + error("CPU Usage is very high.\nClose other applications.\n", 2) + + global ispc_test + global ispc_ref + global ref_compiler + global refc_compiler + # check that required compilers exist + PATH_dir = string.split(os.getenv("PATH"), os.pathsep) + ispc_test_exists = False + ispc_ref_exists = False + ref_compiler_exists = False + if is_windows == False: + ispc_test = "ispc" + ref_compiler = "g++" + refc_compiler = "gcc" + if options.compiler != "": + if options.compiler == "clang" or options.compiler == "clang++": + ref_compiler = "clang++" + refc_compiler = "clang" + if options.compiler == "icc" or options.compiler == "icpc": + ref_compiler = "icpc" + refc_compiler = "icc" + else: + ispc_test = "ispc.exe" + ref_compiler = "cl.exe" + ispc_ref = options.ref + if options.ref != "": + options.ref = True + for counter in PATH_dir: + if os.path.exists(counter + os.sep + ispc_test): + ispc_test_exists = True + if os.path.exists(counter + os.sep + ref_compiler): + ref_compiler_exists = True + if os.path.exists(counter + os.sep + ispc_ref): + ispc_ref_exists = True + if not ispc_test_exists: + error("ISPC compiler not found.\nAdded path to ispc compiler to your PATH variable.\n", 1) + if not ref_compiler_exists: + error("C/C++ compiler %s not found.\nAdded path to %s compiler to your PATH variable.\n" % (ref_compiler, ref_compiler), 1) + if options.ref: + if not ispc_ref_exists: + error("ISPC reference compiler not found.\nAdded path to ispc reference compiler to your PATH variable.\n", 1) + + # checks that config file exists + path_config = os.path.normpath(options.config) + if os.path.exists(path_config) == False: + error("config file not found: %s.\nSet path to your config file in --config.\n" % options.config, 1) + sys.exit() + + # read lines from config file except comments + f = open(path_config, 'r') + f_lines = f.readlines() + f.close() + lines =[] + for i in range(len(f_lines)): + if f_lines[i][0] != "%": + lines.append(f_lines[i]) + length = len(lines) + + # prepare build.log, perf_temp and perf.log files + global perf_log + if options.in_file: + perf_log = pwd + options.in_file + common.remove_if_exists(perf_log) + else: + perf_log = "" + global build_log + build_log = pwd + os.sep + "logs" + os.sep + "perf_build.log" + common.remove_if_exists(build_log) + if os.path.exists(pwd + os.sep + "logs") == False: + os.makedirs(pwd + os.sep + "logs") + + global perf_temp + perf_temp = pwd + "perf_temp" + # end of preparations + + print_debug("Okey go go go!\n\n", s, perf_log) + + #print compilers versions + common.print_version(ispc_test, ispc_ref, ref_compiler, False, perf_log, is_windows) + + # begin + i = 0 + answer = [] + answer_ref = [] + + # loop for all tests + while i < length-2: + # we read name of test + print_debug("%s" % lines[i], s, perf_log) + test = [lines[i][:-1],[],[],[],[],[]] + test_ref = [lines[i][:-1],[],[],[],[],[]] + # read location of test + folder = lines[i+1] + folder = folder[:-1] + folder = os.path.normpath(options.path + os.sep + "examples" + os.sep + folder) + # check that test exists + if os.path.exists(folder) == False: + error("Can't find test %s. Your path is: \"%s\".\nChange current location to ISPC_HOME or set path to ISPC_HOME in --path.\n" % + (lines[i][:-1], options.path), 1) + os.chdir(folder) + # read parameters of test + command = lines[i+2] + command = command[:-1] + if is_windows == False: + ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref" + ex_command = "./test " + command + " >> " + perf_temp + "_test" + bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+" >> "+build_log+" 2>> "+build_log + bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+" >> "+build_log+" 2>> "+build_log + re_command = "make clean >> "+build_log + else: + ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref" + ex_command = "x64\\Release\\test.exe " + command + " >> " + perf_temp + "_test" + bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /t:rebuild >> " + build_log + bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /t:rebuild >> " + build_log + re_command = "msbuild /t:clean >> " + build_log + commands = [ex_command, bu_command, ex_command_ref, bu_command_ref, re_command] + # parsing config parameters + next_line = lines[i+3] + if next_line[0] == "!": # we should take only one part of test output + R = next_line.split(' ') + c1 = int(R[1]) #c1 is a number of string which we want to use in test output + c2 = int(R[2]) #c2 is total number of strings in test output + i = i+1 + else: + c1 = 1 + c2 = 1 + next_line = lines[i+3] + if next_line[0] == "^": #we should concatenate result of this test with previous one + run_test(commands, c1, c2, answer[len(answer)-1], answer_ref[len(answer)-1], False) + i = i+1 + else: #we run this test and append it's result to answer structure + run_test(commands, c1, c2, test, test_ref, True) + answer.append(test) + answer_ref.append(test_ref) + + # preparing next loop iteration + os.chdir(pwd1) + i+=4 + + # delete temp file + common.remove_if_exists(perf_temp+"_test") + common.remove_if_exists(perf_temp+"_ref") + + #print collected answer + print_debug("\n\nTEST COMPILER:\n", s, perf_log) + A = print_answer(answer) + if options.ref != "": + print_debug("\n\nREFERENCE COMPILER:\n", s, perf_log) + B = print_answer(answer_ref) + # print perf report + compare(A,B) + + + +###Main### +from optparse import OptionParser +import sys +import os +import operator +import time +import glob +import string +import platform +# our functions +import common +print_debug = common.print_debug +error = common.error + +if __name__ == "__main__": + # parsing options + parser = OptionParser() + parser.add_option('-n', '--number', dest='number', + help='number of repeats', default="3") + parser.add_option('-c', '--config', dest='config', + help='config file of tests', default="./perf.ini") + parser.add_option('-p', '--path', dest='path', + help='path to ispc root', default=".") + parser.add_option('-s', '--silent', dest='silent', + help='silent mode, only table output', default=False, action="store_true") + parser.add_option('-o', '--output', dest='output', + help='output file for script reading', default="") + parser.add_option('--compiler', dest='compiler', + help='C/C++ compiler', default="") + parser.add_option('-r', '--ref', dest='ref', + help='set reference compiler for compare', default="") + parser.add_option('-f', '--file', dest='in_file', + help='file to save perf output', default="") + (options, args) = parser.parse_args() + perf(options, args) diff --git a/run_tests.py b/run_tests.py index 7c6b1eb8..4ee80fe3 100755 --- a/run_tests.py +++ b/run_tests.py @@ -1,179 +1,54 @@ #!/usr/bin/python +# +# Copyright (c) 2013, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # test-running driver for ispc - -from optparse import OptionParser -import multiprocessing -from ctypes import c_int -import os -import sys -import glob -import re -import signal -import random -import string -import subprocess -import shlex -import platform -import tempfile -import os.path -import time - -# disable fancy error/warning printing with ANSI colors, so grepping for error -# messages doesn't get confused -os.environ["TERM"] = "dumb" - -# This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard -# git history has a workaround for that issue. - -is_windows = (platform.system() == 'Windows' or - 'CYGWIN_NT' in platform.system()) - -parser = OptionParser() -parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests", - default=False, action="store_true") -parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics", - default=None) -parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)", - default="") -parser.add_option('-t', '--target', dest='target', - help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', - default="sse4") -parser.add_option('-a', '--arch', dest='arch', - help='Set architecture (arm, x86, x86-64)', - default="x86-64") -parser.add_option("-c", "--compiler", dest="compiler_exe", help="Compiler binary to use to run tests", - default=None) -parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization', - default=False, action="store_true") -parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel', - default="1024", type="int") -parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output', - default=False, action="store_true") -parser.add_option('--wrap-exe', dest='wrapexe', - help='Executable to wrap test runs with (e.g. "valgrind")', - default="") -parser.add_option('--time', dest='time', help='Enable time output', - default=False, action="store_true") - -(options, args) = parser.parse_args() - -if options.target == 'neon': - options.arch = 'arm' - -# use relative path to not depend on host directory, which may possibly -# have white spaces and unicode characters. -if not is_windows: - ispc_exe = "./ispc" -else: - ispc_exe = ".\\Release\\ispc.exe" - -# checks the required ispc compiler otherwise prints an error message -if not os.path.exists(ispc_exe): - sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe) - sys.exit() - -ispc_exe += " " + options.ispc_flags - -if __name__ == '__main__': - sys.stdout.write("ispc compiler: %s\n" % ispc_exe) - -is_generic_target = (options.target.find("generic-") != -1 and - options.target != "generic-1") -if is_generic_target and options.include_file == None: - if options.target == "generic-4": - sys.stderr.write("No generics #include specified; using examples/intrinsics/sse4.h\n") - options.include_file = "examples/intrinsics/sse4.h" - elif options.target == "generic-8": - sys.stderr.write("No generics #include specified and no default available for \"generic-8\" target.\n") - sys.exit(1) - elif options.target == "generic-16": - sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-16.h\n") - options.include_file = "examples/intrinsics/generic-16.h" - elif options.target == "generic-32": - sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-32.h\n") - options.include_file = "examples/intrinsics/generic-32.h" - elif options.target == "generic-64": - sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-64.h\n") - options.include_file = "examples/intrinsics/generic-64.h" - -if options.compiler_exe == None: - if is_windows: - options.compiler_exe = "cl.exe" - else: - options.compiler_exe = "g++" - -# checks the required compiler otherwise prints an error message -PATH_dir = string.split(os.getenv("PATH"), os.pathsep) -compiler_exists = False - -for counter in PATH_dir: - if os.path.exists(counter + os.sep + options.compiler_exe): - compiler_exists = True - break - -if not compiler_exists: - sys.stderr.write("Fatal error: missing the required compiler: %s \n" % - options.compiler_exe) - sys.exit() - -ispc_root = "." - -# if no specific test files are specified, run all of the tests in tests/, -# failing_tests/, and tests_errors/ -if len(args) == 0: - files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \ - glob.glob(ispc_root + os.sep + "failing_tests" + os.sep + "*ispc") + \ - glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc") -else: - if is_windows: - argfiles = [ ] - for f in args: - # we have to glob ourselves if this is being run under a DOS - # shell, as it passes wildcard as is. - argfiles += glob.glob(f) - else: - argfiles = args - - files = [ ] - for f in argfiles: - if os.path.splitext(string.lower(f))[1] != ".ispc": - sys.stdout.write("Ignoring file %s, which doesn't have an .ispc extension.\n" % f) - else: - files += [ f ] - -# max_test_length is used to issue exact number of whitespace characters when -# updating status. Otherwise update causes new lines standard 80 char terminal -# on both Linux and Windows. -max_test_length = 0 -for f in files: - max_test_length = max(max_test_length, len(f)) - -# randomly shuffle the tests if asked to do so -if (options.random): - random.seed() - random.shuffle(files) - -# counter -total_tests = 0 - - # utility routine to print an update on the number of tests that have been # finished. Should be called with the lock held.. def update_progress(fn, total_tests_arg, counter, max_test_length_arg): counter.value += 1 - progress_str = " Done %d / %d [%s]" % (counter.value, total_tests_arg, fn) - # spaces to clear out detrius from previous printing... - spaces_needed = max_test_length_arg - len(fn) - for x in range(spaces_needed): - progress_str += ' ' - progress_str += '\r' - sys.stdout.write(progress_str) - sys.stdout.flush() + if options.non_interactive == False: + progress_str = " Done %d / %d [%s]" % (counter.value, total_tests_arg, fn) + # spaces to clear out detrius from previous printing... + spaces_needed = max_test_length_arg - len(fn) + for x in range(spaces_needed): + progress_str += ' ' + progress_str += '\r' + sys.stdout.write(progress_str) + sys.stdout.flush() def run_command(cmd): if options.verbose: - sys.stdout.write("Running: %s\n" % cmd) + print_debug("Running: %s\n" % cmd, s, run_tests_log) # Here's a bit tricky part. To pass a command for execution we should # break down the line in to arguments. shlex class is designed exactly @@ -201,9 +76,9 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure): (return_code, output) = run_command(cmd) compile_failed = (return_code != 0) if compile_failed: - sys.stdout.write("Compilation of test %s failed \n" % filename) + print_debug("Compilation of test %s failed \n" % filename, s, run_tests_log) if output != "": - sys.stdout.write("%s" % output.encode("utf-8")) + print_debug("%s" % output.encode("utf-8"), s, run_tests_log) return (1, 0) (return_code, output) = run_command(run_cmd) @@ -212,11 +87,11 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure): surprise = ((expect_failure and not run_failed) or (not expect_failure and run_failed)) if surprise == True: - sys.stderr.write("Test %s %s (return code %d) \n" % \ + print_debug("Test %s %s (return code %d) \n" % \ (filename, "unexpectedly passed" if expect_failure else "failed", - return_code)) + return_code), s, run_tests_log) if output != "": - sys.stdout.write("%s\n" % output.encode("utf-8")) + print_debug("%s\n" % output.encode("utf-8"), s, run_tests_log) if surprise == True: return (0, 1) else: @@ -231,7 +106,7 @@ def add_prefix(path): else: input_prefix = "" path = input_prefix + path - path = os.path.normpath(path) + path = os.path.abspath(path) return path @@ -294,12 +169,12 @@ def run_test(testname): firstline = firstline.rstrip() file.close() - if (output.find(firstline) == -1): - sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \ - (firstline, testname, output)) + if re.search(firstline, output) == None: + print_debug("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \ + (firstline, testname, output), s, run_tests_log) return (1, 0) elif got_error == False: - sys.stderr.write("Unexpectedly no errors issued from test %s\n" % testname) + print_debug("Unexpectedly no errors issued from test %s\n" % testname, s, run_tests_log) return (1, 0) else: return (0, 0) @@ -325,8 +200,7 @@ def run_test(testname): break file.close() if match == -1: - sys.stderr.write("Fatal error: unable to find function signature " + \ - "in test %s\n" % testname) + error("unable to find function signature in test %s\n" % testname, 0) return (1, 0) else: global is_generic_target @@ -359,10 +233,13 @@ def run_test(testname): gcc_isa="" if options.target == 'generic-4': gcc_isa = '-msse4.2' - if options.target == 'generic-8': + if (options.target == 'generic-8'): + if (options.include_file.find("knc-i1x8.h")!=-1 or options.include_file.find("knc-i1x8unsafe_fast.h")!=-1): + gcc_isa = '-mmic' + else: gcc_isa = '-mavx' if (options.target == 'generic-16' or options.target == 'generic-32' or options.target == 'generic-64') \ - and (options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1): + and (options.include_file.find("knc-i1x16.h")!=-1 or options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1): gcc_isa = '-mmic' cc_cmd = "%s -O2 -I. %s %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \ @@ -401,7 +278,21 @@ def run_test(testname): # pull tests to run from the given queue and run them. Multiple copies of # this function will be running in parallel across all of the CPU cores of # the system. -def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test_length_arg, counter, mutex): +def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test_length_arg, counter, mutex, glob_var): + # This is needed on windows because windows doen't copy globals from parent process whili multiprocessing + global is_windows + is_windows = glob_var[0] + global options + options = glob_var[1] + global s + s = glob_var[2] + global ispc_exe + ispc_exe = glob_var[3] + global is_generic_target + is_generic_target = glob_var[4] + global run_tests_log + run_tests_log = glob_var[5] + if is_windows: tmpdir = "tmp%d" % os.getpid() os.mkdir(tmpdir) @@ -444,14 +335,266 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test skip_files += [ filename ] -task_threads = [] - def sigint(signum, frame): for t in task_threads: t.terminate() sys.exit(1) -if __name__ == '__main__': + +def file_check(compfails, runfails): + errors = len(compfails) + len(runfails) + new_compfails = [] + new_runfails = [] + new_passes_compfails = [] + new_passes_runfails = [] +# Open file fail_db.txt + f = open(test_states, 'r') + f_lines = f.readlines() + f.close() +# Detect OS + if platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system(): + OS = "Windows" + else: + if platform.system() == 'Darwin': + OS = "Mac" + else: + OS = "Linux" +# Detect opt_set + if options.no_opt == True: + opt = "-O0" + else: + opt = "-O2" +# Detect LLVM version + temp1 = common.take_lines(ispc_exe + " --version", "first") + llvm_version = temp1[-10:-2] +# Detect compiler version + if is_windows == False: + temp1 = common.take_lines(options.compiler_exe + " --version", "first") + temp2 = re.search("[0-9]*\.[0-9]*\.[0-9]", temp1) + if temp2 == None: + temp3 = re.search("[0-9]*\.[0-9]*", temp1) + else: + temp3 = re.search("[0-9]*\.[0-9]*", temp2.group()) + compiler_version = options.compiler_exe + temp3.group() + else: + compiler_version = "cl" + new_line = " "+options.arch.rjust(6)+" "+options.target.rjust(14)+" "+OS.rjust(7)+" "+llvm_version+" "+compiler_version.rjust(10)+" "+opt+" *\n" + + new_compfails = compfails[:] + new_runfails = runfails[:] + new_f_lines = f_lines[:] + for j in range(0, len(f_lines)): + if (((" "+options.arch+" ") in f_lines[j]) and + ((" "+options.target+" ") in f_lines[j]) and + ((" "+OS+" ") in f_lines[j]) and + ((" "+llvm_version+" ") in f_lines[j]) and + ((" "+compiler_version+" ") in f_lines[j]) and + ((" "+opt+" ") in f_lines[j])): + if (" compfail " in f_lines[j]): + f = 0 + for i in range(0, len(compfails)): + if compfails[i] in f_lines[j]: + new_compfails.remove(compfails[i]) + else: + f = f + 1 + if f == len(compfails): + temp3 = f_lines[j].split(" ") + new_passes_compfails.append(temp3[0]) + if options.update == "FP": + new_f_lines.remove(f_lines[j]) + if (" runfail " in f_lines[j]): + f = 0 + for i in range(0, len(runfails)): + if runfails[i] in f_lines[j]: + new_runfails.remove(runfails[i]) + else: + f = f + 1 + if f == len(runfails): + temp3 = f_lines[j].split(" ") + new_passes_runfails.append(temp3[0]) + if options.update == "FP": + new_f_lines.remove(f_lines[j]) + if len(new_runfails) != 0: + print_debug("NEW RUNFAILS:\n", s, run_tests_log) + for i in range (0,len(new_runfails)): + new_f_lines.append(new_runfails[i] + " runfail " + new_line) + print_debug("\t" + new_runfails[i] + "\n", s, run_tests_log) + if len(new_compfails) != 0: + print_debug("NEW COMPFAILS:\n", s, run_tests_log) + for i in range (0,len(new_compfails)): + new_f_lines.append(new_compfails[i] + " compfail " + new_line) + print_debug("\t" + new_compfails[i] + "\n", s, run_tests_log) + if len(new_runfails) == 0 and len(new_compfails) == 0: + print_debug("No new fails\n", s, run_tests_log) + if len(new_passes_runfails) != 0: + print_debug("NEW PASSES after RUNFAILS:\n", s, run_tests_log) + for i in range (0,len(new_passes_runfails)): + print_debug("\t" + new_passes_runfails[i] + "\n", s, run_tests_log) + if len(new_passes_compfails) != 0: + print_debug("NEW PASSES after COMPFAILS:\n", s, run_tests_log) + for i in range (0,len(new_passes_compfails)): + print_debug("\t" + new_passes_compfails[i] + "\n", s, run_tests_log) + + if options.update != "": + output = open(test_states, 'w') + output.writelines(new_f_lines) + output.close() + return [new_runfails, new_compfails, new_passes_runfails, new_passes_compfails, new_line, errors] + +def verify(): + # Open file fail_db.txt + f = open(test_states, 'r') + f_lines = f.readlines() + f.close() + check = [["g++", "clang", "cl"],["-O0", "-O2"],["x86","x86-64"], + ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"], + ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8", + "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8", "avx1.1-i32x16", + "avx2-i32x8", "avx2-i32x16", "generic-1", "generic-4", "generic-8", + "generic-16", "generic-32", "generic-64"]] + for i in range (0,len(f_lines)): + if f_lines[i][0] == "%": + continue + for j in range(0,len(check)): + temp = 0 + for t in range(0,len(check[j])): + if " " + check[j][t] + " " in f_lines[i]: + temp = temp + 1 + if temp != 1: + print_debug("error in line " + str(i) + "\n", False, run_tests_log) + break + + +def run_tests(options1, args, print_version): + global options + options = options1 + global s + s = options.silent + + # prepare run_tests_log and fail_db files + global run_tests_log + if options.in_file: + run_tests_log = os.getcwd() + os.sep + options.in_file + if print_version == 1: + common.remove_if_exists(run_tests_log) + else: + run_tests_log = "" + global test_states + test_states = "fail_db.txt" + if options.verify: + verify() + return 0 + + # disable fancy error/warning printing with ANSI colors, so grepping for error + # messages doesn't get confused + os.environ["TERM"] = "dumb" + + # This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard + # git history has a workaround for that issue. + global is_windows + is_windows = (platform.system() == 'Windows' or + 'CYGWIN_NT' in platform.system()) + + if options.target == 'neon': + options.arch = 'arm' + + # use relative path to not depend on host directory, which may possibly + # have white spaces and unicode characters. + global ispc_exe + if not is_windows: + ispc_exe = "./ispc" + else: + ispc_exe = ".\\Release\\ispc.exe" + + # checks the required ispc compiler otherwise prints an error message + if not os.path.exists(ispc_exe): + error("missing ispc compiler: %s\n" % ispc_exe, 1) + ispc_exe += " " + options.ispc_flags + print_debug("ispc compiler: %s\n" % ispc_exe, s, run_tests_log) + + global is_generic_target + is_generic_target = (options.target.find("generic-") != -1 and + options.target != "generic-1" and options.target != "generic-x1") + if is_generic_target and options.include_file == None: + if options.target == "generic-4" or options.target == "generic-x4": + error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2) + options.include_file = "examples/intrinsics/sse4.h" + options.target = "generic-4" + elif options.target == "generic-8" or options.target == "generic-x8": + error("No generics #include specified and no default available for \"generic-8\" target.\n", 1) + options.target = "generic-8" + elif options.target == "generic-16" or options.target == "generic-x16": + error("No generics #include specified; using examples/intrinsics/generic-16.h\n", 2) + options.include_file = "examples/intrinsics/generic-16.h" + options.target = "generic-16" + elif options.target == "generic-32" or options.target == "generic-x32": + error("No generics #include specified; using examples/intrinsics/generic-32.h\n", 2) + options.include_file = "examples/intrinsics/generic-32.h" + options.target = "generic-32" + elif options.target == "generic-64" or options.target == "generic-x64": + error("No generics #include specified; using examples/intrinsics/generic-64.h\n", 2) + options.include_file = "examples/intrinsics/generic-64.h" + options.target = "generic-64" + + if options.compiler_exe == None: + if is_windows: + options.compiler_exe = "cl.exe" + else: + options.compiler_exe = "g++" + + # checks the required compiler otherwise prints an error message + PATH_dir = string.split(os.getenv("PATH"), os.pathsep) + compiler_exists = False + + for counter in PATH_dir: + if os.path.exists(counter + os.sep + options.compiler_exe): + compiler_exists = True + break + + if not compiler_exists: + error("missing the required compiler: %s \n" % options.compiler_exe, 1) + + # print compilers versions + if print_version > 0: + common.print_version(ispc_exe, "", options.compiler_exe, False, run_tests_log, is_windows) + + ispc_root = "." + + # if no specific test files are specified, run all of the tests in tests/, + # failing_tests/, and tests_errors/ + if len(args) == 0: + files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \ + glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc") + else: + if is_windows: + argfiles = [ ] + for f in args: + # we have to glob ourselves if this is being run under a DOS + # shell, as it passes wildcard as is. + argfiles += glob.glob(f) + else: + argfiles = args + + files = [ ] + for f in argfiles: + if os.path.splitext(string.lower(f))[1] != ".ispc": + error("Ignoring file %s, which doesn't have an .ispc extension.\n" % f, 2) + else: + files += [ f ] + + # max_test_length is used to issue exact number of whitespace characters when + # updating status. Otherwise update causes new lines standard 80 char terminal + # on both Linux and Windows. + max_test_length = 0 + for f in files: + max_test_length = max(max_test_length, len(f)) + + # randomly shuffle the tests if asked to do so + if (options.random): + random.seed() + random.shuffle(files) + + # counter total_tests = len(files) compile_error_files = [ ] @@ -460,7 +603,7 @@ if __name__ == '__main__': nthreads = min(multiprocessing.cpu_count(), options.num_jobs) nthreads = min(nthreads, len(files)) - sys.stdout.write("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests)) + print_debug("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests), s, run_tests_log) # put each of the test filenames into a queue q = multiprocessing.Queue() @@ -480,41 +623,111 @@ if __name__ == '__main__': start_time = time.time() # launch jobs to run tests + glob_var = [is_windows, options, s, ispc_exe, is_generic_target, run_tests_log] + global task_threads + task_threads = [0] * nthreads for x in range(nthreads): - t = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests, max_test_length, finished_tests_counter, finished_tests_counter_lock)) - task_threads.append(t) - t.start() - + task_threads[x] = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests, + max_test_length, finished_tests_counter, finished_tests_counter_lock, glob_var)) + task_threads[x].start() # wait for them to all finish and then return the number that failed # (i.e. return 0 if all is ok) for t in task_threads: t.join() - sys.stdout.write("\n") + if options.non_interactive == False: + print_debug("\n", s, run_tests_log) elapsed_time = time.time() - start_time - if options.time: - sys.stdout.write("Elapsed time: %d s\n" % elapsed_time) while not qret.empty(): - (c, r, s) = qret.get() + (c, r, skip) = qret.get() compile_error_files += c run_error_files += r - skip_files += s + skip_files += skip + if options.non_interactive: + print_debug(" Done %d / %d\n" % (finished_tests_counter.value, total_tests), s, run_tests_log) if len(skip_files) > 0: skip_files.sort() - sys.stdout.write("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests)) + print_debug("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests), s, run_tests_log) for f in skip_files: - sys.stdout.write("\t%s\n" % f) + print_debug("\t%s\n" % f, s, run_tests_log) if len(compile_error_files) > 0: compile_error_files.sort() - sys.stdout.write("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests)) + print_debug("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests), s, run_tests_log) for f in compile_error_files: - sys.stdout.write("\t%s\n" % f) + print_debug("\t%s\n" % f, s, run_tests_log) if len(run_error_files) > 0: run_error_files.sort() - sys.stdout.write("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests)) + print_debug("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests), s, run_tests_log) for f in run_error_files: - sys.stdout.write("\t%s\n" % f) + print_debug("\t%s\n" % f, s, run_tests_log) + if len(compile_error_files) == 0 and len(run_error_files) == 0: + print_debug("No fails\n", s, run_tests_log) - sys.exit(len(compile_error_files) + len(run_error_files)) + R = file_check(compile_error_files, run_error_files) + + if options.time: + print_debug("Elapsed time: %d s\n" % elapsed_time, s, run_tests_log) + + return R + + +from optparse import OptionParser +import multiprocessing +from ctypes import c_int +import os +import sys +import glob +import re +import signal +import random +import string +import subprocess +import shlex +import platform +import tempfile +import os.path +import time +# our functions +import common +print_debug = common.print_debug +error = common.error + +if __name__ == "__main__": + parser = OptionParser() + parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests", + default=False, action="store_true") + parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics", + default=None) + parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)", + default="") + parser.add_option('-t', '--target', dest='target', + help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)', + default="sse4") + parser.add_option('-a', '--arch', dest='arch', + help='Set architecture (arm, x86, x86-64)', + default="x86-64") + parser.add_option("-c", "--compiler", dest="compiler_exe", help="C/C++ compiler binary to use to run tests", + default=None) + parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization', + default=False, action="store_true") + parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel', + default="1024", type="int") + parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output', + default=False, action="store_true") + parser.add_option('--wrap-exe', dest='wrapexe', + help='Executable to wrap test runs with (e.g. "valgrind")', + default="") + parser.add_option('--time', dest='time', help='Enable time output', + default=False, action="store_true") + parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates', + default=False, action="store_true") + parser.add_option('-u', "--update-errors", dest='update', help='Update file with fails (F of FP)', default="") + parser.add_option('-s', "--silent", dest='silent', help='enable silent mode without any output', default=False, + action = "store_true") + parser.add_option("--file", dest='in_file', help='file to save run_tests output', default="") + parser.add_option("--verify", dest='verify', help='verify the file fail_db.txt', default=False, action="store_true") + (options, args) = parser.parse_args() + L = run_tests(options, args, 1) + exit(0) diff --git a/stdlib.ispc b/stdlib.ispc index 4e06f5da..9b02d0ba 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -38,12 +38,23 @@ ispc code */ -#ifdef ISPC_TARGET_GENERIC -#define IntMaskType bool -#define UIntMaskType bool +#if (ISPC_MASK_BITS == 1) + #define IntMaskType bool + #define UIntMaskType bool +#elif (ISPC_MASK_BITS == 8) + #define IntMaskType int8 + #define UIntMaskType unsigned int8 +#elif (ISPC_MASK_BITS == 16) + #define IntMaskType int16 + #define UIntMaskType unsigned int16 +#elif (ISPC_MASK_BITS == 32) + #define IntMaskType int32 + #define UIntMaskType unsigned int32 +#elif (ISPC_MASK_BITS == 64) + #define IntMaskType int64 + #define UIntMaskType unsigned int64 #else -#define IntMaskType int32 -#define UIntMaskType unsigned int32 + #error Unknown value of ISPC_MASK_BITS #endif /////////////////////////////////////////////////////////////////////////// @@ -335,14 +346,15 @@ static inline int32 sign_extend(bool v) { return __sext_varying_bool(v); } + __declspec(safe) static inline uniform bool any(bool v) { // We only care about whether "any" is true for the active program instances, // so we have to make v with the current program mask. -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __any(v & __mask); #else - return __any(__sext_varying_bool(v) & __mask); + return __any((UIntMaskType)__sext_varying_bool(v) & __mask); #endif } @@ -350,11 +362,10 @@ __declspec(safe) static inline uniform bool all(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes - -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __all(v | !__mask); #else - return __all(__sext_varying_bool(v) | !__mask); + return __all((UIntMaskType)__sext_varying_bool(v) | !__mask); #endif } @@ -362,11 +373,10 @@ __declspec(safe) static inline uniform bool none(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes - -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __none(v & __mask); #else - return __none(__sext_varying_bool(v) & __mask); + return __none((UIntMaskType)__sext_varying_bool(v) & __mask); #endif } @@ -399,10 +409,10 @@ static inline int popcnt(int64 v) { __declspec(safe) static inline uniform int popcnt(bool v) { // As with any() and all(), only count across the active lanes -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __popcnt_int64(__movmsk(v & __mask)); #else - return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask)); + return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask)); #endif } @@ -880,21 +890,45 @@ static inline uniform double select(uniform bool c, uniform double a, /////////////////////////////////////////////////////////////////////////// // Horizontal ops / reductions +__declspec(safe) +static inline uniform int16 reduce_add(int8 x) { + return __reduce_add_int8(__mask ? x : (int8)0); +} + +__declspec(safe) +static inline uniform unsigned int16 reduce_add(unsigned int8 x) { + return __reduce_add_int8(__mask ? x : (int8)0); +} + +__declspec(safe) +static inline uniform int32 reduce_add(int16 x) { + return __reduce_add_int16(__mask ? x : (int16)0); +} + +__declspec(safe) +static inline uniform unsigned int32 reduce_add(unsigned int16 x) { + return __reduce_add_int16(__mask ? x : (int16)0); +} + __declspec(safe) static inline uniform float reduce_add(float x) { // zero the lanes where the mask is off return __reduce_add_float(__mask ? x : 0.); } - __declspec(safe) static inline uniform float reduce_min(float v) { // For the lanes where the mask is off, replace the given value with // infinity, so that it doesn't affect the result. int iflt_max = 0x7f800000; // infinity - // Must use __floatbits_varying_int32, not floatbits(), since with the - // latter the current mask enters into the returned result... - return __reduce_min_float(__mask ? v : __floatbits_varying_int32(iflt_max)); + // unmasked block is needed to make sure that argument for unmasked + // function __reduce_min_float() are calculated without a mask. + bool test = __mask; + uniform float result; + unmasked { + result = __reduce_min_float(test ? v : floatbits(iflt_max)); + } + return result; } __declspec(safe) @@ -902,13 +936,18 @@ static inline uniform float reduce_max(float v) { // For the lanes where the mask is off, replace the given value with // negative infinity, so that it doesn't affect the result. const int iflt_neg_max = 0xff800000; // -infinity - // Must use __floatbits_varying_int32, not floatbits(), since with the - // latter the current mask enters into the returned result... - return __reduce_max_float(__mask ? v : __floatbits_varying_int32(iflt_neg_max)); + // unmasked block is needed to make sure that argument for unmasked + // function __reduce_max_float() are calculated without a mask. + bool test = __mask; + uniform float result; + unmasked { + result = __reduce_max_float(test ? v : floatbits(iflt_neg_max)); + } + return result; } __declspec(safe) -static inline uniform int reduce_add(int x) { +static inline uniform int64 reduce_add(int32 x) { // Zero out the values for lanes that aren't running return __reduce_add_int32(__mask ? x : 0); } @@ -930,7 +969,7 @@ static inline uniform int reduce_max(int v) { } __declspec(safe) -static inline uniform unsigned int reduce_add(unsigned int x) { +static inline uniform unsigned int64 reduce_add(unsigned int32 x) { // Set values for non-running lanes to zero so they don't affect the // result. return __reduce_add_int32(__mask ? x : 0); @@ -960,17 +999,27 @@ static inline uniform double reduce_add(double x) { __declspec(safe) static inline uniform double reduce_min(double v) { int64 iflt_max = 0x7ff0000000000000; // infinity - // Must use __doublebits_varying_int64, not doublebits(), since with the - // latter the current mask enters into the returned result... - return __reduce_min_double(__mask ? v : __doublebits_varying_int64(iflt_max)); + // unmasked block is needed to make sure that argument for unmasked + // function __reduce_min_double() are calculated without a mask. + bool test = __mask; + uniform double result; + unmasked { + result = __reduce_min_double(test ? v : doublebits(iflt_max)); + } + return result; } __declspec(safe) static inline uniform double reduce_max(double v) { const int64 iflt_neg_max = 0xfff0000000000000; // -infinity - // Must use __doublebits_varying_int64, not doublebits(), since with the - // latter the current mask enters into the returned result... - return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max)); + // unmasked block is needed to make sure that argument for unmasked + // function __reduce_max_double() are calculated without a mask. + bool test = __mask; + uniform double result; + unmasked { + result = __reduce_max_double(test ? v : doublebits(iflt_neg_max)); + } + return result; } __declspec(safe) @@ -1325,88 +1374,88 @@ static inline uniform double max(uniform double a, uniform double b) { // int8 -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int8 min(uniform unsigned int8 a, uniform unsigned int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int8 max(uniform unsigned int8 a, uniform unsigned int8 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int8 min(uniform int8 a, uniform int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int8 max(uniform int8 a, uniform int8 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int8 min(unsigned int8 a, unsigned int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int8 max(unsigned int8 a, unsigned int8 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int8 min(int8 a, int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int8 max(int8 a, int8 b) { return (a > b) ? a : b; } // int16 -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int16 min(uniform unsigned int16 a, uniform unsigned int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int16 max(uniform unsigned int16 a, uniform unsigned int16 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int16 min(uniform int16 a, uniform int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int16 max(uniform int16 a, uniform int16 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int16 min(unsigned int16 a, unsigned int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int16 max(unsigned int16 a, unsigned int16 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int16 min(int16 a, int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int16 max(int16 a, int16 b) { return (a > b) ? a : b; } @@ -1510,6 +1559,18 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl return min(max(v, low), high); } +// double + +__declspec(safe,cost2) +static inline double clamp(double v, double low, double high) { + return min(max(v, low), high); +} + +__declspec(safe,cost2) +static inline uniform double clamp(uniform double v, uniform double low, uniform double high) { + return min(max(v, low), high); +} + // int8 __declspec(safe,cost2) @@ -2134,7 +2195,7 @@ static inline uniform float frexp(uniform float x, uniform int * uniform pw2) { __declspec(safe) static inline float sin(float x_full) { if (__math_lib == __math_lib_svml) { - return __svml_sin(x_full); + return __svml_sinf(x_full); } else if (__math_lib == __math_lib_system) { float ret; @@ -2267,8 +2328,10 @@ static inline float asin(float x) { bool isnan = (x > 1); float v; - if (__math_lib == __math_lib_svml || - __math_lib == __math_lib_system) { + if (__math_lib == __math_lib_svml) { + return __svml_asinf(x); + } + else if (__math_lib == __math_lib_system) { float ret; foreach_active (i) { uniform float r = __stdlib_asinf(extract(x, i)); @@ -2371,7 +2434,7 @@ static inline uniform float asin(uniform float x) { __declspec(safe) static inline float cos(float x_full) { if (__math_lib == __math_lib_svml) { - return __svml_cos(x_full); + return __svml_cosf(x_full); } else if (__math_lib == __math_lib_system) { float ret; @@ -2499,18 +2562,28 @@ static inline float acos(float v) { return 1.57079637050628662109375 - asin(v); } +__declspec(safe) +static inline double acos(const double v) { + return 1.57079637050628662109375d0 - asin(v); +} + __declspec(safe) static inline uniform float acos(uniform float v) { return 1.57079637050628662109375 - asin(v); } +__declspec(safe) +static inline uniform double acos(const uniform double v) { + return 1.57079637050628662109375d0 - asin(v); +} + __declspec(safe) static inline void sincos(float x_full, varying float * uniform sin_result, varying float * uniform cos_result) { if (__math_lib == __math_lib_svml) { - __svml_sincos(x_full, sin_result, cos_result); + __svml_sincosf(x_full, sin_result, cos_result); } else if (__math_lib == __math_lib_system) { foreach_active (i) { @@ -2642,7 +2715,7 @@ static inline void sincos(uniform float x_full, uniform float * uniform sin_resu __declspec(safe) static inline float tan(float x_full) { if (__math_lib == __math_lib_svml) { - return __svml_tan(x_full); + return __svml_tanf(x_full); } else if (__math_lib == __math_lib_system) { float ret; @@ -2793,7 +2866,7 @@ static inline uniform float tan(uniform float x_full) { __declspec(safe) static inline float atan(float x_full) { if (__math_lib == __math_lib_svml) { - return __svml_atan(x_full); + return __svml_atanf(x_full); } else if (__math_lib == __math_lib_system) { float ret; @@ -2888,7 +2961,7 @@ static inline uniform float atan(uniform float x_full) { __declspec(safe) static inline float atan2(float y, float x) { if (__math_lib == __math_lib_svml) { - return __svml_atan2(y, x); + return __svml_atan2f(y, x); } else if (__math_lib == __math_lib_system) { float ret; @@ -2951,7 +3024,7 @@ static inline float exp(float x_full) { return __exp_varying_float(x_full); } else if (__math_lib == __math_lib_svml) { - return __svml_exp(x_full); + return __svml_expf(x_full); } else if (__math_lib == __math_lib_system) { float ret; @@ -3119,7 +3192,7 @@ static inline void __range_reduce_log(float input, varying float * uniform reduc static const int nonexponent_mask = 0x807FFFFF; // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126 - static const int exponent_neg1 = (126 << 23); + static const int exponent_neg1 = (126l << 23); // NOTE(boulos): We don't need to mask anything out since we know // the sign bit has to be 0. If it's 1, we need to return infinity/nan // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). @@ -3142,7 +3215,7 @@ static inline void __range_reduce_log(uniform float input, uniform float * unifo uniform int int_version = intbits(input); static const uniform int nonexponent_mask = 0x807FFFFF; - static const uniform int exponent_neg1 = (126 << 23); + static const uniform int exponent_neg1 = (126ul << 23); uniform int biased_exponent = int_version >> 23; uniform int offset_exponent = biased_exponent + 1; *exponent = offset_exponent - 127; // get the real value @@ -3158,7 +3231,7 @@ static inline float log(float x_full) { return __log_varying_float(x_full); } else if (__math_lib == __math_lib_svml) { - return __svml_log(x_full); + return __svml_logf(x_full); } else if (__math_lib == __math_lib_system) { float ret; @@ -3333,7 +3406,7 @@ static inline float pow(float a, float b) { return __pow_varying_float(a, b); } else if (__math_lib == __math_lib_svml) { - return __svml_pow(a, b); + return __svml_powf(a, b); } else if (__math_lib == __math_lib_system) { float ret; @@ -3423,7 +3496,11 @@ static inline uniform double frexp(uniform double x, uniform int * uniform pw2) __declspec(safe) static inline double sin(double x) { - if (__math_lib == __math_lib_ispc_fast) + if (__math_lib == __math_lib_svml) + { + return __svml_sind(x); + } + else if (__math_lib == __math_lib_ispc_fast) return sin((float)x); else { double ret; @@ -3444,8 +3521,30 @@ static inline uniform double sin(uniform double x) { } __declspec(safe) -static inline double cos(double x) { - if (__math_lib == __math_lib_ispc_fast) +static inline double asin(const double x) { + if (__math_lib == __math_lib_svml) + { + return __svml_asind(x); + } + else if (__math_lib == __math_lib_ispc_fast) + return asin((float)x); + else { + double ret; + foreach_active (i) { + uniform double r = __stdlib_asin(extract(x, i)); + ret = insert(ret, i, r); + } + return ret; + } +} + +__declspec(safe) +static inline double cos(const double x) { + if (__math_lib == __math_lib_svml) + { + return __svml_cosd(x); + } + else if (__math_lib == __math_lib_ispc_fast) return cos((float)x); else { double ret; @@ -3468,7 +3567,11 @@ static inline uniform double cos(uniform double x) { __declspec(safe) static inline void sincos(double x, varying double * uniform sin_result, varying double * uniform cos_result) { - if (__math_lib == __math_lib_ispc_fast) { + if (__math_lib == __math_lib_svml) + { + __svml_sincosd(x, sin_result, cos_result); + } + else if (__math_lib == __math_lib_ispc_fast) { float sr, cr; sincos((float)x, &sr, &cr); *sin_result = sr; @@ -3499,7 +3602,11 @@ static inline void sincos(uniform double x, uniform double * uniform sin_result, __declspec(safe) static inline double tan(double x) { - if (__math_lib == __math_lib_ispc_fast) + if (__math_lib == __math_lib_svml) + { + return __svml_tand(x); + } + else if (__math_lib == __math_lib_ispc_fast) return tan((float)x); else { double ret; @@ -3543,7 +3650,11 @@ static inline uniform double atan(uniform double x) { __declspec(safe) static inline double atan2(double y, double x) { - if (__math_lib == __math_lib_ispc_fast) + if (__math_lib == __math_lib_svml) + { + return __svml_atan2d(y,x); + } + else if (__math_lib == __math_lib_ispc_fast) return atan2((float)y, (float)x); else { double ret; @@ -3565,7 +3676,11 @@ static inline uniform double atan2(uniform double y, uniform double x) { __declspec(safe) static inline double exp(double x) { - if (__math_lib == __math_lib_ispc_fast) + if (__math_lib == __math_lib_svml) + { + return __svml_expd(x); + } + else if (__math_lib == __math_lib_ispc_fast) return exp((float)x); else { double ret; @@ -3587,7 +3702,11 @@ static inline uniform double exp(uniform double x) { __declspec(safe) static inline double log(double x) { - if (__math_lib == __math_lib_ispc_fast) + if (__math_lib == __math_lib_svml) + { + return __svml_logd(x); + } + else if (__math_lib == __math_lib_ispc_fast) return log((float)x); else { double ret; @@ -3609,7 +3728,11 @@ static inline uniform double log(uniform double x) { __declspec(safe) static inline double pow(double a, double b) { - if (__math_lib == __math_lib_ispc_fast) + if (__math_lib == __math_lib_svml) + { + return __svml_powd(a,b); + } + else if (__math_lib == __math_lib_ispc_fast) return pow((float)a, (float)b); else { double ret; @@ -3640,18 +3763,18 @@ static inline uniform float half_to_float(uniform unsigned int16 h) { else { // https://gist.github.com/2144712 // Fabian "ryg" Giesen. - static const uniform unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift + static const uniform unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift uniform int32 o = ((int32)(h & 0x7fff)) << 13; // exponent/mantissa bits uniform unsigned int32 exp = shifted_exp & o; // just the exponent - o += (127 - 15) << 23; // exponent adjust + o += (uniform int32)(127 - 15) << 23; // exponent adjust // handle exponent special cases if (exp == shifted_exp) // Inf/NaN? - o += (128 - 16) << 23; // extra exp adjust + o += (uniform unsigned int32)(128 - 16) << 23; // extra exp adjust else if (exp == 0) { // Zero/Denormal? - o += 1 << 23; // extra exp adjust - o = intbits(floatbits(o) - floatbits(113 << 23)); // renormalize + o += 1ul << 23; // extra exp adjust + o = intbits(floatbits(o) - floatbits(113ul << 23)); // renormalize } o |= ((int32)(h & 0x8000)) << 16; // sign bit @@ -3668,17 +3791,17 @@ static inline float half_to_float(unsigned int16 h) { // https://gist.github.com/2144712 // Fabian "ryg" Giesen. - const unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift + const unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift - int32 o = ((int32)(h & 0x7fff)) << 13; // exponent/mantissa bits + int32 o = ((int32)(h & 0x7ffful)) << 13; // exponent/mantissa bits unsigned int32 exp = shifted_exp & o; // just the exponent - o += (127 - 15) << 23; // exponent adjust + o += (int32)(127 - 15) << 23; // exponent adjust - int32 infnan_val = o + ((128 - 16) << 23); - int32 zerodenorm_val = intbits(floatbits(o + (1<<23)) - floatbits(113 << 23)); + int32 infnan_val = o + ((int32)(128 - 16) << 23); + int32 zerodenorm_val = intbits(floatbits(o + (1ul<<23)) - floatbits(113ul << 23)); int32 reg_val = (exp == 0) ? zerodenorm_val : o; - int32 sign_bit = ((int32)(h & 0x8000)) << 16; + int32 sign_bit = ((int32)(h & 0x8000ul)) << 16; return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit); } } @@ -3708,16 +3831,16 @@ static inline uniform int16 float_to_half(uniform float f) { // NaN->qNaN and Inf->Inf // unconditional assignment here, will override with right value for // the regular case below. - uniform int32 f32infty = 255 << 23; - o = (fint > f32infty) ? 0x7e00 : 0x7c00; + uniform int32 f32infty = 255ul << 23; + o = (fint > f32infty) ? 0x7e00u : 0x7c00u; // (De)normalized number or zero // update fint unconditionally to save the blending; we don't need it // anymore for the Inf/NaN case anyway. - const uniform unsigned int32 round_mask = ~0xfffu; - const uniform int32 magic = 15 << 23; - const uniform int32 f16infty = 31 << 23; + const uniform unsigned int32 round_mask = ~0xffful; + const uniform int32 magic = 15ul << 23; + const uniform int32 f16infty = 31ul << 23; uniform int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask; fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed @@ -3754,16 +3877,16 @@ static inline int16 float_to_half(float f) { // NaN->qNaN and Inf->Inf // unconditional assignment here, will override with right value for // the regular case below. - int32 f32infty = 255 << 23; - o = (fint > f32infty) ? 0x7e00 : 0x7c00; + int32 f32infty = 255ul << 23; + o = (fint > f32infty) ? 0x7e00u : 0x7c00u; // (De)normalized number or zero // update fint unconditionally to save the blending; we don't need it // anymore for the Inf/NaN case anyway. - const unsigned int32 round_mask = ~0xfffu; - const int32 magic = 15 << 23; - const int32 f16infty = 31 << 23; + const unsigned int32 round_mask = ~0xffful; + const int32 magic = 15ul << 23; + const int32 f16infty = 31ul << 23; // Shift exponent down, denormalize if necessary. // NOTE This represents half-float denormals using single precision denormals. @@ -3782,7 +3905,7 @@ static inline int16 float_to_half(float f) { // FP16 denormals are rare in practice, I don't know. Whatever slow path your HW // may or may not have for denormals, this may well hit it. float fscale = floatbits(fint & round_mask) * floatbits(magic); - fscale = min(fscale, floatbits((31 << 23) - 0x1000)); + fscale = min(fscale, floatbits((31ul << 23) - 0x1000ul)); int32 fint2 = intbits(fscale) - round_mask; if (fint < f32infty) @@ -3949,7 +4072,7 @@ float_to_srgb8(float inval) // Do the table lookup and unpack bias, scale unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20]; unsigned int bias = (tab >> 16) << 9; - unsigned int scale = tab & 0xffff; + unsigned int scale = tab & 0xfffful; // Grab next-highest mantissa bits and perform linear interpolation unsigned int t = (intbits(inval) >> 12) & 0xff; @@ -3999,7 +4122,7 @@ float_to_srgb8(uniform float inval) // Do the table lookup and unpack bias, scale uniform unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20]; uniform unsigned int bias = (tab >> 16) << 9; - uniform unsigned int scale = tab & 0xffff; + uniform unsigned int scale = tab & 0xfffful; // Grab next-highest mantissa bits and perform linear interpolation uniform unsigned int t = (intbits(inval) >> 12) & 0xff; @@ -4046,14 +4169,14 @@ static inline uniform unsigned int random(uniform RNGState * uniform state) static inline float frandom(varying RNGState * uniform state) { unsigned int irand = random(state); - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; return floatbits(0x3F800000 | irand)-1.0f; } static inline uniform float frandom(uniform RNGState * uniform state) { uniform unsigned int irand = random(state); - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; return floatbits(0x3F800000 | irand)-1.0f; } @@ -4061,18 +4184,18 @@ static inline void seed_rng(varying RNGState * uniform state, unsigned int seed) { state->z1 = seed; state->z2 = seed ^ 0xbeeff00d; - state->z3 = ((seed & 0xffff) << 16) | (seed >> 16); - state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00) << 8) | - ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24); + state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16); + state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) | + ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24); } static inline void seed_rng(uniform RNGState * uniform state, uniform unsigned int seed) { state->z1 = seed; state->z2 = seed ^ 0xbeeff00d; - state->z3 = ((seed & 0xffff) << 16) | (seed >> 16); - state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00) << 8) | - ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24); + state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16); + state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) | + ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24); } @@ -4090,7 +4213,7 @@ static inline uniform bool rdrand(float * uniform ptr) { uniform int32 irand; uniform bool success = __rdrand_i32(&irand); if (success) { - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; *ptr = floatbits(0x3F800000 | irand)-1.0f; } return success; @@ -4110,7 +4233,7 @@ static inline bool rdrand(varying float * uniform ptr) { // in vector form. However, we need to be careful to not // clobber any existing already-set values in *ptr with // inactive lanes here... - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; *ptr = floatbits(0x3F800000 | irand)-1.0f; success = true; } @@ -4130,7 +4253,7 @@ static inline bool rdrand(float * ptr) { foreach_active (index) { uniform int32 irand; if (__rdrand_i32(&irand)) { - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f; success = true; } @@ -4264,3 +4387,720 @@ static inline bool rdrand(int64 * ptr) { return success; } } + +/////////////////////////////////////////////////////////////////////////// +// Fast vector integer division + +/* These tables and the algorithms in the __fast_idiv() functions below are + from Halide; the idea is based on the paper "Division by Invariant + Integers using Multiplication" by Granlund and Montgomery. + + Copyright (c) 2012 MIT CSAIL + + Developed by: + + The Halide team + MIT CSAIL + http://halide-lang.org + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +static const uniform int64 __idiv_table_u8[][3] = { + {0, 0LL, 1}, {1, 171LL, 1}, {0, 0LL, 2}, + {1, 205LL, 2}, {1, 171LL, 2}, {2, 37LL, 2}, + {0, 0LL, 3}, {1, 57LL, 1}, {1, 205LL, 3}, + {2, 117LL, 3}, {1, 171LL, 3}, {1, 79LL, 2}, + {2, 37LL, 3}, {1, 137LL, 3}, {0, 0LL, 4}, + {1, 241LL, 4}, {1, 57LL, 2}, {1, 27LL, 1}, + {1, 205LL, 4}, {2, 135LL, 4}, {2, 117LL, 4}, + {2, 101LL, 4}, {1, 171LL, 4}, {1, 41LL, 2}, + {1, 79LL, 3}, {1, 19LL, 1}, {2, 37LL, 4}, + {2, 27LL, 4}, {1, 137LL, 4}, {2, 9LL, 4}, + {0, 0LL, 5}, {1, 249LL, 5}, {1, 241LL, 5}, + {1, 235LL, 5}, {1, 57LL, 3}, {1, 111LL, 4}, + {1, 27LL, 2}, {2, 165LL, 5}, {1, 205LL, 5}, + {1, 25LL, 2}, {2, 135LL, 5}, {1, 191LL, 5}, + {1, 187LL, 5}, {2, 109LL, 5}, {2, 101LL, 5}, + {1, 175LL, 5}, {1, 171LL, 5}, {2, 79LL, 5}, + {1, 41LL, 3}, {1, 161LL, 5}, {1, 79LL, 4}, + {1, 155LL, 5}, {1, 19LL, 2}, {1, 149LL, 5}, + {2, 37LL, 5}, {1, 9LL, 1}, {2, 27LL, 5}, + {1, 139LL, 5}, {1, 137LL, 5}, {2, 13LL, 5}, + {2, 9LL, 5}, {2, 5LL, 5}, {0, 0LL, 6}, + {1, 253LL, 6}, {1, 249LL, 6}, {1, 245LL, 6}, + {1, 121LL, 5}, {1, 119LL, 5}, {1, 235LL, 6}, + {1, 231LL, 6}, {1, 57LL, 4}, {1, 225LL, 6}, + {1, 111LL, 5}, {1, 219LL, 6}, {1, 27LL, 3}, + {1, 213LL, 6}, {2, 165LL, 6}, {1, 13LL, 2}, + {1, 205LL, 6}, {1, 203LL, 6}, {1, 25LL, 3}, + {1, 99LL, 5}, {2, 135LL, 6}, {1, 193LL, 6}, + {1, 191LL, 6}, {1, 189LL, 6}, {1, 187LL, 6}, + {1, 185LL, 6}, {1, 183LL, 6}, {1, 181LL, 6}, + {1, 179LL, 6}, {1, 177LL, 6}, {1, 175LL, 6}, + {1, 173LL, 6}, {1, 171LL, 6}, {1, 169LL, 6}, + {1, 21LL, 3}, {1, 83LL, 5}, {1, 41LL, 4}, + {1, 163LL, 6}, {1, 161LL, 6}, {2, 63LL, 6}, + {1, 79LL, 5}, {2, 57LL, 6}, {1, 155LL, 6}, + {2, 51LL, 6}, {1, 19LL, 3}, {1, 151LL, 6}, + {1, 149LL, 6}, {1, 37LL, 4}, {2, 37LL, 6}, + {1, 145LL, 6}, {1, 9LL, 2}, {1, 143LL, 6}, + {2, 27LL, 6}, {2, 25LL, 6}, {1, 139LL, 6}, + {1, 69LL, 5}, {1, 137LL, 6}, {2, 15LL, 6}, + {2, 13LL, 6}, {2, 11LL, 6}, {2, 9LL, 6}, + {2, 7LL, 6}, {2, 5LL, 6}, {2, 3LL, 6}, + {0, 0LL, 7}, {1, 255LL, 7}, {1, 127LL, 6}, + {1, 63LL, 5}, {1, 125LL, 6}, {1, 31LL, 4}, + {1, 123LL, 6}, {1, 61LL, 5}, {1, 121LL, 6}, + {1, 15LL, 3}, {1, 119LL, 6}, {1, 59LL, 5}, + {1, 235LL, 7}, {1, 117LL, 6}, {1, 29LL, 4}, + {1, 115LL, 6}, {1, 57LL, 5}, {1, 113LL, 6}, + {1, 225LL, 7}, {1, 7LL, 2}, {1, 111LL, 6}, + {1, 55LL, 5}, {1, 219LL, 7}, {1, 109LL, 6}, + {1, 27LL, 4}, {1, 215LL, 7}, {1, 107LL, 6}, + {1, 53LL, 5}, {1, 211LL, 7}, {1, 105LL, 6}, + {1, 13LL, 3}, {1, 207LL, 7}, {1, 103LL, 6}, + {1, 51LL, 5}, {1, 203LL, 7}, {1, 101LL, 6}, + {1, 25LL, 4}, {1, 199LL, 7}, {1, 99LL, 6}, + {1, 197LL, 7}, {1, 49LL, 5}, {1, 97LL, 6}, + {1, 193LL, 7}, {1, 3LL, 1}, {1, 191LL, 7}, + {1, 95LL, 6}, {1, 189LL, 7}, {1, 47LL, 5}, + {1, 187LL, 7}, {1, 93LL, 6}, {1, 185LL, 7}, + {1, 23LL, 4}, {1, 183LL, 7}, {1, 91LL, 6}, + {1, 181LL, 7}, {1, 45LL, 5}, {1, 179LL, 7}, + {1, 89LL, 6}, {1, 177LL, 7}, {1, 11LL, 3}, + {1, 175LL, 7}, {1, 87LL, 6}, {1, 173LL, 7}, + {1, 43LL, 5}, {1, 171LL, 7}, {1, 85LL, 6}, + {1, 169LL, 7}, {2, 81LL, 7}, {1, 21LL, 4}, + {1, 167LL, 7}, {1, 83LL, 6}, {1, 165LL, 7}, + {1, 41LL, 5}, {2, 71LL, 7}, {1, 163LL, 7}, + {1, 81LL, 6}, {1, 161LL, 7}, {1, 5LL, 2}, + {2, 63LL, 7}, {1, 159LL, 7}, {1, 79LL, 6}, + {1, 157LL, 7}, {2, 57LL, 7}, {1, 39LL, 5}, + {1, 155LL, 7}, {1, 77LL, 6}, {2, 51LL, 7}, + {1, 153LL, 7}, {1, 19LL, 4}, {2, 47LL, 7}, + {1, 151LL, 7}, {1, 75LL, 6}, {1, 149LL, 7}, + {2, 41LL, 7}, {1, 37LL, 5}, {1, 147LL, 7}, + {2, 37LL, 7}, {1, 73LL, 6}, {1, 145LL, 7}, + {2, 33LL, 7}, {1, 9LL, 3}, {2, 31LL, 7}, + {1, 143LL, 7}, {1, 71LL, 6}, {2, 27LL, 7}, + {1, 141LL, 7}, {2, 25LL, 7}, {1, 35LL, 5}, + {1, 139LL, 7}, {2, 21LL, 7}, {1, 69LL, 6}, + {2, 19LL, 7}, {1, 137LL, 7}, {1, 17LL, 4}, + {2, 15LL, 7}, {1, 135LL, 7}, {2, 13LL, 7}, + {1, 67LL, 6}, {2, 11LL, 7}, {1, 133LL, 7}, + {2, 9LL, 7}, {1, 33LL, 5}, {2, 7LL, 7}, + {1, 131LL, 7}, {2, 5LL, 7}, {1, 65LL, 6}, + {2, 3LL, 7}, {1, 129LL, 7}, {0, 0LL, 8}, +}; +static const uniform int64 __idiv_table_s8[][3] = { + {0, 0LL, 1}, {1, 86LL, 0}, {0, 0LL, 2}, + {1, 103LL, 1}, {1, 43LL, 0}, {1, 147LL, 2}, + {0, 0LL, 3}, {1, 57LL, 1}, {1, 103LL, 2}, + {1, 187LL, 3}, {1, 43LL, 1}, {1, 79LL, 2}, + {1, 147LL, 3}, {1, 137LL, 3}, {0, 0LL, 4}, + {1, 121LL, 3}, {1, 57LL, 2}, {1, 27LL, 1}, + {1, 103LL, 3}, {1, 49LL, 2}, {1, 187LL, 4}, + {1, 179LL, 4}, {1, 43LL, 2}, {1, 41LL, 2}, + {1, 79LL, 3}, {1, 19LL, 1}, {1, 147LL, 4}, + {1, 71LL, 3}, {1, 137LL, 4}, {1, 133LL, 4}, + {0, 0LL, 5}, {1, 125LL, 4}, {1, 121LL, 4}, + {1, 59LL, 3}, {1, 57LL, 3}, {1, 111LL, 4}, + {1, 27LL, 2}, {1, 211LL, 5}, {1, 103LL, 4}, + {1, 25LL, 2}, {1, 49LL, 3}, {1, 6LL, 0}, + {1, 47LL, 3}, {1, 23LL, 2}, {1, 45LL, 3}, + {1, 11LL, 1}, {1, 43LL, 3}, {1, 21LL, 2}, + {1, 41LL, 3}, {1, 81LL, 4}, {1, 79LL, 4}, + {1, 39LL, 3}, {1, 19LL, 2}, {1, 75LL, 4}, + {1, 147LL, 5}, {1, 9LL, 1}, {1, 71LL, 4}, + {1, 35LL, 3}, {1, 137LL, 5}, {1, 135LL, 5}, + {1, 133LL, 5}, {1, 131LL, 5}, {0, 0LL, 6}, + {1, 127LL, 5}, {1, 63LL, 4}, {1, 31LL, 3}, + {1, 61LL, 4}, {1, 15LL, 2}, {1, 59LL, 4}, + {1, 29LL, 3}, {1, 57LL, 4}, {1, 113LL, 5}, + {1, 7LL, 1}, {1, 55LL, 4}, {1, 27LL, 3}, + {1, 107LL, 5}, {1, 53LL, 4}, {1, 13LL, 2}, + {1, 103LL, 5}, {1, 51LL, 4}, {1, 25LL, 3}, + {1, 99LL, 5}, {1, 49LL, 4}, {1, 97LL, 5}, + {1, 3LL, 0}, {1, 95LL, 5}, {1, 47LL, 4}, + {1, 93LL, 5}, {1, 23LL, 3}, {1, 91LL, 5}, + {1, 45LL, 4}, {1, 89LL, 5}, {1, 11LL, 2}, + {1, 87LL, 5}, {1, 43LL, 4}, {1, 85LL, 5}, + {1, 21LL, 3}, {1, 83LL, 5}, {1, 41LL, 4}, + {1, 163LL, 6}, {1, 81LL, 5}, {1, 5LL, 1}, + {1, 79LL, 5}, {1, 157LL, 6}, {1, 39LL, 4}, + {1, 77LL, 5}, {1, 19LL, 3}, {1, 151LL, 6}, + {1, 75LL, 5}, {1, 37LL, 4}, {1, 147LL, 6}, + {1, 73LL, 5}, {1, 9LL, 2}, {1, 143LL, 6}, + {1, 71LL, 5}, {1, 141LL, 6}, {1, 35LL, 4}, + {1, 69LL, 5}, {1, 137LL, 6}, {1, 17LL, 3}, + {1, 135LL, 6}, {1, 67LL, 5}, {1, 133LL, 6}, + {1, 33LL, 4}, {1, 131LL, 6}, {1, 65LL, 5}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, +}; +static const uniform int64 __idiv_table_u16[][3] = { + {0, 0LL, 1}, {1, 43691LL, 1}, {0, 0LL, 2}, + {1, 52429LL, 2}, {1, 43691LL, 2}, {2, 9363LL, 2}, + {0, 0LL, 3}, {1, 58255LL, 3}, {1, 52429LL, 3}, + {1, 47663LL, 3}, {1, 43691LL, 3}, {1, 20165LL, 2}, + {2, 9363LL, 3}, {1, 34953LL, 3}, {0, 0LL, 4}, + {1, 61681LL, 4}, {1, 58255LL, 4}, {1, 55189LL, 4}, + {1, 52429LL, 4}, {2, 34329LL, 4}, {1, 47663LL, 4}, + {2, 25645LL, 4}, {1, 43691LL, 4}, {2, 18351LL, 4}, + {1, 20165LL, 3}, {2, 12137LL, 4}, {2, 9363LL, 4}, + {1, 18079LL, 3}, {1, 34953LL, 4}, {2, 2115LL, 4}, + {0, 0LL, 5}, {1, 63551LL, 5}, {1, 61681LL, 5}, + {1, 59919LL, 5}, {1, 58255LL, 5}, {1, 7085LL, 2}, + {1, 55189LL, 5}, {2, 42011LL, 5}, {1, 52429LL, 5}, + {2, 36765LL, 5}, {2, 34329LL, 5}, {1, 48771LL, 5}, + {1, 47663LL, 5}, {1, 11651LL, 3}, {2, 25645LL, 5}, + {2, 23705LL, 5}, {1, 43691LL, 5}, {2, 20063LL, 5}, + {2, 18351LL, 5}, {1, 41121LL, 5}, {1, 20165LL, 4}, + {1, 39569LL, 5}, {2, 12137LL, 5}, {2, 10725LL, 5}, + {2, 9363LL, 5}, {2, 8049LL, 5}, {1, 18079LL, 4}, + {1, 35545LL, 5}, {1, 34953LL, 5}, {1, 8595LL, 3}, + {2, 2115LL, 5}, {2, 1041LL, 5}, {0, 0LL, 6}, + {1, 4033LL, 2}, {1, 63551LL, 6}, {1, 31301LL, 5}, + {1, 61681LL, 6}, {2, 56039LL, 6}, {1, 59919LL, 6}, + {1, 59075LL, 6}, {1, 58255LL, 6}, {1, 57457LL, 6}, + {1, 7085LL, 3}, {2, 46313LL, 6}, {1, 55189LL, 6}, + {1, 6809LL, 3}, {2, 42011LL, 6}, {1, 53093LL, 6}, + {1, 52429LL, 6}, {1, 25891LL, 5}, {2, 36765LL, 6}, + {1, 25267LL, 5}, {2, 34329LL, 6}, {1, 49345LL, 6}, + {1, 48771LL, 6}, {1, 48211LL, 6}, {1, 47663LL, 6}, + {2, 28719LL, 6}, {1, 11651LL, 4}, {2, 26647LL, 6}, + {2, 25645LL, 6}, {2, 24665LL, 6}, {2, 23705LL, 6}, + {1, 44151LL, 6}, {1, 43691LL, 6}, {2, 20945LL, 6}, + {2, 20063LL, 6}, {1, 42367LL, 6}, {2, 18351LL, 6}, + {1, 5191LL, 3}, {1, 41121LL, 6}, {1, 20361LL, 5}, + {1, 20165LL, 5}, {1, 19973LL, 5}, {1, 39569LL, 6}, + {2, 12863LL, 6}, {2, 12137LL, 6}, {1, 2405LL, 2}, + {2, 10725LL, 6}, {1, 37787LL, 6}, {2, 9363LL, 6}, + {1, 18559LL, 5}, {2, 8049LL, 6}, {2, 7409LL, 6}, + {1, 18079LL, 5}, {1, 35849LL, 6}, {1, 35545LL, 6}, + {2, 4957LL, 6}, {1, 34953LL, 6}, {1, 4333LL, 3}, + {1, 8595LL, 4}, {2, 2665LL, 6}, {2, 2115LL, 6}, + {2, 1573LL, 6}, {2, 1041LL, 6}, {2, 517LL, 6}, + {0, 0LL, 7}, {1, 16257LL, 5}, {1, 4033LL, 3}, + {1, 16009LL, 5}, {1, 63551LL, 7}, {1, 63073LL, 7}, + {1, 31301LL, 6}, {1, 31069LL, 6}, {1, 61681LL, 7}, + {1, 61231LL, 7}, {2, 56039LL, 7}, {1, 30175LL, 6}, + {1, 59919LL, 7}, {1, 29747LL, 6}, {1, 59075LL, 7}, + {1, 29331LL, 6}, {1, 58255LL, 7}, {1, 57853LL, 7}, + {1, 57457LL, 7}, {1, 28533LL, 6}, {1, 7085LL, 4}, + {1, 14075LL, 5}, {2, 46313LL, 7}, {1, 27777LL, 6}, + {1, 55189LL, 7}, {1, 13707LL, 5}, {1, 6809LL, 4}, + {2, 42705LL, 7}, {2, 42011LL, 7}, {1, 53431LL, 7}, + {1, 53093LL, 7}, {1, 52759LL, 7}, {1, 52429LL, 7}, + {2, 38671LL, 7}, {1, 25891LL, 6}, {1, 6433LL, 4}, + {2, 36765LL, 7}, {2, 36145LL, 7}, {1, 25267LL, 6}, + {2, 34927LL, 7}, {2, 34329LL, 7}, {1, 49637LL, 7}, + {1, 49345LL, 7}, {2, 32577LL, 7}, {1, 48771LL, 7}, + {2, 31443LL, 7}, {1, 48211LL, 7}, {1, 47935LL, 7}, + {1, 47663LL, 7}, {2, 29251LL, 7}, {2, 28719LL, 7}, + {1, 2929LL, 3}, {1, 11651LL, 5}, {1, 23173LL, 6}, + {2, 26647LL, 7}, {1, 2865LL, 3}, {2, 25645LL, 7}, + {1, 1417LL, 2}, {2, 24665LL, 7}, {1, 44859LL, 7}, + {2, 23705LL, 7}, {2, 23233LL, 7}, {1, 44151LL, 7}, + {1, 2745LL, 3}, {1, 43691LL, 7}, {2, 21393LL, 7}, + {2, 20945LL, 7}, {1, 43019LL, 7}, {2, 20063LL, 7}, + {1, 21291LL, 6}, {1, 42367LL, 7}, {1, 21077LL, 6}, + {2, 18351LL, 7}, {1, 41735LL, 7}, {1, 5191LL, 4}, + {2, 17111LL, 7}, {1, 41121LL, 7}, {2, 16305LL, 7}, + {1, 20361LL, 6}, {1, 40525LL, 7}, {1, 20165LL, 6}, + {1, 40137LL, 7}, {1, 19973LL, 6}, {1, 39757LL, 7}, + {1, 39569LL, 7}, {2, 13231LL, 7}, {2, 12863LL, 7}, + {1, 39017LL, 7}, {2, 12137LL, 7}, {2, 11779LL, 7}, + {1, 2405LL, 3}, {2, 11073LL, 7}, {2, 10725LL, 7}, + {1, 18979LL, 6}, {1, 37787LL, 7}, {2, 9699LL, 7}, + {2, 9363LL, 7}, {1, 37283LL, 7}, {1, 18559LL, 6}, + {2, 8373LL, 7}, {2, 8049LL, 7}, {1, 4579LL, 4}, + {2, 7409LL, 7}, {2, 7093LL, 7}, {1, 18079LL, 6}, + {1, 36003LL, 7}, {1, 35849LL, 7}, {2, 5857LL, 7}, + {1, 35545LL, 7}, {1, 35395LL, 7}, {2, 4957LL, 7}, + {1, 35099LL, 7}, {1, 34953LL, 7}, {1, 4351LL, 4}, + {1, 4333LL, 4}, {2, 3507LL, 7}, {1, 8595LL, 5}, + {2, 2943LL, 7}, {2, 2665LL, 7}, {1, 16981LL, 6}, + {2, 2115LL, 7}, {2, 1843LL, 7}, {2, 1573LL, 7}, + {1, 33421LL, 7}, {2, 1041LL, 7}, {1, 33157LL, 7}, + {2, 517LL, 7}, {1, 32897LL, 7}, {0, 0LL, 8}, +}; +static const uniform int64 __idiv_table_s16[][3] = { + {0, 0LL, 1}, {1, 21846LL, 0}, {0, 0LL, 2}, + {1, 26215LL, 1}, {1, 10923LL, 0}, {1, 18725LL, 1}, + {0, 0LL, 3}, {1, 7282LL, 0}, {1, 26215LL, 2}, + {1, 5958LL, 0}, {1, 10923LL, 1}, {1, 20165LL, 2}, + {1, 18725LL, 2}, {1, 34953LL, 3}, {0, 0LL, 4}, + {1, 30841LL, 3}, {1, 3641LL, 0}, {1, 55189LL, 4}, + {1, 26215LL, 3}, {1, 49933LL, 4}, {1, 2979LL, 0}, + {1, 45591LL, 4}, {1, 10923LL, 2}, {1, 5243LL, 1}, + {1, 20165LL, 3}, {1, 38837LL, 4}, {1, 18725LL, 3}, + {1, 18079LL, 3}, {1, 34953LL, 4}, {1, 16913LL, 3}, + {0, 0LL, 5}, {1, 1986LL, 0}, {1, 30841LL, 4}, + {1, 3745LL, 1}, {1, 3641LL, 1}, {1, 7085LL, 2}, + {1, 55189LL, 5}, {1, 26887LL, 4}, {1, 26215LL, 4}, + {1, 51151LL, 5}, {1, 49933LL, 5}, {1, 12193LL, 3}, + {1, 2979LL, 1}, {1, 11651LL, 3}, {1, 45591LL, 5}, + {1, 44621LL, 5}, {1, 10923LL, 3}, {1, 2675LL, 1}, + {1, 5243LL, 2}, {1, 41121LL, 5}, {1, 20165LL, 4}, + {1, 19785LL, 4}, {1, 38837LL, 5}, {1, 38131LL, 5}, + {1, 18725LL, 4}, {1, 36793LL, 5}, {1, 18079LL, 4}, + {1, 17773LL, 4}, {1, 34953LL, 5}, {1, 8595LL, 3}, + {1, 16913LL, 4}, {1, 33289LL, 5}, {0, 0LL, 6}, + {1, 4033LL, 2}, {1, 993LL, 0}, {1, 31301LL, 5}, + {1, 30841LL, 5}, {1, 15197LL, 4}, {1, 3745LL, 2}, + {1, 14769LL, 4}, {1, 3641LL, 2}, {1, 57457LL, 6}, + {1, 7085LL, 3}, {1, 55925LL, 6}, {1, 55189LL, 6}, + {1, 6809LL, 3}, {1, 26887LL, 5}, {1, 26547LL, 5}, + {1, 26215LL, 5}, {1, 25891LL, 5}, {1, 51151LL, 6}, + {1, 25267LL, 5}, {1, 49933LL, 6}, {1, 24673LL, 5}, + {1, 12193LL, 4}, {1, 48211LL, 6}, {1, 2979LL, 2}, + {1, 5891LL, 3}, {1, 11651LL, 4}, {1, 11523LL, 4}, + {1, 45591LL, 6}, {1, 45101LL, 6}, {1, 44621LL, 6}, + {1, 44151LL, 6}, {1, 10923LL, 4}, {1, 43241LL, 6}, + {1, 2675LL, 2}, {1, 662LL, 0}, {1, 5243LL, 3}, + {1, 5191LL, 3}, {1, 41121LL, 6}, {1, 20361LL, 5}, + {1, 20165LL, 5}, {1, 19973LL, 5}, {1, 19785LL, 5}, + {1, 1225LL, 1}, {1, 38837LL, 6}, {1, 2405LL, 2}, + {1, 38131LL, 6}, {1, 37787LL, 6}, {1, 18725LL, 5}, + {1, 18559LL, 5}, {1, 36793LL, 6}, {1, 36473LL, 6}, + {1, 18079LL, 5}, {1, 35849LL, 6}, {1, 17773LL, 5}, + {1, 35247LL, 6}, {1, 34953LL, 6}, {1, 4333LL, 3}, + {1, 8595LL, 4}, {1, 34101LL, 6}, {1, 16913LL, 5}, + {1, 33555LL, 6}, {1, 33289LL, 6}, {1, 33027LL, 6}, + {0, 0LL, 7}, {1, 16257LL, 5}, {1, 4033LL, 3}, + {1, 16009LL, 5}, {1, 993LL, 1}, {1, 31537LL, 6}, + {1, 31301LL, 6}, {1, 31069LL, 6}, {1, 30841LL, 6}, + {1, 3827LL, 3}, {1, 15197LL, 5}, {1, 30175LL, 6}, + {1, 3745LL, 3}, {1, 29747LL, 6}, {1, 14769LL, 5}, + {1, 29331LL, 6}, {1, 3641LL, 3}, {1, 28927LL, 6}, + {1, 57457LL, 7}, {1, 28533LL, 6}, {1, 7085LL, 4}, + {1, 14075LL, 5}, {1, 55925LL, 7}, {1, 27777LL, 6}, + {1, 55189LL, 7}, {1, 13707LL, 5}, {1, 6809LL, 4}, + {1, 54121LL, 7}, {1, 26887LL, 6}, {1, 6679LL, 4}, + {1, 26547LL, 6}, {1, 6595LL, 4}, {1, 26215LL, 6}, + {1, 6513LL, 4}, {1, 25891LL, 6}, {1, 6433LL, 4}, + {1, 51151LL, 7}, {1, 50841LL, 7}, {1, 25267LL, 6}, + {1, 6279LL, 4}, {1, 49933LL, 7}, {1, 24819LL, 6}, + {1, 24673LL, 6}, {1, 49057LL, 7}, {1, 12193LL, 5}, + {1, 24245LL, 6}, {1, 48211LL, 7}, {1, 749LL, 1}, + {1, 2979LL, 3}, {1, 23697LL, 6}, {1, 5891LL, 4}, + {1, 2929LL, 3}, {1, 11651LL, 5}, {1, 23173LL, 6}, + {1, 11523LL, 5}, {1, 2865LL, 3}, {1, 45591LL, 7}, + {1, 1417LL, 2}, {1, 45101LL, 7}, {1, 11215LL, 5}, + {1, 44621LL, 7}, {1, 44385LL, 7}, {1, 44151LL, 7}, + {1, 2745LL, 3}, {1, 10923LL, 5}, {1, 43465LL, 7}, + {1, 43241LL, 7}, {1, 43019LL, 7}, {1, 2675LL, 3}, + {1, 21291LL, 6}, {1, 331LL, 0}, {1, 21077LL, 6}, + {1, 5243LL, 4}, {1, 41735LL, 7}, {1, 5191LL, 4}, + {1, 10331LL, 5}, {1, 41121LL, 7}, {1, 40921LL, 7}, + {1, 20361LL, 6}, {1, 40525LL, 7}, {1, 20165LL, 6}, + {1, 20069LL, 6}, {1, 19973LL, 6}, {1, 39757LL, 7}, + {1, 19785LL, 6}, {1, 4923LL, 4}, {1, 1225LL, 2}, + {1, 39017LL, 7}, {1, 38837LL, 7}, {1, 19329LL, 6}, + {1, 2405LL, 3}, {1, 38305LL, 7}, {1, 38131LL, 7}, + {1, 18979LL, 6}, {1, 37787LL, 7}, {1, 18809LL, 6}, + {1, 18725LL, 6}, {1, 37283LL, 7}, {1, 18559LL, 6}, + {1, 36955LL, 7}, {1, 36793LL, 7}, {1, 4579LL, 4}, + {1, 36473LL, 7}, {1, 36315LL, 7}, {1, 18079LL, 6}, + {1, 36003LL, 7}, {1, 35849LL, 7}, {1, 35697LL, 7}, + {1, 17773LL, 6}, {1, 8849LL, 5}, {1, 35247LL, 7}, + {1, 35099LL, 7}, {1, 34953LL, 7}, {1, 4351LL, 4}, + {1, 4333LL, 4}, {1, 17261LL, 6}, {1, 8595LL, 5}, + {1, 535LL, 1}, {1, 34101LL, 7}, {1, 16981LL, 6}, + {1, 16913LL, 6}, {1, 16845LL, 6}, {1, 33555LL, 7}, + {1, 33421LL, 7}, {1, 33289LL, 7}, {1, 33157LL, 7}, + {1, 33027LL, 7}, {1, 32897LL, 7}, {1, 32769LL, 7}, +}; +static const uniform int64 __idiv_table_u32[][3] = { + {0, 0LL, 1}, {1, 2863311531LL, 1}, {0, 0LL, 2}, + {1, 3435973837LL, 2}, {1, 2863311531LL, 2}, {2, 613566757LL, 2}, + {0, 0LL, 3}, {1, 954437177LL, 1}, {1, 3435973837LL, 3}, + {1, 3123612579LL, 3}, {1, 2863311531LL, 3}, {1, 1321528399LL, 2}, + {2, 613566757LL, 3}, {1, 2290649225LL, 3}, {0, 0LL, 4}, + {1, 4042322161LL, 4}, {1, 954437177LL, 2}, {2, 2938661835LL, 4}, + {1, 3435973837LL, 4}, {2, 2249744775LL, 4}, {1, 3123612579LL, 4}, + {1, 2987803337LL, 4}, {1, 2863311531LL, 4}, {1, 1374389535LL, 3}, + {1, 1321528399LL, 3}, {2, 795364315LL, 4}, {2, 613566757LL, 4}, + {1, 2369637129LL, 4}, {1, 2290649225LL, 4}, {2, 138547333LL, 4}, + {0, 0LL, 5}, {1, 1041204193LL, 3}, {1, 4042322161LL, 5}, + {2, 3558687189LL, 5}, {1, 954437177LL, 3}, {2, 3134165325LL, 5}, + {2, 2938661835LL, 5}, {2, 2753184165LL, 5}, {1, 3435973837LL, 5}, + {1, 3352169597LL, 5}, {2, 2249744775LL, 5}, {1, 799063683LL, 3}, + {1, 3123612579LL, 5}, {2, 1813430637LL, 5}, {1, 2987803337LL, 5}, + {1, 2924233053LL, 5}, {1, 2863311531LL, 5}, {1, 1402438301LL, 4}, + {1, 1374389535LL, 4}, {1, 2694881441LL, 5}, {1, 1321528399LL, 4}, + {2, 891408307LL, 5}, {2, 795364315LL, 5}, {2, 702812831LL, 5}, + {2, 613566757LL, 5}, {2, 527452125LL, 5}, {1, 2369637129LL, 5}, + {1, 582368447LL, 3}, {1, 2290649225LL, 5}, {1, 1126548799LL, 4}, + {2, 138547333LL, 5}, {2, 68174085LL, 5}, {0, 0LL, 6}, + {1, 4228890877LL, 6}, {1, 1041204193LL, 4}, {1, 128207979LL, 1}, + {1, 4042322161LL, 6}, {1, 1991868891LL, 5}, {2, 3558687189LL, 6}, + {1, 3871519817LL, 6}, {1, 954437177LL, 4}, {2, 3235934265LL, 6}, + {2, 3134165325LL, 6}, {1, 458129845LL, 3}, {2, 2938661835LL, 6}, + {1, 892460737LL, 4}, {2, 2753184165LL, 6}, {1, 3479467177LL, 6}, + {1, 3435973837LL, 6}, {1, 3393554407LL, 6}, {1, 3352169597LL, 6}, + {1, 827945503LL, 4}, {2, 2249744775LL, 6}, {1, 3233857729LL, 6}, + {1, 799063683LL, 4}, {1, 789879043LL, 4}, {1, 3123612579LL, 6}, + {1, 3088515809LL, 6}, {2, 1813430637LL, 6}, {2, 1746305385LL, 6}, + {1, 2987803337LL, 6}, {1, 2955676419LL, 6}, {1, 2924233053LL, 6}, + {2, 1491936009LL, 6}, {1, 2863311531LL, 6}, {2, 1372618415LL, 6}, + {1, 1402438301LL, 5}, {1, 2776544515LL, 6}, {1, 1374389535LL, 5}, + {2, 1148159575LL, 6}, {1, 2694881441LL, 6}, {2, 1042467791LL, 6}, + {1, 1321528399LL, 5}, {2, 940802361LL, 6}, {2, 891408307LL, 6}, + {2, 842937507LL, 6}, {2, 795364315LL, 6}, {2, 748664025LL, 6}, + {2, 702812831LL, 6}, {2, 657787785LL, 6}, {2, 613566757LL, 6}, + {2, 570128403LL, 6}, {2, 527452125LL, 6}, {2, 485518043LL, 6}, + {1, 2369637129LL, 6}, {2, 403800345LL, 6}, {1, 582368447LL, 4}, + {1, 1154949189LL, 5}, {1, 2290649225LL, 6}, {2, 248469183LL, 6}, + {1, 1126548799LL, 5}, {2, 174592167LL, 6}, {2, 138547333LL, 6}, + {1, 274877907LL, 3}, {2, 68174085LL, 6}, {2, 33818641LL, 6}, + {0, 0LL, 7}, {1, 266354561LL, 3}, {1, 4228890877LL, 7}, + {1, 4196609267LL, 7}, {1, 1041204193LL, 5}, {1, 4133502361LL, 7}, + {1, 128207979LL, 2}, {1, 4072265289LL, 7}, {1, 4042322161LL, 7}, + {1, 125400505LL, 2}, {1, 1991868891LL, 6}, {1, 1977538899LL, 6}, + {2, 3558687189LL, 7}, {1, 974744351LL, 5}, {1, 3871519817LL, 7}, + {1, 3844446251LL, 7}, {1, 954437177LL, 5}, {1, 3791419407LL, 7}, + {2, 3235934265LL, 7}, {1, 3739835469LL, 7}, {2, 3134165325LL, 7}, + {1, 3689636335LL, 7}, {1, 458129845LL, 4}, {1, 910191745LL, 5}, + {2, 2938661835LL, 7}, {1, 3593175255LL, 7}, {1, 892460737LL, 5}, + {1, 3546811703LL, 7}, {2, 2753184165LL, 7}, {1, 875407347LL, 5}, + {1, 3479467177LL, 7}, {2, 2620200175LL, 7}, {1, 3435973837LL, 7}, + {1, 3414632385LL, 7}, {1, 3393554407LL, 7}, {1, 3372735055LL, 7}, + {1, 3352169597LL, 7}, {1, 1665926709LL, 6}, {1, 827945503LL, 5}, + {1, 1645975491LL, 6}, {2, 2249744775LL, 7}, {1, 1626496491LL, 6}, + {1, 3233857729LL, 7}, {2, 2134925265LL, 7}, {1, 799063683LL, 5}, + {2, 2060591247LL, 7}, {1, 789879043LL, 5}, {1, 1570730897LL, 6}, + {1, 3123612579LL, 7}, {2, 1916962805LL, 7}, {1, 3088515809LL, 7}, + {2, 1847555765LL, 7}, {2, 1813430637LL, 7}, {1, 3037324939LL, 7}, + {2, 1746305385LL, 7}, {1, 3004130131LL, 7}, {1, 2987803337LL, 7}, + {2, 1648338801LL, 7}, {1, 2955676419LL, 7}, {1, 2939870663LL, 7}, + {1, 2924233053LL, 7}, {2, 1522554545LL, 7}, {2, 1491936009LL, 7}, + {1, 2878302691LL, 7}, {1, 2863311531LL, 7}, {1, 356059465LL, 4}, + {2, 1372618415LL, 7}, {2, 1343553873LL, 7}, {1, 1402438301LL, 6}, + {2, 1286310003LL, 7}, {1, 2776544515LL, 7}, {1, 1381296015LL, 6}, + {1, 1374389535LL, 6}, {1, 42735993LL, 1}, {2, 1148159575LL, 7}, + {1, 2708156719LL, 7}, {1, 2694881441LL, 7}, {1, 1340867839LL, 6}, + {2, 1042467791LL, 7}, {1, 663956297LL, 5}, {1, 1321528399LL, 6}, + {1, 2630410593LL, 7}, {2, 940802361LL, 7}, {1, 2605477791LL, 7}, + {2, 891408307LL, 7}, {1, 2581013211LL, 7}, {2, 842937507LL, 7}, + {1, 1278501893LL, 6}, {2, 795364315LL, 7}, {2, 771906565LL, 7}, + {2, 748664025LL, 7}, {2, 725633745LL, 7}, {2, 702812831LL, 7}, + {2, 680198441LL, 7}, {2, 657787785LL, 7}, {2, 635578121LL, 7}, + {2, 613566757LL, 7}, {1, 2443359173LL, 7}, {2, 570128403LL, 7}, + {2, 548696263LL, 7}, {2, 527452125LL, 7}, {1, 1200340205LL, 6}, + {2, 485518043LL, 7}, {2, 464823301LL, 7}, {1, 2369637129LL, 7}, + {2, 423966729LL, 7}, {2, 403800345LL, 7}, {2, 383805589LL, 7}, + {1, 582368447LL, 5}, {2, 344322273LL, 7}, {1, 1154949189LL, 6}, + {1, 2300233531LL, 7}, {1, 2290649225LL, 7}, {1, 285143057LL, 4}, + {2, 248469183LL, 7}, {1, 2262369605LL, 7}, {1, 1126548799LL, 6}, + {2, 192835267LL, 7}, {2, 174592167LL, 7}, {2, 156496785LL, 7}, + {2, 138547333LL, 7}, {2, 120742053LL, 7}, {1, 274877907LL, 4}, + {1, 2190262207LL, 7}, {2, 68174085LL, 7}, {1, 2172947881LL, 7}, + {2, 33818641LL, 7}, {1, 2155905153LL, 7}, {0, 0LL, 8}, +}; +static const uniform int64 __idiv_table_s32[][3] = { + {0, 0LL, 1}, {1, 1431655766LL, 0}, {0, 0LL, 2}, + {1, 1717986919LL, 1}, {1, 715827883LL, 0}, {1, 2454267027LL, 2}, + {0, 0LL, 3}, {1, 954437177LL, 1}, {1, 1717986919LL, 2}, + {1, 780903145LL, 1}, {1, 715827883LL, 1}, {1, 1321528399LL, 2}, + {1, 2454267027LL, 3}, {1, 2290649225LL, 3}, {0, 0LL, 4}, + {1, 2021161081LL, 3}, {1, 954437177LL, 2}, {1, 1808407283LL, 3}, + {1, 1717986919LL, 3}, {1, 818089009LL, 2}, {1, 780903145LL, 2}, + {1, 2987803337LL, 4}, {1, 715827883LL, 2}, {1, 1374389535LL, 3}, + {1, 1321528399LL, 3}, {1, 1272582903LL, 3}, {1, 2454267027LL, 4}, + {1, 2369637129LL, 4}, {1, 2290649225LL, 4}, {1, 2216757315LL, 4}, + {0, 0LL, 5}, {1, 1041204193LL, 3}, {1, 2021161081LL, 4}, + {1, 3926827243LL, 5}, {1, 954437177LL, 3}, {1, 3714566311LL, 5}, + {1, 1808407283LL, 4}, {1, 3524075731LL, 5}, {1, 1717986919LL, 4}, + {1, 1676084799LL, 4}, {1, 818089009LL, 3}, {1, 799063683LL, 3}, + {1, 780903145LL, 3}, {1, 3054198967LL, 5}, {1, 2987803337LL, 5}, + {1, 2924233053LL, 5}, {1, 715827883LL, 3}, {1, 1402438301LL, 4}, + {1, 1374389535LL, 4}, {1, 2694881441LL, 5}, {1, 1321528399LL, 4}, + {1, 1296593901LL, 4}, {1, 1272582903LL, 4}, {1, 156180629LL, 1}, + {1, 2454267027LL, 5}, {1, 2411209711LL, 5}, {1, 2369637129LL, 5}, + {1, 582368447LL, 3}, {1, 2290649225LL, 5}, {1, 1126548799LL, 4}, + {1, 2216757315LL, 5}, {1, 2181570691LL, 5}, {0, 0LL, 6}, + {1, 2114445439LL, 5}, {1, 1041204193LL, 4}, {1, 128207979LL, 1}, + {1, 2021161081LL, 5}, {1, 1991868891LL, 5}, {1, 3926827243LL, 6}, + {1, 3871519817LL, 6}, {1, 954437177LL, 4}, {1, 3765450781LL, 6}, + {1, 3714566311LL, 6}, {1, 458129845LL, 3}, {1, 1808407283LL, 5}, + {1, 892460737LL, 4}, {1, 3524075731LL, 6}, {1, 1739733589LL, 5}, + {1, 1717986919LL, 5}, {1, 424194301LL, 3}, {1, 1676084799LL, 5}, + {1, 827945503LL, 4}, {1, 818089009LL, 4}, {1, 1616928865LL, 5}, + {1, 799063683LL, 4}, {1, 789879043LL, 4}, {1, 780903145LL, 4}, + {1, 3088515809LL, 6}, {1, 3054198967LL, 6}, {1, 3020636341LL, 6}, + {1, 2987803337LL, 6}, {1, 738919105LL, 4}, {1, 2924233053LL, 6}, + {1, 2893451653LL, 6}, {1, 715827883LL, 4}, {1, 354224107LL, 3}, + {1, 1402438301LL, 5}, {1, 2776544515LL, 6}, {1, 1374389535LL, 5}, + {1, 680390859LL, 4}, {1, 2694881441LL, 6}, {1, 333589693LL, 3}, + {1, 1321528399LL, 5}, {1, 2617884829LL, 6}, {1, 1296593901LL, 5}, + {1, 1284476201LL, 5}, {1, 1272582903LL, 5}, {1, 2521815661LL, 6}, + {1, 156180629LL, 2}, {1, 2476377541LL, 6}, {1, 2454267027LL, 6}, + {1, 1216273925LL, 5}, {1, 2411209711LL, 6}, {1, 1195121335LL, 5}, + {1, 2369637129LL, 6}, {1, 2349383821LL, 6}, {1, 582368447LL, 4}, + {1, 1154949189LL, 5}, {1, 2290649225LL, 6}, {1, 70991195LL, 1}, + {1, 1126548799LL, 5}, {1, 558694933LL, 4}, {1, 2216757315LL, 6}, + {1, 274877907LL, 3}, {1, 2181570691LL, 6}, {1, 2164392969LL, 6}, + {0, 0LL, 7}, {1, 266354561LL, 3}, {1, 2114445439LL, 6}, + {1, 1049152317LL, 5}, {1, 1041204193LL, 5}, {1, 4133502361LL, 7}, + {1, 128207979LL, 2}, {1, 4072265289LL, 7}, {1, 2021161081LL, 6}, + {1, 125400505LL, 2}, {1, 1991868891LL, 6}, {1, 1977538899LL, 6}, + {1, 3926827243LL, 7}, {1, 974744351LL, 5}, {1, 3871519817LL, 7}, + {1, 961111563LL, 5}, {1, 954437177LL, 5}, {1, 3791419407LL, 7}, + {1, 3765450781LL, 7}, {1, 1869917735LL, 6}, {1, 3714566311LL, 7}, + {1, 230602271LL, 3}, {1, 458129845LL, 4}, {1, 910191745LL, 5}, + {1, 1808407283LL, 6}, {1, 3593175255LL, 7}, {1, 892460737LL, 5}, + {1, 443351463LL, 4}, {1, 3524075731LL, 7}, {1, 875407347LL, 5}, + {1, 1739733589LL, 6}, {1, 432197967LL, 4}, {1, 1717986919LL, 6}, + {1, 3414632385LL, 7}, {1, 424194301LL, 4}, {1, 210795941LL, 3}, + {1, 1676084799LL, 6}, {1, 1665926709LL, 6}, {1, 827945503LL, 5}, + {1, 1645975491LL, 6}, {1, 818089009LL, 5}, {1, 1626496491LL, 6}, + {1, 1616928865LL, 6}, {1, 3214946281LL, 7}, {1, 799063683LL, 5}, + {1, 397222409LL, 4}, {1, 789879043LL, 5}, {1, 1570730897LL, 6}, + {1, 780903145LL, 5}, {1, 3105965051LL, 7}, {1, 3088515809LL, 7}, + {1, 3071261531LL, 7}, {1, 3054198967LL, 7}, {1, 759331235LL, 5}, + {1, 3020636341LL, 7}, {1, 3004130131LL, 7}, {1, 2987803337LL, 7}, + {1, 2971653049LL, 7}, {1, 738919105LL, 5}, {1, 2939870663LL, 7}, + {1, 2924233053LL, 7}, {1, 2908760921LL, 7}, {1, 2893451653LL, 7}, + {1, 2878302691LL, 7}, {1, 715827883LL, 5}, {1, 356059465LL, 4}, + {1, 354224107LL, 4}, {1, 2819260585LL, 7}, {1, 1402438301LL, 6}, + {1, 1395319325LL, 6}, {1, 2776544515LL, 7}, {1, 1381296015LL, 6}, + {1, 1374389535LL, 6}, {1, 42735993LL, 1}, {1, 680390859LL, 5}, + {1, 2708156719LL, 7}, {1, 2694881441LL, 7}, {1, 1340867839LL, 6}, + {1, 333589693LL, 4}, {1, 663956297LL, 5}, {1, 1321528399LL, 6}, + {1, 2630410593LL, 7}, {1, 2617884829LL, 7}, {1, 81421181LL, 2}, + {1, 1296593901LL, 6}, {1, 2581013211LL, 7}, {1, 1284476201LL, 6}, + {1, 1278501893LL, 6}, {1, 1272582903LL, 6}, {1, 2533436931LL, 7}, + {1, 2521815661LL, 7}, {1, 2510300521LL, 7}, {1, 156180629LL, 3}, + {1, 2487582869LL, 7}, {1, 2476377541LL, 7}, {1, 2465272709LL, 7}, + {1, 2454267027LL, 7}, {1, 2443359173LL, 7}, {1, 1216273925LL, 6}, + {1, 605457945LL, 5}, {1, 2411209711LL, 7}, {1, 1200340205LL, 6}, + {1, 1195121335LL, 6}, {1, 2379895299LL, 7}, {1, 2369637129LL, 7}, + {1, 2359467013LL, 7}, {1, 2349383821LL, 7}, {1, 2339386443LL, 7}, + {1, 582368447LL, 5}, {1, 2319644785LL, 7}, {1, 1154949189LL, 6}, + {1, 2300233531LL, 7}, {1, 2290649225LL, 7}, {1, 285143057LL, 4}, + {1, 70991195LL, 2}, {1, 2262369605LL, 7}, {1, 1126548799LL, 6}, + {1, 1121950641LL, 6}, {1, 558694933LL, 5}, {1, 2225732041LL, 7}, + {1, 2216757315LL, 7}, {1, 2207854675LL, 7}, {1, 274877907LL, 4}, + {1, 2190262207LL, 7}, {1, 2181570691LL, 7}, {1, 2172947881LL, 7}, + {1, 2164392969LL, 7}, {1, 2155905153LL, 7}, {1, 2147483649LL, 7}, +}; + +__declspec(safe) +static unmasked inline unsigned int8 +__fast_idiv(unsigned int8 numerator, uniform unsigned int8 divisor) { + uniform int64 method = __idiv_table_u8[divisor-2][0]; + uniform int64 multiplier = __idiv_table_u8[divisor-2][1]; + uniform int64 shift = __idiv_table_u8[divisor-2][2]; + + unsigned int16 mult = multiplier; + unsigned int16 val = numerator; + if (method == 0) + return numerator >> shift; + else if (method == 1) + return (val * mult) >> (8 + shift); + else { + val *= mult; + val >>= 8; + val += (numerator-val)>>1; + return (val >> shift); + } +} + +__declspec(safe) +static unmasked inline int8 __fast_idiv(int8 numerator, uniform int8 divisor) { + uniform int8 method = __idiv_table_s8[divisor-2][0]; + uniform int16 multiplier = __idiv_table_s8[divisor-2][1]; + uniform int8 shift = __idiv_table_s8[divisor-2][2]; + + if (method == 0) + return numerator >> shift; + else { + unsigned int8 sign = numerator >> 7; + numerator ^= sign; + int16 mul = (int16)numerator * (int16)multiplier; + mul >>= 8 + shift; + return (int8)mul ^ sign; + } +} + +__declspec(safe) +static unmasked inline unsigned int16 __fast_idiv(unsigned int16 numerator, + uniform unsigned int16 divisor) { + uniform int64 method = __idiv_table_u16[divisor-2][0]; + uniform int64 multiplier = __idiv_table_u16[divisor-2][1]; + uniform int64 shift = __idiv_table_u16[divisor-2][2]; + + unsigned int32 mult = multiplier; + unsigned int32 val = numerator; + if (method == 0) + return numerator >> shift; + else if (method == 1) + return (val * mult) >> (16 + shift); + else { + val *= mult; + val >>= 16; + val += (numerator-val)>>1; + return val >> shift; + } +} + +__declspec(safe) +static unmasked inline int16 __fast_idiv(int16 numerator, uniform int16 divisor) { + uniform int64 method = __idiv_table_s16[divisor-2][0]; + uniform int64 multiplier = __idiv_table_s16[divisor-2][1]; + uniform int64 shift = __idiv_table_s16[divisor-2][2]; + + if (method == 0) + return numerator >> shift; + else { + unsigned int16 sign = numerator >> 15; + numerator ^= sign; + int32 mul = (int32)numerator * (int32)multiplier; + mul >>= 16 + shift; + int16 result = mul; + return result ^ sign; + } +} + +__declspec(safe) +static unmasked inline inline unsigned int32 __fast_idiv(unsigned int32 numerator, + uniform unsigned int32 divisor) { + uniform int64 method = __idiv_table_u32[divisor-2][0]; + uniform int64 multiplier = __idiv_table_u32[divisor-2][1]; + uniform int64 shift = __idiv_table_u32[divisor-2][2]; + + unsigned int64 mult = multiplier; + unsigned int64 val = numerator; + if (method == 0) + return numerator >> shift; + else if (method == 1) + return (val * mult) >> (32 + shift); + else { + val *= mult; + val >>= 32; + val += (numerator-val)>>1; + return val >> shift; + } +} + +__declspec(safe) +static unmasked inline int32 __fast_idiv(int32 numerator, uniform int32 divisor) { + uniform int64 method = __idiv_table_s32[divisor-2][0]; + uniform int64 multiplier = __idiv_table_s32[divisor-2][1]; + uniform int64 shift = __idiv_table_s32[divisor-2][2]; + + if (method == 0) + return numerator >> shift; + else { + unsigned int32 sign = numerator >> 31; + numerator ^= sign; + int64 mul = (int64)numerator * (int64)multiplier; + mul >>= 32 + shift; + int32 result = mul; + return result ^ sign; + } +} + +/////////////////////////////////////////////////////////////////////////// +// Saturating int8/int16 ops + +__declspec(safe) +static unmasked inline unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) { + return __avg_up_uint8(a, b); +} + +__declspec(safe) +static unmasked inline int8 avg_up(int8 a, int8 b) { + return __avg_up_int8(a, b); +} + +__declspec(safe) +static unmasked inline unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) { + return __avg_up_uint16(a, b); +} + +__declspec(safe) +static unmasked inline int16 avg_up(int16 a, int16 b) { + return __avg_up_int16(a, b); +} + +__declspec(safe) +static unmasked inline unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) { + return __avg_down_uint8(a, b); +} + +__declspec(safe) +static unmasked inline int8 avg_down(int8 a, int8 b) { + return __avg_down_int8(a, b); +} + +__declspec(safe) +static unmasked inline unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) { + return __avg_down_uint16(a, b); +} + +__declspec(safe) +static unmasked inline int16 avg_down(int16 a, int16 b) { + return __avg_down_int16(a, b); +} diff --git a/sym.cpp b/sym.cpp index f16f5e11..05f9996a 100644 --- a/sym.cpp +++ b/sym.cpp @@ -214,6 +214,17 @@ SymbolTable::LookupType(const char *name) const { return NULL; } +bool +SymbolTable::ContainsType(const Type *type) const { + TypeMapType::const_iterator iter = types.begin(); + while (iter != types.end()) { + if (iter->second == type) { + return true; + } + iter++; + } + return false; +} std::vector SymbolTable::ClosestVariableOrFunctionMatch(const char *str) const { diff --git a/sym.h b/sym.h index efb532a3..761c3612 100644 --- a/sym.h +++ b/sym.h @@ -219,6 +219,12 @@ public: @return Pointer to the Type, if found; otherwise NULL is returned. */ const Type *LookupType(const char *name) const; + + /** Look for a type given a pointer. + + @return True if found, False otherwise. + */ + bool ContainsType(const Type * type) const; /** This method returns zero or more strings with the names of symbols in the symbol table that nearly (but not exactly) match the given diff --git a/tests/aossoa-1.ispc b/tests/aossoa-1.ispc index 59964d6d..32d3bcba 100644 --- a/tests/aossoa-1.ispc +++ b/tests/aossoa-1.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 3 -#define maxProgramCount 64 +#define width 3ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 3; diff --git a/tests/aossoa-2.ispc b/tests/aossoa-2.ispc index 9ff82226..df8eae5c 100644 --- a/tests/aossoa-2.ispc +++ b/tests/aossoa-2.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 4 -#define maxProgramCount 64 +#define width 4ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 4; diff --git a/tests/aossoa-5.ispc b/tests/aossoa-5.ispc index eb4fed3a..d6346455 100644 --- a/tests/aossoa-5.ispc +++ b/tests/aossoa-5.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 3 -#define maxProgramCount 64 +#define width 3ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 3; diff --git a/tests/aossoa-6.ispc b/tests/aossoa-6.ispc index b64cd10b..7c177fde 100644 --- a/tests/aossoa-6.ispc +++ b/tests/aossoa-6.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 4 -#define maxProgramCount 64 +#define width 4ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 4; diff --git a/tests/atomics-12.ispc b/tests/atomics-12.ispc index c27ad99c..d6359555 100644 --- a/tests/atomics-12.ispc +++ b/tests/atomics-12.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; if (programIndex < 30 && programIndex & 1) - b = atomic_or_global(&s, (1 << programIndex)); + b = atomic_or_global(&s, (1ul << programIndex)); RET[programIndex] = s; } @@ -15,6 +15,6 @@ export void result(uniform float RET[]) { uniform int sum = 0; for (uniform int i = 0; i < min(30, programCount); ++i) if (i & 1) - sum += (1 << i); + sum += (1ul << i); RET[programIndex] = sum; } diff --git a/tests/atomics-13.ispc b/tests/atomics-13.ispc index 86faaddb..dea3bfc3 100644 --- a/tests/atomics-13.ispc +++ b/tests/atomics-13.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; int32 b = 0; if (programIndex < 32 && programIndex & 1) - b = atomic_or_global(&s, (1 << programIndex)); + b = atomic_or_global(&s, (1ul << programIndex)); RET[programIndex] = popcnt(reduce_max((int32)b)); } diff --git a/tests/atomics-4.ispc b/tests/atomics-4.ispc index 30b343d1..ac746ad2 100644 --- a/tests/atomics-4.ispc +++ b/tests/atomics-4.ispc @@ -5,10 +5,10 @@ uniform int32 s = 0; export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; - float b = atomic_or_global(&s, (1< 0 ? 1 : 0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/coalesce-1.ispc b/tests/coalesce-1.ispc index acfe8cdf..39a79a91 100644 --- a/tests/coalesce-1.ispc +++ b/tests/coalesce-1.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; assert(programIndex <= 64); diff --git a/tests/coalesce-2.ispc b/tests/coalesce-2.ispc index 88b952a4..a047e456 100644 --- a/tests/coalesce-2.ispc +++ b/tests/coalesce-2.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; RET[programIndex] = buf[programIndex & 1]; diff --git a/tests/coalesce-3.ispc b/tests/coalesce-3.ispc index 7a05963f..c1718b4f 100644 --- a/tests/coalesce-3.ispc +++ b/tests/coalesce-3.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; RET[programIndex] = buf[(programIndex >> 2) * 16 + (programIndex & 3)]; diff --git a/tests/coalesce-4.ispc b/tests/coalesce-4.ispc index 1ddd4b89..182a4d4f 100644 --- a/tests/coalesce-4.ispc +++ b/tests/coalesce-4.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[2*programIndex]; diff --git a/tests/coalesce-5.ispc b/tests/coalesce-5.ispc index 2dd8d44e..385e8526 100644 --- a/tests/coalesce-5.ispc +++ b/tests/coalesce-5.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[4*programIndex]; diff --git a/tests/coalesce-6.ispc b/tests/coalesce-6.ispc index 2a54a2db..8c630a45 100644 --- a/tests/coalesce-6.ispc +++ b/tests/coalesce-6.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[4*programIndex]; diff --git a/tests/coalesce-7.ispc b/tests/coalesce-7.ispc index 8ed628bd..29b56b8d 100644 --- a/tests/coalesce-7.ispc +++ b/tests/coalesce-7.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[4*programIndex]; diff --git a/tests/coalesce-8.ispc b/tests/coalesce-8.ispc index dfefaa19..f01ca9c3 100644 --- a/tests/coalesce-8.ispc +++ b/tests/coalesce-8.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; int index = (programIndex < 4) ? (programIndex & 1) : diff --git a/tests/count-leading-trailing-zeros-1.ispc b/tests/count-leading-trailing-zeros-1.ispc index 221d066d..3f12c07d 100644 --- a/tests/count-leading-trailing-zeros-1.ispc +++ b/tests/count-leading-trailing-zeros-1.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - RET[programIndex] = count_trailing_zeros(0xf0); + RET[programIndex] = count_trailing_zeros(0xf0ul); } export void result(uniform float RET[]) { diff --git a/tests/count-leading-trailing-zeros-4.ispc b/tests/count-leading-trailing-zeros-4.ispc index 475c18ca..4b849018 100644 --- a/tests/count-leading-trailing-zeros-4.ispc +++ b/tests/count-leading-trailing-zeros-4.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - int32 i = (1 << (programIndex % 28)); + int32 i = (1ul << (programIndex % 28)); RET[programIndex] = count_leading_zeros(i); } diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc new file mode 100644 index 00000000..5f9a66d5 --- /dev/null +++ b/tests/double-consts.ispc @@ -0,0 +1,24 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double a = aFOO[programIndex]; + // Test parsing of double constants. + double d1 = 1.0d40; + double d2 = 1.d40; + double d3 = 1d40; + double d4 = .1d41; + double d5 = 10000000000000000000000000000000000000000.d; + double d6 = 10000000000000000000000000000000000000000.0d; + + // All the constants should be equal and if it's evaluated as "float", + // then sqrt will evaluate to +inf. + if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && d1 == d6 && + ((float)sqrt(d1)) < 2e20) { + RET[programIndex] = a; + } +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/exclusive-scan-and-2.ispc b/tests/exclusive-scan-and-2.ispc index 5d2bcd1f..b742a91e 100644 --- a/tests/exclusive-scan-and-2.ispc +++ b/tests/exclusive-scan-and-2.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { RET[programIndex] = -1; - int32 a = ~(1 << programIndex); + int32 a = ~(1ul << programIndex); if ((programIndex < 32) && (programIndex & 1) == 0) { RET[programIndex] = exclusive_scan_and(a); } @@ -15,7 +15,7 @@ export void result(uniform float RET[]) { if ((programIndex & 1) == 0 && programIndex > 0 && programIndex < 32) { int val = 0xffffffff; for (int i = 0; i < programIndex-1; i += 2) - val &= ~(1< 32) break; + } + } + } + + for (int8 num = 0; num < 127; ++num) { + for (uniform int8 div = 2; div < 127; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 64) break; + } + } + } + + for (int16 num = 0; num < 32767; ++num) { + for (uniform int16 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 96) break; + } + } + } + + for (unsigned int16 num = 0; num < 0xffff; ++num) { + for (uniform unsigned int16 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 128) break; + } + } + } + + // randomly sample int32s... + uniform RNGState state; + seed_rng(&state, 1234); + for (uniform int i = 0; i < 64k; ++i) { + unsigned int32 num = random(&state); + for (uniform unsigned int32 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("ui32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 160) break; + } + } + } + + for (uniform int64 i = 0; i < 64k; ++i) { + int32 num = random(&state); + if (num < 0) + continue; + for (uniform int32 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("si32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 192) break; + } + } + } + + RET[programIndex] = errorCount; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} + diff --git a/tests/kilo-mega-giga-2.ispc b/tests/kilo-mega-giga-2.ispc index 77e201ef..42545b8d 100644 --- a/tests/kilo-mega-giga-2.ispc +++ b/tests/kilo-mega-giga-2.ispc @@ -8,5 +8,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { } export void result(uniform float RET[]) { - RET[programIndex] = 2*1024*1024 + 5; + RET[programIndex] = 2ul*1024ul*1024ul + 5; } diff --git a/tests/ldexp-double.ispc b/tests/ldexp-double.ispc index 6b3ed734..e1b7a59f 100644 --- a/tests/ldexp-double.ispc +++ b/tests/ldexp-double.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - double a = 1 << (programIndex % 28); + double a = 1ul << (programIndex % 28); if (programIndex & 1) a = -a; RET[programIndex] = ldexp(a, 2); @@ -11,7 +11,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { export void result(uniform float RET[]) { int pi = programIndex % 28; - RET[programIndex] = (1 << (pi + 2)); + RET[programIndex] = (1ul << (pi + 2)); if (programIndex & 1) RET[programIndex] = -RET[programIndex]; } diff --git a/tests/ldexp-float.ispc b/tests/ldexp-float.ispc index a2ec9a27..305ae106 100644 --- a/tests/ldexp-float.ispc +++ b/tests/ldexp-float.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - float a = 1 << (programIndex % 28); + float a = 1ul << (programIndex % 28); if (programIndex & 1) a = -a; RET[programIndex] = ldexp(a, 2); @@ -11,7 +11,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { export void result(uniform float RET[]) { int pi = programIndex % 28; - RET[programIndex] = (1 << (pi + 2)); + RET[programIndex] = (1ul << (pi + 2)); if (programIndex & 1) RET[programIndex] = -RET[programIndex]; } diff --git a/tests/local-atomics-12.ispc b/tests/local-atomics-12.ispc index 23a30af5..358ffd34 100644 --- a/tests/local-atomics-12.ispc +++ b/tests/local-atomics-12.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; if (programIndex < 29 && (programIndex & 1)) - b = atomic_or_local(&s, (1 << programIndex)); + b = atomic_or_local(&s, (1ul << programIndex)); RET[programIndex] = s; } @@ -15,6 +15,6 @@ export void result(uniform float RET[]) { uniform int sum = 0; for (uniform int i = 0; i < min(programCount, 29); ++i) if (i & 1) - sum += (1 << i); + sum += (1ul << i); RET[programIndex] = sum; } diff --git a/tests/local-atomics-13.ispc b/tests/local-atomics-13.ispc index 36fd1f1c..b9d35d09 100644 --- a/tests/local-atomics-13.ispc +++ b/tests/local-atomics-13.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; int32 b = 0; if (programIndex < 28 && (programIndex & 1)) - b = atomic_or_local(&s, (1 << programIndex)); + b = atomic_or_local(&s, (1ul << programIndex)); RET[programIndex] = popcnt(reduce_max(b)); } diff --git a/tests/local-atomics-14.ispc b/tests/local-atomics-14.ispc index 4cf81809..25c52e60 100644 --- a/tests/local-atomics-14.ispc +++ b/tests/local-atomics-14.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; if (programIndex < 32 && (programIndex & 1)) - b = atomic_or_local(&s, (1 << programIndex)); + b = atomic_or_local(&s, (1ul << programIndex)); RET[programIndex] = (s>>20); } @@ -15,6 +15,6 @@ export void result(uniform float RET[]) { uniform int sum = 0; for (uniform int i = 0; i < min(32, programCount); ++i) if (i & 1) - sum += (1 << i); + sum += (1ul << i); RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20; } diff --git a/tests/local-atomics-4.ispc b/tests/local-atomics-4.ispc index f7f6a04a..b3648ab5 100644 --- a/tests/local-atomics-4.ispc +++ b/tests/local-atomics-4.ispc @@ -7,10 +7,10 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; if (programIndex < 29) - atomic_or_local(&s, (1< struct Foo" for assignment operator is not possible +// Type conversion from "const uniform int[0-9]*" to "soa<4> struct Foo" for assignment operator is not possible struct Pt { float x, y, z; }; diff --git a/tests_errors/soa-12.ispc b/tests_errors/soa-12.ispc index e2cd3242..c0420614 100644 --- a/tests_errors/soa-12.ispc +++ b/tests_errors/soa-12.ispc @@ -1,4 +1,4 @@ -// Can't convert between types "const uniform int32" and "soa<4> float" with different SOA widths +// Can't convert between types "const uniform int[0-9]*" and "soa<4> float" with different SOA widths struct Pt { float x, y, z; }; diff --git a/tests_errors/soa-3.ispc b/tests_errors/soa-3.ispc index b2be1b59..04dc84bc 100644 --- a/tests_errors/soa-3.ispc +++ b/tests_errors/soa-3.ispc @@ -1,4 +1,4 @@ -// syntax error, unexpected '-', expecting int32 constant +// syntax error, unexpected '-', expecting int struct F { float a, b, c; }; diff --git a/tests_errors/soa-4.ispc b/tests_errors/soa-4.ispc index b2be1b59..04dc84bc 100644 --- a/tests_errors/soa-4.ispc +++ b/tests_errors/soa-4.ispc @@ -1,4 +1,4 @@ -// syntax error, unexpected '-', expecting int32 constant +// syntax error, unexpected '-', expecting int struct F { float a, b, c; }; diff --git a/tests_errors/soa-9.ispc b/tests_errors/soa-9.ispc index 7c6a1df9..e9e7509a 100644 --- a/tests_errors/soa-9.ispc +++ b/tests_errors/soa-9.ispc @@ -1,4 +1,4 @@ -// Can't convert from pointer to SOA type "soa<8> struct A * uniform" to pointer to non-SOA type "void * varying" +// Can't convert from pointer to SOA type "soa<8> struct A \* uniform" to pointer to non-SOA type "void \* varying" struct A { float a, b; }; diff --git a/tests_errors/struct_arith.ispc b/tests_errors/struct_arith.ispc index 9d942880..df729d02 100644 --- a/tests_errors/struct_arith.ispc +++ b/tests_errors/struct_arith.ispc @@ -1,4 +1,4 @@ -// Assignment operator "+=" is illegal with struct type +// Assignment operator "\+=" is illegal with struct type struct Point { float x, y, z; }; diff --git a/tests_errors/vec-size-compile-constant.ispc b/tests_errors/vec-size-compile-constant.ispc index b9e61721..0eb6f90e 100644 --- a/tests_errors/vec-size-compile-constant.ispc +++ b/tests_errors/vec-size-compile-constant.ispc @@ -1,4 +1,4 @@ -// syntax error, unexpected identifier, expecting int32 constant +// syntax error, unexpected identifier, expecting int void foo(uniform int i) { float a; diff --git a/util.cpp b/util.cpp index dbea9517..6b121988 100644 --- a/util.cpp +++ b/util.cpp @@ -79,8 +79,8 @@ compiler under a debuffer; in this case, just return a reasonable default. */ -static int -lTerminalWidth() { +int +TerminalWidth() { if (g->disableLineWrap) return 1<<30; @@ -228,8 +228,8 @@ lFindIndent(int numColons, const char *buf) { /** Print the given string to the given FILE, assuming the given output column width. Break words as needed to avoid words spilling past the last column. */ -static void -lPrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) { +void +PrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) { #ifdef ISPC_IS_WINDOWS fputs(buf, out); fputs("\n", out); @@ -375,7 +375,7 @@ lPrint(const char *type, bool isError, SourcePos p, const char *fmt, return; printed.insert(formattedBuf); - lPrintWithWordBreaks(formattedBuf, indent, lTerminalWidth(), stderr); + PrintWithWordBreaks(formattedBuf, indent, TerminalWidth(), stderr); lPrintFileLineContext(p); free(errorBuf); diff --git a/util.h b/util.h index b247b8bd..7edf71f7 100644 --- a/util.h +++ b/util.h @@ -156,4 +156,18 @@ void GetDirectoryAndFileName(const std::string ¤tDir, bool VerifyDataLayoutCompatibility(const std::string &module_dl, const std::string &lib_dl); +/** Print the given string to the given FILE, assuming the given output + column width. Break words as needed to avoid words spilling past the + last column. */ +void PrintWithWordBreaks(const char *buf, int indent, int columnWidth, + FILE *out); + +/** Returns the width of the terminal where the compiler is running. + Finding this out may fail in a variety of reasonable situations (piping + compiler output to 'less', redirecting output to a file, running the + compiler under a debuffer; in this case, just return a reasonable + default. + */ +int TerminalWidth(); + #endif // ISPC_UTIL_H