diff --git a/Makefile b/Makefile index 1fc77dff..4af7aff4 100644 --- a/Makefile +++ b/Makefile @@ -140,7 +140,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=nvptx64 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ +TARGETS=nvptx64 avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \ generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 ifneq ($(ARM_ENABLED), 0) diff --git a/alloy.py b/alloy.py index d56ef3a4..0aaf3d8d 100755 --- a/alloy.py +++ b/alloy.py @@ -212,10 +212,10 @@ def check_targets(): answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"] if AVX11 == False and "rdrand" in f_lines[i]: AVX11 = True; - answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"] + answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16", "avx1.1-i64x4"] if AVX2 == False and "avx2" in f_lines[i]: AVX2 = True; - answer = answer + ["avx2-i32x8", "avx2-i32x16"] + answer = answer + ["avx2-i32x8", "avx2-i32x16", "avx2-i64x4"] if current_OS == "MacOS": f_lines = take_lines("sysctl machdep.cpu.features", "first") if "SSE2" in f_lines: @@ -229,10 +229,10 @@ def check_targets(): answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"] if "RDRAND" in f_lines: AVX11 = True; - answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"] + answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16", "avx1.1-i64x4"] if "AVX2.0" in f_lines: AVX2 = True; - answer = answer + ["avx2-i32x8", "avx2-i32x16"] + answer = answer + ["avx2-i32x8", "avx2-i32x16", "avx2-i64x4"] answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"] # now check what targets we have with the help of SDE @@ -257,9 +257,9 @@ def check_targets(): if AVX == False and "snb" in f_lines[i]: answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]] if AVX11 == False and "ivb" in f_lines[i]: - answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"]] + answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"], ["-ivb", "avx1.1-i64x4"]] if AVX2 == False and "hsw" in f_lines[i]: - answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]] + answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"], ["-hsw", "avx2-i64x4"]] return [answer, answer_sde] def build_ispc(version_LLVM, make): @@ -274,29 +274,38 @@ def build_ispc(version_LLVM, make): def execute_stability(stability, R, print_version): stability1 = copy.deepcopy(stability) - temp = run_tests.run_tests(stability1, [], print_version) + b_temp = run_tests.run_tests(stability1, [], print_version) + temp = b_temp[0] + time = b_temp[1] for j in range(0,4): R[j][0] = R[j][0] + temp[j] for i in range(0,len(temp[j])): R[j][1].append(temp[4]) number_of_fails = temp[5] number_of_new_fails = len(temp[0]) + len(temp[1]) + number_of_passes = len(temp[2]) + len(temp[3]) if number_of_fails == 0: str_fails = ". No fails" else: str_fails = ". Fails: " + str(number_of_fails) if number_of_new_fails == 0: - str_new_fails = ", No new fails.\n" + str_new_fails = ", No new fails" else: - str_new_fails = ", New fails: " + str(number_of_new_fails) + ".\n" - print_debug(temp[4][1:-3] + str_fails + str_new_fails, False, stability_log) + str_new_fails = ", New fails: " + str(number_of_new_fails) + if number_of_passes == 0: + str_new_passes = "." + else: + str_new_passes = ", " + str(number_of_passes) + " new passes." + if stability.time: + str_time = " " + time + "\n" + else: + str_time = "\n" + print_debug(temp[4][1:-3] + str_fails + str_new_fails + str_new_passes + str_time, False, stability_log) def run_special_tests(): i = 5 -def validation_run(only, only_targets, reference_branch, number, notify, update, make): - if os.environ["ISPC_HOME"] != os.getcwd(): - error("you ISPC_HOME and your current pass are different!\n", 2) +def validation_run(only, only_targets, reference_branch, number, notify, update, speed_number, make, perf_llvm, time): os.chdir(os.environ["ISPC_HOME"]) os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"] if options.notify != "": @@ -322,9 +331,9 @@ def validation_run(only, only_targets, reference_branch, number, notify, update, stability.random = False stability.ispc_flags = "" stability.compiler_exe = None - stability.num_jobs = 1024 + stability.num_jobs = speed_number stability.verbose = False - stability.time = False + stability.time = time stability.non_interactive = True stability.update = update stability.include_file = None @@ -476,28 +485,36 @@ def validation_run(only, only_targets, reference_branch, number, notify, update, # prepare LLVM 3.3 as newest LLVM need_LLVM = check_LLVM(["3.3"]) if len(need_LLVM) != 0: - build_LLVM(need_LLVM[i], "", "", "", False, False, False, True, False, make) -# prepare reference point. build both test and reference compilers - try_do_LLVM("apply git", "git branch", True) - temp4 = take_lines("git branch", "all") - for line in temp4: - if "*" in line: - current_branch = line[2:-1] - stashing = True - sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n") - if "No local changes" in take_lines("git stash", "first"): - stashing = False - #try_do_LLVM("stash current branch ", "git stash", True) - try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True) - sys.stdout.write(".\n") - build_ispc("3.3", make) - sys.stdout.write(".\n") - os.rename("ispc", "ispc_ref") - try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True) - if stashing: - try_do_LLVM("return current branch ", "git stash pop", True) - sys.stdout.write("You can interrupt script now.\n") - build_ispc("3.3", make) + build_LLVM(need_LLVM[0], "", "", "", False, False, False, True, False, make) + if perf_llvm == False: + # prepare reference point. build both test and reference compilers + try_do_LLVM("apply git", "git branch", True) + temp4 = take_lines("git branch", "all") + for line in temp4: + if "*" in line: + current_branch = line[2:-1] + stashing = True + sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n") + if "No local changes" in take_lines("git stash", "first"): + stashing = False + #try_do_LLVM("stash current branch ", "git stash", True) + try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True) + sys.stdout.write(".\n") + build_ispc("3.3", make) + sys.stdout.write(".\n") + os.rename("ispc", "ispc_ref") + try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True) + if stashing: + try_do_LLVM("return current branch ", "git stash pop", True) + sys.stdout.write("You can interrupt script now.\n") + build_ispc("3.3", make) + else: + # build compiler with two different LLVM versions + if len(check_LLVM([reference_branch])) != 0: + error("you haven't got llvm called " + reference_branch, 1) + build_ispc("3.3", make) + os.rename("ispc", "ispc_ref") + build_ispc(reference_branch, make) # begin validation run for performance. output is inserted into perf() perf.perf(performance, []) if options.notify != "": @@ -560,16 +577,26 @@ def Main(): stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log" current_path = os.getcwd() make = "make -j" + options.speed + if os.environ["ISPC_HOME"] != os.getcwd(): + error("you ISPC_HOME and your current path are different!\n", 2) + if options.perf_llvm == True: + if options.branch == "master": + options.branch = "trunk" try: + start_time = time.time() if options.build_llvm: build_LLVM(options.version, options.revision, options.folder, options.tarball, options.debug, options.selfbuild, options.extra, False, options.force, make) if options.validation_run: validation_run(options.only, options.only_targets, options.branch, - options.number_for_performance, options.notify, options.update, make) + options.number_for_performance, options.notify, options.update, int(options.speed), + make, options.perf_llvm, options.time) + elapsed_time = time.time() - start_time + if options.time: + print_debug("Elapsed time: " + time.strftime('%Hh%Mm%Ssec.', time.gmtime(elapsed_time)) + "\n", False, "") finally: os.chdir(current_path) - date_name = "alloy_results_" + datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S') + date_name = "alloy_results_" + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') if os.path.exists(date_name): error("It's forbidden to run alloy two times in a second, logs are in ./logs", 1) os.rename(f_date, date_name) @@ -656,11 +683,15 @@ run_group.add_option('--update-errors', dest='update', run_group.add_option('--only-targets', dest='only_targets', help='set list of targets to test. Possible values - all subnames of targets.', default="") +run_group.add_option('--time', dest='time', + help='display time of testing', default=False, action='store_true') run_group.add_option('--only', dest='only', help='set types of tests. Possible values:\n' + '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' + 'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).', default="") +run_group.add_option('--perf_LLVM', dest='perf_llvm', + help='compare LLVM 3.3 with "--compare-with", default trunk', default=False, action='store_true') parser.add_option_group(run_group) # options for activity "setup PATHS" setup_group = OptionGroup(parser, "Options for setup", diff --git a/ast.cpp b/ast.cpp index 83ee207d..60b20a80 100644 --- a/ast.cpp +++ b/ast.cpp @@ -223,7 +223,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc, else if ((fce = dynamic_cast(node)) != NULL) { fce->func = (Expr *)WalkAST(fce->func, preFunc, postFunc, data); fce->args = (ExprList *)WalkAST(fce->args, preFunc, postFunc, data); - fce->launchCountExpr = (Expr *)WalkAST(fce->launchCountExpr, preFunc, + for (int k = 0; k < 3; k++) + fce->launchCountExpr[0] = (Expr *)WalkAST(fce->launchCountExpr[0], preFunc, postFunc, data); } else if ((ie = dynamic_cast(node)) != NULL) { diff --git a/builtins.cpp b/builtins.cpp index 48ce9afb..e3141725 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -979,6 +979,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } case Target::AVX11: { switch (g->target->getVectorWidth()) { + case 4: + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_avx11_i64x4_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_avx11_i64x4_64bit); + } + break; case 8: if (runtime32) { EXPORT_MODULE(builtins_bitcode_avx11_32bit); @@ -1002,6 +1010,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } case Target::AVX2: { switch (g->target->getVectorWidth()) { + case 4: + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_avx2_i64x4_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_avx2_i64x4_64bit); + } + break; case 8: if (runtime32) { EXPORT_MODULE(builtins_bitcode_avx2_32bit); diff --git a/builtins/target-avx11-i64x4.ll b/builtins/target-avx11-i64x4.ll new file mode 100644 index 00000000..8fe75266 --- /dev/null +++ b/builtins/target-avx11-i64x4.ll @@ -0,0 +1,120 @@ +;; Copyright (c) 2013, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include(`target-avx1-i64x4base.ll') + +ifelse(LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int min/max + +define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unsigned int min/max + +define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(float) +gen_gather(i64) +gen_gather(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float/half conversions + +define(`expand_4to8', ` + %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> +') +define(`extract_4from8', ` + %$3 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> +') + +declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone +; 0 is round nearest even +declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone + +define <4 x float> @__half_to_float_varying(<4 x i16> %v4) nounwind readnone { + expand_4to8(i16, v4, v) + %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v) + extract_4from8(float, r, ret) + ret <4 x float> %ret +} + +define <4 x i16> @__float_to_half_varying(<4 x float> %v4) nounwind readnone { + expand_4to8(float, v4, v) + %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0) + extract_4from8(i16, r, ret) + ret <4 x i16> %ret +} + +define float @__half_to_float_uniform(i16 %v) nounwind readnone { + %v1 = bitcast i16 %v to <1 x i16> + %vv = shufflevector <1 x i16> %v1, <1 x i16> undef, + <8 x i32> + %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv) + %r = extractelement <8 x float> %rv, i32 0 + ret float %r +} + +define i16 @__float_to_half_uniform(float %v) nounwind readnone { + %v1 = bitcast float %v to <1 x float> + %vv = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + ; round to nearest even + %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0) + %r = extractelement <8 x i16> %rv, i32 0 + ret i16 %r +} diff --git a/builtins/target-avx2-i64x4.ll b/builtins/target-avx2-i64x4.ll new file mode 100644 index 00000000..d74f32dc --- /dev/null +++ b/builtins/target-avx2-i64x4.ll @@ -0,0 +1,355 @@ +;; Copyright (c) 2013, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +ifelse(LLVM_VERSION, `LLVM_3_1', `', + `define(`HAVE_GATHER', `1')') + +include(`target-avx1-i64x4base.ll') + +ifelse(LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int min/max + +;; declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone +;; declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readonly + +define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unsigned int min/max + +;; declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readonly +;; declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readonly + +define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + +define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline { + %m = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %m +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float/half conversions + + + +define(`expand_4to8', ` + %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> +') +define(`extract_4from8', ` + %$3 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> +') + +declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone +; 0 is round nearest even +declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone + +define <4 x float> @__half_to_float_varying(<4 x i16> %v4) nounwind readnone { + expand_4to8(i16, v4, v) + %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v) + extract_4from8(float, r, ret) + ret <4 x float> %ret +} + +define <4 x i16> @__float_to_half_varying(<4 x float> %v4) nounwind readnone { + expand_4to8(float, v4, v) + %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0) + extract_4from8(i16, r, ret) + ret <4 x i16> %ret +} + +define float @__half_to_float_uniform(i16 %v) nounwind readnone { + %v1 = bitcast i16 %v to <1 x i16> + %vv = shufflevector <1 x i16> %v1, <1 x i16> undef, + <8 x i32> + %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv) + %r = extractelement <8 x float> %rv, i32 0 + ret float %r +} + +define i16 @__float_to_half_uniform(float %v) nounwind readnone { + %v1 = bitcast float %v to <1 x float> + %vv = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + ; round to nearest even + %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0) + %r = extractelement <8 x i16> %rv, i32 0 + ret i16 %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +declare void @llvm.trap() noreturn nounwind + + +ifelse(LLVM_VERSION, `LLVM_3_1', ` +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double)', ` + +gen_gather(i8) +gen_gather(i16) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 gathers + +declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %target, i8 * %ptr, + <4 x i32> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind +declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr, + <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind + +define <4 x i32> @__gather_base_offsets32_i32(i8 * %ptr, + i32 %scale, <4 x i32> %offsets, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + + %v = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8 * %ptr, + <4 x i32> %offsets, <4 x i32> %vecmask, i8 %scale8) + ret <4 x i32> %v +} + + +define <4 x i32> @__gather_base_offsets64_i32(i8 * %ptr, + i32 %scale, <4 x i64> %offsets, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + + %v = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets, <4 x i32> %vecmask, i8 %scale8) + + ret <4 x i32> %v +} + + +define <4 x i32> @__gather32_i32(<4 x i32> %ptrs, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + + %v = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8 * null, + <4 x i32> %ptrs, <4 x i32> %vecmask, i8 1) + + ret <4 x i32> %v +} + + +define <4 x i32> @__gather64_i32(<4 x i64> %ptrs, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + + %v = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs, <4 x i32> %vecmask, i8 1) + + ret <4 x i32> %v +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float gathers + +declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %target, i8 * %ptr, + <4 x i32> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind +declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr, + <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind + +define <4 x float> @__gather_base_offsets32_float(i8 * %ptr, + i32 %scale, <4 x i32> %offsets, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + %mask = bitcast <4 x i32> %vecmask to <4 x float> + + %v = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8 * %ptr, + <4 x i32> %offsets, <4 x float> %mask, i8 %scale8) + + ret <4 x float> %v +} + + +define <4 x float> @__gather_base_offsets64_float(i8 * %ptr, + i32 %scale, <4 x i64> %offsets, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + %mask = bitcast <4 x i32> %vecmask to <4 x float> + + %v = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets, <4 x float> %mask, i8 %scale8) + + ret <4 x float> %v +} + + +define <4 x float> @__gather32_float(<4 x i32> %ptrs, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + %mask = bitcast <4 x i32> %vecmask to <4 x float> + + %v = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8 * null, + <4 x i32> %ptrs, <4 x float> %mask, i8 1) + + ret <4 x float> %v +} + + +define <4 x float> @__gather64_float(<4 x i64> %ptrs, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32> + %mask = bitcast <4 x i32> %vecmask to <4 x float> + + %v = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs, <4 x float> %mask, i8 1) + + ret <4 x float> %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int64 gathers + +declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr, + <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind +declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr, + <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind + +define <4 x i64> @__gather_base_offsets32_i64(i8 * %ptr, + i32 %scale, <4 x i32> %offsets, + <4 x i64> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + + %v = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets, <4 x i64> %vecmask, i8 %scale8) + + ret <4 x i64> %v +} + + +define <4 x i64> @__gather_base_offsets64_i64(i8 * %ptr, + i32 %scale, <4 x i64> %offsets, + <4 x i64> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + + %v = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets, <4 x i64> %vecmask, i8 %scale8) + + ret <4 x i64> %v +} + + +define <4 x i64> @__gather32_i64(<4 x i32> %ptrs, + <4 x i64> %vecmask) nounwind readonly alwaysinline { + + %v = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs, <4 x i64> %vecmask, i8 1) + ret <4 x i64> %v +} + + +define <4 x i64> @__gather64_i64(<4 x i64> %ptrs, + <4 x i64> %vecmask) nounwind readonly alwaysinline { + %v = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs, <4 x i64> %vecmask, i8 1) + ret <4 x i64> %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double gathers + +declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr, + <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind +declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr, + <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind + +define <4 x double> @__gather_base_offsets32_double(i8 * %ptr, + i32 %scale, <4 x i32> %offsets, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double> + + %v = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets, <4 x double> %vecmask, i8 %scale8) + ret <4 x double> %v +} + +define <4 x double> @__gather_base_offsets64_double(i8 * %ptr, + i32 %scale, <4 x i64> %offsets, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double> + + %v = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets, <4 x double> %vecmask, i8 %scale8) + + ret <4 x double> %v +} + +define <4 x double> @__gather32_double(<4 x i32> %ptrs, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double> + + %v = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs, <4 x double> %vecmask, i8 1) + + ret <4 x double> %v +} + +define <4 x double> @__gather64_double(<4 x i64> %ptrs, + <4 x i64> %vecmask64) nounwind readonly alwaysinline { + %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double> + + %v = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs, <4 x double> %vecmask, i8 1) + + ret <4 x double> %v +} + +') diff --git a/builtins/util.m4 b/builtins/util.m4 index 11501780..c90e8adc 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1813,7 +1813,7 @@ define(`stdlib_core', ` declare i32 @__fast_masked_vload() declare i8* @ISPCAlloc(i8**, i64, i32) nounwind -declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind +declare void @ISPCLaunch(i8**, i8*, i8*, i32,i32,i32) nounwind declare void @ISPCSync(i8*) nounwind declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind diff --git a/cbackend.cpp b/cbackend.cpp index 481ca3fd..5a1ef705 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -558,8 +558,15 @@ char CWriter::ID = 0; static std::string CBEMangle(const std::string &S) { std::string Result; - for (unsigned i = 0, e = S.size(); i != e; ++i) - if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') { + for (unsigned i = 0, e = S.size(); i != e; ++i) { + if (i+1 != e && ((S[i] == '>' && S[i+1] == '>') || + (S[i] == '<' && S[i+1] == '<'))) { + Result += '_'; + Result += 'A'+(S[i]&15); + Result += 'A'+((S[i]>>4)&15); + Result += '_'; + i++; + } else if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') { Result += S[i]; } else { Result += '_'; @@ -567,6 +574,7 @@ static std::string CBEMangle(const std::string &S) { Result += 'A'+((S[i]>>4)&15); Result += '_'; } + } return Result; } diff --git a/ctx.cpp b/ctx.cpp index c50d22f9..3aee776a 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -3502,7 +3502,7 @@ FunctionEmitContext::ReturnInst() { llvm::Value * FunctionEmitContext::LaunchInst(llvm::Value *callee, std::vector &argVals, - llvm::Value *launchCount) { + llvm::Value *launchCount[3]){ if (callee == NULL) { AssertPos(currentPos, m->errorCount > 0); return NULL; @@ -3563,7 +3563,9 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee, args.push_back(launchGroupHandlePtr); args.push_back(fptr); args.push_back(voidmem); - args.push_back(launchCount); + args.push_back(launchCount[0]); + args.push_back(launchCount[1]); + args.push_back(launchCount[2]); return CallInst(flaunch, NULL, args, ""); } diff --git a/ctx.h b/ctx.h index 58f9aae3..4dd30053 100644 --- a/ctx.h +++ b/ctx.h @@ -542,7 +542,7 @@ public: he given argument values. */ llvm::Value *LaunchInst(llvm::Value *callee, std::vector &argVals, - llvm::Value *launchCount); + llvm::Value *launchCount[3]); void SyncInst(); diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index 78d35ddc..d6bf6fd5 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -2009,43 +2009,37 @@ static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val, ptr[i] = val[i]; } -static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) +static FORCEINLINE void __masked_store_i32(void *p, const __vec16_i32 val, const __vec16_i1 mask) { #ifdef ISPC_FORCE_ALIGNED_MEMORY _mm512_mask_store_epi32(p, mask, val.v); #else - __vec16_i32 tmp; - tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v); - _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_i32extscatter_epi32(p, mask, __ispc_stride1, val, _MM_DOWNCONV_EPI32_NONE, _MM_SCALE_4, _MM_HINT_NONE); #endif } -static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, __vec16_i1 mask) +static FORCEINLINE void __masked_store_float(void *p, const __vec16_f val, const __vec16_i1 mask) { #ifdef ISPC_FORCE_ALIGNED_MEMORY _mm512_mask_store_ps(p, mask, val.v); #else - __vec16_f tmp; - tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v); - _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_mask_i32extscatter_ps(p, mask, __ispc_stride1, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_4, _MM_HINT_NONE); #endif } -static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val, - __vec16_i1 mask) { - int64_t *ptr = (int64_t *)p; - for (int i = 0; i < 16; ++i) - if ((mask.v & (1 << i)) != 0) - ptr[i] = val[i]; +static FORCEINLINE void __masked_store_i64(void *p, const __vec16_i64 val, const __vec16_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + _mm512_mask_store_epi64(p, mask, val.v1); + _mm512_mask_store_epi64((uint8_t*)p+64, tmp_m, val.v2); +#else + _mm512_mask_i32loextscatter_epi64( p, mask, __ispc_stride1, val.v1, _MM_DOWNCONV_EPI64_NONE, _MM_SCALE_8, _MM_HINT_NONE); + _mm512_mask_i32loextscatter_epi64((int64_t*)p+8, _mm512_kswapb(mask,mask), __ispc_stride1, val.v2, _MM_DOWNCONV_EPI64_NONE, _MM_SCALE_8, _MM_HINT_NONE); +#endif } -static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, __vec16_i1 mask) +static FORCEINLINE void __masked_store_double(void *p, const __vec16_d val, const __vec16_i1 mask) { #ifdef ISPC_FORCE_ALIGNED_MEMORY __vec16_i1 tmp_m = mask; @@ -2053,19 +2047,8 @@ static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, __vec16_i1 _mm512_mask_store_pd(p, mask, val.v1); _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2); #else - __vec16_d tmp; - __vec16_i1 tmp_m = mask; - tmp_m = _mm512_kswapb(tmp_m, tmp_m); - tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1); - tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2); - _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_mask_i32loextscatter_pd( p, mask, __ispc_stride1, val.v1, _MM_DOWNCONV_PD_NONE, _MM_SCALE_8, _MM_HINT_NONE); + _mm512_mask_i32loextscatter_pd((double*)p+8, _mm512_kswapb(mask,mask), __ispc_stride1, val.v2, _MM_DOWNCONV_PD_NONE, _MM_SCALE_8, _MM_HINT_NONE); #endif } diff --git a/examples/mandelbrot_tasks3d/.gitignore b/examples/mandelbrot_tasks3d/.gitignore new file mode 100644 index 00000000..c2471c27 --- /dev/null +++ b/examples/mandelbrot_tasks3d/.gitignore @@ -0,0 +1,2 @@ +mandelbrot +*.ppm diff --git a/examples/mandelbrot_tasks3d/Makefile b/examples/mandelbrot_tasks3d/Makefile new file mode 100644 index 00000000..3dd44d65 --- /dev/null +++ b/examples/mandelbrot_tasks3d/Makefile @@ -0,0 +1,8 @@ + +EXAMPLE=mandelbrot_tasks3d +CPP_SRC=mandelbrot_tasks3d.cpp mandelbrot_tasks_serial.cpp +ISPC_SRC=mandelbrot_tasks3d.ispc +ISPC_IA_TARGETS=avx,sse2,sse4 +ISPC_ARM_TARGETS=neon + +include ../common.mk diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj new file mode 100644 index 00000000..3a8fca79 --- /dev/null +++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj @@ -0,0 +1,180 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {E80DA7D4-AB22-4648-A068-327307156BE6} + Win32Proj + mandelbrot_tasks + + + + Application + true + Unicode + + + Application + true + Unicode + + + Application + false + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + + + + + + + true + $(ProjectDir)..\..;$(ExecutablePath) + mandelbrot_tasks + + + true + $(ProjectDir)..\..;$(ExecutablePath) + mandelbrot_tasks + + + false + $(ProjectDir)..\..;$(ExecutablePath) + mandelbrot_tasks + + + false + $(ProjectDir)..\..;$(ExecutablePath) + mandelbrot_tasks + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(TargetDir) + true + Fast + + + Console + true + + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(TargetDir) + true + Fast + + + Console + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(TargetDir) + Fast + + + Console + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(TargetDir) + Fast + + + Console + true + true + true + + + + + + + + + + Document + ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 + + ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 + + $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h + $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h + ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 + + ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 + + $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h + $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h + + + + + + diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp new file mode 100644 index 00000000..9cbb966a --- /dev/null +++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp @@ -0,0 +1,146 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include +#include "../timing.h" +#include "mandelbrot_tasks3d_ispc.h" +using namespace ispc; + +extern void mandelbrot_serial(float x0, float y0, float x1, float y1, + int width, int height, int maxIterations, + int output[]); + +/* Write a PPM image file with the image of the Mandelbrot set */ +static void +writePPM(int *buf, int width, int height, const char *fn) { + FILE *fp = fopen(fn, "wb"); + fprintf(fp, "P6\n"); + fprintf(fp, "%d %d\n", width, height); + fprintf(fp, "255\n"); + for (int i = 0; i < width*height; ++i) { + // Map the iteration count to colors by just alternating between + // two greys. + char c = (buf[i] & 0x1) ? 240 : 20; + for (int j = 0; j < 3; ++j) + fputc(c, fp); + } + fclose(fp); + printf("Wrote image file %s\n", fn); +} + + +static void usage() { + fprintf(stderr, "usage: mandelbrot [--scale=]\n"); + exit(1); +} + +int main(int argc, char *argv[]) { + unsigned int width = 1536; + unsigned int height = 1024; + float x0 = -2; + float x1 = 1; + float y0 = -1; + float y1 = 1; + + if (argc == 1) + ; + else if (argc == 2) { + if (strncmp(argv[1], "--scale=", 8) == 0) { + float scale = atof(argv[1] + 8); + if (scale == 0.f) + usage(); + width *= scale; + height *= scale; + // round up to multiples of 16 + width = (width + 0xf) & ~0xf; + height = (height + 0xf) & ~0xf; + } + else + usage(); + } + else + usage(); + + int maxIterations = 512; + int *buf = new int[width*height]; + + // + // Compute the image using the ispc implementation; report the minimum + // time of three runs. + // + double minISPC = 1e30; + for (int i = 0; i < 3; ++i) { + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; + reset_and_start_timer(); + mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = get_elapsed_mcycles(); + minISPC = std::min(minISPC, dt); + } + + printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC); + writePPM(buf, width, height, "mandelbrot-ispc.ppm"); + + + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minSerial = 1e30; + for (int i = 0; i < 3; ++i) { + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; + reset_and_start_timer(); + mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = get_elapsed_mcycles(); + minSerial = std::min(minSerial, dt); + } + + printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); + writePPM(buf, width, height, "mandelbrot-serial.ppm"); + + printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); + + return 0; +} diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc new file mode 100644 index 00000000..395bdca4 --- /dev/null +++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc @@ -0,0 +1,99 @@ +/* + Copyright (c) 2010-2012, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +static inline int +mandel(float c_re, float c_im, int count) { + float z_re = c_re, z_im = c_im; + int i; + for (i = 0; i < count; ++i) { + if (z_re * z_re + z_im * z_im > 4.) + break; + + float new_re = z_re*z_re - z_im*z_im; + float new_im = 2.f * z_re * z_im; + unmasked { + z_re = c_re + new_re; + z_im = c_im + new_im; + } + } + + return i; +} + + +/* Task to compute the Mandelbrot iterations for a single scanline. + */ +task void +mandelbrot_scanline(uniform float x0, uniform float dx, + uniform float y0, uniform float dy, + uniform int width, uniform int height, + uniform int xspan, uniform int yspan, + uniform int maxIterations, uniform int output[]) { + const uniform int xstart = taskIndex0 * xspan; + const uniform int xend = min(xstart + xspan, width); + + const uniform int ystart = taskIndex1 * yspan; + const uniform int yend = min(ystart + yspan, height); + + + foreach (yi = ystart ... yend, xi = xstart ... xend) { + float x = x0 + xi * dx; + float y = y0 + yi * dy; + + int index = yi * width + xi; + output[index] = mandel(x, y, maxIterations); + } + +} + +#if 1 +export void +mandelbrot_ispc(uniform float x0, uniform float y0, + uniform float x1, uniform float y1, + uniform int width, uniform int height, + uniform int maxIterations, uniform int output[]) { + uniform float dx = (x1 - x0) / width; + uniform float dy = (y1 - y0) / height; + const uniform int xspan = 16; /* make sure it is big enough to avoid false-sharing */ + const uniform int yspan = 16; + + +#if 1 + launch [width/xspan, height/yspan] +#else + launch [height/yspan][width/xspan] +#endif + mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan, + maxIterations, output); +} +#endif diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp b/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp new file mode 100644 index 00000000..a76fb5ca --- /dev/null +++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp @@ -0,0 +1,68 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +static int mandel(float c_re, float c_im, int count) { + float z_re = c_re, z_im = c_im; + int i; + for (i = 0; i < count; ++i) { + if (z_re * z_re + z_im * z_im > 4.f) + break; + + float new_re = z_re*z_re - z_im*z_im; + float new_im = 2.f * z_re * z_im; + z_re = c_re + new_re; + z_im = c_im + new_im; + } + + return i; +} + +void mandelbrot_serial(float x0, float y0, float x1, float y1, + int width, int height, int maxIterations, + int output[]) +{ + float dx = (x1 - x0) / width; + float dy = (y1 - y0) / height; + + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; ++i) { + float x = x0 + i * dx; + float y = y0 + j * dy; + + int index = (j * width + i); + output[index] = mandel(x, y, maxIterations); + } + } +} + diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index c9c2fa7b..6bc60129 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -170,21 +170,41 @@ // Signature of ispc-generated 'task' functions typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount, - int taskIndex, int taskCount); + int taskIndex, int taskCount, + int taskIndex0, int taskIndex1, int taskIndex2, + int taskCount0, int taskCount1, int taskCount2); // Small structure used to hold the data for each task struct TaskInfo { TaskFuncType func; void *data; - int taskIndex, taskCount; + int taskIndex; + int taskCount3d[3]; #if defined(ISPC_IS_WINDOWS) event taskEvent; #endif -}; + int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; } + int taskIndex0() const + { + return taskIndex % taskCount3d[0]; + } + int taskIndex1() const + { + return ( taskIndex / taskCount3d[0] ) % taskCount3d[1]; + } + int taskIndex2() const + { + return taskIndex / ( taskCount3d[0]*taskCount3d[1] ); + } + int taskCount0() const { return taskCount3d[0]; } + int taskCount1() const { return taskCount3d[1]; } + int taskCount2() const { return taskCount3d[2]; } + TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); } +} __attribute__((aligned(32))); // ispc expects these functions to have C linkage / not be mangled extern "C" { - void ISPCLaunch(void **handlePtr, void *f, void *data, int count); + void ISPCLaunch(void **handlePtr, void *f, void *data, int countx,int county, int countz); void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); void ISPCSync(void *handle); } @@ -518,7 +538,9 @@ lRunTask(void *ti) { // Actually run the task taskInfo->func(taskInfo->data, threadIndex, threadCount, - taskInfo->taskIndex, taskInfo->taskCount); + taskInfo->taskIndex, taskInfo->taskCount(), + taskInfo->taskIndex0(), taskInfo->taskIndex1(), taskInfo->taskIndex2(), + taskInfo->taskCount0(), taskInfo->taskCount1(), taskInfo->taskCount2()); } @@ -559,7 +581,9 @@ lRunTask(LPVOID param) { // will cause bugs in code that uses those. int threadIndex = 0; int threadCount = 1; - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); // Signal the event that this task is done ti->taskEvent.set(); @@ -660,7 +684,9 @@ lTaskEntry(void *arg) { DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg)); TaskInfo *myTask = tg->GetTaskInfo(taskNumber); myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex, - myTask->taskCount); + myTask->taskCount(), + myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(), + myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2()); // // Decrement the "number of unfinished tasks" counter in the task @@ -861,7 +887,9 @@ TaskGroup::Sync() { // Do work for _myTask_ // // FIXME: bogus values for thread index/thread count here as well.. - myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount); + myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount(), + myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(), + myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2()); // // Decrement the number of unfinished tasks counter @@ -891,7 +919,9 @@ TaskGroup::Launch(int baseIndex, int count) { // Actually run the task. // Cilk does not expose the task -> thread mapping so we pretend it's 1:1 - ti->func(ti->data, ti->taskIndex, ti->taskCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, ti->taskIndex, ti->taskCount(), + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); } } @@ -920,7 +950,9 @@ TaskGroup::Launch(int baseIndex, int count) { // Actually run the task. int threadIndex = omp_get_thread_num(); int threadCount = omp_get_num_threads(); - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); } } @@ -951,7 +983,9 @@ TaskGroup::Launch(int baseIndex, int count) { int threadIndex = ti->taskIndex; int threadCount = ti->taskCount; - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); }); } @@ -978,7 +1012,9 @@ TaskGroup::Launch(int baseIndex, int count) { // TBB does not expose the task -> thread mapping so we pretend it's 1:1 int threadIndex = ti->taskIndex; int threadCount = ti->taskCount; - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); }); } } @@ -1031,7 +1067,8 @@ FreeTaskGroup(TaskGroup *tg) { /////////////////////////////////////////////////////////////////////////// void -ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) { +ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2) { + const int count = count0*count1*count2; TaskGroup *taskGroup; if (*taskGroupPtr == NULL) { InitTaskSystem(); @@ -1047,7 +1084,9 @@ ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) { ti->func = (TaskFuncType)func; ti->data = data; ti->taskIndex = i; - ti->taskCount = count; + ti->taskCount3d[0] = count0; + ti->taskCount3d[1] = count1; + ti->taskCount3d[2] = count2; } taskGroup->Launch(baseIndex, count); } diff --git a/expr.cpp b/expr.cpp index 614cb5e5..60d9ce66 100644 --- a/expr.cpp +++ b/expr.cpp @@ -1660,6 +1660,64 @@ BinaryExpr::BinaryExpr(Op o, Expr *a, Expr *b, SourcePos p) arg1 = b; } +Expr *lCreateBinaryOperatorCall(const BinaryExpr::Op bop, + Expr *a0, Expr *a1, + const SourcePos &sp) +{ + if ((a0 == NULL) || (a1 == NULL)) { + return NULL; + } + Expr *arg0 = a0->TypeCheck(); + Expr *arg1 = a1->TypeCheck(); + if ((arg0 == NULL) || (arg1 == NULL)) { + return NULL; + } + const Type *type0 = arg0->GetType(); + const Type *type1 = arg1->GetType(); + + // If either operand is a reference, dereference it before we move + // forward + if (CastType(type0) != NULL) { + arg0 = new RefDerefExpr(arg0, arg0->pos); + type0 = arg0->GetType(); + } + if (CastType(type1) != NULL) { + arg1 = new RefDerefExpr(arg1, arg1->pos); + type1 = arg1->GetType(); + } + if ((type0 == NULL) || (type1 == NULL)) { + return NULL; + } + if (CastType(type0) != NULL || + CastType(type1) != NULL) { + std::string opName = std::string("operator") + lOpString(bop); + std::vector funs; + m->symbolTable->LookupFunction(opName.c_str(), &funs); + if (funs.size() == 0) { + Error(sp, "operator %s(%s, %s) is not defined.", + opName.c_str(), (type0->GetString()).c_str(), (type1->GetString()).c_str()); + return NULL; + } + Expr *func = new FunctionSymbolExpr(opName.c_str(), funs, sp); + ExprList *args = new ExprList(sp); + args->exprs.push_back(arg0); + args->exprs.push_back(arg1); + Expr *opCallExpr = new FunctionCallExpr(func, args, sp); + return opCallExpr; + } + return NULL; +} + + +Expr * MakeBinaryExpr(BinaryExpr::Op o, Expr *a, Expr *b, SourcePos p) { + Expr * op = lCreateBinaryOperatorCall(o, a, b, p); + if (op != NULL) { + return op; + } + op = new BinaryExpr(o, a, b, p); + return op; +} + /** Emit code for a && or || logical operator. In particular, the code here handles "short-circuit" evaluation, where the second expression @@ -2985,29 +3043,10 @@ AssignExpr::TypeCheck() { if (lvalueIsReference) lvalue = new RefDerefExpr(lvalue, lvalue->pos); - FunctionSymbolExpr *fse; - if ((fse = dynamic_cast(rvalue)) != NULL) { - // Special case to use the type of the LHS to resolve function - // overloads when we're assigning a function pointer where the - // function is overloaded. - const Type *lvalueType = lvalue->GetType(); - const FunctionType *ftype; - if (CastType(lvalueType) == NULL || - (ftype = CastType(lvalueType->GetBaseType())) == NULL) { - Error(lvalue->pos, "Can't assign function pointer to type \"%s\".", - lvalueType ? lvalueType->GetString().c_str() : ""); - return NULL; - } - - std::vector paramTypes; - for (int i = 0; i < ftype->GetNumParameters(); ++i) - paramTypes.push_back(ftype->GetParameterType(i)); - - if (!fse->ResolveOverloads(rvalue->pos, paramTypes)) { - Error(pos, "Unable to find overloaded function for function " - "pointer assignment."); - return NULL; - } + if (PossiblyResolveFunctionOverloads(rvalue, lvalue->GetType()) == false) { + Error(pos, "Unable to find overloaded function for function " + "pointer assignment."); + return NULL; } const Type *lhsType = lvalue->GetType(); @@ -3501,11 +3540,13 @@ SelectExpr::Print() const { // FunctionCallExpr FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, - bool il, Expr *lce) + bool il, Expr *lce[3]) : Expr(p), isLaunch(il) { func = f; args = a; - launchCountExpr = lce; + launchCountExpr[0] = lce[0]; + launchCountExpr[1] = lce[1]; + launchCountExpr[2] = lce[2]; } @@ -3623,9 +3664,13 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const { llvm::Value *retVal = NULL; ctx->SetDebugPos(pos); if (ft->isTask) { - AssertPos(pos, launchCountExpr != NULL); - llvm::Value *launchCount = launchCountExpr->GetValue(ctx); - if (launchCount != NULL) + AssertPos(pos, launchCountExpr[0] != NULL); + llvm::Value *launchCount[3] = + { launchCountExpr[0]->GetValue(ctx), + launchCountExpr[1]->GetValue(ctx), + launchCountExpr[2]->GetValue(ctx) }; + + if (launchCount[0] != NULL) ctx->LaunchInst(callee, argVals, launchCount); } else @@ -3650,10 +3695,37 @@ FunctionCallExpr::GetLValue(FunctionEmitContext *ctx) const { return NULL; } } - + + +bool FullResolveOverloads(Expr * func, ExprList * args, + std::vector *argTypes, + std::vector *argCouldBeNULL, + std::vector *argIsConstant) { + for (unsigned int i = 0; i < args->exprs.size(); ++i) { + Expr *expr = args->exprs[i]; + if (expr == NULL) + return false; + const Type *t = expr->GetType(); + if (t == NULL) + return false; + argTypes->push_back(t); + argCouldBeNULL->push_back(lIsAllIntZeros(expr) || dynamic_cast(expr)); + argIsConstant->push_back(dynamic_cast(expr) || dynamic_cast(expr)); + } + return true; +} + const Type * FunctionCallExpr::GetType() const { + std::vector argTypes; + std::vector argCouldBeNULL, argIsConstant; + if (FullResolveOverloads(func, args, &argTypes, &argCouldBeNULL, &argIsConstant) == true) { + FunctionSymbolExpr *fse = dynamic_cast(func); + if (fse != NULL) { + fse->ResolveOverloads(args->pos, argTypes, &argCouldBeNULL, &argIsConstant); + } + } const FunctionType *ftype = lGetFunctionType(func); return ftype ? ftype->GetReturnType() : NULL; } @@ -3689,20 +3761,9 @@ FunctionCallExpr::TypeCheck() { std::vector argTypes; std::vector argCouldBeNULL, argIsConstant; - for (unsigned int i = 0; i < args->exprs.size(); ++i) { - Expr *expr = args->exprs[i]; - if (expr == NULL) - return NULL; - const Type *t = expr->GetType(); - if (t == NULL) - return NULL; - - argTypes.push_back(t); - argCouldBeNULL.push_back(lIsAllIntZeros(expr) || - dynamic_cast(expr)); - argIsConstant.push_back(dynamic_cast(expr) || - dynamic_cast(expr)); + if (FullResolveOverloads(func, args, &argTypes, &argCouldBeNULL, &argIsConstant) == false) { + return NULL; } FunctionSymbolExpr *fse = dynamic_cast(func); @@ -3732,14 +3793,17 @@ FunctionCallExpr::TypeCheck() { if (!isLaunch) Error(pos, "\"launch\" expression needed to call function " "with \"task\" qualifier."); - if (!launchCountExpr) + for (int k = 0; k < 3; k++) + { + if (!launchCountExpr[k]) return NULL; - launchCountExpr = - TypeConvertExpr(launchCountExpr, AtomicType::UniformInt32, - "task launch count"); - if (launchCountExpr == NULL) + launchCountExpr[k] = + TypeConvertExpr(launchCountExpr[k], AtomicType::UniformInt32, + "task launch count"); + if (launchCountExpr[k] == NULL) return NULL; + } } else { if (isLaunch) { @@ -3747,7 +3811,7 @@ FunctionCallExpr::TypeCheck() { "qualified function."); return NULL; } - AssertPos(pos, launchCountExpr == NULL); + AssertPos(pos, launchCountExpr[0] == NULL); } } else { @@ -7010,7 +7074,8 @@ TypeCastExpr::GetLValue(FunctionEmitContext *ctx) const { const Type * TypeCastExpr::GetType() const { - AssertPos(pos, type->HasUnboundVariability() == false); + // We have to switch off this assert after supporting of operators. + //AssertPos(pos, type->HasUnboundVariability() == false); return type; } @@ -8190,6 +8255,9 @@ FunctionSymbolExpr::ResolveOverloads(SourcePos argPos, const std::vector *argCouldBeNULL, const std::vector *argIsConstant) { const char *funName = candidateFunctions.front()->name.c_str(); + if (triedToResolve == true) { + return true; + } triedToResolve = true; diff --git a/expr.h b/expr.h index 42fdff45..0d46191b 100644 --- a/expr.h +++ b/expr.h @@ -246,7 +246,8 @@ public: class FunctionCallExpr : public Expr { public: FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, - bool isLaunch = false, Expr *launchCountExpr = NULL); + bool isLaunch = false, + Expr *launchCountExpr[3] = (Expr*[3]){NULL, NULL, NULL}); llvm::Value *GetValue(FunctionEmitContext *ctx) const; llvm::Value *GetLValue(FunctionEmitContext *ctx) const; @@ -261,7 +262,7 @@ public: Expr *func; ExprList *args; bool isLaunch; - Expr *launchCountExpr; + Expr *launchCountExpr[3]; }; @@ -730,6 +731,8 @@ bool CanConvertTypes(const Type *fromType, const Type *toType, */ Expr *TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase); +Expr * MakeBinaryExpr(BinaryExpr::Op o, Expr *a, Expr *b, SourcePos p); + /** Utility routine that emits code to initialize a symbol given an initializer expression. diff --git a/fail_db.txt b/fail_db.txt index f1aaaab2..9c43c7f0 100644 --- a/fail_db.txt +++ b/fail_db.txt @@ -1025,3 +1025,38 @@ ./tests/reduce-equal.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 clang++3.3 -O2 * ./tests/test-141.ispc runfail x86 avx2-i32x16 Mac LLVM 3.4 clang++3.3 -O2 * ./tests/test-141.ispc runfail x86-64 avx2-i32x16 Mac LLVM 3.4 clang++3.3 -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx2-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1.1-i64x4 Windows LLVM 3.4 cl -O2 * +.\tests\reduce-min-uint.ispc runfail x86 avx2-i64x4 Windows LLVM 3.4 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx2-i64x4 Windows LLVM 3.4 cl -O2 * diff --git a/func.cpp b/func.cpp index b975049b..af2cc05a 100644 --- a/func.cpp +++ b/func.cpp @@ -132,9 +132,28 @@ Function::Function(Symbol *s, Stmt *c) { Assert(taskIndexSym); taskCountSym = m->symbolTable->LookupVariable("taskCount"); Assert(taskCountSym); + + taskIndexSym0 = m->symbolTable->LookupVariable("taskIndex0"); + Assert(taskIndexSym0); + taskIndexSym1 = m->symbolTable->LookupVariable("taskIndex1"); + Assert(taskIndexSym1); + taskIndexSym2 = m->symbolTable->LookupVariable("taskIndex2"); + Assert(taskIndexSym2); + + + taskCountSym0 = m->symbolTable->LookupVariable("taskCount0"); + Assert(taskCountSym0); + taskCountSym1 = m->symbolTable->LookupVariable("taskCount1"); + Assert(taskCountSym1); + taskCountSym2 = m->symbolTable->LookupVariable("taskCount2"); + Assert(taskCountSym2); } else + { threadIndexSym = threadCountSym = taskIndexSym = taskCountSym = NULL; + taskIndexSym0 = taskIndexSym1 = taskIndexSym2 = NULL; + taskCountSym0 = taskCountSym1 = taskCountSym2 = NULL; + } } @@ -225,6 +244,12 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, llvm::Value *threadCount = argIter++; llvm::Value *taskIndex = argIter++; llvm::Value *taskCount = argIter++; + llvm::Value *taskIndex0 = argIter++; + llvm::Value *taskIndex1 = argIter++; + llvm::Value *taskIndex2 = argIter++; + llvm::Value *taskCount0 = argIter++; + llvm::Value *taskCount1 = argIter++; + llvm::Value *taskCount2 = argIter++; // Copy the function parameter values from the structure into local // storage @@ -256,6 +281,20 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount"); ctx->StoreInst(taskCount, taskCountSym->storagePtr); + + taskIndexSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex0"); + ctx->StoreInst(taskIndex0, taskIndexSym0->storagePtr); + taskIndexSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex1"); + ctx->StoreInst(taskIndex1, taskIndexSym1->storagePtr); + taskIndexSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex2"); + ctx->StoreInst(taskIndex2, taskIndexSym2->storagePtr); + + taskCountSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount0"); + ctx->StoreInst(taskCount0, taskCountSym0->storagePtr); + taskCountSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount1"); + ctx->StoreInst(taskCount1, taskCountSym1->storagePtr); + taskCountSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount2"); + ctx->StoreInst(taskCount2, taskCountSym2->storagePtr); } else { // Regular, non-task function diff --git a/func.h b/func.h index ac3e1447..88a96dbc 100644 --- a/func.h +++ b/func.h @@ -60,7 +60,10 @@ private: Stmt *code; Symbol *maskSymbol; Symbol *threadIndexSym, *threadCountSym; - Symbol *taskIndexSym, *taskCountSym; + Symbol *taskIndexSym, *taskCountSym; + Symbol *taskIndexSym0, *taskCountSym0; + Symbol *taskIndexSym1, *taskCountSym1; + Symbol *taskIndexSym2, *taskCountSym2; }; #endif // ISPC_FUNC_H diff --git a/ispc.cpp b/ispc.cpp index bd28fc51..96cf4447 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -515,6 +515,25 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : #if !defined(LLVM_3_1) // LLVM 3.2+ only this->m_hasRand = true; +#endif + } + else if (!strcasecmp(isa, "avx1.1-i64x4")) { + this->m_isa = Target::AVX11; + this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_vectorWidth = 4; + this->m_attributes = "+avx,+popcnt,+cmov,+f16c" +#if defined(LLVM_3_4) + ",+rdrnd" +#else + ",+rdrand" +#endif + ; + this->m_maskingIsFree = false; + this->m_maskBitCount = 64; + this->m_hasHalf = true; +#if !defined(LLVM_3_1) + // LLVM 3.2+ only + this->m_hasRand = true; #endif } else if (!strcasecmp(isa, "avx2") || @@ -563,6 +582,29 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // LLVM 3.2+ only this->m_hasRand = true; this->m_hasGather = true; +#endif + } + else if (!strcasecmp(isa, "avx2-i64x4")) { + this->m_isa = Target::AVX2; + this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_vectorWidth = 4; + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" +#if defined(LLVM_3_4) + ",+rdrnd" +#else + ",+rdrand" +#endif +#ifndef LLVM_3_1 + ",+fma" +#endif // !LLVM_3_1 + ; + this->m_maskingIsFree = false; + this->m_maskBitCount = 64; + this->m_hasHalf = true; +#if !defined(LLVM_3_1) + // LLVM 3.2+ only + this->m_hasRand = true; + this->m_hasGather = true; #endif } #ifdef ISPC_ARM_ENABLED @@ -740,8 +782,8 @@ Target::SupportedTargets() { "sse2-i32x4, sse2-i32x8, " "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, " "avx1-i32x8, avx1-i32x16, avx1-i64x4, " - "avx1.1-i32x8, avx1.1-i32x16, " - "avx2-i32x8, avx2-i32x16, " + "avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 " + "avx2-i32x8, avx2-i32x16, avx2-i64x4, " "generic-x1, generic-x4, generic-x8, generic-x16, " "generic-x32, generic-x64"; } diff --git a/ispc.vcxproj b/ispc.vcxproj index 58fa5b08..b9a3b6c5 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -28,10 +28,14 @@ + + + + @@ -323,6 +327,24 @@ Building gen-bitcode-avx11-x2-64bit.cpp + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp + $(Configuration)/gen-bitcode-avx11-i64x4-32bit.cpp + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll + Building gen-bitcode-avx11-i64x4-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-i64x4.ll | python bitcode2cpp.py builtins\target-avx11-i64x4.ll 64bit > $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp + $(Configuration)/gen-bitcode-avx11-i64x4-64bit.cpp + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll + Building gen-bitcode-avx11-i64x4-64bit.cpp + + Document @@ -359,6 +381,24 @@ Building gen-bitcode-avx2-x2-64bit.cpp + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp + $(Configuration)/gen-bitcode-avx2-i64x4-32bit.cpp + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll + Building gen-bitcode-avx2-i64x4-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-i64x4.ll | python bitcode2cpp.py builtins\target-avx2-i64x4.ll 64bit > $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp + $(Configuration)/gen-bitcode-avx2-i64x4-64bit.cpp + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll + Building gen-bitcode-avx2-i64x4-64bit.cpp + + Document diff --git a/lex.ll b/lex.ll index 3655220f..87a80145 100644 --- a/lex.ll +++ b/lex.ll @@ -419,6 +419,14 @@ while { RT; return TOKEN_WHILE; } \"C\" { RT; return TOKEN_STRING_C_LITERAL; } \.\.\. { RT; return TOKEN_DOTDOTDOT; } +"operator*" { return TOKEN_IDENTIFIER; } +"operator+" { return TOKEN_IDENTIFIER; } +"operator-" { return TOKEN_IDENTIFIER; } +"operator<<" { return TOKEN_IDENTIFIER; } +"operator>>" { return TOKEN_IDENTIFIER; } +"operator/" { return TOKEN_IDENTIFIER; } +"operator%" { return TOKEN_IDENTIFIER; } + L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERAL; } {IDENT} { diff --git a/parse.yy b/parse.yy index 933a3455..9a0377c5 100644 --- a/parse.yy +++ b/parse.yy @@ -353,17 +353,75 @@ launch_expression : TOKEN_LAUNCH postfix_expression '(' argument_expression_list ')' { ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @2); - $$ = new FunctionCallExpr($2, $4, Union(@2, @5), true, oneExpr); + Expr *launchCount[3] = {oneExpr, oneExpr, oneExpr}; + $$ = new FunctionCallExpr($2, $4, Union(@2, @5), true, launchCount); } | TOKEN_LAUNCH postfix_expression '(' ')' { ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @2); - $$ = new FunctionCallExpr($2, new ExprList(Union(@3,@4)), Union(@2, @4), true, oneExpr); + Expr *launchCount[3] = {oneExpr, oneExpr, oneExpr}; + $$ = new FunctionCallExpr($2, new ExprList(Union(@3,@4)), Union(@2, @4), true, launchCount); } - | TOKEN_LAUNCH '[' expression ']' postfix_expression '(' argument_expression_list ')' - { $$ = new FunctionCallExpr($5, $7, Union(@5,@8), true, $3); } - | TOKEN_LAUNCH '[' expression ']' postfix_expression '(' ')' - { $$ = new FunctionCallExpr($5, new ExprList(Union(@5,@6)), Union(@5,@7), true, $3); } + + | TOKEN_LAUNCH '[' assignment_expression ']' postfix_expression '(' argument_expression_list ')' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5); + Expr *launchCount[3] = {$3, oneExpr, oneExpr}; + $$ = new FunctionCallExpr($5, $7, Union(@5,@8), true, launchCount); + } + | TOKEN_LAUNCH '[' assignment_expression ']' postfix_expression '(' ')' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5); + Expr *launchCount[3] = {$3, oneExpr, oneExpr}; + $$ = new FunctionCallExpr($5, new ExprList(Union(@5,@6)), Union(@5,@7), true, launchCount); + } + + | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ']' postfix_expression '(' argument_expression_list ')' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @7); + Expr *launchCount[3] = {$3, $5, oneExpr}; + $$ = new FunctionCallExpr($7, $9, Union(@7,@10), true, launchCount); + } + | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ']' postfix_expression '(' ')' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @7); + Expr *launchCount[3] = {$3, $5, oneExpr}; + $$ = new FunctionCallExpr($7, new ExprList(Union(@7,@8)), Union(@7,@9), true, launchCount); + } + | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' argument_expression_list ')' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @8); + Expr *launchCount[3] = {$6, $3, oneExpr}; + $$ = new FunctionCallExpr($8, $10, Union(@8,@11), true, launchCount); + } + | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' ')' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @8); + Expr *launchCount[3] = {$6, $3, oneExpr}; + $$ = new FunctionCallExpr($8, new ExprList(Union(@8,@9)), Union(@8,@10), true, launchCount); + } + + | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ',' assignment_expression ']' postfix_expression '(' argument_expression_list ')' + { + Expr *launchCount[3] = {$3, $5, $7}; + $$ = new FunctionCallExpr($9, $11, Union(@9,@12), true, launchCount); + } + | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ',' assignment_expression ']' postfix_expression '(' ')' + { + Expr *launchCount[3] = {$3, $5, $7}; + $$ = new FunctionCallExpr($9, new ExprList(Union(@9,@10)), Union(@9,@11), true, launchCount); + } + | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' argument_expression_list ')' + { + Expr *launchCount[3] = {$9, $6, $3}; + $$ = new FunctionCallExpr($11, $13, Union(@11,@14), true, launchCount); + } + | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' ')' + { + Expr *launchCount[3] = {$9, $6, $3}; + $$ = new FunctionCallExpr($11, new ExprList(Union(@11,@12)), Union(@11,@13), true, launchCount); + } + | TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>' { @@ -377,13 +435,13 @@ launch_expression "around function call expression."); $$ = NULL; } - | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' argument_expression_list ')' '>' + | TOKEN_LAUNCH '[' assignment_expression ']' '<' postfix_expression '(' argument_expression_list ')' '>' { Error(Union(@5, @10), "\"launch\" expressions no longer take '<' '>' " "around function call expression."); $$ = NULL; } - | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' ')' '>' + | TOKEN_LAUNCH '[' assignment_expression ']' '<' postfix_expression '(' ')' '>' { Error(Union(@5, @9), "\"launch\" expressions no longer take '<' '>' " "around function call expression."); @@ -468,27 +526,27 @@ cast_expression multiplicative_expression : cast_expression | multiplicative_expression '*' cast_expression - { $$ = new BinaryExpr(BinaryExpr::Mul, $1, $3, Union(@1, @3)); } + { $$ = MakeBinaryExpr(BinaryExpr::Mul, $1, $3, Union(@1, @3)); } | multiplicative_expression '/' cast_expression - { $$ = new BinaryExpr(BinaryExpr::Div, $1, $3, Union(@1, @3)); } + { $$ = MakeBinaryExpr(BinaryExpr::Div, $1, $3, Union(@1, @3)); } | multiplicative_expression '%' cast_expression - { $$ = new BinaryExpr(BinaryExpr::Mod, $1, $3, Union(@1, @3)); } + { $$ = MakeBinaryExpr(BinaryExpr::Mod, $1, $3, Union(@1, @3)); } ; additive_expression : multiplicative_expression | additive_expression '+' multiplicative_expression - { $$ = new BinaryExpr(BinaryExpr::Add, $1, $3, Union(@1, @3)); } + { $$ = MakeBinaryExpr(BinaryExpr::Add, $1, $3, Union(@1, @3)); } | additive_expression '-' multiplicative_expression - { $$ = new BinaryExpr(BinaryExpr::Sub, $1, $3, Union(@1, @3)); } + { $$ = MakeBinaryExpr(BinaryExpr::Sub, $1, $3, Union(@1, @3)); } ; shift_expression : additive_expression | shift_expression TOKEN_LEFT_OP additive_expression - { $$ = new BinaryExpr(BinaryExpr::Shl, $1, $3, Union(@1,@3)); } + { $$ = MakeBinaryExpr(BinaryExpr::Shl, $1, $3, Union(@1, @3)); } | shift_expression TOKEN_RIGHT_OP additive_expression - { $$ = new BinaryExpr(BinaryExpr::Shr, $1, $3, Union(@1,@3)); } + { $$ = MakeBinaryExpr(BinaryExpr::Shr, $1, $3, Union(@1, @3)); } ; relational_expression @@ -2214,9 +2272,24 @@ static void lAddThreadIndexCountToSymbolTable(SourcePos pos) { Symbol *taskIndexSym = new Symbol("taskIndex", pos, type); m->symbolTable->AddVariable(taskIndexSym); - + Symbol *taskCountSym = new Symbol("taskCount", pos, type); m->symbolTable->AddVariable(taskCountSym); + + Symbol *taskIndexSym0 = new Symbol("taskIndex0", pos, type); + m->symbolTable->AddVariable(taskIndexSym0); + Symbol *taskIndexSym1 = new Symbol("taskIndex1", pos, type); + m->symbolTable->AddVariable(taskIndexSym1); + Symbol *taskIndexSym2 = new Symbol("taskIndex2", pos, type); + m->symbolTable->AddVariable(taskIndexSym2); + + + Symbol *taskCountSym0 = new Symbol("taskCount0", pos, type); + m->symbolTable->AddVariable(taskCountSym0); + Symbol *taskCountSym1 = new Symbol("taskCount1", pos, type); + m->symbolTable->AddVariable(taskCountSym1); + Symbol *taskCountSym2 = new Symbol("taskCount2", pos, type); + m->symbolTable->AddVariable(taskCountSym2); } diff --git a/run_tests.py b/run_tests.py index 803410b8..506d37a5 100755 --- a/run_tests.py +++ b/run_tests.py @@ -377,7 +377,12 @@ def file_check(compfails, runfails): temp3 = re.search("[0-9]*\.[0-9]*", temp2.group()) compiler_version = options.compiler_exe + temp3.group() else: - compiler_version = "cl" + compiler_version = "cl" + possible_compilers = ["g++4.4", "g++4.7", "clang++3.3", "cl"] + if not compiler_version in possible_compilers: + error("\n**********\nWe don't have history of fails for compiler " + + compiler_version + + "\nAll fails will be new!!!\n**********", 2) new_line = " "+options.arch.rjust(6)+" "+options.target.rjust(14)+" "+OS.rjust(7)+" "+llvm_version+" "+compiler_version.rjust(10)+" "+opt+" *\n" new_compfails = compfails[:] @@ -449,8 +454,9 @@ def verify(): check = [["g++", "clang++", "cl"],["-O0", "-O2"],["x86","x86-64"], ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"], ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8", - "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8", "avx1.1-i32x16", - "avx2-i32x8", "avx2-i32x16", "generic-1", "generic-4", "generic-8", + "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8", + "avx1.1-i32x16", "avx1.1-i64x4", "avx2-i32x8", "avx2-i32x16", "avx2-i64x4", + "generic-1", "generic-4", "generic-8", "generic-16", "generic-32", "generic-64"]] for i in range (0,len(f_lines)): if f_lines[i][0] == "%": @@ -647,7 +653,8 @@ def run_tests(options1, args, print_version): if options.non_interactive == False: print_debug("\n", s, run_tests_log) - elapsed_time = time.time() - start_time + temp_time = (time.time() - start_time) + elapsed_time = time.strftime('%Hh%Mm%Ssec.', time.gmtime(temp_time)) while not qret.empty(): (c, r, skip) = qret.get() @@ -675,12 +682,16 @@ def run_tests(options1, args, print_version): if len(compile_error_files) == 0 and len(run_error_files) == 0: print_debug("No fails\n", s, run_tests_log) - R = file_check(compile_error_files, run_error_files) + if len(args) == 0: + R = file_check(compile_error_files, run_error_files) + else: + error("don't check new fails for incomplete suite of tests", 2) + R = 0 if options.time: - print_debug("Elapsed time: %d s\n" % elapsed_time, s, run_tests_log) + print_debug("Elapsed time: " + elapsed_time + "\n", s, run_tests_log) - return R + return [R, elapsed_time] from optparse import OptionParser diff --git a/tests/operators.ispc b/tests/operators.ispc new file mode 100644 index 00000000..95502bdd --- /dev/null +++ b/tests/operators.ispc @@ -0,0 +1,70 @@ + +export uniform int width() { return programCount; } + +struct S { + float a; +}; + +// References "struct&" were put in random order to test them. +struct S operator*(struct S& rr, struct S rv) { + struct S c; + c.a = rr.a + rv.a + 2; + return c; +} + +struct S operator/(struct S& rr, struct S& rv) { + struct S c; + c.a = rr.a - rr.a + 2; + return c; +} + +struct S operator%(struct S rr, struct S& rv) { + struct S c; + c.a = rr.a + rv.a + 2; + return c; +} + +struct S operator+(struct S rr, struct S rv) { + struct S c; + c.a = rr.a / rv.a + 3; + return c; +} + +struct S operator-(struct S rr, struct S& rv) { + struct S c; + c.a = rr.a + rv.a + 2; + return c; +} + +struct S operator>>(struct S& rr, struct S rv) { + struct S c; + c.a = rr.a / rv.a + 3; + return c; +} + +struct S operator<<(struct S& rr, struct S& rv) { + struct S c; + c.a = rr.a + rv.a + 2; + return c; +} + +struct S a, a1; +struct S b, b1; +struct S d1, d2, d3, d4, d5, d6, d7; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + a.a = aFOO[programIndex]; + b.a = -aFOO[programIndex]; + d1 = a * b; + d2 = a / b; + d3 = a % b; + d4 = a + b; + d5 = a - b; + d6 = a >> b; + d7 = a << b; + RET[programIndex] = d1.a + d2.a + d3.a + d4.a + d5.a + d6.a + d7.a; +} + +export void result(uniform float RET[4]) { + RET[programIndex] = 14; +} diff --git a/tests/operators1.ispc b/tests/operators1.ispc new file mode 100644 index 00000000..f52c4c35 --- /dev/null +++ b/tests/operators1.ispc @@ -0,0 +1,64 @@ + +export uniform int width() { return programCount; } + +struct S { + float a; +}; + +// References "struct&" were put in random order to test them. +struct S operator*(struct S& rr, struct S rv) { + struct S c; + c.a = rr.a + rv.a + 2; + return c; +} + +struct S operator/(struct S& rr, struct S& rv) { + struct S c; + c.a = rr.a - rr.a + 2; + return c; +} + +struct S operator%(struct S rr, struct S& rv) { + struct S c; + c.a = rr.a + rv.a + 2; + return c; +} + +struct S operator+(struct S rr, struct S rv) { + struct S c; + c.a = rr.a / rv.a + 3; + return c; +} + +struct S operator-(struct S rr, struct S& rv) { + struct S c; + c.a = rr.a + rv.a + 2; + return c; +} + +struct S operator>>(struct S& rr, struct S rv) { + struct S c; + c.a = rr.a / rv.a + 3; + return c; +} + +struct S operator<<(struct S& rr, struct S& rv) { + struct S c; + c.a = rr.a + rv.a + 2; + return c; +} + +struct S a; +struct S b; +struct S d; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + a.a = 5; + b.a = -5; + d = a * b + b / a - a << (b - b) - a; + RET[programIndex] = d.a; +} + +export void result(uniform float RET[4]) { + RET[programIndex] = 12; +} diff --git a/tests/operators2.ispc b/tests/operators2.ispc new file mode 100644 index 00000000..b732b24a --- /dev/null +++ b/tests/operators2.ispc @@ -0,0 +1,51 @@ +int off; + +export uniform int width() { return programCount; } + +struct S { + float a; +}; + +struct S operator+(struct S rr, struct S rv) { + struct S c; + c.a = rr.a / rv.a + 3; + if (off == 1) + c.a = 22; + return c; +} + +struct S operator/(struct S rr, struct S rv) { + struct S c; + c.a = rr.a + rv.a + 10; + if (off == 1) + c.a = 33; + return c; +} + +struct S a; +struct S b; +struct S d; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + int T = programIndex; + a.a = aFOO[programIndex]; + b.a = -aFOO[programIndex]; + if (programIndex == 3) + off = 1; + else + off = 0; + if (T % 2) + d = a + b; + else + d = a / b; + + RET[programIndex] = d.a; +} + +export void result(uniform float RET[4]) { + if (programIndex % 2) + RET[programIndex] = 2; + else + RET[programIndex] = 10; + RET[3] = 22; +} diff --git a/type.cpp b/type.cpp index 5fa1845b..516276f0 100644 --- a/type.cpp +++ b/type.cpp @@ -2961,6 +2961,12 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const { callTypes.push_back(LLVMTypes::Int32Type); // threadCount callTypes.push_back(LLVMTypes::Int32Type); // taskIndex callTypes.push_back(LLVMTypes::Int32Type); // taskCount + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex0 + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex1 + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex2 + callTypes.push_back(LLVMTypes::Int32Type); // taskCount0 + callTypes.push_back(LLVMTypes::Int32Type); // taskCount1 + callTypes.push_back(LLVMTypes::Int32Type); // taskCount2 } else // Otherwise we already have the types of the arguments