From ee7e3679818d281c08cf1bcf50d5ae37ce2bbb1f Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Sat, 5 May 2012 15:13:11 -0700 Subject: [PATCH] Do global dead code elimination early in optimization. This gives a 15-20% speedup in compilation time for simple programs (but only ~2% for the big 21k monster program). --- builtins/util.m4 | 258 ++++++++++++++++++++++++++++++++++++++++++++++- opt.cpp | 2 + 2 files changed, 257 insertions(+), 3 deletions(-) diff --git a/builtins/util.m4 b/builtins/util.m4 index 042b2ef5..023ca411 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1,4 +1,4 @@ -;; Copyright (c) 2010-2011, Intel Corporation +;; Copyright (c) 2010-2012, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without @@ -1661,6 +1661,258 @@ declare @__exp_varying_float() nounwind readnone declare float @__pow_uniform_float(float, float) nounwind readnone declare @__pow_varying_float(, ) nounwind readnone +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +declare void @__use8() +declare void @__use16() +declare void @__use32() +declare void @__use64() + +;; This is a temporary function that will be removed at the end of +;; compilation--the idea is that it calls out to all of the various +;; functions / pseudo-function declarations that we need to keep around +;; so that they are available to the various optimization passes. This +;; then prevents those functions from being removed as dead code when +;; we do early DCE... + +define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, + %v32, %v64, + %mask) { + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; loads + %ml8 = call @__masked_load_8(i8 * %ptr, %mask) + call void @__use8( %ml8) + %ml16 = call @__masked_load_16(i8 * %ptr, %mask) + call void @__use16( %ml16) + %ml32 = call @__masked_load_32(i8 * %ptr, %mask) + call void @__use32( %ml32) + %ml64 = call @__masked_load_64(i8 * %ptr, %mask) + call void @__use64( %ml64) + + %lb8 = call @__load_and_broadcast_8(i8 * %ptr, %mask) + call void @__use8( %lb8) + %lb16 = call @__load_and_broadcast_16(i8 * %ptr, %mask) + call void @__use16( %lb16) + %lb32 = call @__load_and_broadcast_32(i8 * %ptr, %mask) + call void @__use32( %lb32) + %lb64 = call @__load_and_broadcast_64(i8 * %ptr, %mask) + call void @__use64( %lb64) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; stores + %pv8 = bitcast i8 * %ptr to * + call void @__pseudo_masked_store_8( * %pv8, %v8, + %mask) + %pv16 = bitcast i8 * %ptr to * + call void @__pseudo_masked_store_16( * %pv16, %v16, + %mask) + %pv32 = bitcast i8 * %ptr to * + call void @__pseudo_masked_store_32( * %pv32, %v32, + %mask) + %pv64 = bitcast i8 * %ptr to * + call void @__pseudo_masked_store_64( * %pv64, %v64, + %mask) + + call void @__masked_store_8( * %pv8, %v8, %mask) + call void @__masked_store_16( * %pv16, %v16, %mask) + call void @__masked_store_32( * %pv32, %v32, %mask) + call void @__masked_store_64( * %pv64, %v64, %mask) + + call void @__masked_store_blend_8( * %pv8, %v8, + %mask) + call void @__masked_store_blend_16( * %pv16, %v16, + %mask) + call void @__masked_store_blend_32( * %pv32, %v32, + %mask) + call void @__masked_store_blend_64( * %pv64, %v64, + %mask) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; gathers + + %pg32_8 = call @__pseudo_gather32_8( %v32, + %mask) + call void @__use8( %pg32_8) + %pg32_16 = call @__pseudo_gather32_16( %v32, + %mask) + call void @__use16( %pg32_16) + %pg32_32 = call @__pseudo_gather32_32( %v32, + %mask) + call void @__use32( %pg32_32) + %pg32_64 = call @__pseudo_gather32_64( %v32, + %mask) + call void @__use64( %pg32_64) + + %pg64_8 = call @__pseudo_gather64_8( %v64, + %mask) + call void @__use8( %pg64_8) + %pg64_16 = call @__pseudo_gather64_16( %v64, + %mask) + call void @__use16( %pg64_16) + %pg64_32 = call @__pseudo_gather64_32( %v64, + %mask) + call void @__use32( %pg64_32) + %pg64_64 = call @__pseudo_gather64_64( %v64, + %mask) + call void @__use64( %pg64_64) + + %g32_8 = call @__gather32_i8( %v32, + %mask) + call void @__use8( %g32_8) + %g32_16 = call @__gather32_i16( %v32, + %mask) + call void @__use16( %g32_16) + %g32_32 = call @__gather32_i32( %v32, + %mask) + call void @__use32( %g32_32) + %g32_64 = call @__gather32_i64( %v32, + %mask) + call void @__use64( %g32_64) + + %g64_8 = call @__gather64_i8( %v64, + %mask) + call void @__use8( %g64_8) + %g64_16 = call @__gather64_i16( %v64, + %mask) + call void @__use16( %g64_16) + %g64_32 = call @__gather64_i32( %v64, + %mask) + call void @__use32( %g64_32) + %g64_64 = call @__gather64_i64( %v64, + %mask) + call void @__use64( %g64_64) + + %pgbo32_8 = call + @__pseudo_gather_base_offsets32_8(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use8( %pgbo32_8) + %pgbo32_16 = call + @__pseudo_gather_base_offsets32_16(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use16( %pgbo32_16) + %pgbo32_32 = call + @__pseudo_gather_base_offsets32_32(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use32( %pgbo32_32) + %pgbo32_64 = call + @__pseudo_gather_base_offsets32_64(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use64( %pgbo32_64) + + %gbo32_8 = call + @__gather_base_offsets32_i8(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use8( %gbo32_8) + %gbo32_16 = call + @__gather_base_offsets32_i16(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use16( %gbo32_16) + %gbo32_32 = call + @__gather_base_offsets32_i32(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use32( %gbo32_32) + %gbo32_64 = call + @__gather_base_offsets32_i64(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use64( %gbo32_64) + + + %pgbo64_8 = call + @__pseudo_gather_base_offsets64_8(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use8( %pgbo64_8) + %pgbo64_16 = call + @__pseudo_gather_base_offsets64_16(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use16( %pgbo64_16) + %pgbo64_32 = call + @__pseudo_gather_base_offsets64_32(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use32( %pgbo64_32) + %pgbo64_64 = call + @__pseudo_gather_base_offsets64_64(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use64( %pgbo64_64) + + %gbo64_8 = call + @__gather_base_offsets64_i8(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use8( %gbo64_8) + %gbo64_16 = call + @__gather_base_offsets64_i16(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use16( %gbo64_16) + %gbo64_32 = call + @__gather_base_offsets64_i32(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use32( %gbo64_32) + %gbo64_64 = call + @__gather_base_offsets64_i64(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use64( %gbo64_64) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; scatters + + call void @__pseudo_scatter32_8( %v32, %v8, %mask) + call void @__pseudo_scatter32_16( %v32, %v16, %mask) + call void @__pseudo_scatter32_32( %v32, %v32, %mask) + call void @__pseudo_scatter32_64( %v32, %v64, %mask) + + call void @__pseudo_scatter64_8( %v64, %v8, %mask) + call void @__pseudo_scatter64_16( %v64, %v16, %mask) + call void @__pseudo_scatter64_32( %v64, %v32, %mask) + call void @__pseudo_scatter64_64( %v64, %v64, %mask) + + call void @__scatter32_i8( %v32, %v8, %mask) + call void @__scatter32_i16( %v32, %v16, %mask) + call void @__scatter32_i32( %v32, %v32, %mask) + call void @__scatter32_i64( %v32, %v64, %mask) + + call void @__scatter64_i8( %v64, %v8, %mask) + call void @__scatter64_i16( %v64, %v16, %mask) + call void @__scatter64_i32( %v64, %v32, %mask) + call void @__scatter64_i64( %v64, %v64, %mask) + + call void @__pseudo_scatter_base_offsets32_8(i8 * %ptr, %v32, i32 0, %v32, + %v8, %mask) + call void @__pseudo_scatter_base_offsets32_16(i8 * %ptr, %v32, i32 0, %v32, + %v16, %mask) + call void @__pseudo_scatter_base_offsets32_32(i8 * %ptr, %v32, i32 0, %v32, + %v32, %mask) + call void @__pseudo_scatter_base_offsets32_64(i8 * %ptr, %v32, i32 0, %v32, + %v64, %mask) + + call void @__pseudo_scatter_base_offsets64_8(i8 * %ptr, %v64, i32 0, %v64, + %v8, %mask) + call void @__pseudo_scatter_base_offsets64_16(i8 * %ptr, %v64, i32 0, %v64, + %v16, %mask) + call void @__pseudo_scatter_base_offsets64_32(i8 * %ptr, %v64, i32 0, %v64, + %v32, %mask) + call void @__pseudo_scatter_base_offsets64_64(i8 * %ptr, %v64, i32 0, %v64, + %v64, %mask) + + call void @__scatter_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, + %v8, %mask) + call void @__scatter_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, + %v16, %mask) + call void @__scatter_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, + %v32, %mask) + call void @__scatter_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, + %v64, %mask) + + call void @__scatter_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, + %v8, %mask) + call void @__scatter_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, + %v16, %mask) + call void @__scatter_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, + %v32, %mask) + call void @__scatter_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, + %v64, %mask) + + ret void +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector ops @@ -2251,9 +2503,9 @@ define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x MASK> %mask) nounwind alwa ;; $4: alignment for elements of type $2 (4, 8, ...) define(`masked_load', ` -define <$1 x $2> @__masked_load_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline { +define <$1 x $2> @__masked_load_$3(i8 *, <$1 x MASK> %mask) nounwind alwaysinline { entry: - %mm = call i32 @__movmsk(<$1 x i32> %mask) + %mm = call i32 @__movmsk(<$1 x MASK> %mask) ; if the first lane and the last lane are on, then it is safe to do a vector load ; of the whole thing--what the lanes in the middle want turns out to not matter... diff --git a/opt.cpp b/opt.cpp index 063be681..dc201367 100644 --- a/opt.cpp +++ b/opt.cpp @@ -446,6 +446,7 @@ Optimize(llvm::Module *module, int optLevel) { llvm::initializeTarget(*registry); bool runSROA = true; + optPM.add(llvm::createGlobalDCEPass()); // Early optimizations to try to reduce the total amount of code to // work with if we can @@ -3906,6 +3907,7 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__scatter32_i32", "__scatter32_i64", "__scatter64_i8", "__scatter64_i16", "__scatter64_i32", "__scatter64_i64", + "__keep_funcs_live", }; bool modifiedAny = false;