From: Ben.Lippmeier@anu.edu.au Date: Wed, 5 Sep 2007 12:52:19 +0000 (+0000) Subject: Refactor MachRegs.trivColorable to do unboxed accumulation X-Git-Url: http://git.megacz.com/?p=ghc-hetmet.git;a=commitdiff_plain;h=a8312580d6f089d153d8af668484d4c2eb75e8a8 Refactor MachRegs.trivColorable to do unboxed accumulation trivColorable was soaking up total 31% time, 41% alloc when compiling SHA1.lhs with -O2 -fregs-graph on x86. Refactoring to use unboxed accumulators and walk directly over the UniqFM holding the set of conflicts reduces this to 17% time, 6% alloc. --- diff --git a/compiler/nativeGen/AsmCodeGen.lhs b/compiler/nativeGen/AsmCodeGen.lhs index 86363ed..c9f11d5 100644 --- a/compiler/nativeGen/AsmCodeGen.lhs +++ b/compiler/nativeGen/AsmCodeGen.lhs @@ -278,7 +278,7 @@ cmmNativeGen dflags us cmm -- graph coloring register allocation let ((alloced, regAllocStats), usAlloc) - = {-# SCC "regAlloc(color)" #-} + = {-# SCC "RegAlloc(color)" #-} initUs usLive $ Color.regAlloc generateRegAllocStats @@ -312,7 +312,7 @@ cmmNativeGen dflags us cmm else do -- do linear register allocation let ((alloced, regAllocStats), usAlloc) - = {-# SCC "regAlloc(linear)" #-} + = {-# SCC "RegAlloc(linear)" #-} initUs usLive $ liftM unzip $ mapUs Linear.regAlloc withLiveness diff --git a/compiler/nativeGen/GraphBase.hs b/compiler/nativeGen/GraphBase.hs index c4e9eb3..b980ba2 100644 --- a/compiler/nativeGen/GraphBase.hs +++ b/compiler/nativeGen/GraphBase.hs @@ -82,7 +82,7 @@ data Node k cls color , nodeConflicts :: UniqSet k -- | Colors that cannot be used by this node. - , nodeExclusions :: UniqSet color + , nodeExclusions :: UniqSet color -- | Colors that this node would prefer to be, in decending order. , nodePreference :: [color] diff --git a/compiler/nativeGen/GraphOps.hs b/compiler/nativeGen/GraphOps.hs index f620d8a..f918fd2 100644 --- a/compiler/nativeGen/GraphOps.hs +++ b/compiler/nativeGen/GraphOps.hs @@ -28,7 +28,6 @@ import UniqFM import Data.List hiding (union) import Data.Maybe - -- | Lookup a node from the graph. lookupNode :: Uniquable k @@ -447,6 +446,7 @@ setColor u color u +{-# INLINE adjustWithDefaultUFM #-} adjustWithDefaultUFM :: Uniquable k => (a -> a) -> a -> k @@ -458,7 +458,7 @@ adjustWithDefaultUFM f def k map map k def - +{-# INLINE adjustUFM #-} adjustUFM :: Uniquable k => (a -> a) diff --git a/compiler/nativeGen/MachRegs.lhs b/compiler/nativeGen/MachRegs.lhs index dd4962c..ee514f9 100644 --- a/compiler/nativeGen/MachRegs.lhs +++ b/compiler/nativeGen/MachRegs.lhs @@ -103,6 +103,9 @@ import Unique import UniqSet import Constants import FastTypes +import UniqFM + +import GHC.Exts #if powerpc_TARGET_ARCH import Data.Word ( Word8, Word16, Word32 ) @@ -444,24 +447,30 @@ instance Outputable Reg where -- NOTE: This only works for arcitectures with just RcInteger and RcDouble -- (which are disjoint) ie. x86, x86_64 and ppc -- + +-- BL 2007/09 +-- Doing a nice fold over the UniqSet makes trivColorable use +-- 32% of total compile time and 42% of total alloc when compiling SHA1.lhs from darcs. +{- trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool trivColorable classN conflicts exclusions - = let + = let + + acc :: Reg -> (Int, Int) -> (Int, Int) acc r (cd, cf) = case regClass r of RcInteger -> (cd+1, cf) RcDouble -> (cd, cf+1) _ -> panic "MachRegs.trivColorable: reg class not handled" - tmp = foldUniqSet acc (0, 0) conflicts - (rsD, rsFP) = foldUniqSet acc tmp exclusions + tmp = foldUniqSet acc (0, 0) conflicts + (countInt, countFloat) = foldUniqSet acc tmp exclusions - squeese = worst rsD classN RcInteger - + worst rsFP classN RcDouble + squeese = worst countInt classN RcInteger + + worst countFloat classN RcDouble in squeese < allocatableRegsInClass classN - -- | Worst case displacement -- node N of classN has n neighbors of class C. -- @@ -480,6 +489,69 @@ worst n classN classC -> case classC of RcDouble -> min n (allocatableRegsInClass RcDouble) RcInteger -> 0 +-} + + +-- The number of allocatable regs is hard coded here so we can do a fast comparision +-- in trivColorable. It's ok if these numbers are _less_ than the actual number of +-- free regs, but they can't be more or the register conflict graph won't color. +-- +-- There is an allocatableRegsInClass :: RegClass -> Int, but doing the unboxing +-- is too slow for us here. +-- +-- Compare MachRegs.freeRegs and MachRegs.h to get these numbers. +-- +#if i386_TARGET_ARCH +#define ALLOCATABLE_REGS_INTEGER 3# +#define ALLOCATABLE_REGS_DOUBLE 6# +#endif + +#if x86_64_TARGET_ARCH +#define ALLOCATABLE_REGS_INTEGER 5# +#define ALLOCATABLE_REGS_DOUBLE 2# +#endif + +#if powerpc_TARGET_ARCH +#define ALLOCATABLE_REGS_INTEGER 16# +#define ALLOCATABLE_REGS_DOUBLE 26# +#endif + +{-# INLINE regClass #-} +trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool +trivColorable classN conflicts exclusions + = {-# SCC "trivColorable" #-} + let + {-# INLINE isSqueesed #-} + isSqueesed cI cF ufm + = case ufm of + NodeUFM _ _ left right + -> case isSqueesed cI cF right of + (# s, cI', cF' #) + -> case s of + False -> isSqueesed cI' cF' left + True -> (# True, cI', cF' #) + + LeafUFM _ reg + -> case regClass reg of + RcInteger + -> case cI +# 1# of + cI' -> (# cI' >=# ALLOCATABLE_REGS_INTEGER, cI', cF #) + + RcDouble + -> case cF +# 1# of + cF' -> (# cF' >=# ALLOCATABLE_REGS_DOUBLE, cI, cF' #) + + EmptyUFM + -> (# False, cI, cF #) + + in case isSqueesed 0# 0# conflicts of + (# False, cI', cF' #) + -> case isSqueesed cI' cF' exclusions of + (# s, _, _ #) -> not s + + (# True, _, _ #) + -> False + -- ----------------------------------------------------------------------------- diff --git a/compiler/nativeGen/RegAllocColor.hs b/compiler/nativeGen/RegAllocColor.hs index 2e3d40e..0cd3923 100644 --- a/compiler/nativeGen/RegAllocColor.hs +++ b/compiler/nativeGen/RegAllocColor.hs @@ -79,11 +79,12 @@ regAlloc_spin dump (spinCount :: Int) triv regsFree slotsFree debug_codeGraphs c $$ text "slotsFree = " <> ppr (sizeUniqSet slotsFree)) -- build a conflict graph from the code. - graph <- buildGraph code + graph <- {-# SCC "BuildGraph" #-} buildGraph code -- build a map of how many instructions each reg lives for. -- this is lazy, it won't be computed unless we need to spill - let fmLife = plusUFMs_C (\(r1, l1) (_, l2) -> (r1, l1 + l2)) + + let fmLife = {-# SCC "LifetimeCount" #-} plusUFMs_C (\(r1, l1) (_, l2) -> (r1, l1 + l2)) $ map lifetimeCount code -- record startup state @@ -101,7 +102,7 @@ regAlloc_spin dump (spinCount :: Int) triv regsFree slotsFree debug_codeGraphs c -- try and color the graph let (graph_colored, rsSpill, rmCoalesce) - = Color.colorGraph regsFree triv spill graph + = {-# SCC "ColorGraph" #-} Color.colorGraph regsFree triv spill graph -- rewrite regs in the code that have been coalesced let patchF reg = case lookupUFM rmCoalesce reg of @@ -147,7 +148,7 @@ regAlloc_spin dump (spinCount :: Int) triv regsFree slotsFree debug_codeGraphs c -- spill the uncolored regs (code_spilled, slotsFree', spillStats) <- regSpill code_coalesced slotsFree rsSpill - + -- recalculate liveness let code_nat = map stripLive code_spilled code_relive <- mapM regLiveness code_nat diff --git a/compiler/nativeGen/RegAllocLinear.hs b/compiler/nativeGen/RegAllocLinear.hs index 5719328..b99abe3 100644 --- a/compiler/nativeGen/RegAllocLinear.hs +++ b/compiler/nativeGen/RegAllocLinear.hs @@ -224,6 +224,7 @@ emptyStackMap = StackMap [0..maxSpillSlots] emptyUFM getStackSlotFor :: StackMap -> Unique -> (StackMap,Int) getStackSlotFor (StackMap [] _) _ = panic "RegAllocLinear.getStackSlotFor: out of stack slots" + getStackSlotFor fs@(StackMap (freeSlot:stack') reserved) reg = case lookupUFM reserved reg of Just slot -> (fs,slot) diff --git a/compiler/utils/UniqFM.lhs b/compiler/utils/UniqFM.lhs index 3abf698..242fe22 100644 --- a/compiler/utils/UniqFM.lhs +++ b/compiler/utils/UniqFM.lhs @@ -21,7 +21,8 @@ Basically, the things need to be in class @Uniquable@, and we use the -- for details module UniqFM ( - UniqFM, -- abstract type + UniqFM(..), -- abstract type + -- (de-abstracted for MachRegs.trivColorable optimisation BL 2007/09) emptyUFM, unitUFM,