X-Git-Url: http://git.megacz.com/?p=ghc-hetmet.git;a=blobdiff_plain;f=compiler%2FnativeGen%2FRegAlloc%2FGraph%2FTrivColorable.hs;h=848b266116ccadf8696330d27010beaa6e468823;hp=df0460631396c3cf3366c19fb38b1c7c82ea9f42;hb=1f7ab811c4421458568b0ed900b496192fee885b;hpb=de29a9f02449359b70402f763ac7590673774124 diff --git a/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs b/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs index df04606..848b266 100644 --- a/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs +++ b/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs @@ -1,6 +1,7 @@ +{-# LANGUAGE BangPatterns #-} module RegAlloc.Graph.TrivColorable ( - trivColorable, + trivColorable, ) where @@ -14,163 +15,245 @@ import GraphBase import UniqFM import FastTypes - -{- --- allocatableRegs is allMachRegNos with the fixed-use regs removed. --- i.e., these are the regs for which we are prepared to allow the --- register allocator to attempt to map VRegs to. -allocatableRegs :: [RegNo] -allocatableRegs - = let isFree i = isFastTrue (freeReg i) - in filter isFree allMachRegNos +import Platform +import Panic --- | The number of regs in each class. --- We go via top level CAFs to ensure that we're not recomputing --- the length of these lists each time the fn is called. -allocatableRegsInClass :: RegClass -> Int -allocatableRegsInClass cls - = case cls of - RcInteger -> allocatableRegsInteger - RcDouble -> allocatableRegsDouble - RcFloat -> panic "Regs.allocatableRegsInClass: no match\n" +-- trivColorable --------------------------------------------------------------- -allocatableRegsInteger :: Int -allocatableRegsInteger - = length $ filter (\r -> regClass r == RcInteger) - $ map RealReg allocatableRegs - -allocatableRegsDouble :: Int -allocatableRegsDouble - = length $ filter (\r -> regClass r == RcDouble) - $ map RealReg allocatableRegs --} +-- trivColorable function for the graph coloring allocator +-- +-- This gets hammered by scanGraph during register allocation, +-- so needs to be fairly efficient. +-- +-- NOTE: This only works for arcitectures with just RcInteger and RcDouble +-- (which are disjoint) ie. x86, x86_64 and ppc +-- +-- The number of allocatable regs is hard coded in here so we can do +-- a fast comparision in trivColorable. +-- +-- It's ok if these numbers are _less_ than the actual number of free +-- regs, but they can't be more or the register conflict +-- graph won't color. +-- +-- If the graph doesn't color then the allocator will panic, but it won't +-- generate bad object code or anything nasty like that. +-- +-- There is an allocatableRegsInClass :: RegClass -> Int, but doing +-- the unboxing is too slow for us here. +-- TODO: Is that still true? Could we use allocatableRegsInClass +-- without losing performance now? +-- +-- Look at includes/stg/MachRegs.h to get the numbers. +-- --- trivColorable --------------------------------------------------------------- +-- Disjoint registers ---------------------------------------------------------- +-- +-- The definition has been unfolded into individual cases for speed. +-- Each architecture has a different register setup, so we use a +-- different regSqueeze function for each. +-- +accSqueeze + :: FastInt + -> FastInt + -> (reg -> FastInt) + -> UniqFM reg + -> FastInt + +accSqueeze count maxCount squeeze ufm = acc count (eltsUFM ufm) + where acc count [] = count + acc count _ | count >=# maxCount = count + acc count (r:rs) = acc (count +# squeeze r) rs + +{- Note [accSqueeze] +~~~~~~~~~~~~~~~~~~~~ +BL 2007/09 +Doing a nice fold over the UniqSet makes trivColorable use +32% of total compile time and 42% of total alloc when compiling SHA1.lhs from darcs. +Therefore the UniqFM is made non-abstract and we use custom fold. + +MS 2010/04 +When converting UniqFM to use Data.IntMap, the fold cannot use UniqFM internal +representation any more. But it is imperative that the assSqueeze stops +the folding if the count gets greater or equal to maxCount. We thus convert +UniqFM to a (lazy) list, do the fold and stops if necessary, which was +the most efficient variant tried. Benchmark compiling 10-times SHA1.lhs follows. +(original = previous implementation, folding = fold of the whole UFM, + lazyFold = the current implementation, + hackFold = using internal representation of Data.IntMap) + + original folding hackFold lazyFold + -O -fasm (used everywhere) 31.509s 30.387s 30.791s 30.603s + 100.00% 96.44% 97.72% 97.12% + -fregs-graph 67.938s 74.875s 62.673s 64.679s + 100.00% 110.21% 92.25% 95.20% + -fregs-iterative 89.761s 143.913s 81.075s 86.912s + 100.00% 160.33% 90.32% 96.83% + -fnew-codegen 38.225s 37.142s 37.551s 37.119s + 100.00% 97.17% 98.24% 97.11% + -fnew-codegen -fregs-graph 91.786s 91.51s 87.368s 86.88s + 100.00% 99.70% 95.19% 94.65% + -fnew-codegen -fregs-iterative 206.72s 343.632s 194.694s 208.677s + 100.00% 166.23% 94.18% 100.95% +-} --- trivColorable function for the graph coloring allocator --- This gets hammered by scanGraph during register allocation, --- so needs to be fairly efficient. +-- TODO: We shouldn't be using defaultTargetPlatform here. +-- We should be passing DynFlags in instead, and looking at +-- its targetPlatform. + +trivColorable + :: (RegClass -> VirtualReg -> FastInt) + -> (RegClass -> RealReg -> FastInt) + -> Triv VirtualReg RegClass RealReg + +trivColorable virtualRegSqueeze realRegSqueeze RcInteger conflicts exclusions + | let !cALLOCATABLE_REGS_INTEGER + = iUnbox (case platformArch defaultTargetPlatform of + ArchX86 -> 3 + ArchX86_64 -> 5 + ArchPPC -> 16 + ArchSPARC -> 14 + ArchPPC_64 -> panic "trivColorable ArchPPC_64" + ArchUnknown -> panic "trivColorable ArchUnknown") + , count2 <- accSqueeze (_ILIT(0)) cALLOCATABLE_REGS_INTEGER + (virtualRegSqueeze RcInteger) + conflicts + + , count3 <- accSqueeze count2 cALLOCATABLE_REGS_INTEGER + (realRegSqueeze RcInteger) + exclusions + + = count3 <# cALLOCATABLE_REGS_INTEGER + +trivColorable virtualRegSqueeze realRegSqueeze RcFloat conflicts exclusions + | let !cALLOCATABLE_REGS_FLOAT + = iUnbox (case platformArch defaultTargetPlatform of + ArchX86 -> 0 + ArchX86_64 -> 0 + ArchPPC -> 0 + ArchSPARC -> 22 + ArchPPC_64 -> panic "trivColorable ArchPPC_64" + ArchUnknown -> panic "trivColorable ArchUnknown") + , count2 <- accSqueeze (_ILIT(0)) cALLOCATABLE_REGS_FLOAT + (virtualRegSqueeze RcFloat) + conflicts + + , count3 <- accSqueeze count2 cALLOCATABLE_REGS_FLOAT + (realRegSqueeze RcFloat) + exclusions + + = count3 <# cALLOCATABLE_REGS_FLOAT + +trivColorable virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions + | let !cALLOCATABLE_REGS_DOUBLE + = iUnbox (case platformArch defaultTargetPlatform of + ArchX86 -> 6 + ArchX86_64 -> 0 + ArchPPC -> 26 + ArchSPARC -> 11 + ArchPPC_64 -> panic "trivColorable ArchPPC_64" + ArchUnknown -> panic "trivColorable ArchUnknown") + , count2 <- accSqueeze (_ILIT(0)) cALLOCATABLE_REGS_DOUBLE + (virtualRegSqueeze RcDouble) + conflicts + + , count3 <- accSqueeze count2 cALLOCATABLE_REGS_DOUBLE + (realRegSqueeze RcDouble) + exclusions + + = count3 <# cALLOCATABLE_REGS_DOUBLE + +trivColorable virtualRegSqueeze realRegSqueeze RcDoubleSSE conflicts exclusions + | let !cALLOCATABLE_REGS_SSE + = iUnbox (case platformArch defaultTargetPlatform of + ArchX86 -> 8 + ArchX86_64 -> 10 + ArchPPC -> 0 + ArchSPARC -> 0 + ArchPPC_64 -> panic "trivColorable ArchPPC_64" + ArchUnknown -> panic "trivColorable ArchUnknown") + , count2 <- accSqueeze (_ILIT(0)) cALLOCATABLE_REGS_SSE + (virtualRegSqueeze RcDoubleSSE) + conflicts + + , count3 <- accSqueeze count2 cALLOCATABLE_REGS_SSE + (realRegSqueeze RcDoubleSSE) + exclusions + + = count3 <# cALLOCATABLE_REGS_SSE + + +-- Specification Code ---------------------------------------------------------- -- --- NOTE: This only works for arcitectures with just RcInteger and RcDouble --- (which are disjoint) ie. x86, x86_64 and ppc +-- The trivColorable function for each particular architecture should +-- implement the following function, but faster. -- --- BL 2007/09 --- Doing a nice fold over the UniqSet makes trivColorable use --- 32% of total compile time and 42% of total alloc when compiling SHA1.lhs from darcs. {- trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool trivColorable classN conflicts exclusions = let - acc :: Reg -> (Int, Int) -> (Int, Int) - acc r (cd, cf) - = case regClass r of - RcInteger -> (cd+1, cf) - RcDouble -> (cd, cf+1) - _ -> panic "Regs.trivColorable: reg class not handled" + acc :: Reg -> (Int, Int) -> (Int, Int) + acc r (cd, cf) + = case regClass r of + RcInteger -> (cd+1, cf) + RcFloat -> (cd, cf+1) + _ -> panic "Regs.trivColorable: reg class not handled" - tmp = foldUniqSet acc (0, 0) conflicts - (countInt, countFloat) = foldUniqSet acc tmp exclusions + tmp = foldUniqSet acc (0, 0) conflicts + (countInt, countFloat) = foldUniqSet acc tmp exclusions - squeese = worst countInt classN RcInteger - + worst countFloat classN RcDouble + squeese = worst countInt classN RcInteger + + worst countFloat classN RcFloat - in squeese < allocatableRegsInClass classN + in squeese < allocatableRegsInClass classN -- | Worst case displacement --- node N of classN has n neighbors of class C. +-- node N of classN has n neighbors of class C. -- --- We currently only have RcInteger and RcDouble, which don't conflict at all. --- This is a bit boring compared to what's in RegArchX86. +-- We currently only have RcInteger and RcDouble, which don't conflict at all. +-- This is a bit boring compared to what's in RegArchX86. -- worst :: Int -> RegClass -> RegClass -> Int worst n classN classC = case classN of - RcInteger - -> case classC of - RcInteger -> min n (allocatableRegsInClass RcInteger) - RcDouble -> 0 - - RcDouble - -> case classC of - RcDouble -> min n (allocatableRegsInClass RcDouble) - RcInteger -> 0 --} + RcInteger + -> case classC of + RcInteger -> min n (allocatableRegsInClass RcInteger) + RcFloat -> 0 + RcDouble + -> case classC of + RcFloat -> min n (allocatableRegsInClass RcFloat) + RcInteger -> 0 --- The number of allocatable regs is hard coded here so we can do a fast comparision --- in trivColorable. It's ok if these numbers are _less_ than the actual number of --- free regs, but they can't be more or the register conflict graph won't color. --- --- There is an allocatableRegsInClass :: RegClass -> Int, but doing the unboxing --- is too slow for us here. --- --- Compare Regs.freeRegs and MachRegs.h to get these numbers. --- -#if i386_TARGET_ARCH -#define ALLOCATABLE_REGS_INTEGER (_ILIT(3)) -#define ALLOCATABLE_REGS_DOUBLE (_ILIT(6)) -#define ALLOCATABLE_REGS_FLOAT (_ILIT(0)) - -#elif x86_64_TARGET_ARCH -#define ALLOCATABLE_REGS_INTEGER (_ILIT(5)) -#define ALLOCATABLE_REGS_DOUBLE (_ILIT(2)) -#define ALLOCATABLE_REGS_FLOAT (_ILIT(0)) - -#elif powerpc_TARGET_ARCH -#define ALLOCATABLE_REGS_INTEGER (_ILIT(16)) -#define ALLOCATABLE_REGS_DOUBLE (_ILIT(26)) -#define ALLOCATABLE_REGS_FLOAT (_ILIT(0)) - -#elif sparc_TARGET_ARCH -#define ALLOCATABLE_REGS_INTEGER (_ILIT(14)) -#define ALLOCATABLE_REGS_DOUBLE (_ILIT(8)) -#define ALLOCATABLE_REGS_FLOAT (_ILIT(6)) - -#else -#error ToDo: define ALLOCATABLE_REGS_INTEGER and ALLOCATABLE_REGS_DOUBLE -#endif - -trivColorable - :: (Reg -> RegClass) - -> Triv Reg RegClass Reg - -trivColorable regClass _ conflicts exclusions - = {-# SCC "trivColorable" #-} - let - isSqueesed cI cF ufm - = case ufm of - NodeUFM _ _ left right - -> case isSqueesed cI cF right of - (# s, cI', cF' #) - -> case s of - False -> isSqueesed cI' cF' left - True -> (# True, cI', cF' #) - - LeafUFM _ reg - -> case regClass reg of - RcInteger - -> case cI +# _ILIT(1) of - cI' -> (# cI' >=# ALLOCATABLE_REGS_INTEGER, cI', cF #) - - RcDouble - -> case cF +# _ILIT(1) of - cF' -> (# cF' >=# ALLOCATABLE_REGS_DOUBLE, cI, cF' #) - - RcFloat - -> case cF +# _ILIT(1) of - cF' -> (# cF' >=# ALLOCATABLE_REGS_FLOAT, cI, cF' #) - - EmptyUFM - -> (# False, cI, cF #) - - in case isSqueesed (_ILIT(0)) (_ILIT(0)) conflicts of - (# False, cI', cF' #) - -> case isSqueesed cI' cF' exclusions of - (# s, _, _ #) -> not s - - (# True, _, _ #) - -> False +-- allocatableRegs is allMachRegNos with the fixed-use regs removed. +-- i.e., these are the regs for which we are prepared to allow the +-- register allocator to attempt to map VRegs to. +allocatableRegs :: [RegNo] +allocatableRegs + = let isFree i = isFastTrue (freeReg i) + in filter isFree allMachRegNos + + +-- | The number of regs in each class. +-- We go via top level CAFs to ensure that we're not recomputing +-- the length of these lists each time the fn is called. +allocatableRegsInClass :: RegClass -> Int +allocatableRegsInClass cls + = case cls of + RcInteger -> allocatableRegsInteger + RcFloat -> allocatableRegsDouble + +allocatableRegsInteger :: Int +allocatableRegsInteger + = length $ filter (\r -> regClass r == RcInteger) + $ map RealReg allocatableRegs + +allocatableRegsFloat :: Int +allocatableRegsFloat + = length $ filter (\r -> regClass r == RcFloat + $ map RealReg allocatableRegs +-}