Whitespace only in nativeGen/RegAlloc/Graph/TrivColorable.hs

[ghc-hetmet.git] / compiler / nativeGen / RegAlloc / Graph / TrivColorable.hs
diff --git a/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs b/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs

index df04606..e286707 100644 (file)
--- a/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs
+++ b/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs
@@ -1,6 +1,7 @@
+{-# OPTIONS -fno-warn-unused-binds #-}
  
  module RegAlloc.Graph.TrivColorable (
  
  module RegAlloc.Graph.TrivColorable (
-       trivColorable,
+        trivColorable,
  )
  
  where
  )
  
  where
@@ -15,162 +16,235 @@ import GraphBase
  import UniqFM
  import FastTypes
  
  import UniqFM
  import FastTypes
  
-{-
--- allocatableRegs is allMachRegNos with the fixed-use regs removed.
--- i.e., these are the regs for which we are prepared to allow the
--- register allocator to attempt to map VRegs to.
-allocatableRegs :: [RegNo]
-allocatableRegs
-   = let isFree i = isFastTrue (freeReg i)
-     in  filter isFree allMachRegNos
-
-
--- | The number of regs in each class.
---     We go via top level CAFs to ensure that we're not recomputing
---     the length of these lists each time the fn is called.
-allocatableRegsInClass :: RegClass -> Int
-allocatableRegsInClass cls
- = case cls of
-       RcInteger       -> allocatableRegsInteger
-       RcDouble        -> allocatableRegsDouble
-       RcFloat         -> panic "Regs.allocatableRegsInClass: no match\n"
-
-allocatableRegsInteger :: Int
-allocatableRegsInteger 
-       = length $ filter (\r -> regClass r == RcInteger) 
-                $ map RealReg allocatableRegs
-
-allocatableRegsDouble :: Int
-allocatableRegsDouble
-       = length $ filter (\r -> regClass r == RcDouble) 
-                $ map RealReg allocatableRegs
--}
-
  
  -- trivColorable ---------------------------------------------------------------
  
  -- trivColorable function for the graph coloring allocator
  
  -- trivColorable ---------------------------------------------------------------
  
  -- trivColorable function for the graph coloring allocator
---     This gets hammered by scanGraph during register allocation,
---     so needs to be fairly efficient.
  --
  --
---     NOTE:   This only works for arcitectures with just RcInteger and RcDouble
---             (which are disjoint) ie. x86, x86_64 and ppc
+--      This gets hammered by scanGraph during register allocation,
+--      so needs to be fairly efficient.
  --
  --
-
---     BL 2007/09
---     Doing a nice fold over the UniqSet makes trivColorable use
---     32% of total compile time and 42% of total alloc when compiling SHA1.lhs from darcs.
-{-
-trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool
-trivColorable classN conflicts exclusions
- = let
-
-       acc :: Reg -> (Int, Int) -> (Int, Int)
-       acc r (cd, cf)  
-        = case regClass r of
-               RcInteger       -> (cd+1, cf)
-               RcDouble        -> (cd,   cf+1)
-               _               -> panic "Regs.trivColorable: reg class not handled"
-
-       tmp                     = foldUniqSet acc (0, 0) conflicts
-       (countInt,  countFloat) = foldUniqSet acc tmp    exclusions
-
-       squeese         = worst countInt   classN RcInteger
-                       + worst countFloat classN RcDouble
-
-   in  squeese < allocatableRegsInClass classN
-
--- | Worst case displacement
---     node N of classN has n neighbors of class C.
+--      NOTE:   This only works for arcitectures with just RcInteger and RcDouble
+--              (which are disjoint) ie. x86, x86_64 and ppc
  --
  --
---     We currently only have RcInteger and RcDouble, which don't conflict at all.
---     This is a bit boring compared to what's in RegArchX86.
+--      The number of allocatable regs is hard coded here so we can do a fast
+--              comparision in trivColorable.
  --
  --
-worst :: Int -> RegClass -> RegClass -> Int
-worst n classN classC
- = case classN of
-       RcInteger
-        -> case classC of
-               RcInteger       -> min n (allocatableRegsInClass RcInteger)
-               RcDouble        -> 0
-               
-       RcDouble
-        -> case classC of
-               RcDouble        -> min n (allocatableRegsInClass RcDouble)
-               RcInteger       -> 0
--}
-
-
--- The number of allocatable regs is hard coded here so we can do a fast comparision
--- in trivColorable. It's ok if these numbers are _less_ than the actual number of
--- free regs, but they can't be more or the register conflict graph won't color.
+--      It's ok if these numbers are _less_ than the actual number of free regs,
+--              but they can't be more or the register conflict graph won't color.
+--
+--      If the graph doesn't color then the allocator will panic, but it won't
+--              generate bad object code or anything nasty like that.
  --
  --
--- There is an allocatableRegsInClass :: RegClass -> Int, but doing the unboxing
--- is too slow for us here.
+--      There is an allocatableRegsInClass :: RegClass -> Int, but doing the unboxing
+--      is too slow for us here.
  --
  --
--- Compare Regs.freeRegs  and MachRegs.h to get these numbers.
+--      Look at includes/stg/MachRegs.h to get these numbers.
  --
  --
+
  #if i386_TARGET_ARCH
  #define ALLOCATABLE_REGS_INTEGER (_ILIT(3))
  #define ALLOCATABLE_REGS_DOUBLE  (_ILIT(6))
  #define ALLOCATABLE_REGS_FLOAT   (_ILIT(0))
  #if i386_TARGET_ARCH
  #define ALLOCATABLE_REGS_INTEGER (_ILIT(3))
  #define ALLOCATABLE_REGS_DOUBLE  (_ILIT(6))
  #define ALLOCATABLE_REGS_FLOAT   (_ILIT(0))
+#define ALLOCATABLE_REGS_SSE     (_ILIT(8))
+
  
  #elif x86_64_TARGET_ARCH
  #define ALLOCATABLE_REGS_INTEGER (_ILIT(5))
  
  #elif x86_64_TARGET_ARCH
  #define ALLOCATABLE_REGS_INTEGER (_ILIT(5))
-#define ALLOCATABLE_REGS_DOUBLE  (_ILIT(2))
+#define ALLOCATABLE_REGS_DOUBLE  (_ILIT(0))
  #define ALLOCATABLE_REGS_FLOAT   (_ILIT(0))
  #define ALLOCATABLE_REGS_FLOAT   (_ILIT(0))
+#define ALLOCATABLE_REGS_SSE     (_ILIT(10))
  
  #elif powerpc_TARGET_ARCH
  #define ALLOCATABLE_REGS_INTEGER (_ILIT(16))
  #define ALLOCATABLE_REGS_DOUBLE  (_ILIT(26))
  #define ALLOCATABLE_REGS_FLOAT   (_ILIT(0))
  
  #elif powerpc_TARGET_ARCH
  #define ALLOCATABLE_REGS_INTEGER (_ILIT(16))
  #define ALLOCATABLE_REGS_DOUBLE  (_ILIT(26))
  #define ALLOCATABLE_REGS_FLOAT   (_ILIT(0))
+#define ALLOCATABLE_REGS_SSE     (_ILIT(0))
+
  
  #elif sparc_TARGET_ARCH
  #define ALLOCATABLE_REGS_INTEGER (_ILIT(14))
  
  #elif sparc_TARGET_ARCH
  #define ALLOCATABLE_REGS_INTEGER (_ILIT(14))
-#define ALLOCATABLE_REGS_DOUBLE  (_ILIT(8))
-#define ALLOCATABLE_REGS_FLOAT   (_ILIT(6))
+#define ALLOCATABLE_REGS_DOUBLE  (_ILIT(11))
+#define ALLOCATABLE_REGS_FLOAT   (_ILIT(22))
+#define ALLOCATABLE_REGS_SSE     (_ILIT(0))
+
  
  #else
  
  #else
-#error ToDo: define ALLOCATABLE_REGS_INTEGER and ALLOCATABLE_REGS_DOUBLE
+#error ToDo: choose which trivColorable function to use for this architecture.
  #endif
  
  #endif
  
-trivColorable 
-       :: (Reg -> RegClass) 
-       -> Triv Reg RegClass Reg
-       
-trivColorable regClass _ conflicts exclusions
- = {-# SCC "trivColorable" #-}
-   let
-       isSqueesed cI cF ufm
-         = case ufm of
-               NodeUFM _ _ left right
-                -> case isSqueesed cI cF right of
-                       (# s, cI', cF' #)
-                        -> case s of
-                               False   -> isSqueesed cI' cF' left
-                               True    -> (# True, cI', cF' #)
-
-               LeafUFM _ reg
-                -> case regClass reg of
-                       RcInteger
-                        -> case cI +# _ILIT(1) of
-                               cI' -> (# cI' >=# ALLOCATABLE_REGS_INTEGER, cI', cF #)
-
-                       RcDouble
-                        -> case cF +# _ILIT(1) of
-                               cF' -> (# cF' >=# ALLOCATABLE_REGS_DOUBLE,  cI, cF' #)
-
-                       RcFloat 
-                        -> case cF +# _ILIT(1) of
-                               cF' -> (# cF' >=# ALLOCATABLE_REGS_FLOAT,   cI, cF' #)
-
-               EmptyUFM
-                ->     (# False, cI, cF #)
-
-   in case isSqueesed (_ILIT(0)) (_ILIT(0)) conflicts of
-       (# False, cI', cF' #)
-        -> case isSqueesed cI' cF' exclusions of
-               (# s, _, _ #)   -> not s
-
-       (# True, _, _ #)
-        -> False
+
+
+-- Disjoint registers ----------------------------------------------------------
+--
+--      The definition has been unfolded into individual cases for speed.
+--      Each architecture has a different register setup, so we use a
+--      different regSqueeze function for each.
+--
+accSqueeze
+        :: FastInt
+        -> FastInt
+        -> (reg -> FastInt)
+        -> UniqFM reg
+        -> FastInt
+
+accSqueeze count maxCount squeeze ufm = acc count (eltsUFM ufm)
+  where acc count [] = count
+        acc count _ | count >=# maxCount = count
+        acc count (r:rs) = acc (count +# squeeze r) rs
+
+{- Note [accSqueeze]
+~~~~~~~~~~~~~~~~~~~~
+BL 2007/09
+Doing a nice fold over the UniqSet makes trivColorable use
+32% of total compile time and 42% of total alloc when compiling SHA1.lhs from darcs.
+Therefore the UniqFM is made non-abstract and we use custom fold.
+
+MS 2010/04
+When converting UniqFM to use Data.IntMap, the fold cannot use UniqFM internal
+representation any more. But it is imperative that the assSqueeze stops
+the folding if the count gets greater or equal to maxCount. We thus convert
+UniqFM to a (lazy) list, do the fold and stops if necessary, which was
+the most efficient variant tried. Benchmark compiling 10-times SHA1.lhs follows.
+(original = previous implementation, folding = fold of the whole UFM,
+ lazyFold = the current implementation,
+ hackFold = using internal representation of Data.IntMap)
+
+                                 original  folding   hackFold  lazyFold
+ -O -fasm (used everywhere)      31.509s   30.387s   30.791s   30.603s
+                                 100.00%   96.44%    97.72%    97.12%
+ -fregs-graph                    67.938s   74.875s   62.673s   64.679s
+                                 100.00%   110.21%   92.25%    95.20%
+ -fregs-iterative                89.761s   143.913s  81.075s   86.912s
+                                 100.00%   160.33%   90.32%    96.83%
+ -fnew-codegen                   38.225s   37.142s   37.551s   37.119s
+                                 100.00%   97.17%    98.24%    97.11%
+ -fnew-codegen -fregs-graph      91.786s   91.51s    87.368s   86.88s
+                                 100.00%   99.70%    95.19%    94.65%
+ -fnew-codegen -fregs-iterative  206.72s   343.632s  194.694s  208.677s
+                                 100.00%   166.23%   94.18%    100.95%
+-}
+
+trivColorable
+        :: (RegClass -> VirtualReg -> FastInt)
+        -> (RegClass -> RealReg    -> FastInt)
+        -> Triv VirtualReg RegClass RealReg
+
+trivColorable virtualRegSqueeze realRegSqueeze RcInteger conflicts exclusions
+        | count2        <- accSqueeze (_ILIT(0)) ALLOCATABLE_REGS_INTEGER
+                                (virtualRegSqueeze RcInteger)
+                                conflicts
+
+        , count3        <- accSqueeze  count2    ALLOCATABLE_REGS_INTEGER
+                                (realRegSqueeze   RcInteger)
+                                exclusions
+
+        = count3 <# ALLOCATABLE_REGS_INTEGER
+
+trivColorable virtualRegSqueeze realRegSqueeze RcFloat conflicts exclusions
+        | count2        <- accSqueeze (_ILIT(0)) ALLOCATABLE_REGS_FLOAT
+                                (virtualRegSqueeze RcFloat)
+                                conflicts
+
+        , count3        <- accSqueeze  count2    ALLOCATABLE_REGS_FLOAT
+                                (realRegSqueeze   RcFloat)
+                                exclusions
+
+        = count3 <# ALLOCATABLE_REGS_FLOAT
+
+trivColorable virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions
+        | count2        <- accSqueeze (_ILIT(0)) ALLOCATABLE_REGS_DOUBLE
+                                (virtualRegSqueeze RcDouble)
+                                conflicts
+
+        , count3        <- accSqueeze  count2    ALLOCATABLE_REGS_DOUBLE
+                                (realRegSqueeze   RcDouble)
+                                exclusions
+
+        = count3 <# ALLOCATABLE_REGS_DOUBLE
+
+trivColorable virtualRegSqueeze realRegSqueeze RcDoubleSSE conflicts exclusions
+        | count2        <- accSqueeze (_ILIT(0)) ALLOCATABLE_REGS_SSE
+                                (virtualRegSqueeze RcDoubleSSE)
+                                conflicts
+
+        , count3        <- accSqueeze  count2    ALLOCATABLE_REGS_SSE
+                                (realRegSqueeze   RcDoubleSSE)
+                                exclusions
+
+        = count3 <# ALLOCATABLE_REGS_SSE
+
+
+-- Specification Code ----------------------------------------------------------
+--
+--      The trivColorable function for each particular architecture should
+--      implement the following function, but faster.
+--
+
+{-
+trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool
+trivColorable classN conflicts exclusions
+ = let
+
+        acc :: Reg -> (Int, Int) -> (Int, Int)
+        acc r (cd, cf)
+         = case regClass r of
+                RcInteger       -> (cd+1, cf)
+                RcFloat         -> (cd,   cf+1)
+                _               -> panic "Regs.trivColorable: reg class not handled"
+
+        tmp                     = foldUniqSet acc (0, 0) conflicts
+        (countInt,  countFloat) = foldUniqSet acc tmp    exclusions
+
+        squeese         = worst countInt   classN RcInteger
+                        + worst countFloat classN RcFloat
+
+   in   squeese < allocatableRegsInClass classN
+
+-- | Worst case displacement
+--      node N of classN has n neighbors of class C.
+--
+--      We currently only have RcInteger and RcDouble, which don't conflict at all.
+--      This is a bit boring compared to what's in RegArchX86.
+--
+worst :: Int -> RegClass -> RegClass -> Int
+worst n classN classC
+ = case classN of
+        RcInteger
+         -> case classC of
+                RcInteger       -> min n (allocatableRegsInClass RcInteger)
+                RcFloat         -> 0
+
+        RcDouble
+         -> case classC of
+                RcFloat         -> min n (allocatableRegsInClass RcFloat)
+                RcInteger       -> 0
+
+-- allocatableRegs is allMachRegNos with the fixed-use regs removed.
+-- i.e., these are the regs for which we are prepared to allow the
+-- register allocator to attempt to map VRegs to.
+allocatableRegs :: [RegNo]
+allocatableRegs
+   = let isFree i = isFastTrue (freeReg i)
+     in  filter isFree allMachRegNos
+
+
+-- | The number of regs in each class.
+--      We go via top level CAFs to ensure that we're not recomputing
+--      the length of these lists each time the fn is called.
+allocatableRegsInClass :: RegClass -> Int
+allocatableRegsInClass cls
+ = case cls of
+        RcInteger       -> allocatableRegsInteger
+        RcFloat         -> allocatableRegsDouble
+
+allocatableRegsInteger :: Int
+allocatableRegsInteger
+        = length $ filter (\r -> regClass r == RcInteger)
+                 $ map RealReg allocatableRegs
+
+allocatableRegsFloat :: Int
+allocatableRegsFloat
+        = length $ filter (\r -> regClass r == RcFloat
+                 $ map RealReg allocatableRegs
+-}