compiler/cmm/CmmStackLayout.hs

   1 {-# OPTIONS_GHC -XGADTs -XNoMonoLocalBinds #-}
   2 -- Norman likes local bindings
   3 -- If this module lives on I'd like to get rid of this flag in due course
   4
   5 -- Todo: remove
   6 {-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
   7
   8 {-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
   9 #if __GLASGOW_HASKELL__ >= 701
  10 -- GHC 7.0.1 improved incomplete pattern warnings with GADTs
  11 {-# OPTIONS_GHC -fwarn-incomplete-patterns #-}
  12 #endif
  13
  14 module CmmStackLayout
  15     ( SlotEnv, liveSlotAnal, liveSlotTransfers, removeLiveSlotDefs
  16     , getSpEntryMap, layout, manifestSP, igraph, areaBuilder
  17     , stubSlotsOnDeath ) -- to help crash early during debugging
  18 where
  19
  20 import Constants
  21 import Prelude hiding (succ, zip, unzip, last)
  22
  23 import BlockId
  24 import Cmm
  25 import CmmExpr
  26 import CmmProcPoint
  27 import Maybes
  28 import MkGraph (stackStubExpr)
  29 import Control.Monad
  30 import OptimizationFuel
  31 import Outputable
  32 import SMRep (ByteOff)
  33
  34 import Compiler.Hoopl
  35
  36 import Data.Map (Map)
  37 import qualified Data.Map as Map
  38 import qualified FiniteMap as Map
  39
  40 ------------------------------------------------------------------------
  41 --                    Stack Layout                                    --
  42 ------------------------------------------------------------------------
  43
  44 -- | Before we lay out the stack, we need to know something about the
  45 -- liveness of the stack slots. In particular, to decide whether we can
  46 -- reuse a stack location to hold multiple stack slots, we need to know
  47 -- when each of the stack slots is used.
  48 -- Although tempted to use something simpler, we really need a full interference
  49 -- graph. Consider the following case:
  50 --   case <...> of
  51 --     1 -> <spill x>; // y is dead out
  52 --     2 -> <spill y>; // x is dead out
  53 --     3 -> <spill x and y>
  54 -- If we consider the arms in order and we use just the deadness information given by a
  55 -- dataflow analysis, we might decide to allocate the stack slots for x and y
  56 -- to the same stack location, which will lead to incorrect code in the third arm.
  57 -- We won't make this mistake with an interference graph.
  58
  59 -- First, the liveness analysis.
  60 -- We represent a slot with an area, an offset into the area, and a width.
  61 -- Tracking the live slots is a bit tricky because there may be loads and stores
  62 -- into only a part of a stack slot (e.g. loading the low word of a 2-word long),
  63 -- e.g. Slot A 0 8 overlaps with Slot A 4 4.
  64 --
  65 -- The definition of a slot set is intended to reduce the number of overlap
  66 -- checks we have to make. There's no reason to check for overlap between
  67 -- slots in different areas, so we segregate the map by Area's.
  68 -- We expect few slots in each Area, so we collect them in an unordered list.
  69 -- To keep these lists short, any contiguous live slots are coalesced into
  70 -- a single slot, on insertion.
  71
  72 slotLattice :: DataflowLattice SubAreaSet
  73 slotLattice = DataflowLattice "live slots" Map.empty add
  74   where add _ (OldFact old) (NewFact new) = case Map.foldRightWithKey addArea (False, old) new of
  75                                               (change, x) -> (changeIf change, x)
  76         addArea a newSlots z = foldr (addSlot a) z newSlots
  77         addSlot a slot (changed, map) =
  78           let (c, live) = liveGen slot $ Map.findWithDefault [] a map
  79           in (c || changed, Map.insert a live map)
  80
  81 slotLatticeJoin :: [SubAreaSet] -> SubAreaSet
  82 slotLatticeJoin facts = foldr extend (fact_bot slotLattice) facts
  83   where extend fact res = snd $ fact_join slotLattice undefined (OldFact fact) (NewFact res)
  84
  85 type SlotEnv   = BlockEnv SubAreaSet
  86   -- The sub-areas live on entry to the block
  87
  88 liveSlotAnal :: CmmGraph -> FuelUniqSM SlotEnv
  89 liveSlotAnal g = liftM snd $ dataflowPassBwd g [] $ analBwd slotLattice liveSlotTransfers
  90
  91 -- Add the subarea s to the subareas in the list-set (possibly coalescing it with
  92 -- adjacent subareas), and also return whether s was a new addition.
  93 liveGen :: SubArea -> [SubArea] -> (Bool, [SubArea])
  94 liveGen s set = liveGen' s set []
  95   where liveGen' s [] z = (True, s : z)
  96         liveGen' s@(a, hi, w) (s'@(a', hi', w') : rst) z =
  97           if a /= a' || hi < lo' || lo > hi' then    -- no overlap
  98             liveGen' s rst (s' : z)
  99           else if s' `contains` s then               -- old contains new
 100             (False, set)
 101           else                                       -- overlap: coalesce the slots
 102             let new_hi = max hi hi'
 103                 new_lo = min lo lo'
 104             in liveGen' (a, new_hi, new_hi - new_lo) rst z
 105           where lo  = hi  - w  -- remember: areas grow down
 106                 lo' = hi' - w'
 107         contains (a, hi, w) (a', hi', w') =
 108           a == a' && hi >= hi' && hi - w <= hi' - w'
 109
 110 liveKill :: SubArea -> [SubArea] -> [SubArea]
 111 liveKill (a, hi, w) set = -- pprTrace "killing slots in area" (ppr a) $
 112                           liveKill' set []
 113   where liveKill' [] z = z
 114         liveKill' (s'@(a', hi', w') : rst) z =
 115           if a /= a' || hi < lo' || lo > hi' then    -- no overlap
 116             liveKill' rst (s' : z)
 117           else                                       -- overlap: split the old slot
 118             let z'  = if hi' > hi  then (a, hi', hi' - hi)  : z else z
 119                 z'' = if lo  > lo' then (a, lo,  lo  - lo') : z' else z'
 120             in liveKill' rst z''
 121           where lo  = hi  - w  -- remember: areas grow down
 122                 lo' = hi' - w'
 123
 124 -- Note: the stack slots that hold variables returned on the stack are not
 125 -- considered live in to the block -- we treat the first node as a definition site.
 126 -- BEWARE?: Am I being a little careless here in failing to check for the
 127 -- entry Id (which would use the CallArea Old).
 128 liveSlotTransfers :: BwdTransfer CmmNode SubAreaSet
 129 liveSlotTransfers = mkBTransfer3 frt mid lst
 130   where frt :: CmmNode C O -> SubAreaSet -> SubAreaSet
 131         frt (CmmEntry l) f = Map.delete (CallArea (Young l)) f
 132
 133         mid :: CmmNode O O -> SubAreaSet -> SubAreaSet
 134         mid n f = foldSlotsUsed addSlot (removeLiveSlotDefs f n) n
 135         lst :: CmmNode O C -> FactBase SubAreaSet -> SubAreaSet
 136         lst n f = liveInSlots n $ case n of
 137           CmmCall {cml_cont=Nothing, cml_args=args} -> add_area (CallArea Old) args out
 138           CmmCall {cml_cont=Just k, cml_args=args}  -> add_area (CallArea Old) args (add_area (CallArea (Young k)) args out)
 139           CmmForeignCall {succ=k, updfr=oldend}     -> add_area (CallArea Old) oldend (add_area (CallArea (Young k)) wORD_SIZE out)
 140           _                                         -> out
 141          where out = joinOutFacts slotLattice n f
 142                add_area _ n live | n == 0 = live
 143                add_area a n live = Map.insert a (snd $ liveGen (a, n, n) $ Map.findWithDefault [] a live) live
 144
 145 -- Slot sets: adding slots, removing slots, and checking for membership.
 146 liftToArea :: Area -> ([SubArea] -> [SubArea]) -> SubAreaSet -> SubAreaSet
 147 addSlot, removeSlot :: SubAreaSet -> SubArea -> SubAreaSet
 148 elemSlot            :: SubAreaSet -> SubArea -> Bool
 149 liftToArea a f map = Map.insert a (f (Map.findWithDefault [] a map)) map
 150 addSlot    live (a, i, w) = liftToArea a (snd . liveGen  (a, i, w)) live
 151 removeSlot live (a, i, w) = liftToArea a       (liveKill (a, i, w)) live
 152 elemSlot   live (a, i, w) =
 153   not $ fst $ liveGen  (a, i, w) (Map.findWithDefault [] a live)
 154
 155 removeLiveSlotDefs :: (DefinerOfSlots s, UserOfSlots s) => SubAreaSet -> s -> SubAreaSet
 156 removeLiveSlotDefs = foldSlotsDefd removeSlot
 157
 158 liveInSlots :: (DefinerOfSlots s, UserOfSlots s) => s -> SubAreaSet -> SubAreaSet
 159 liveInSlots x live = foldSlotsUsed addSlot (removeLiveSlotDefs live x) x
 160
 161 liveLastIn :: CmmNode O C -> (BlockId -> SubAreaSet) -> SubAreaSet
 162 liveLastIn l env = liveInSlots l (liveLastOut env l)
 163
 164 -- Don't forget to keep the outgoing parameters in the CallArea live,
 165 -- as well as the update frame.
 166 -- Note: We have to keep the update frame live at a call because of the
 167 -- case where the function doesn't return -- in that case, there won't
 168 -- be a return to keep the update frame live. We'd still better keep the
 169 -- info pointer in the update frame live at any call site;
 170 -- otherwise we could screw up the garbage collector.
 171 liveLastOut :: (BlockId -> SubAreaSet) -> CmmNode O C -> SubAreaSet
 172 liveLastOut env l =
 173   case l of
 174     CmmCall _ Nothing n _ _ ->
 175       add_area (CallArea Old) n out -- add outgoing args (includes upd frame)
 176     CmmCall _ (Just k) n _ _ ->
 177       add_area (CallArea Old) n (add_area (CallArea (Young k)) n out)
 178     CmmForeignCall { succ = k, updfr = oldend } ->
 179       add_area (CallArea Old) oldend (add_area (CallArea (Young k)) wORD_SIZE out)
 180     _ -> out
 181   where out = slotLatticeJoin $ map env $ successors l
 182         add_area _ n live | n == 0 = live
 183         add_area a n live =
 184           Map.insert a (snd $ liveGen (a, n, n) $ Map.findWithDefault [] a live) live
 185
 186 -- The liveness analysis must be precise: otherwise, we won't know if a definition
 187 -- should really kill a live-out stack slot.
 188 -- But the interference graph does not have to be precise -- it might decide that
 189 -- any live areas interfere. To maintain both a precise analysis and an imprecise
 190 -- interference graph, we need to convert the live-out stack slots to graph nodes
 191 -- at each and every instruction; rather than reconstruct a new list of nodes
 192 -- every time, I provide a function to fold over the nodes, which should be a
 193 -- reasonably efficient approach for the implementations we envision.
 194 -- Of course, it will probably be much easier to program if we just return a list...
 195 type Set x = Map x ()
 196 data IGraphBuilder n =
 197   Builder { foldNodes     :: forall z. SubArea -> (n -> z -> z) -> z -> z
 198           , _wordsOccupied :: AreaSizeMap -> AreaMap -> n -> [Int]
 199           }
 200
 201 areaBuilder :: IGraphBuilder Area
 202 areaBuilder = Builder fold words
 203   where fold (a, _, _) f z = f a z
 204         words areaSize areaMap a =
 205           case Map.lookup a areaMap of
 206             Just addr -> [addr .. addr + (Map.lookup a areaSize `orElse`
 207                                           pprPanic "wordsOccupied: unknown area" (ppr areaSize <+> ppr a))]
 208             Nothing   -> []
 209
 210 --slotBuilder :: IGraphBuilder (Area, Int)
 211 --slotBuilder = undefined
 212
 213 -- Now, we can build the interference graph.
 214 -- The usual story: a definition interferes with all live outs and all other
 215 -- definitions.
 216 type IGraph x = Map x (Set x)
 217 type IGPair x = (IGraph x, IGraphBuilder x)
 218 igraph :: (Ord x) => IGraphBuilder x -> SlotEnv -> CmmGraph -> IGraph x
 219 igraph builder env g = foldr interfere Map.empty (postorderDfs g)
 220   where foldN = foldNodes builder
 221         interfere block igraph = foldBlockNodesB3 (first, middle, last) block igraph
 222           where first _ (igraph, _) = igraph
 223                 middle node (igraph, liveOut) =
 224                   (addEdges igraph node liveOut, liveInSlots node liveOut)
 225                 last node igraph =
 226                   (addEdges igraph node $ liveLastOut env' node, liveLastIn node env')
 227
 228                 -- add edges between a def and the other defs and liveouts
 229                 addEdges igraph i out = fst $ foldSlotsDefd addDef (igraph, out) i
 230                 addDef (igraph, out) def@(a, _, _) =
 231                   (foldN def (addDefN out) igraph,
 232                    Map.insert a (snd $ liveGen def (Map.findWithDefault [] a out)) out)
 233                 addDefN out n igraph =
 234                   let addEdgeNO o igraph = foldN o addEdgeNN igraph
 235                       addEdgeNN n' igraph = addEdgeNN' n n' $ addEdgeNN' n' n igraph
 236                       addEdgeNN' n n' igraph = Map.insert n (Map.insert n' () set) igraph
 237                         where set = Map.findWithDefault Map.empty n igraph
 238                   in Map.foldRightWithKey (\ _ os igraph -> foldr addEdgeNO igraph os) igraph out
 239                 env' bid = mapLookup bid env `orElse` panic "unknown blockId in igraph"
 240
 241 -- Before allocating stack slots, we need to collect one more piece of information:
 242 -- what's the highest offset (in bytes) used in each Area?
 243 -- We'll need to allocate that much space for each Area.
 244
 245 -- Mapping of areas to area sizes (not offsets!)
 246 type AreaSizeMap = AreaMap
 247
 248 -- JD: WHY CAN'T THIS COME FROM THE slot-liveness info?
 249 getAreaSize :: ByteOff -> CmmGraph -> AreaSizeMap
 250   -- The domain of the returned mapping consists only of Areas
 251   -- used for (a) variable spill slots, and (b) parameter passing areas for calls
 252 getAreaSize entry_off g =
 253   foldGraphBlocks (foldBlockNodesF3 (first, add_regslots, last))
 254               (Map.singleton (CallArea Old) entry_off) g
 255   where first _  z = z
 256         last :: CmmNode O C -> Map Area Int -> Map Area Int
 257         last l@(CmmCall _ Nothing args res _) z  =  add_regslots l (add (add z area args) area res)
 258           where area = CallArea Old
 259         last l@(CmmCall _ (Just k) args res _) z =  add_regslots l (add (add z area args) area res)
 260           where area = CallArea (Young k)
 261         last l@(CmmForeignCall {succ = k}) z     =  add_regslots l (add z area wORD_SIZE)
 262           where area = CallArea (Young k)
 263         last l z                                 =  add_regslots l z
 264         add_regslots i z = foldSlotsUsed addSlot (foldSlotsDefd addSlot z i) i
 265         addSlot z (a@(RegSlot (LocalReg _ ty)), _, _) =
 266           add z a $ widthInBytes $ typeWidth ty
 267         addSlot z _ = z
 268         add z a off = Map.insert a (max off (Map.findWithDefault 0 a z)) z
 269         -- The 'max' is important.  Two calls, to f and g, might share a common
 270         -- continuation (and hence a common CallArea), but their number of overflow
 271         -- parameters might differ.
 272         -- EZY: Ought to use insert with combining function...
 273
 274
 275 -- Find the Stack slots occupied by the subarea's conflicts
 276 conflictSlots :: Ord x => IGPair x -> AreaSizeMap -> AreaMap -> SubArea -> Set Int
 277 conflictSlots (ig, Builder foldNodes wordsOccupied) areaSize areaMap subarea =
 278   foldNodes subarea foldNode Map.empty
 279   where foldNode n set = Map.foldRightWithKey conflict set $ Map.findWithDefault Map.empty n ig
 280         conflict n' () set = liveInSlots areaMap n' set
 281         -- Add stack slots occupied by igraph node n
 282         liveInSlots areaMap n set = foldr setAdd set (wordsOccupied areaSize areaMap n)
 283         setAdd w s = Map.insert w () s
 284
 285 -- Find any open space for 'area' on the stack, starting from the
 286 -- 'offset'.  If the area is a CallArea or a spill slot for a pointer,
 287 -- then it must be word-aligned.
 288 freeSlotFrom :: Ord x => IGPair x -> AreaSizeMap -> Int -> AreaMap -> Area -> Int
 289 freeSlotFrom ig areaSize offset areaMap area =
 290   let size = Map.lookup area areaSize `orElse` 0
 291       conflicts = conflictSlots ig areaSize areaMap (area, size, size)
 292       -- CallAreas and Ptrs need to be word-aligned (round up!)
 293       align = case area of CallArea _                                -> align'
 294                            RegSlot  r | isGcPtrType (localRegType r) -> align'
 295                            RegSlot  _                                -> id
 296       align' n = (n + (wORD_SIZE - 1)) `div` wORD_SIZE * wORD_SIZE
 297       -- Find a space big enough to hold the area
 298       findSpace curr 0 = curr
 299       findSpace curr cnt = -- part of target slot, # of bytes left to check
 300         if Map.member curr conflicts then
 301           findSpace (align (curr + size)) size -- try the next (possibly) open space
 302         else findSpace (curr - 1) (cnt - 1)
 303   in findSpace (align (offset + size)) size
 304
 305 -- Find an open space on the stack, and assign it to the area.
 306 allocSlotFrom :: Ord x => IGPair x -> AreaSizeMap -> Int -> AreaMap -> Area -> AreaMap
 307 allocSlotFrom ig areaSize from areaMap area =
 308   if Map.member area areaMap then areaMap
 309   else Map.insert area (freeSlotFrom ig areaSize from areaMap area) areaMap
 310
 311 -- Figure out all of the offsets from the slot location; this will be
 312 -- non-zero for procpoints.
 313 type SpEntryMap = BlockEnv Int
 314 getSpEntryMap :: Int -> CmmGraph -> SpEntryMap
 315 getSpEntryMap entry_off g@(CmmGraph {g_entry = entry})
 316     = foldGraphBlocks add_sp_off (mapInsert entry entry_off emptyBlockMap) g
 317   where add_sp_off :: CmmBlock -> BlockEnv Int -> BlockEnv Int
 318         add_sp_off b env =
 319           case lastNode b of
 320             CmmCall {cml_cont=Just succ, cml_ret_args=off} -> mapInsert succ off env
 321             CmmForeignCall {succ=succ}                     -> mapInsert succ wORD_SIZE env
 322             _                                              -> env
 323
 324 -- | Greedy stack layout.
 325 -- Compute liveness, build the interference graph, and allocate slots for the areas.
 326 -- We visit each basic block in a (generally) forward order.
 327
 328 -- At each instruction that names a register subarea r, we immediately allocate
 329 -- any available slot on the stack by the following procedure:
 330 --  1. Find the sub-areas S that conflict with r
 331 --  2. Find the stack slots used for S
 332 --  3. Choose a contiguous stack space s not in S (s must be large enough to hold r)
 333
 334 -- For a CallArea, we allocate the stack space only when we reach a function
 335 -- call that returns to the CallArea's blockId.
 336 -- Then, we allocate the Area subject to the following constraints:
 337 --   a) It must be younger than all the sub-areas that are live on entry to the block
 338 --         This constraint is only necessary for the successor of a call
 339 --   b) It must not overlap with any already-allocated Area with which it conflicts
 340 --         (ie at some point, not necessarily now, is live at the same time)
 341 --   Part (b) is just the 1,2,3 part above
 342
 343 -- Note: The stack pointer only has to be younger than the youngest live stack slot
 344 -- at proc points. Otherwise, the stack pointer can point anywhere.
 345
 346 layout :: ProcPointSet -> SpEntryMap -> SlotEnv -> ByteOff -> CmmGraph -> AreaMap
 347 -- The domain of the returned map includes an Area for EVERY block
 348 -- including each block that is not the successor of a call (ie is not a proc-point)
 349 -- That's how we return the info of what the SP should be at the entry of every non
 350 -- procpoint block.  However, note that procpoint blocks have their
 351 -- /slot/ stored, which is not necessarily the value of the SP on entry
 352 -- to the block (in fact, it probably isn't, due to argument passing).
 353 -- See [Procpoint Sp offset]
 354
 355 layout procPoints spEntryMap env entry_off g =
 356   let ig = (igraph areaBuilder env g, areaBuilder)
 357       env' bid = mapLookup bid env `orElse` panic "unknown blockId in igraph"
 358       areaSize = getAreaSize entry_off g
 359
 360       -- Find the youngest live stack slot that has already been allocated
 361       youngest_live :: AreaMap     -- Already allocated
 362                     -> SubAreaSet  -- Sub-areas live here
 363                     -> ByteOff     -- Offset of the youngest byte of any
 364                                    --    already-allocated, live sub-area
 365       youngest_live areaMap live = fold_subareas young_slot live 0
 366         where young_slot (a, o, _) z = case Map.lookup a areaMap of
 367                                          Just top -> max z $ top + o
 368                                          Nothing  -> z
 369               fold_subareas f m z = Map.foldRightWithKey (\_ s z -> foldr f z s) z m
 370
 371       -- Allocate space for spill slots and call areas
 372       allocVarSlot = allocSlotFrom ig areaSize 0
 373
 374       -- Update the successor's incoming SP.
 375       setSuccSPs inSp bid areaMap =
 376         case (Map.lookup area areaMap , mapLookup bid (toBlockMap g)) of
 377           (Just _, _) -> areaMap -- succ already knows incoming SP
 378           (Nothing, Just _) ->
 379             if setMember bid procPoints then
 380               let young = youngest_live areaMap $ env' bid
 381                   -- start = case returnOff stackInfo of Just b  -> max b young
 382                   --                                     Nothing -> young
 383                   start = young -- maybe wrong, but I don't understand
 384                                 -- why the preceding is necessary...
 385               in  allocSlotFrom ig areaSize start areaMap area
 386             else Map.insert area inSp areaMap
 387           (_, Nothing) -> panic "Block not found in cfg"
 388         where area = CallArea (Young bid)
 389
 390       layoutAreas areaMap block = foldBlockNodesF3 (flip const, allocMid, allocLast (entryLabel block)) block areaMap
 391       allocMid m areaMap = foldSlotsDefd alloc' (foldSlotsUsed alloc' areaMap m) m
 392       allocLast bid l areaMap =
 393         foldr (setSuccSPs inSp) areaMap' (successors l)
 394         where inSp = slot + spOffset -- [Procpoint Sp offset]
 395               -- If it's not in the map, we should use our previous
 396               -- calculation unchanged.
 397               spOffset = mapLookup bid spEntryMap `orElse` 0
 398               slot = expectJust "slot in" $ Map.lookup (CallArea (Young bid)) areaMap
 399               areaMap' = foldSlotsDefd alloc' (foldSlotsUsed alloc' areaMap l) l
 400       alloc' areaMap (a@(RegSlot _), _, _) = allocVarSlot areaMap a
 401       alloc' areaMap _ = areaMap
 402
 403       initMap = Map.insert (CallArea (Young (g_entry g))) 0
 404               . Map.insert (CallArea Old)                 0
 405               $ Map.empty
 406
 407       areaMap = foldl layoutAreas initMap (postorderDfs g)
 408   in -- pprTrace "ProcPoints" (ppr procPoints) $
 409      -- pprTrace "Area SizeMap" (ppr areaSize) $
 410      -- pprTrace "Entry offset" (ppr entry_off) $
 411      -- pprTrace "Area Map" (ppr areaMap) $
 412      areaMap
 413
 414 {- Note [Procpoint Sp offset]
 415
 416 The calculation of inSp is a little tricky.  (Un)fortunately, if you get
 417 it wrong, you will get inefficient but correct code.  You know you've
 418 got it wrong if the generated stack pointer bounces up and down for no
 419 good reason.
 420
 421 Why can't we just set inSp to the location of the slot?  (This is what
 422 the code used to do.)  The trouble is when we actually hit the proc
 423 point the start of the slot will not be the same as the actual Sp due
 424 to argument passing:
 425
 426   a:
 427       I32[(young<b> + 4)] = cde;
 428       // Stack pointer is moved to young end (bottom) of young<b> for call
 429       // +-------+
 430       // | arg 1 |
 431       // +-------+ <- Sp
 432       call (I32[foobar::I32])(...) returns to Just b (4) (4) with update frame 4;
 433   b:
 434       // After call, stack pointer is above the old end (top) of
 435       // young<b> (the difference is spOffset)
 436       // +-------+ <- Sp
 437       // | arg 1 |
 438       // +-------+
 439
 440 If we blithely set the Sp to be the same as the slot (the young end of
 441 young<b>), an adjustment will be necessary when we go to the next block.
 442 This is wasteful.  So, instead, for the next block after a procpoint,
 443 the actual Sp should be set to the same as the true Sp when we just
 444 entered the procpoint.  Then manifestSP will automatically do the right
 445 thing.
 446
 447 Questions you may ask:
 448
 449 1. Why don't we need to change the mapping for the procpoint itself?
 450    Because manifestSP does its own calculation of the true stack value,
 451    manifestSP will notice the discrepancy between the actual stack
 452    pointer and the slot start, and adjust all of its memory accesses
 453    accordingly.  So the only problem is when we adjust the Sp in
 454    preparation for the successor block; that's why this code is here and
 455    not in setSuccSPs.
 456
 457 2. Why don't we make the procpoint call area and the true offset match
 458    up?  If we did that, we would never use memory above the true value
 459    of the stack pointer, thus wasting all of the stack we used to store
 460    arguments.  You might think that some clever changes to the slot
 461    offsets, using negative offsets, might fix it, but this does not make
 462    semantic sense.
 463
 464 3. If manifestSP is already calculating the true stack value, why we can't
 465    do this trick inside manifestSP itself?  The reason is that if two
 466    branches join with inconsistent SPs, one of them has to be fixed: we
 467    can't know what the fix should be without already knowing what the
 468    chosen location of SP is on the next successor.  (This is
 469    the "succ already knows incoming SP" case), This calculation cannot
 470    be easily done in manifestSP, since it processes the nodes
 471    /backwards/.  So we need to have figured this out before we hit
 472    manifestSP.
 473 -}
 474
 475 -- After determining the stack layout, we can:
 476 -- 1. Replace references to stack Areas with addresses relative to the stack
 477 --    pointer.
 478 -- 2. Insert adjustments to the stack pointer to ensure that it is at a
 479 --    conventional location at each proc point.
 480 --    Because we don't take interrupts on the execution stack, we only need the
 481 --    stack pointer to be younger than the live values on the stack at proc points.
 482 -- 3. Compute the maximum stack offset used in the procedure and replace
 483 --    the stack high-water mark with that offset.
 484 manifestSP :: SpEntryMap -> AreaMap -> ByteOff -> CmmGraph -> FuelUniqSM CmmGraph
 485 manifestSP spEntryMap areaMap entry_off g@(CmmGraph {g_entry=entry}) =
 486   ofBlockMap entry `liftM` foldl replB (return mapEmpty) (postorderDfs g)
 487   where slot a = -- pprTrace "slot" (ppr a) $
 488                    Map.lookup a areaMap `orElse` panic "unallocated Area"
 489         slot' (Just id) = slot $ CallArea (Young id)
 490         slot' Nothing   = slot $ CallArea Old
 491         sp_high = maxSlot slot g
 492         proc_entry_sp = slot (CallArea Old) + entry_off
 493
 494         spOffset id = mapLookup id spEntryMap `orElse` 0
 495
 496         sp_on_entry id | id == entry = proc_entry_sp
 497         sp_on_entry id = slot' (Just id) + spOffset id
 498
 499         -- On entry to procpoints, the stack pointer is conventional;
 500         -- otherwise, we check the SP set by predecessors.
 501         replB :: FuelUniqSM (BlockEnv CmmBlock) -> CmmBlock -> FuelUniqSM (BlockEnv CmmBlock)
 502         replB blocks block =
 503           do let (head, middles, JustC tail :: MaybeC C (CmmNode O C)) = blockToNodeList block
 504                  middles' = map (middle spIn) middles
 505              bs <- replLast head middles' tail
 506              flip (foldr insertBlock) bs `liftM` blocks
 507           where spIn = sp_on_entry (entryLabel block)
 508
 509                 middle spOff m = mapExpDeep (replSlot spOff) m
 510                 -- XXX there shouldn't be any global registers in the
 511                 -- CmmCall, so there shouldn't be any slots in
 512                 -- CmmCall... check that...
 513                 last   spOff l = mapExpDeep (replSlot spOff) l
 514                 replSlot spOff (CmmStackSlot a i) = CmmRegOff (CmmGlobal Sp) (spOff - (slot a + i))
 515                 replSlot _ (CmmLit CmmHighStackMark) = -- replacing the high water mark
 516                   CmmLit (CmmInt (toInteger (max 0 (sp_high - proc_entry_sp))) (typeWidth bWord))
 517                 -- Invariant: Sp is always greater than SpLim.  Thus, if
 518                 -- the high water mark is zero, we can optimize away the
 519                 -- conditional branch.  Relies on dead code elimination
 520                 -- to get rid of the dead GC blocks.
 521                 -- EZY: Maybe turn this into a guard that checks if a
 522                 -- statement is stack-check ish?  Maybe we should make
 523                 -- an actual mach-op for it, so there's no chance of
 524                 -- mixing this up with something else...
 525                 replSlot _ (CmmMachOp (MO_U_Lt _)
 526                               [CmmMachOp (MO_Sub _)
 527                                          [ CmmReg (CmmGlobal Sp)
 528                                          , CmmLit (CmmInt 0 _)],
 529                                CmmReg (CmmGlobal SpLim)]) = CmmLit (CmmInt 0 wordWidth)
 530                 replSlot _ e = e
 531
 532                 replLast :: MaybeC C (CmmNode C O) -> [CmmNode O O] -> CmmNode O C -> FuelUniqSM [CmmBlock]
 533                 replLast h m l@(CmmCall _ k n _ _)       = updSp (slot' k + n) h m l
 534                 -- JD: LastForeignCall probably ought to have an outgoing
 535                 --     arg size, just like LastCall
 536                 replLast h m l@(CmmForeignCall {succ=k}) = updSp (slot' (Just k) + wORD_SIZE) h m l
 537                 replLast h m l@(CmmBranch k)             = updSp (sp_on_entry k) h m l
 538                 replLast h m l                           = uncurry (:) `liftM` foldr succ (return (b, [])) (successors l)
 539                   where b :: CmmBlock
 540                         b = updSp' spIn h m l
 541                         succ succId z =
 542                           let succSp = sp_on_entry succId in
 543                           if succSp /= spIn then
 544                             do (b,  bs)  <- z
 545                                (b', bs') <- insertBetween b (adjustSp succSp) succId
 546                                return (b', bs' ++ bs)
 547                           else z
 548
 549                 updSp sp h m l = return [updSp' sp h m l]
 550                 updSp' sp h m l | sp == spIn = blockOfNodeList (h, m, JustC $ last sp l)
 551                                 | otherwise  = blockOfNodeList (h, m ++ adjustSp sp, JustC $ last sp l)
 552                 adjustSp sp = [CmmAssign (CmmGlobal Sp) e]
 553                   where e = CmmMachOp (MO_Add wordWidth) [CmmReg (CmmGlobal Sp), off]
 554                         off = CmmLit $ CmmInt (toInteger $ spIn - sp) wordWidth
 555
 556
 557 -- To compute the stack high-water mark, we fold over the graph and
 558 -- compute the highest slot offset.
 559 maxSlot :: (Area -> Int) -> CmmGraph -> Int
 560 maxSlot slotOff g = foldGraphBlocks (foldBlockNodesF3 (flip const, highSlot, highSlot)) 0 g
 561   where highSlot i z = foldSlotsUsed add (foldSlotsDefd add z i) i
 562         add z (a, i, _) = max z (slotOff a + i)
 563
 564 -----------------------------------------------------------------------------
 565 -- | Sanity check: stub pointers immediately after they die
 566 -----------------------------------------------------------------------------
 567 -- This will miss stack slots that are last used in a Last node,
 568 -- but it should do pretty well...
 569
 570 stubSlotsOnDeath :: CmmGraph -> FuelUniqSM CmmGraph
 571 stubSlotsOnDeath g = liftM fst $ dataflowPassBwd g [] $ analRewBwd slotLattice
 572                                                                    liveSlotTransfers
 573                                                                    rewrites
 574     where rewrites = mkBRewrite3 frt mid lst
 575           frt _ _ = return Nothing
 576           mid m liveSlots = return $ foldSlotsUsed (stub liveSlots m) Nothing m
 577           lst _ _ = return Nothing
 578           stub liveSlots m rst subarea@(a, off, w) =
 579             if elemSlot liveSlots subarea then rst
 580             else let store = mkMiddle $ CmmStore (CmmStackSlot a off)
 581                                                  (stackStubExpr (widthFromBytes w))
 582                  in case rst of Nothing -> Just (mkMiddle m <*> store)
 583                                 Just g  -> Just (g <*> store)