compiler/cmm/CmmStackLayout.hs

   1 {-# OPTIONS_GHC -XGADTs -XNoMonoLocalBinds #-}
   2 -- Norman likes local bindings
   3 -- If this module lives on I'd like to get rid of this flag in due course
   4
   5 -- Todo: remove
   6 {-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
   7
   8 {-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
   9 #if __GLASGOW_HASKELL__ >= 701
  10 -- GHC 7.0.1 improved incomplete pattern warnings with GADTs
  11 {-# OPTIONS_GHC -fwarn-incomplete-patterns #-}
  12 #endif
  13
  14 module CmmStackLayout
  15     ( SlotEnv, liveSlotAnal, liveSlotTransfers, removeLiveSlotDefs
  16     , getSpEntryMap, layout, manifestSP, igraph, areaBuilder
  17     , stubSlotsOnDeath ) -- to help crash early during debugging
  18 where
  19
  20 import Constants
  21 import Prelude hiding (succ, zip, unzip, last)
  22
  23 import BlockId
  24 import Cmm
  25 import CmmExpr
  26 import CmmProcPoint
  27 import Maybes
  28 import MkGraph (stackStubExpr)
  29 import Control.Monad
  30 import OptimizationFuel
  31 import Outputable
  32 import SMRep (ByteOff)
  33
  34 import Compiler.Hoopl
  35
  36 import Data.Map (Map)
  37 import qualified Data.Map as Map
  38 import qualified FiniteMap as Map
  39
  40 ------------------------------------------------------------------------
  41 --                    Stack Layout                                    --
  42 ------------------------------------------------------------------------
  43
  44 -- | Before we lay out the stack, we need to know something about the
  45 -- liveness of the stack slots. In particular, to decide whether we can
  46 -- reuse a stack location to hold multiple stack slots, we need to know
  47 -- when each of the stack slots is used.
  48 -- Although tempted to use something simpler, we really need a full interference
  49 -- graph. Consider the following case:
  50 --   case <...> of
  51 --     1 -> <spill x>; // y is dead out
  52 --     2 -> <spill y>; // x is dead out
  53 --     3 -> <spill x and y>
  54 -- If we consider the arms in order and we use just the deadness information given by a
  55 -- dataflow analysis, we might decide to allocate the stack slots for x and y
  56 -- to the same stack location, which will lead to incorrect code in the third arm.
  57 -- We won't make this mistake with an interference graph.
  58
  59 -- First, the liveness analysis.
  60 -- We represent a slot with an area, an offset into the area, and a width.
  61 -- Tracking the live slots is a bit tricky because there may be loads and stores
  62 -- into only a part of a stack slot (e.g. loading the low word of a 2-word long),
  63 -- e.g. Slot A 0 8 overlaps with Slot A 4 4.
  64 --
  65 -- The definition of a slot set is intended to reduce the number of overlap
  66 -- checks we have to make. There's no reason to check for overlap between
  67 -- slots in different areas, so we segregate the map by Area's.
  68 -- We expect few slots in each Area, so we collect them in an unordered list.
  69 -- To keep these lists short, any contiguous live slots are coalesced into
  70 -- a single slot, on insertion.
  71
  72 slotLattice :: DataflowLattice SubAreaSet
  73 slotLattice = DataflowLattice "live slots" Map.empty add
  74   where add _ (OldFact old) (NewFact new) = case Map.foldRightWithKey addArea (False, old) new of
  75                                               (change, x) -> (changeIf change, x)
  76         addArea a newSlots z = foldr (addSlot a) z newSlots
  77         addSlot a slot (changed, map) =
  78           let (c, live) = liveGen slot $ Map.findWithDefault [] a map
  79           in (c || changed, Map.insert a live map)
  80
  81 slotLatticeJoin :: [SubAreaSet] -> SubAreaSet
  82 slotLatticeJoin facts = foldr extend (fact_bot slotLattice) facts
  83   where extend fact res = snd $ fact_join slotLattice undefined (OldFact fact) (NewFact res)
  84
  85 type SlotEnv   = BlockEnv SubAreaSet
  86   -- The sub-areas live on entry to the block
  87
  88 liveSlotAnal :: CmmGraph -> FuelUniqSM SlotEnv
  89 liveSlotAnal g = liftM snd $ dataflowPassBwd g [] $ analBwd slotLattice liveSlotTransfers
  90
  91 -- Add the subarea s to the subareas in the list-set (possibly coalescing it with
  92 -- adjacent subareas), and also return whether s was a new addition.
  93 liveGen :: SubArea -> [SubArea] -> (Bool, [SubArea])
  94 liveGen s set = liveGen' s set []
  95   where liveGen' s [] z = (True, s : z)
  96         liveGen' s@(a, hi, w) (s'@(a', hi', w') : rst) z =
  97           if a /= a' || hi < lo' || lo > hi' then    -- no overlap
  98             liveGen' s rst (s' : z)
  99           else if s' `contains` s then               -- old contains new
 100             (False, set)
 101           else                                       -- overlap: coalesce the slots
 102             let new_hi = max hi hi'
 103                 new_lo = min lo lo'
 104             in liveGen' (a, new_hi, new_hi - new_lo) rst z
 105           where lo  = hi  - w  -- remember: areas grow down
 106                 lo' = hi' - w'
 107         contains (a, hi, w) (a', hi', w') =
 108           a == a' && hi >= hi' && hi - w <= hi' - w'
 109
 110 liveKill :: SubArea -> [SubArea] -> [SubArea]
 111 liveKill (a, hi, w) set = -- pprTrace "killing slots in area" (ppr a) $
 112                           liveKill' set []
 113   where liveKill' [] z = z
 114         liveKill' (s'@(a', hi', w') : rst) z =
 115           if a /= a' || hi < lo' || lo > hi' then    -- no overlap
 116             liveKill' rst (s' : z)
 117           else                                       -- overlap: split the old slot
 118             let z'  = if hi' > hi  then (a, hi', hi' - hi)  : z else z
 119                 z'' = if lo  > lo' then (a, lo,  lo  - lo') : z' else z'
 120             in liveKill' rst z''
 121           where lo  = hi  - w  -- remember: areas grow down
 122                 lo' = hi' - w'
 123
 124 -- Note: the stack slots that hold variables returned on the stack are not
 125 -- considered live in to the block -- we treat the first node as a definition site.
 126 -- BEWARE?: Am I being a little careless here in failing to check for the
 127 -- entry Id (which would use the CallArea Old).
 128 liveSlotTransfers :: BwdTransfer CmmNode SubAreaSet
 129 liveSlotTransfers = mkBTransfer3 frt mid lst
 130   where frt :: CmmNode C O -> SubAreaSet -> SubAreaSet
 131         frt (CmmEntry l) f = Map.delete (CallArea (Young l)) f
 132
 133         mid :: CmmNode O O -> SubAreaSet -> SubAreaSet
 134         mid n f = foldSlotsUsed addSlot (removeLiveSlotDefs f n) n
 135         lst :: CmmNode O C -> FactBase SubAreaSet -> SubAreaSet
 136         lst n f = liveInSlots n $ case n of
 137           -- EZY: There's something fishy going on here: the old area is
 138           -- being kept alive too long.  In particular, the incoming
 139           -- parameters can be safely clobbered after they've been read
 140           -- out.
 141           CmmCall {cml_cont=Nothing, cml_args=args} -> add_area (CallArea Old) args out
 142           CmmCall {cml_cont=Just k, cml_args=args}  -> add_area (CallArea Old) args (add_area (CallArea (Young k)) args out)
 143           CmmForeignCall {succ=k, updfr=oldend}     -> add_area (CallArea Old) oldend (add_area (CallArea (Young k)) wORD_SIZE out)
 144           _                                         -> out
 145          where out = joinOutFacts slotLattice n f
 146                add_area _ n live | n == 0 = live
 147                add_area a n live = Map.insert a (snd $ liveGen (a, n, n) $ Map.findWithDefault [] a live) live
 148
 149 -- Slot sets: adding slots, removing slots, and checking for membership.
 150 liftToArea :: Area -> ([SubArea] -> [SubArea]) -> SubAreaSet -> SubAreaSet
 151 addSlot, removeSlot :: SubAreaSet -> SubArea -> SubAreaSet
 152 elemSlot            :: SubAreaSet -> SubArea -> Bool
 153 liftToArea a f map = Map.insert a (f (Map.findWithDefault [] a map)) map
 154 addSlot    live (a, i, w) = liftToArea a (snd . liveGen  (a, i, w)) live
 155 removeSlot live (a, i, w) = liftToArea a       (liveKill (a, i, w)) live
 156 elemSlot   live (a, i, w) =
 157   not $ fst $ liveGen  (a, i, w) (Map.findWithDefault [] a live)
 158
 159 removeLiveSlotDefs :: (DefinerOfSlots s, UserOfSlots s) => SubAreaSet -> s -> SubAreaSet
 160 removeLiveSlotDefs = foldSlotsDefd removeSlot
 161
 162 liveInSlots :: (DefinerOfSlots s, UserOfSlots s) => s -> SubAreaSet -> SubAreaSet
 163 liveInSlots x live = foldSlotsUsed addSlot (removeLiveSlotDefs live x) x
 164
 165 liveLastIn :: CmmNode O C -> (BlockId -> SubAreaSet) -> SubAreaSet
 166 liveLastIn l env = liveInSlots l (liveLastOut env l)
 167
 168 -- Don't forget to keep the outgoing parameters in the CallArea live,
 169 -- as well as the update frame.
 170 -- Note: We have to keep the update frame live at a call because of the
 171 -- case where the function doesn't return -- in that case, there won't
 172 -- be a return to keep the update frame live. We'd still better keep the
 173 -- info pointer in the update frame live at any call site;
 174 -- otherwise we could screw up the garbage collector.
 175 liveLastOut :: (BlockId -> SubAreaSet) -> CmmNode O C -> SubAreaSet
 176 liveLastOut env l =
 177   case l of
 178     CmmCall _ Nothing n _ _ ->
 179       add_area (CallArea Old) n out -- add outgoing args (includes upd frame)
 180     CmmCall _ (Just k) n _ _ ->
 181       add_area (CallArea Old) n (add_area (CallArea (Young k)) n out)
 182     CmmForeignCall { succ = k, updfr = oldend } ->
 183       add_area (CallArea Old) oldend (add_area (CallArea (Young k)) wORD_SIZE out)
 184     _ -> out
 185   where out = slotLatticeJoin $ map env $ successors l
 186         add_area _ n live | n == 0 = live
 187         add_area a n live =
 188           Map.insert a (snd $ liveGen (a, n, n) $ Map.findWithDefault [] a live) live
 189
 190 -- The liveness analysis must be precise: otherwise, we won't know if a definition
 191 -- should really kill a live-out stack slot.
 192 -- But the interference graph does not have to be precise -- it might decide that
 193 -- any live areas interfere. To maintain both a precise analysis and an imprecise
 194 -- interference graph, we need to convert the live-out stack slots to graph nodes
 195 -- at each and every instruction; rather than reconstruct a new list of nodes
 196 -- every time, I provide a function to fold over the nodes, which should be a
 197 -- reasonably efficient approach for the implementations we envision.
 198 -- Of course, it will probably be much easier to program if we just return a list...
 199 type Set x = Map x ()
 200 data IGraphBuilder n =
 201   Builder { foldNodes     :: forall z. SubArea -> (n -> z -> z) -> z -> z
 202           , _wordsOccupied :: AreaSizeMap -> AreaMap -> n -> [Int]
 203           }
 204
 205 areaBuilder :: IGraphBuilder Area
 206 areaBuilder = Builder fold words
 207   where fold (a, _, _) f z = f a z
 208         words areaSize areaMap a =
 209           case Map.lookup a areaMap of
 210             Just addr -> [addr .. addr + (Map.lookup a areaSize `orElse`
 211                                           pprPanic "wordsOccupied: unknown area" (ppr areaSize <+> ppr a))]
 212             Nothing   -> []
 213
 214 --slotBuilder :: IGraphBuilder (Area, Int)
 215 --slotBuilder = undefined
 216
 217 -- Now, we can build the interference graph.
 218 -- The usual story: a definition interferes with all live outs and all other
 219 -- definitions.
 220 type IGraph x = Map x (Set x)
 221 type IGPair x = (IGraph x, IGraphBuilder x)
 222 igraph :: (Ord x) => IGraphBuilder x -> SlotEnv -> CmmGraph -> IGraph x
 223 igraph builder env g = foldr interfere Map.empty (postorderDfs g)
 224   where foldN = foldNodes builder
 225         interfere block igraph = foldBlockNodesB3 (first, middle, last) block igraph
 226           where first _ (igraph, _) = igraph
 227                 middle node (igraph, liveOut) =
 228                   (addEdges igraph node liveOut, liveInSlots node liveOut)
 229                 last node igraph =
 230                   (addEdges igraph node $ liveLastOut env' node, liveLastIn node env')
 231
 232                 -- add edges between a def and the other defs and liveouts
 233                 addEdges igraph i out = fst $ foldSlotsDefd addDef (igraph, out) i
 234                 addDef (igraph, out) def@(a, _, _) =
 235                   (foldN def (addDefN out) igraph,
 236                    Map.insert a (snd $ liveGen def (Map.findWithDefault [] a out)) out)
 237                 addDefN out n igraph =
 238                   let addEdgeNO o igraph = foldN o addEdgeNN igraph
 239                       addEdgeNN n' igraph = addEdgeNN' n n' $ addEdgeNN' n' n igraph
 240                       addEdgeNN' n n' igraph = Map.insert n (Map.insert n' () set) igraph
 241                         where set = Map.findWithDefault Map.empty n igraph
 242                   in Map.foldRightWithKey (\ _ os igraph -> foldr addEdgeNO igraph os) igraph out
 243                 env' bid = mapLookup bid env `orElse` panic "unknown blockId in igraph"
 244
 245 -- Before allocating stack slots, we need to collect one more piece of information:
 246 -- what's the highest offset (in bytes) used in each Area?
 247 -- We'll need to allocate that much space for each Area.
 248
 249 -- Mapping of areas to area sizes (not offsets!)
 250 type AreaSizeMap = AreaMap
 251
 252 -- JD: WHY CAN'T THIS COME FROM THE slot-liveness info?
 253 getAreaSize :: ByteOff -> CmmGraph -> AreaSizeMap
 254   -- The domain of the returned mapping consists only of Areas
 255   -- used for (a) variable spill slots, and (b) parameter passing areas for calls
 256 getAreaSize entry_off g =
 257   foldGraphBlocks (foldBlockNodesF3 (first, add_regslots, last))
 258               (Map.singleton (CallArea Old) entry_off) g
 259   where first _  z = z
 260         last :: CmmNode O C -> Map Area Int -> Map Area Int
 261         last l@(CmmCall _ Nothing args res _) z  =  add_regslots l (add (add z area args) area res)
 262           where area = CallArea Old
 263         last l@(CmmCall _ (Just k) args res _) z =  add_regslots l (add (add z area args) area res)
 264           where area = CallArea (Young k)
 265         last l@(CmmForeignCall {succ = k}) z     =  add_regslots l (add z area wORD_SIZE)
 266           where area = CallArea (Young k)
 267         last l z                                 =  add_regslots l z
 268         add_regslots i z = foldSlotsUsed addSlot (foldSlotsDefd addSlot z i) i
 269         addSlot z (a@(RegSlot (LocalReg _ ty)), _, _) =
 270           add z a $ widthInBytes $ typeWidth ty
 271         addSlot z _ = z
 272         add z a off = Map.insert a (max off (Map.findWithDefault 0 a z)) z
 273         -- The 'max' is important.  Two calls, to f and g, might share a common
 274         -- continuation (and hence a common CallArea), but their number of overflow
 275         -- parameters might differ.
 276         -- EZY: Ought to use insert with combining function...
 277
 278
 279 -- Find the Stack slots occupied by the subarea's conflicts
 280 conflictSlots :: Ord x => IGPair x -> AreaSizeMap -> AreaMap -> SubArea -> Set Int
 281 conflictSlots (ig, Builder foldNodes wordsOccupied) areaSize areaMap subarea =
 282   foldNodes subarea foldNode Map.empty
 283   where foldNode n set = Map.foldRightWithKey conflict set $ Map.findWithDefault Map.empty n ig
 284         conflict n' () set = liveInSlots areaMap n' set
 285         -- Add stack slots occupied by igraph node n
 286         liveInSlots areaMap n set = foldr setAdd set (wordsOccupied areaSize areaMap n)
 287         setAdd w s = Map.insert w () s
 288
 289 -- Find any open space for 'area' on the stack, starting from the
 290 -- 'offset'.  If the area is a CallArea or a spill slot for a pointer,
 291 -- then it must be word-aligned.
 292 freeSlotFrom :: Ord x => IGPair x -> AreaSizeMap -> Int -> AreaMap -> Area -> Int
 293 freeSlotFrom ig areaSize offset areaMap area =
 294   let size = Map.lookup area areaSize `orElse` 0
 295       conflicts = conflictSlots ig areaSize areaMap (area, size, size)
 296       -- CallAreas and Ptrs need to be word-aligned (round up!)
 297       align = case area of CallArea _                                -> align'
 298                            RegSlot  r | isGcPtrType (localRegType r) -> align'
 299                            RegSlot  _                                -> id
 300       align' n = (n + (wORD_SIZE - 1)) `div` wORD_SIZE * wORD_SIZE
 301       -- Find a space big enough to hold the area
 302       findSpace curr 0 = curr
 303       findSpace curr cnt = -- part of target slot, # of bytes left to check
 304         if Map.member curr conflicts then
 305           findSpace (align (curr + size)) size -- try the next (possibly) open space
 306         else findSpace (curr - 1) (cnt - 1)
 307   in findSpace (align (offset + size)) size
 308
 309 -- Find an open space on the stack, and assign it to the area.
 310 allocSlotFrom :: Ord x => IGPair x -> AreaSizeMap -> Int -> AreaMap -> Area -> AreaMap
 311 allocSlotFrom ig areaSize from areaMap area =
 312   if Map.member area areaMap then areaMap
 313   else Map.insert area (freeSlotFrom ig areaSize from areaMap area) areaMap
 314
 315 -- Figure out all of the offsets from the slot location; this will be
 316 -- non-zero for procpoints.
 317 type SpEntryMap = BlockEnv Int
 318 getSpEntryMap :: Int -> CmmGraph -> SpEntryMap
 319 getSpEntryMap entry_off g@(CmmGraph {g_entry = entry})
 320     = foldGraphBlocks add_sp_off (mapInsert entry entry_off emptyBlockMap) g
 321   where add_sp_off :: CmmBlock -> BlockEnv Int -> BlockEnv Int
 322         add_sp_off b env =
 323           case lastNode b of
 324             CmmCall {cml_cont=Just succ, cml_ret_args=off} -> mapInsert succ off env
 325             CmmForeignCall {succ=succ}                     -> mapInsert succ wORD_SIZE env
 326             _                                              -> env
 327
 328 -- | Greedy stack layout.
 329 -- Compute liveness, build the interference graph, and allocate slots for the areas.
 330 -- We visit each basic block in a (generally) forward order.
 331
 332 -- At each instruction that names a register subarea r, we immediately allocate
 333 -- any available slot on the stack by the following procedure:
 334 --  1. Find the sub-areas S that conflict with r
 335 --  2. Find the stack slots used for S
 336 --  3. Choose a contiguous stack space s not in S (s must be large enough to hold r)
 337
 338 -- For a CallArea, we allocate the stack space only when we reach a function
 339 -- call that returns to the CallArea's blockId.
 340 -- Then, we allocate the Area subject to the following constraints:
 341 --   a) It must be younger than all the sub-areas that are live on entry to the block
 342 --         This constraint is only necessary for the successor of a call
 343 --   b) It must not overlap with any already-allocated Area with which it conflicts
 344 --         (ie at some point, not necessarily now, is live at the same time)
 345 --   Part (b) is just the 1,2,3 part above
 346
 347 -- Note: The stack pointer only has to be younger than the youngest live stack slot
 348 -- at proc points. Otherwise, the stack pointer can point anywhere.
 349
 350 layout :: ProcPointSet -> SpEntryMap -> SlotEnv -> ByteOff -> CmmGraph -> AreaMap
 351 -- The domain of the returned map includes an Area for EVERY block
 352 -- including each block that is not the successor of a call (ie is not a proc-point)
 353 -- That's how we return the info of what the SP should be at the entry of every non
 354 -- procpoint block.  However, note that procpoint blocks have their
 355 -- /slot/ stored, which is not necessarily the value of the SP on entry
 356 -- to the block (in fact, it probably isn't, due to argument passing).
 357 -- See [Procpoint Sp offset]
 358
 359 layout procPoints spEntryMap env entry_off g =
 360   let ig = (igraph areaBuilder env g, areaBuilder)
 361       env' bid = mapLookup bid env `orElse` panic "unknown blockId in igraph"
 362       areaSize = getAreaSize entry_off g
 363
 364       -- Find the youngest live stack slot that has already been allocated
 365       youngest_live :: AreaMap     -- Already allocated
 366                     -> SubAreaSet  -- Sub-areas live here
 367                     -> ByteOff     -- Offset of the youngest byte of any
 368                                    --    already-allocated, live sub-area
 369       youngest_live areaMap live = fold_subareas young_slot live 0
 370         where young_slot (a, o, _) z = case Map.lookup a areaMap of
 371                                          Just top -> max z $ top + o
 372                                          Nothing  -> z
 373               fold_subareas f m z = Map.foldRightWithKey (\_ s z -> foldr f z s) z m
 374
 375       -- Allocate space for spill slots and call areas
 376       allocVarSlot = allocSlotFrom ig areaSize 0
 377
 378       -- Update the successor's incoming SP.
 379       setSuccSPs inSp bid areaMap =
 380         case (Map.lookup area areaMap , mapLookup bid (toBlockMap g)) of
 381           (Just _, _) -> areaMap -- succ already knows incoming SP
 382           (Nothing, Just _) ->
 383             if setMember bid procPoints then
 384               let young = youngest_live areaMap $ env' bid
 385                   -- start = case returnOff stackInfo of Just b  -> max b young
 386                   --                                     Nothing -> young
 387                   start = young -- maybe wrong, but I don't understand
 388                                 -- why the preceding is necessary...
 389               in  allocSlotFrom ig areaSize start areaMap area
 390             else Map.insert area inSp areaMap
 391           (_, Nothing) -> panic "Block not found in cfg"
 392         where area = CallArea (Young bid)
 393
 394       layoutAreas areaMap block = foldBlockNodesF3 (flip const, allocMid, allocLast (entryLabel block)) block areaMap
 395       allocMid m areaMap = foldSlotsDefd alloc' (foldSlotsUsed alloc' areaMap m) m
 396       allocLast bid l areaMap =
 397         foldr (setSuccSPs inSp) areaMap' (successors l)
 398         where inSp = slot + spOffset -- [Procpoint Sp offset]
 399               -- If it's not in the map, we should use our previous
 400               -- calculation unchanged.
 401               spOffset = mapLookup bid spEntryMap `orElse` 0
 402               slot = expectJust "slot in" $ Map.lookup (CallArea (Young bid)) areaMap
 403               areaMap' = foldSlotsDefd alloc' (foldSlotsUsed alloc' areaMap l) l
 404       alloc' areaMap (a@(RegSlot _), _, _) = allocVarSlot areaMap a
 405       alloc' areaMap _ = areaMap
 406
 407       initMap = Map.insert (CallArea (Young (g_entry g))) 0
 408               . Map.insert (CallArea Old)                 0
 409               $ Map.empty
 410
 411       areaMap = foldl layoutAreas initMap (postorderDfs g)
 412   in -- pprTrace "ProcPoints" (ppr procPoints) $
 413      -- pprTrace "Area SizeMap" (ppr areaSize) $
 414      -- pprTrace "Entry offset" (ppr entry_off) $
 415      -- pprTrace "Area Map" (ppr areaMap) $
 416      areaMap
 417
 418 {- Note [Procpoint Sp offset]
 419
 420 The calculation of inSp is a little tricky.  (Un)fortunately, if you get
 421 it wrong, you will get inefficient but correct code.  You know you've
 422 got it wrong if the generated stack pointer bounces up and down for no
 423 good reason.
 424
 425 Why can't we just set inSp to the location of the slot?  (This is what
 426 the code used to do.)  The trouble is when we actually hit the proc
 427 point the start of the slot will not be the same as the actual Sp due
 428 to argument passing:
 429
 430   a:
 431       I32[(young<b> + 4)] = cde;
 432       // Stack pointer is moved to young end (bottom) of young<b> for call
 433       // +-------+
 434       // | arg 1 |
 435       // +-------+ <- Sp
 436       call (I32[foobar::I32])(...) returns to Just b (4) (4) with update frame 4;
 437   b:
 438       // After call, stack pointer is above the old end (top) of
 439       // young<b> (the difference is spOffset)
 440       // +-------+ <- Sp
 441       // | arg 1 |
 442       // +-------+
 443
 444 If we blithely set the Sp to be the same as the slot (the young end of
 445 young<b>), an adjustment will be necessary when we go to the next block.
 446 This is wasteful.  So, instead, for the next block after a procpoint,
 447 the actual Sp should be set to the same as the true Sp when we just
 448 entered the procpoint.  Then manifestSP will automatically do the right
 449 thing.
 450
 451 Questions you may ask:
 452
 453 1. Why don't we need to change the mapping for the procpoint itself?
 454    Because manifestSP does its own calculation of the true stack value,
 455    manifestSP will notice the discrepancy between the actual stack
 456    pointer and the slot start, and adjust all of its memory accesses
 457    accordingly.  So the only problem is when we adjust the Sp in
 458    preparation for the successor block; that's why this code is here and
 459    not in setSuccSPs.
 460
 461 2. Why don't we make the procpoint call area and the true offset match
 462    up?  If we did that, we would never use memory above the true value
 463    of the stack pointer, thus wasting all of the stack we used to store
 464    arguments.  You might think that some clever changes to the slot
 465    offsets, using negative offsets, might fix it, but this does not make
 466    semantic sense.
 467
 468 3. If manifestSP is already calculating the true stack value, why we can't
 469    do this trick inside manifestSP itself?  The reason is that if two
 470    branches join with inconsistent SPs, one of them has to be fixed: we
 471    can't know what the fix should be without already knowing what the
 472    chosen location of SP is on the next successor.  (This is
 473    the "succ already knows incoming SP" case), This calculation cannot
 474    be easily done in manifestSP, since it processes the nodes
 475    /backwards/.  So we need to have figured this out before we hit
 476    manifestSP.
 477 -}
 478
 479 -- After determining the stack layout, we can:
 480 -- 1. Replace references to stack Areas with addresses relative to the stack
 481 --    pointer.
 482 -- 2. Insert adjustments to the stack pointer to ensure that it is at a
 483 --    conventional location at each proc point.
 484 --    Because we don't take interrupts on the execution stack, we only need the
 485 --    stack pointer to be younger than the live values on the stack at proc points.
 486 -- 3. Compute the maximum stack offset used in the procedure and replace
 487 --    the stack high-water mark with that offset.
 488 manifestSP :: SpEntryMap -> AreaMap -> ByteOff -> CmmGraph -> FuelUniqSM CmmGraph
 489 manifestSP spEntryMap areaMap entry_off g@(CmmGraph {g_entry=entry}) =
 490   ofBlockMap entry `liftM` foldl replB (return mapEmpty) (postorderDfs g)
 491   where slot a = -- pprTrace "slot" (ppr a) $
 492                    Map.lookup a areaMap `orElse` panic "unallocated Area"
 493         slot' (Just id) = slot $ CallArea (Young id)
 494         slot' Nothing   = slot $ CallArea Old
 495         sp_high = maxSlot slot g
 496         proc_entry_sp = slot (CallArea Old) + entry_off
 497
 498         spOffset id = mapLookup id spEntryMap `orElse` 0
 499
 500         sp_on_entry id | id == entry = proc_entry_sp
 501         sp_on_entry id = slot' (Just id) + spOffset id
 502
 503         -- On entry to procpoints, the stack pointer is conventional;
 504         -- otherwise, we check the SP set by predecessors.
 505         replB :: FuelUniqSM (BlockEnv CmmBlock) -> CmmBlock -> FuelUniqSM (BlockEnv CmmBlock)
 506         replB blocks block =
 507           do let (head, middles, JustC tail :: MaybeC C (CmmNode O C)) = blockToNodeList block
 508                  middles' = map (middle spIn) middles
 509              bs <- replLast head middles' tail
 510              flip (foldr insertBlock) bs `liftM` blocks
 511           where spIn = sp_on_entry (entryLabel block)
 512
 513                 middle spOff m = mapExpDeep (replSlot spOff) m
 514                 -- XXX there shouldn't be any global registers in the
 515                 -- CmmCall, so there shouldn't be any slots in
 516                 -- CmmCall... check that...
 517                 last   spOff l = mapExpDeep (replSlot spOff) l
 518                 replSlot spOff (CmmStackSlot a i) = CmmRegOff (CmmGlobal Sp) (spOff - (slot a + i))
 519                 replSlot _ (CmmLit CmmHighStackMark) = -- replacing the high water mark
 520                   CmmLit (CmmInt (toInteger (max 0 (sp_high - proc_entry_sp))) (typeWidth bWord))
 521                 replSlot _ e = e
 522
 523                 replLast :: MaybeC C (CmmNode C O) -> [CmmNode O O] -> CmmNode O C -> FuelUniqSM [CmmBlock]
 524                 replLast h m l@(CmmCall _ k n _ _)       = updSp (slot' k + n) h m l
 525                 -- JD: LastForeignCall probably ought to have an outgoing
 526                 --     arg size, just like LastCall
 527                 replLast h m l@(CmmForeignCall {succ=k}) = updSp (slot' (Just k) + wORD_SIZE) h m l
 528                 replLast h m l@(CmmBranch k)             = updSp (sp_on_entry k) h m l
 529                 replLast h m l                           = uncurry (:) `liftM` foldr succ (return (b, [])) (successors l)
 530                   where b :: CmmBlock
 531                         b = updSp' spIn h m l
 532                         succ succId z =
 533                           let succSp = sp_on_entry succId in
 534                           if succSp /= spIn then
 535                             do (b,  bs)  <- z
 536                                (b', bs') <- insertBetween b (adjustSp succSp) succId
 537                                return (b', bs' ++ bs)
 538                           else z
 539
 540                 updSp sp h m l = return [updSp' sp h m l]
 541                 updSp' sp h m l | sp == spIn = blockOfNodeList (h, m, JustC $ last sp l)
 542                                 | otherwise  = blockOfNodeList (h, m ++ adjustSp sp, JustC $ last sp l)
 543                 adjustSp sp = [CmmAssign (CmmGlobal Sp) e]
 544                   where e = CmmMachOp (MO_Add wordWidth) [CmmReg (CmmGlobal Sp), off]
 545                         off = CmmLit $ CmmInt (toInteger $ spIn - sp) wordWidth
 546
 547
 548 -- To compute the stack high-water mark, we fold over the graph and
 549 -- compute the highest slot offset.
 550 maxSlot :: (Area -> Int) -> CmmGraph -> Int
 551 maxSlot slotOff g = foldGraphBlocks (foldBlockNodesF3 (flip const, highSlot, highSlot)) 0 g
 552   where highSlot i z = foldSlotsUsed add (foldSlotsDefd add z i) i
 553         add z (a, i, _) = max z (slotOff a + i)
 554
 555 -----------------------------------------------------------------------------
 556 -- | Sanity check: stub pointers immediately after they die
 557 -----------------------------------------------------------------------------
 558 -- This will miss stack slots that are last used in a Last node,
 559 -- but it should do pretty well...
 560
 561 stubSlotsOnDeath :: CmmGraph -> FuelUniqSM CmmGraph
 562 stubSlotsOnDeath g = liftM fst $ dataflowPassBwd g [] $ analRewBwd slotLattice
 563                                                                    liveSlotTransfers
 564                                                                    rewrites
 565     where rewrites = mkBRewrite3 frt mid lst
 566           frt _ _ = return Nothing
 567           mid m liveSlots = return $ foldSlotsUsed (stub liveSlots m) Nothing m
 568           lst _ _ = return Nothing
 569           stub liveSlots m rst subarea@(a, off, w) =
 570             if elemSlot liveSlots subarea then rst
 571             else let store = mkMiddle $ CmmStore (CmmStackSlot a off)
 572                                                  (stackStubExpr (widthFromBytes w))
 573                  in case rst of Nothing -> Just (mkMiddle m <*> store)
 574                                 Just g  -> Just (g <*> store)