rts/sm/Storage.c

   1 /* -----------------------------------------------------------------------------
   2  *
   3  * (c) The GHC Team, 1998-2008
   4  *
   5  * Storage manager front end
   6  *
   7  * Documentation on the architecture of the Storage Manager can be
   8  * found in the online commentary:
   9  *
  10  *   http://hackage.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage
  11  *
  12  * ---------------------------------------------------------------------------*/
  13
  14 #include "PosixSource.h"
  15 #include "Rts.h"
  16 #include "RtsUtils.h"
  17 #include "RtsFlags.h"
  18 #include "Stats.h"
  19 #include "Hooks.h"
  20 #include "BlockAlloc.h"
  21 #include "MBlock.h"
  22 #include "Weak.h"
  23 #include "Sanity.h"
  24 #include "Arena.h"
  25 #include "OSThreads.h"
  26 #include "Capability.h"
  27 #include "Storage.h"
  28 #include "Schedule.h"
  29 #include "RetainerProfile.h"    // for counting memory blocks (memInventory)
  30 #include "OSMem.h"
  31 #include "Trace.h"
  32 #include "GC.h"
  33 #include "Evac.h"
  34
  35 #include <stdlib.h>
  36 #include <string.h>
  37
  38 /*
  39  * All these globals require sm_mutex to access in THREADED_RTS mode.
  40  */
  41 StgClosure    *caf_list         = NULL;
  42 StgClosure    *revertible_caf_list = NULL;
  43 rtsBool       keepCAFs;
  44
  45 bdescr *pinned_object_block;    /* allocate pinned objects into this block */
  46 nat alloc_blocks;               /* number of allocate()d blocks since GC */
  47 nat alloc_blocks_lim;           /* approximate limit on alloc_blocks */
  48
  49 generation *generations = NULL; /* all the generations */
  50 generation *g0          = NULL; /* generation 0, for convenience */
  51 generation *oldest_gen  = NULL; /* oldest generation, for convenience */
  52 step *g0s0              = NULL; /* generation 0, step 0, for convenience */
  53
  54 nat total_steps         = 0;
  55 step *all_steps         = NULL; /* single array of steps */
  56
  57 ullong total_allocated = 0;     /* total memory allocated during run */
  58
  59 nat n_nurseries         = 0;    /* == RtsFlags.ParFlags.nNodes, convenience */
  60 step *nurseries         = NULL; /* array of nurseries, >1 only if THREADED_RTS */
  61
  62 #ifdef THREADED_RTS
  63 /*
  64  * Storage manager mutex:  protects all the above state from
  65  * simultaneous access by two STG threads.
  66  */
  67 Mutex sm_mutex;
  68 /*
  69  * This mutex is used by atomicModifyMutVar# only
  70  */
  71 Mutex atomic_modify_mutvar_mutex;
  72 #endif
  73
  74
  75 /*
  76  * Forward references
  77  */
  78 static void *stgAllocForGMP   (size_t size_in_bytes);
  79 static void *stgReallocForGMP (void *ptr, size_t old_size, size_t new_size);
  80 static void  stgDeallocForGMP (void *ptr, size_t size);
  81
  82 static void
  83 initStep (step *stp, int g, int s)
  84 {
  85     stp->no = s;
  86     stp->abs_no = RtsFlags.GcFlags.steps * g + s;
  87     stp->blocks = NULL;
  88     stp->n_blocks = 0;
  89     stp->n_words = 0;
  90     stp->live_estimate = 0;
  91     stp->old_blocks = NULL;
  92     stp->n_old_blocks = 0;
  93     stp->gen = &generations[g];
  94     stp->gen_no = g;
  95     stp->large_objects = NULL;
  96     stp->n_large_blocks = 0;
  97     stp->scavenged_large_objects = NULL;
  98     stp->n_scavenged_large_blocks = 0;
  99     stp->mark = 0;
 100     stp->compact = 0;
 101     stp->bitmap = NULL;
 102 #ifdef THREADED_RTS
 103     initSpinLock(&stp->sync_todo);
 104     initSpinLock(&stp->sync_large_objects);
 105 #endif
 106     stp->threads = END_TSO_QUEUE;
 107     stp->old_threads = END_TSO_QUEUE;
 108 }
 109
 110 void
 111 initStorage( void )
 112 {
 113   nat g, s;
 114   generation *gen;
 115
 116   if (generations != NULL) {
 117       // multi-init protection
 118       return;
 119   }
 120
 121   initMBlocks();
 122
 123   /* Sanity check to make sure the LOOKS_LIKE_ macros appear to be
 124    * doing something reasonable.
 125    */
 126   /* We use the NOT_NULL variant or gcc warns that the test is always true */
 127   ASSERT(LOOKS_LIKE_INFO_PTR_NOT_NULL((StgWord)&stg_BLACKHOLE_info));
 128   ASSERT(LOOKS_LIKE_CLOSURE_PTR(&stg_dummy_ret_closure));
 129   ASSERT(!HEAP_ALLOCED(&stg_dummy_ret_closure));
 130
 131   if (RtsFlags.GcFlags.maxHeapSize != 0 &&
 132       RtsFlags.GcFlags.heapSizeSuggestion >
 133       RtsFlags.GcFlags.maxHeapSize) {
 134     RtsFlags.GcFlags.maxHeapSize = RtsFlags.GcFlags.heapSizeSuggestion;
 135   }
 136
 137   if (RtsFlags.GcFlags.maxHeapSize != 0 &&
 138       RtsFlags.GcFlags.minAllocAreaSize >
 139       RtsFlags.GcFlags.maxHeapSize) {
 140       errorBelch("maximum heap size (-M) is smaller than minimum alloc area size (-A)");
 141       RtsFlags.GcFlags.minAllocAreaSize = RtsFlags.GcFlags.maxHeapSize;
 142   }
 143
 144   initBlockAllocator();
 145
 146 #if defined(THREADED_RTS)
 147   initMutex(&sm_mutex);
 148   initMutex(&atomic_modify_mutvar_mutex);
 149 #endif
 150
 151   ACQUIRE_SM_LOCK;
 152
 153   /* allocate generation info array */
 154   generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations
 155                                              * sizeof(struct generation_),
 156                                              "initStorage: gens");
 157
 158   /* allocate all the steps into an array.  It is important that we do
 159      it this way, because we need the invariant that two step pointers
 160      can be directly compared to see which is the oldest.
 161      Remember that the last generation has only one step. */
 162   total_steps = 1 + (RtsFlags.GcFlags.generations - 1) * RtsFlags.GcFlags.steps;
 163   all_steps   = stgMallocBytes(total_steps * sizeof(struct step_),
 164                                "initStorage: steps");
 165
 166   /* Initialise all generations */
 167   for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
 168     gen = &generations[g];
 169     gen->no = g;
 170     gen->mut_list = allocBlock();
 171     gen->collections = 0;
 172     gen->par_collections = 0;
 173     gen->failed_promotions = 0;
 174     gen->max_blocks = 0;
 175   }
 176
 177   /* A couple of convenience pointers */
 178   g0 = &generations[0];
 179   oldest_gen = &generations[RtsFlags.GcFlags.generations-1];
 180
 181   /* Allocate step structures in each generation */
 182   if (RtsFlags.GcFlags.generations > 1) {
 183     /* Only for multiple-generations */
 184
 185     /* Oldest generation: one step */
 186     oldest_gen->n_steps = 1;
 187     oldest_gen->steps   = all_steps + (RtsFlags.GcFlags.generations - 1)
 188                                       * RtsFlags.GcFlags.steps;
 189
 190     /* set up all except the oldest generation with 2 steps */
 191     for(g = 0; g < RtsFlags.GcFlags.generations-1; g++) {
 192       generations[g].n_steps = RtsFlags.GcFlags.steps;
 193       generations[g].steps   = all_steps + g * RtsFlags.GcFlags.steps;
 194     }
 195
 196   } else {
 197     /* single generation, i.e. a two-space collector */
 198     g0->n_steps = 1;
 199     g0->steps   = all_steps;
 200   }
 201
 202 #ifdef THREADED_RTS
 203   n_nurseries = n_capabilities;
 204 #else
 205   n_nurseries = 1;
 206 #endif
 207   nurseries = stgMallocBytes (n_nurseries * sizeof(struct step_),
 208                               "initStorage: nurseries");
 209
 210   /* Initialise all steps */
 211   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
 212     for (s = 0; s < generations[g].n_steps; s++) {
 213         initStep(&generations[g].steps[s], g, s);
 214     }
 215   }
 216
 217   for (s = 0; s < n_nurseries; s++) {
 218       initStep(&nurseries[s], 0, s);
 219   }
 220
 221   /* Set up the destination pointers in each younger gen. step */
 222   for (g = 0; g < RtsFlags.GcFlags.generations-1; g++) {
 223     for (s = 0; s < generations[g].n_steps-1; s++) {
 224       generations[g].steps[s].to = &generations[g].steps[s+1];
 225     }
 226     generations[g].steps[s].to = &generations[g+1].steps[0];
 227   }
 228   oldest_gen->steps[0].to = &oldest_gen->steps[0];
 229
 230   for (s = 0; s < n_nurseries; s++) {
 231       nurseries[s].to = generations[0].steps[0].to;
 232   }
 233
 234   /* The oldest generation has one step. */
 235   if (RtsFlags.GcFlags.compact || RtsFlags.GcFlags.sweep) {
 236       if (RtsFlags.GcFlags.generations == 1) {
 237           errorBelch("WARNING: compact/sweep is incompatible with -G1; disabled");
 238       } else {
 239           oldest_gen->steps[0].mark = 1;
 240           if (RtsFlags.GcFlags.compact)
 241               oldest_gen->steps[0].compact = 1;
 242       }
 243   }
 244
 245   generations[0].max_blocks = 0;
 246   g0s0 = &generations[0].steps[0];
 247
 248   /* The allocation area.  Policy: keep the allocation area
 249    * small to begin with, even if we have a large suggested heap
 250    * size.  Reason: we're going to do a major collection first, and we
 251    * don't want it to be a big one.  This vague idea is borne out by
 252    * rigorous experimental evidence.
 253    */
 254   allocNurseries();
 255
 256   weak_ptr_list = NULL;
 257   caf_list = NULL;
 258   revertible_caf_list = NULL;
 259
 260   /* initialise the allocate() interface */
 261   alloc_blocks = 0;
 262   alloc_blocks_lim = RtsFlags.GcFlags.minAllocAreaSize;
 263
 264   /* Tell GNU multi-precision pkg about our custom alloc functions */
 265   mp_set_memory_functions(stgAllocForGMP, stgReallocForGMP, stgDeallocForGMP);
 266
 267 #ifdef THREADED_RTS
 268   initSpinLock(&gc_alloc_block_sync);
 269   initSpinLock(&recordMutableGen_sync);
 270   whitehole_spin = 0;
 271 #endif
 272
 273   IF_DEBUG(gc, statDescribeGens());
 274
 275   RELEASE_SM_LOCK;
 276 }
 277
 278 void
 279 exitStorage (void)
 280 {
 281     stat_exit(calcAllocated());
 282 }
 283
 284 void
 285 freeStorage (void)
 286 {
 287     stgFree(g0s0); // frees all the steps
 288     stgFree(generations);
 289     freeAllMBlocks();
 290 #if defined(THREADED_RTS)
 291     closeMutex(&sm_mutex);
 292     closeMutex(&atomic_modify_mutvar_mutex);
 293 #endif
 294     stgFree(nurseries);
 295 }
 296
 297 /* -----------------------------------------------------------------------------
 298    CAF management.
 299
 300    The entry code for every CAF does the following:
 301
 302       - builds a CAF_BLACKHOLE in the heap
 303       - pushes an update frame pointing to the CAF_BLACKHOLE
 304       - invokes UPD_CAF(), which:
 305           - calls newCaf, below
 306           - updates the CAF with a static indirection to the CAF_BLACKHOLE
 307
 308    Why do we build a BLACKHOLE in the heap rather than just updating
 309    the thunk directly?  It's so that we only need one kind of update
 310    frame - otherwise we'd need a static version of the update frame too.
 311
 312    newCaf() does the following:
 313
 314       - it puts the CAF on the oldest generation's mut-once list.
 315         This is so that we can treat the CAF as a root when collecting
 316         younger generations.
 317
 318    For GHCI, we have additional requirements when dealing with CAFs:
 319
 320       - we must *retain* all dynamically-loaded CAFs ever entered,
 321         just in case we need them again.
 322       - we must be able to *revert* CAFs that have been evaluated, to
 323         their pre-evaluated form.
 324
 325       To do this, we use an additional CAF list.  When newCaf() is
 326       called on a dynamically-loaded CAF, we add it to the CAF list
 327       instead of the old-generation mutable list, and save away its
 328       old info pointer (in caf->saved_info) for later reversion.
 329
 330       To revert all the CAFs, we traverse the CAF list and reset the
 331       info pointer to caf->saved_info, then throw away the CAF list.
 332       (see GC.c:revertCAFs()).
 333
 334       -- SDM 29/1/01
 335
 336    -------------------------------------------------------------------------- */
 337
 338 void
 339 newCAF(StgClosure* caf)
 340 {
 341   ACQUIRE_SM_LOCK;
 342
 343   if(keepCAFs)
 344   {
 345     // HACK:
 346     // If we are in GHCi _and_ we are using dynamic libraries,
 347     // then we can't redirect newCAF calls to newDynCAF (see below),
 348     // so we make newCAF behave almost like newDynCAF.
 349     // The dynamic libraries might be used by both the interpreted
 350     // program and GHCi itself, so they must not be reverted.
 351     // This also means that in GHCi with dynamic libraries, CAFs are not
 352     // garbage collected. If this turns out to be a problem, we could
 353     // do another hack here and do an address range test on caf to figure
 354     // out whether it is from a dynamic library.
 355     ((StgIndStatic *)caf)->saved_info  = (StgInfoTable *)caf->header.info;
 356     ((StgIndStatic *)caf)->static_link = caf_list;
 357     caf_list = caf;
 358   }
 359   else
 360   {
 361     /* Put this CAF on the mutable list for the old generation.
 362     * This is a HACK - the IND_STATIC closure doesn't really have
 363     * a mut_link field, but we pretend it has - in fact we re-use
 364     * the STATIC_LINK field for the time being, because when we
 365     * come to do a major GC we won't need the mut_link field
 366     * any more and can use it as a STATIC_LINK.
 367     */
 368     ((StgIndStatic *)caf)->saved_info = NULL;
 369     recordMutableGen(caf, oldest_gen);
 370   }
 371
 372   RELEASE_SM_LOCK;
 373 }
 374
 375 // An alternate version of newCaf which is used for dynamically loaded
 376 // object code in GHCi.  In this case we want to retain *all* CAFs in
 377 // the object code, because they might be demanded at any time from an
 378 // expression evaluated on the command line.
 379 // Also, GHCi might want to revert CAFs, so we add these to the
 380 // revertible_caf_list.
 381 //
 382 // The linker hackily arranges that references to newCaf from dynamic
 383 // code end up pointing to newDynCAF.
 384 void
 385 newDynCAF(StgClosure *caf)
 386 {
 387     ACQUIRE_SM_LOCK;
 388
 389     ((StgIndStatic *)caf)->saved_info  = (StgInfoTable *)caf->header.info;
 390     ((StgIndStatic *)caf)->static_link = revertible_caf_list;
 391     revertible_caf_list = caf;
 392
 393     RELEASE_SM_LOCK;
 394 }
 395
 396 /* -----------------------------------------------------------------------------
 397    Nursery management.
 398    -------------------------------------------------------------------------- */
 399
 400 static bdescr *
 401 allocNursery (step *stp, bdescr *tail, nat blocks)
 402 {
 403     bdescr *bd;
 404     nat i;
 405
 406     // Allocate a nursery: we allocate fresh blocks one at a time and
 407     // cons them on to the front of the list, not forgetting to update
 408     // the back pointer on the tail of the list to point to the new block.
 409     for (i=0; i < blocks; i++) {
 410         // @LDV profiling
 411         /*
 412           processNursery() in LdvProfile.c assumes that every block group in
 413           the nursery contains only a single block. So, if a block group is
 414           given multiple blocks, change processNursery() accordingly.
 415         */
 416         bd = allocBlock();
 417         bd->link = tail;
 418         // double-link the nursery: we might need to insert blocks
 419         if (tail != NULL) {
 420             tail->u.back = bd;
 421         }
 422         bd->step = stp;
 423         bd->gen_no = 0;
 424         bd->flags = 0;
 425         bd->free = bd->start;
 426         tail = bd;
 427     }
 428     tail->u.back = NULL;
 429     return tail;
 430 }
 431
 432 static void
 433 assignNurseriesToCapabilities (void)
 434 {
 435 #ifdef THREADED_RTS
 436     nat i;
 437
 438     for (i = 0; i < n_nurseries; i++) {
 439         capabilities[i].r.rNursery        = &nurseries[i];
 440         capabilities[i].r.rCurrentNursery = nurseries[i].blocks;
 441         capabilities[i].r.rCurrentAlloc   = NULL;
 442     }
 443 #else /* THREADED_RTS */
 444     MainCapability.r.rNursery        = &nurseries[0];
 445     MainCapability.r.rCurrentNursery = nurseries[0].blocks;
 446     MainCapability.r.rCurrentAlloc   = NULL;
 447 #endif
 448 }
 449
 450 void
 451 allocNurseries( void )
 452 {
 453     nat i;
 454
 455     for (i = 0; i < n_nurseries; i++) {
 456         nurseries[i].blocks =
 457             allocNursery(&nurseries[i], NULL,
 458                          RtsFlags.GcFlags.minAllocAreaSize);
 459         nurseries[i].n_blocks    = RtsFlags.GcFlags.minAllocAreaSize;
 460         nurseries[i].old_blocks   = NULL;
 461         nurseries[i].n_old_blocks = 0;
 462     }
 463     assignNurseriesToCapabilities();
 464 }
 465
 466 void
 467 resetNurseries( void )
 468 {
 469     nat i;
 470     bdescr *bd;
 471     step *stp;
 472
 473     for (i = 0; i < n_nurseries; i++) {
 474         stp = &nurseries[i];
 475         for (bd = stp->blocks; bd; bd = bd->link) {
 476             bd->free = bd->start;
 477             ASSERT(bd->gen_no == 0);
 478             ASSERT(bd->step == stp);
 479             IF_DEBUG(sanity,memset(bd->start, 0xaa, BLOCK_SIZE));
 480         }
 481     }
 482     assignNurseriesToCapabilities();
 483 }
 484
 485 lnat
 486 countNurseryBlocks (void)
 487 {
 488     nat i;
 489     lnat blocks = 0;
 490
 491     for (i = 0; i < n_nurseries; i++) {
 492         blocks += nurseries[i].n_blocks;
 493     }
 494     return blocks;
 495 }
 496
 497 static void
 498 resizeNursery ( step *stp, nat blocks )
 499 {
 500   bdescr *bd;
 501   nat nursery_blocks;
 502
 503   nursery_blocks = stp->n_blocks;
 504   if (nursery_blocks == blocks) return;
 505
 506   if (nursery_blocks < blocks) {
 507       debugTrace(DEBUG_gc, "increasing size of nursery to %d blocks",
 508                  blocks);
 509     stp->blocks = allocNursery(stp, stp->blocks, blocks-nursery_blocks);
 510   }
 511   else {
 512     bdescr *next_bd;
 513
 514     debugTrace(DEBUG_gc, "decreasing size of nursery to %d blocks",
 515                blocks);
 516
 517     bd = stp->blocks;
 518     while (nursery_blocks > blocks) {
 519         next_bd = bd->link;
 520         next_bd->u.back = NULL;
 521         nursery_blocks -= bd->blocks; // might be a large block
 522         freeGroup(bd);
 523         bd = next_bd;
 524     }
 525     stp->blocks = bd;
 526     // might have gone just under, by freeing a large block, so make
 527     // up the difference.
 528     if (nursery_blocks < blocks) {
 529         stp->blocks = allocNursery(stp, stp->blocks, blocks-nursery_blocks);
 530     }
 531   }
 532
 533   stp->n_blocks = blocks;
 534   ASSERT(countBlocks(stp->blocks) == stp->n_blocks);
 535 }
 536
 537 //
 538 // Resize each of the nurseries to the specified size.
 539 //
 540 void
 541 resizeNurseriesFixed (nat blocks)
 542 {
 543     nat i;
 544     for (i = 0; i < n_nurseries; i++) {
 545         resizeNursery(&nurseries[i], blocks);
 546     }
 547 }
 548
 549 //
 550 // Resize the nurseries to the total specified size.
 551 //
 552 void
 553 resizeNurseries (nat blocks)
 554 {
 555     // If there are multiple nurseries, then we just divide the number
 556     // of available blocks between them.
 557     resizeNurseriesFixed(blocks / n_nurseries);
 558 }
 559
 560
 561 /* -----------------------------------------------------------------------------
 562    move_TSO is called to update the TSO structure after it has been
 563    moved from one place to another.
 564    -------------------------------------------------------------------------- */
 565
 566 void
 567 move_TSO (StgTSO *src, StgTSO *dest)
 568 {
 569     ptrdiff_t diff;
 570
 571     // relocate the stack pointer...
 572     diff = (StgPtr)dest - (StgPtr)src; // In *words*
 573     dest->sp = (StgPtr)dest->sp + diff;
 574 }
 575
 576 /* -----------------------------------------------------------------------------
 577    The allocate() interface
 578
 579    allocateInGen() function allocates memory directly into a specific
 580    generation.  It always succeeds, and returns a chunk of memory n
 581    words long.  n can be larger than the size of a block if necessary,
 582    in which case a contiguous block group will be allocated.
 583
 584    allocate(n) is equivalent to allocateInGen(g0).
 585    -------------------------------------------------------------------------- */
 586
 587 StgPtr
 588 allocateInGen (generation *g, lnat n)
 589 {
 590     step *stp;
 591     bdescr *bd;
 592     StgPtr ret;
 593
 594     ACQUIRE_SM_LOCK;
 595
 596     TICK_ALLOC_HEAP_NOCTR(n);
 597     CCS_ALLOC(CCCS,n);
 598
 599     stp = &g->steps[0];
 600
 601     if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_))
 602     {
 603         lnat req_blocks =  (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
 604
 605         // Attempting to allocate an object larger than maxHeapSize
 606         // should definitely be disallowed.  (bug #1791)
 607         if (RtsFlags.GcFlags.maxHeapSize > 0 &&
 608             req_blocks >= RtsFlags.GcFlags.maxHeapSize) {
 609             heapOverflow();
 610         }
 611
 612         bd = allocGroup(req_blocks);
 613         dbl_link_onto(bd, &stp->large_objects);
 614         stp->n_large_blocks += bd->blocks; // might be larger than req_blocks
 615         bd->gen_no  = g->no;
 616         bd->step = stp;
 617         bd->flags = BF_LARGE;
 618         bd->free = bd->start + n;
 619         ret = bd->start;
 620     }
 621     else
 622     {
 623         // small allocation (<LARGE_OBJECT_THRESHOLD) */
 624         bd = stp->blocks;
 625         if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
 626             bd = allocBlock();
 627             bd->gen_no = g->no;
 628             bd->step = stp;
 629             bd->flags = 0;
 630             bd->link = stp->blocks;
 631             stp->blocks = bd;
 632             stp->n_blocks++;
 633             alloc_blocks++;
 634         }
 635         ret = bd->free;
 636         bd->free += n;
 637     }
 638
 639     RELEASE_SM_LOCK;
 640
 641     return ret;
 642 }
 643
 644 StgPtr
 645 allocate (lnat n)
 646 {
 647     return allocateInGen(g0,n);
 648 }
 649
 650 lnat
 651 allocatedBytes( void )
 652 {
 653     lnat allocated;
 654
 655     allocated = alloc_blocks * BLOCK_SIZE_W;
 656     if (pinned_object_block != NULL) {
 657         allocated -= (pinned_object_block->start + BLOCK_SIZE_W) -
 658             pinned_object_block->free;
 659     }
 660
 661     return allocated;
 662 }
 663
 664 // split N blocks off the start of the given bdescr, returning the
 665 // remainder as a new block group.  We treat the remainder as if it
 666 // had been freshly allocated in generation 0.
 667 bdescr *
 668 splitLargeBlock (bdescr *bd, nat blocks)
 669 {
 670     bdescr *new_bd;
 671
 672     // subtract the original number of blocks from the counter first
 673     bd->step->n_large_blocks -= bd->blocks;
 674
 675     new_bd = splitBlockGroup (bd, blocks);
 676
 677     dbl_link_onto(new_bd, &g0s0->large_objects);
 678     g0s0->n_large_blocks += new_bd->blocks;
 679     new_bd->gen_no  = g0s0->no;
 680     new_bd->step    = g0s0;
 681     new_bd->flags   = BF_LARGE;
 682     new_bd->free    = bd->free;
 683
 684     // add the new number of blocks to the counter.  Due to the gaps
 685     // for block descriptor, new_bd->blocks + bd->blocks might not be
 686     // equal to the original bd->blocks, which is why we do it this way.
 687     bd->step->n_large_blocks += bd->blocks;
 688
 689     return new_bd;
 690 }
 691
 692 /* -----------------------------------------------------------------------------
 693    allocateLocal()
 694
 695    This allocates memory in the current thread - it is intended for
 696    use primarily from STG-land where we have a Capability.  It is
 697    better than allocate() because it doesn't require taking the
 698    sm_mutex lock in the common case.
 699
 700    Memory is allocated directly from the nursery if possible (but not
 701    from the current nursery block, so as not to interfere with
 702    Hp/HpLim).
 703    -------------------------------------------------------------------------- */
 704
 705 StgPtr
 706 allocateLocal (Capability *cap, lnat n)
 707 {
 708     bdescr *bd;
 709     StgPtr p;
 710
 711     if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
 712         return allocateInGen(g0,n);
 713     }
 714
 715     /* small allocation (<LARGE_OBJECT_THRESHOLD) */
 716
 717     TICK_ALLOC_HEAP_NOCTR(n);
 718     CCS_ALLOC(CCCS,n);
 719
 720     bd = cap->r.rCurrentAlloc;
 721     if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
 722
 723         // The CurrentAlloc block is full, we need to find another
 724         // one.  First, we try taking the next block from the
 725         // nursery:
 726         bd = cap->r.rCurrentNursery->link;
 727
 728         if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
 729             // The nursery is empty, or the next block is already
 730             // full: allocate a fresh block (we can't fail here).
 731             ACQUIRE_SM_LOCK;
 732             bd = allocBlock();
 733             cap->r.rNursery->n_blocks++;
 734             RELEASE_SM_LOCK;
 735             bd->gen_no = 0;
 736             bd->step = cap->r.rNursery;
 737             bd->flags = 0;
 738             // NO: alloc_blocks++;
 739             // calcAllocated() uses the size of the nursery, and we've
 740             // already bumpted nursery->n_blocks above.
 741         } else {
 742             // we have a block in the nursery: take it and put
 743             // it at the *front* of the nursery list, and use it
 744             // to allocate() from.
 745             cap->r.rCurrentNursery->link = bd->link;
 746             if (bd->link != NULL) {
 747                 bd->link->u.back = cap->r.rCurrentNursery;
 748             }
 749         }
 750         dbl_link_onto(bd, &cap->r.rNursery->blocks);
 751         cap->r.rCurrentAlloc = bd;
 752         IF_DEBUG(sanity, checkNurserySanity(cap->r.rNursery));
 753     }
 754     p = bd->free;
 755     bd->free += n;
 756     return p;
 757 }
 758
 759 /* ---------------------------------------------------------------------------
 760    Allocate a fixed/pinned object.
 761
 762    We allocate small pinned objects into a single block, allocating a
 763    new block when the current one overflows.  The block is chained
 764    onto the large_object_list of generation 0 step 0.
 765
 766    NOTE: The GC can't in general handle pinned objects.  This
 767    interface is only safe to use for ByteArrays, which have no
 768    pointers and don't require scavenging.  It works because the
 769    block's descriptor has the BF_LARGE flag set, so the block is
 770    treated as a large object and chained onto various lists, rather
 771    than the individual objects being copied.  However, when it comes
 772    to scavenge the block, the GC will only scavenge the first object.
 773    The reason is that the GC can't linearly scan a block of pinned
 774    objects at the moment (doing so would require using the
 775    mostly-copying techniques).  But since we're restricting ourselves
 776    to pinned ByteArrays, not scavenging is ok.
 777
 778    This function is called by newPinnedByteArray# which immediately
 779    fills the allocated memory with a MutableByteArray#.
 780    ------------------------------------------------------------------------- */
 781
 782 StgPtr
 783 allocatePinned( lnat n )
 784 {
 785     StgPtr p;
 786     bdescr *bd = pinned_object_block;
 787
 788     // If the request is for a large object, then allocate()
 789     // will give us a pinned object anyway.
 790     if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
 791         return allocate(n);
 792     }
 793
 794     ACQUIRE_SM_LOCK;
 795
 796     TICK_ALLOC_HEAP_NOCTR(n);
 797     CCS_ALLOC(CCCS,n);
 798
 799     // we always return 8-byte aligned memory.  bd->free must be
 800     // 8-byte aligned to begin with, so we just round up n to
 801     // the nearest multiple of 8 bytes.
 802     if (sizeof(StgWord) == 4) {
 803         n = (n+1) & ~1;
 804     }
 805
 806     // If we don't have a block of pinned objects yet, or the current
 807     // one isn't large enough to hold the new object, allocate a new one.
 808     if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) {
 809         pinned_object_block = bd = allocBlock();
 810         dbl_link_onto(bd, &g0s0->large_objects);
 811         g0s0->n_large_blocks++;
 812         bd->gen_no = 0;
 813         bd->step   = g0s0;
 814         bd->flags  = BF_PINNED | BF_LARGE;
 815         bd->free   = bd->start;
 816         alloc_blocks++;
 817     }
 818
 819     p = bd->free;
 820     bd->free += n;
 821     RELEASE_SM_LOCK;
 822     return p;
 823 }
 824
 825 /* -----------------------------------------------------------------------------
 826    Write Barriers
 827    -------------------------------------------------------------------------- */
 828
 829 /*
 830    This is the write barrier for MUT_VARs, a.k.a. IORefs.  A
 831    MUT_VAR_CLEAN object is not on the mutable list; a MUT_VAR_DIRTY
 832    is.  When written to, a MUT_VAR_CLEAN turns into a MUT_VAR_DIRTY
 833    and is put on the mutable list.
 834 */
 835 void
 836 dirty_MUT_VAR(StgRegTable *reg, StgClosure *p)
 837 {
 838     Capability *cap = regTableToCapability(reg);
 839     bdescr *bd;
 840     if (p->header.info == &stg_MUT_VAR_CLEAN_info) {
 841         p->header.info = &stg_MUT_VAR_DIRTY_info;
 842         bd = Bdescr((StgPtr)p);
 843         if (bd->gen_no > 0) recordMutableCap(p,cap,bd->gen_no);
 844     }
 845 }
 846
 847 // Setting a TSO's link field with a write barrier.
 848 // It is *not* necessary to call this function when
 849 //    * setting the link field to END_TSO_QUEUE
 850 //    * putting a TSO on the blackhole_queue
 851 //    * setting the link field of the currently running TSO, as it
 852 //      will already be dirty.
 853 void
 854 setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target)
 855 {
 856     bdescr *bd;
 857     if ((tso->flags & (TSO_DIRTY|TSO_LINK_DIRTY)) == 0) {
 858         tso->flags |= TSO_LINK_DIRTY;
 859         bd = Bdescr((StgPtr)tso);
 860         if (bd->gen_no > 0) recordMutableCap((StgClosure*)tso,cap,bd->gen_no);
 861     }
 862     tso->_link = target;
 863 }
 864
 865 void
 866 dirty_TSO (Capability *cap, StgTSO *tso)
 867 {
 868     bdescr *bd;
 869     if ((tso->flags & (TSO_DIRTY|TSO_LINK_DIRTY)) == 0) {
 870         bd = Bdescr((StgPtr)tso);
 871         if (bd->gen_no > 0) recordMutableCap((StgClosure*)tso,cap,bd->gen_no);
 872     }
 873     tso->flags |= TSO_DIRTY;
 874 }
 875
 876 /*
 877    This is the write barrier for MVARs.  An MVAR_CLEAN objects is not
 878    on the mutable list; a MVAR_DIRTY is.  When written to, a
 879    MVAR_CLEAN turns into a MVAR_DIRTY and is put on the mutable list.
 880    The check for MVAR_CLEAN is inlined at the call site for speed,
 881    this really does make a difference on concurrency-heavy benchmarks
 882    such as Chaneneos and cheap-concurrency.
 883 */
 884 void
 885 dirty_MVAR(StgRegTable *reg, StgClosure *p)
 886 {
 887     Capability *cap = regTableToCapability(reg);
 888     bdescr *bd;
 889     bd = Bdescr((StgPtr)p);
 890     if (bd->gen_no > 0) recordMutableCap(p,cap,bd->gen_no);
 891 }
 892
 893 /* -----------------------------------------------------------------------------
 894    Allocation functions for GMP.
 895
 896    These all use the allocate() interface - we can't have any garbage
 897    collection going on during a gmp operation, so we use allocate()
 898    which always succeeds.  The gmp operations which might need to
 899    allocate will ask the storage manager (via doYouWantToGC()) whether
 900    a garbage collection is required, in case we get into a loop doing
 901    only allocate() style allocation.
 902    -------------------------------------------------------------------------- */
 903
 904 static void *
 905 stgAllocForGMP (size_t size_in_bytes)
 906 {
 907   StgArrWords* arr;
 908   nat data_size_in_words, total_size_in_words;
 909
 910   /* round up to a whole number of words */
 911   data_size_in_words  = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
 912   total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
 913
 914   /* allocate and fill it in. */
 915 #if defined(THREADED_RTS)
 916   arr = (StgArrWords *)allocateLocal(myTask()->cap, total_size_in_words);
 917 #else
 918   arr = (StgArrWords *)allocateLocal(&MainCapability, total_size_in_words);
 919 #endif
 920   SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
 921
 922   /* and return a ptr to the goods inside the array */
 923   return arr->payload;
 924 }
 925
 926 static void *
 927 stgReallocForGMP (void *ptr, size_t old_size, size_t new_size)
 928 {
 929     size_t min_size;
 930     void *new_stuff_ptr = stgAllocForGMP(new_size);
 931     nat i = 0;
 932     char *p = (char *) ptr;
 933     char *q = (char *) new_stuff_ptr;
 934
 935     min_size = old_size < new_size ? old_size : new_size;
 936     for (; i < min_size; i++, p++, q++) {
 937         *q = *p;
 938     }
 939
 940     return(new_stuff_ptr);
 941 }
 942
 943 static void
 944 stgDeallocForGMP (void *ptr STG_UNUSED,
 945                   size_t size STG_UNUSED)
 946 {
 947     /* easy for us: the garbage collector does the dealloc'n */
 948 }
 949
 950 /* -----------------------------------------------------------------------------
 951  * Stats and stuff
 952  * -------------------------------------------------------------------------- */
 953
 954 /* -----------------------------------------------------------------------------
 955  * calcAllocated()
 956  *
 957  * Approximate how much we've allocated: number of blocks in the
 958  * nursery + blocks allocated via allocate() - unused nusery blocks.
 959  * This leaves a little slop at the end of each block, and doesn't
 960  * take into account large objects (ToDo).
 961  * -------------------------------------------------------------------------- */
 962
 963 lnat
 964 calcAllocated( void )
 965 {
 966   nat allocated;
 967   bdescr *bd;
 968
 969   allocated = allocatedBytes();
 970   allocated += countNurseryBlocks() * BLOCK_SIZE_W;
 971
 972   {
 973 #ifdef THREADED_RTS
 974   nat i;
 975   for (i = 0; i < n_nurseries; i++) {
 976       Capability *cap;
 977       for ( bd = capabilities[i].r.rCurrentNursery->link;
 978             bd != NULL; bd = bd->link ) {
 979           allocated -= BLOCK_SIZE_W;
 980       }
 981       cap = &capabilities[i];
 982       if (cap->r.rCurrentNursery->free <
 983           cap->r.rCurrentNursery->start + BLOCK_SIZE_W) {
 984           allocated -= (cap->r.rCurrentNursery->start + BLOCK_SIZE_W)
 985               - cap->r.rCurrentNursery->free;
 986       }
 987   }
 988 #else
 989   bdescr *current_nursery = MainCapability.r.rCurrentNursery;
 990
 991   for ( bd = current_nursery->link; bd != NULL; bd = bd->link ) {
 992       allocated -= BLOCK_SIZE_W;
 993   }
 994   if (current_nursery->free < current_nursery->start + BLOCK_SIZE_W) {
 995       allocated -= (current_nursery->start + BLOCK_SIZE_W)
 996           - current_nursery->free;
 997   }
 998 #endif
 999   }
1000
1001   total_allocated += allocated;
1002   return allocated;
1003 }
1004
1005 /* Approximate the amount of live data in the heap.  To be called just
1006  * after garbage collection (see GarbageCollect()).
1007  */
1008 lnat
1009 calcLiveBlocks(void)
1010 {
1011   nat g, s;
1012   lnat live = 0;
1013   step *stp;
1014
1015   if (RtsFlags.GcFlags.generations == 1) {
1016       return g0s0->n_large_blocks + g0s0->n_blocks;
1017   }
1018
1019   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1020     for (s = 0; s < generations[g].n_steps; s++) {
1021       /* approximate amount of live data (doesn't take into account slop
1022        * at end of each block).
1023        */
1024       if (g == 0 && s == 0) {
1025           continue;
1026       }
1027       stp = &generations[g].steps[s];
1028       live += stp->n_large_blocks + stp->n_blocks;
1029     }
1030   }
1031   return live;
1032 }
1033
1034 lnat
1035 countOccupied(bdescr *bd)
1036 {
1037     lnat words;
1038
1039     words = 0;
1040     for (; bd != NULL; bd = bd->link) {
1041         ASSERT(bd->free <= bd->start + bd->blocks * BLOCK_SIZE_W);
1042         words += bd->free - bd->start;
1043     }
1044     return words;
1045 }
1046
1047 // Return an accurate count of the live data in the heap, excluding
1048 // generation 0.
1049 lnat
1050 calcLiveWords(void)
1051 {
1052     nat g, s;
1053     lnat live;
1054     step *stp;
1055
1056     if (RtsFlags.GcFlags.generations == 1) {
1057         return g0s0->n_words + countOccupied(g0s0->large_objects);
1058     }
1059
1060     live = 0;
1061     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1062         for (s = 0; s < generations[g].n_steps; s++) {
1063             if (g == 0 && s == 0) continue;
1064             stp = &generations[g].steps[s];
1065             live += stp->n_words + countOccupied(stp->large_objects);
1066         }
1067     }
1068     return live;
1069 }
1070
1071 /* Approximate the number of blocks that will be needed at the next
1072  * garbage collection.
1073  *
1074  * Assume: all data currently live will remain live.  Steps that will
1075  * be collected next time will therefore need twice as many blocks
1076  * since all the data will be copied.
1077  */
1078 extern lnat
1079 calcNeeded(void)
1080 {
1081     lnat needed = 0;
1082     nat g, s;
1083     step *stp;
1084
1085     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1086         for (s = 0; s < generations[g].n_steps; s++) {
1087             if (g == 0 && s == 0) { continue; }
1088             stp = &generations[g].steps[s];
1089
1090             // we need at least this much space
1091             needed += stp->n_blocks + stp->n_large_blocks;
1092
1093             // any additional space needed to collect this gen next time?
1094             if (g == 0 || // always collect gen 0
1095                 (generations[g].steps[0].n_blocks +
1096                  generations[g].steps[0].n_large_blocks
1097                  > generations[g].max_blocks)) {
1098                 // we will collect this gen next time
1099                 if (stp->mark) {
1100                     //  bitmap:
1101                     needed += stp->n_blocks / BITS_IN(W_);
1102                     //  mark stack:
1103                     needed += stp->n_blocks / 100;
1104                 }
1105                 if (stp->compact) {
1106                     continue; // no additional space needed for compaction
1107                 } else {
1108                     needed += stp->n_blocks;
1109                 }
1110             }
1111         }
1112     }
1113     return needed;
1114 }
1115
1116 /* ----------------------------------------------------------------------------
1117    Executable memory
1118
1119    Executable memory must be managed separately from non-executable
1120    memory.  Most OSs these days require you to jump through hoops to
1121    dynamically allocate executable memory, due to various security
1122    measures.
1123
1124    Here we provide a small memory allocator for executable memory.
1125    Memory is managed with a page granularity; we allocate linearly
1126    in the page, and when the page is emptied (all objects on the page
1127    are free) we free the page again, not forgetting to make it
1128    non-executable.
1129
1130    TODO: The inability to handle objects bigger than BLOCK_SIZE_W means that
1131          the linker cannot use allocateExec for loading object code files
1132          on Windows. Once allocateExec can handle larger objects, the linker
1133          should be modified to use allocateExec instead of VirtualAlloc.
1134    ------------------------------------------------------------------------- */
1135
1136 static bdescr *exec_block;
1137
1138 void *allocateExec (nat bytes)
1139 {
1140     void *ret;
1141     nat n;
1142
1143     ACQUIRE_SM_LOCK;
1144
1145     // round up to words.
1146     n  = (bytes + sizeof(W_) + 1) / sizeof(W_);
1147
1148     if (n+1 > BLOCK_SIZE_W) {
1149         barf("allocateExec: can't handle large objects");
1150     }
1151
1152     if (exec_block == NULL ||
1153         exec_block->free + n + 1 > exec_block->start + BLOCK_SIZE_W) {
1154         bdescr *bd;
1155         lnat pagesize = getPageSize();
1156         bd = allocGroup(stg_max(1, pagesize / BLOCK_SIZE));
1157         debugTrace(DEBUG_gc, "allocate exec block %p", bd->start);
1158         bd->gen_no = 0;
1159         bd->flags = BF_EXEC;
1160         bd->link = exec_block;
1161         if (exec_block != NULL) {
1162             exec_block->u.back = bd;
1163         }
1164         bd->u.back = NULL;
1165         setExecutable(bd->start, bd->blocks * BLOCK_SIZE, rtsTrue);
1166         exec_block = bd;
1167     }
1168     *(exec_block->free) = n;  // store the size of this chunk
1169     exec_block->gen_no += n;  // gen_no stores the number of words allocated
1170     ret = exec_block->free + 1;
1171     exec_block->free += n + 1;
1172
1173     RELEASE_SM_LOCK
1174     return ret;
1175 }
1176
1177 void freeExec (void *addr)
1178 {
1179     StgPtr p = (StgPtr)addr - 1;
1180     bdescr *bd = Bdescr((StgPtr)p);
1181
1182     if ((bd->flags & BF_EXEC) == 0) {
1183         barf("freeExec: not executable");
1184     }
1185
1186     if (*(StgPtr)p == 0) {
1187         barf("freeExec: already free?");
1188     }
1189
1190     ACQUIRE_SM_LOCK;
1191
1192     bd->gen_no -= *(StgPtr)p;
1193     *(StgPtr)p = 0;
1194
1195     if (bd->gen_no == 0) {
1196         // Free the block if it is empty, but not if it is the block at
1197         // the head of the queue.
1198         if (bd != exec_block) {
1199             debugTrace(DEBUG_gc, "free exec block %p", bd->start);
1200             dbl_link_remove(bd, &exec_block);
1201             setExecutable(bd->start, bd->blocks * BLOCK_SIZE, rtsFalse);
1202             freeGroup(bd);
1203         } else {
1204             bd->free = bd->start;
1205         }
1206     }
1207
1208     RELEASE_SM_LOCK
1209 }
1210
1211 /* -----------------------------------------------------------------------------
1212    Debugging
1213
1214    memInventory() checks for memory leaks by counting up all the
1215    blocks we know about and comparing that to the number of blocks
1216    allegedly floating around in the system.
1217    -------------------------------------------------------------------------- */
1218
1219 #ifdef DEBUG
1220
1221 // Useful for finding partially full blocks in gdb
1222 void findSlop(bdescr *bd);
1223 void findSlop(bdescr *bd)
1224 {
1225     lnat slop;
1226
1227     for (; bd != NULL; bd = bd->link) {
1228         slop = (bd->blocks * BLOCK_SIZE_W) - (bd->free - bd->start);
1229         if (slop > (1024/sizeof(W_))) {
1230             debugBelch("block at %p (bdescr %p) has %ldKB slop\n",
1231                        bd->start, bd, slop / (1024/sizeof(W_)));
1232         }
1233     }
1234 }
1235
1236 nat
1237 countBlocks(bdescr *bd)
1238 {
1239     nat n;
1240     for (n=0; bd != NULL; bd=bd->link) {
1241         n += bd->blocks;
1242     }
1243     return n;
1244 }
1245
1246 // (*1) Just like countBlocks, except that we adjust the count for a
1247 // megablock group so that it doesn't include the extra few blocks
1248 // that would be taken up by block descriptors in the second and
1249 // subsequent megablock.  This is so we can tally the count with the
1250 // number of blocks allocated in the system, for memInventory().
1251 static nat
1252 countAllocdBlocks(bdescr *bd)
1253 {
1254     nat n;
1255     for (n=0; bd != NULL; bd=bd->link) {
1256         n += bd->blocks;
1257         // hack for megablock groups: see (*1) above
1258         if (bd->blocks > BLOCKS_PER_MBLOCK) {
1259             n -= (MBLOCK_SIZE / BLOCK_SIZE - BLOCKS_PER_MBLOCK)
1260                 * (bd->blocks/(MBLOCK_SIZE/BLOCK_SIZE));
1261         }
1262     }
1263     return n;
1264 }
1265
1266 static lnat
1267 stepBlocks (step *stp)
1268 {
1269     ASSERT(countBlocks(stp->blocks) == stp->n_blocks);
1270     ASSERT(countBlocks(stp->large_objects) == stp->n_large_blocks);
1271     return stp->n_blocks + stp->n_old_blocks +
1272             countAllocdBlocks(stp->large_objects);
1273 }
1274
1275 // If memInventory() calculates that we have a memory leak, this
1276 // function will try to find the block(s) that are leaking by marking
1277 // all the ones that we know about, and search through memory to find
1278 // blocks that are not marked.  In the debugger this can help to give
1279 // us a clue about what kind of block leaked.  In the future we might
1280 // annotate blocks with their allocation site to give more helpful
1281 // info.
1282 static void
1283 findMemoryLeak (void)
1284 {
1285   nat g, s, i;
1286   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1287       for (i = 0; i < n_capabilities; i++) {
1288           markBlocks(capabilities[i].mut_lists[g]);
1289       }
1290       markBlocks(generations[g].mut_list);
1291       for (s = 0; s < generations[g].n_steps; s++) {
1292           markBlocks(generations[g].steps[s].blocks);
1293           markBlocks(generations[g].steps[s].large_objects);
1294       }
1295   }
1296
1297   for (i = 0; i < n_nurseries; i++) {
1298       markBlocks(nurseries[i].blocks);
1299       markBlocks(nurseries[i].large_objects);
1300   }
1301
1302 #ifdef PROFILING
1303   // TODO:
1304   // if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
1305   //    markRetainerBlocks();
1306   // }
1307 #endif
1308
1309   // count the blocks allocated by the arena allocator
1310   // TODO:
1311   // markArenaBlocks();
1312
1313   // count the blocks containing executable memory
1314   markBlocks(exec_block);
1315
1316   reportUnmarkedBlocks();
1317 }
1318
1319
1320 void
1321 memInventory (rtsBool show)
1322 {
1323   nat g, s, i;
1324   step *stp;
1325   lnat gen_blocks[RtsFlags.GcFlags.generations];
1326   lnat nursery_blocks, retainer_blocks,
1327        arena_blocks, exec_blocks;
1328   lnat live_blocks = 0, free_blocks = 0;
1329   rtsBool leak;
1330
1331   // count the blocks we current have
1332
1333   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1334       gen_blocks[g] = 0;
1335       for (i = 0; i < n_capabilities; i++) {
1336           gen_blocks[g] += countBlocks(capabilities[i].mut_lists[g]);
1337       }
1338       gen_blocks[g] += countAllocdBlocks(generations[g].mut_list);
1339       for (s = 0; s < generations[g].n_steps; s++) {
1340           stp = &generations[g].steps[s];
1341           gen_blocks[g] += stepBlocks(stp);
1342       }
1343   }
1344
1345   nursery_blocks = 0;
1346   for (i = 0; i < n_nurseries; i++) {
1347       nursery_blocks += stepBlocks(&nurseries[i]);
1348   }
1349
1350   retainer_blocks = 0;
1351 #ifdef PROFILING
1352   if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
1353       retainer_blocks = retainerStackBlocks();
1354   }
1355 #endif
1356
1357   // count the blocks allocated by the arena allocator
1358   arena_blocks = arenaBlocks();
1359
1360   // count the blocks containing executable memory
1361   exec_blocks = countAllocdBlocks(exec_block);
1362
1363   /* count the blocks on the free list */
1364   free_blocks = countFreeList();
1365
1366   live_blocks = 0;
1367   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1368       live_blocks += gen_blocks[g];
1369   }
1370   live_blocks += nursery_blocks +
1371                + retainer_blocks + arena_blocks + exec_blocks;
1372
1373 #define MB(n) (((n) * BLOCK_SIZE_W) / ((1024*1024)/sizeof(W_)))
1374
1375   leak = live_blocks + free_blocks != mblocks_allocated * BLOCKS_PER_MBLOCK;
1376
1377   if (show || leak)
1378   {
1379       if (leak) {
1380           debugBelch("Memory leak detected:\n");
1381       } else {
1382           debugBelch("Memory inventory:\n");
1383       }
1384       for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1385           debugBelch("  gen %d blocks : %5lu blocks (%lu MB)\n", g,
1386                      gen_blocks[g], MB(gen_blocks[g]));
1387       }
1388       debugBelch("  nursery      : %5lu blocks (%lu MB)\n",
1389                  nursery_blocks, MB(nursery_blocks));
1390       debugBelch("  retainer     : %5lu blocks (%lu MB)\n",
1391                  retainer_blocks, MB(retainer_blocks));
1392       debugBelch("  arena blocks : %5lu blocks (%lu MB)\n",
1393                  arena_blocks, MB(arena_blocks));
1394       debugBelch("  exec         : %5lu blocks (%lu MB)\n",
1395                  exec_blocks, MB(exec_blocks));
1396       debugBelch("  free         : %5lu blocks (%lu MB)\n",
1397                  free_blocks, MB(free_blocks));
1398       debugBelch("  total        : %5lu blocks (%lu MB)\n",
1399                  live_blocks + free_blocks, MB(live_blocks+free_blocks));
1400       if (leak) {
1401           debugBelch("\n  in system    : %5lu blocks (%lu MB)\n",
1402                      mblocks_allocated * BLOCKS_PER_MBLOCK, mblocks_allocated);
1403       }
1404   }
1405
1406   if (leak) {
1407       debugBelch("\n");
1408       findMemoryLeak();
1409   }
1410   ASSERT(n_alloc_blocks == live_blocks);
1411   ASSERT(!leak);
1412 }
1413
1414
1415 /* Full heap sanity check. */
1416 void
1417 checkSanity( void )
1418 {
1419     nat g, s;
1420
1421     if (RtsFlags.GcFlags.generations == 1) {
1422         checkHeap(g0s0->blocks);
1423         checkChain(g0s0->large_objects);
1424     } else {
1425
1426         for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1427             for (s = 0; s < generations[g].n_steps; s++) {
1428                 if (g == 0 && s == 0) { continue; }
1429                 ASSERT(countBlocks(generations[g].steps[s].blocks)
1430                        == generations[g].steps[s].n_blocks);
1431                 ASSERT(countBlocks(generations[g].steps[s].large_objects)
1432                        == generations[g].steps[s].n_large_blocks);
1433                 checkHeap(generations[g].steps[s].blocks);
1434                 checkChain(generations[g].steps[s].large_objects);
1435                 if (g > 0) {
1436                     checkMutableList(generations[g].mut_list, g);
1437                 }
1438             }
1439         }
1440
1441         for (s = 0; s < n_nurseries; s++) {
1442             ASSERT(countBlocks(nurseries[s].blocks)
1443                    == nurseries[s].n_blocks);
1444             ASSERT(countBlocks(nurseries[s].large_objects)
1445                    == nurseries[s].n_large_blocks);
1446         }
1447
1448         checkFreeListSanity();
1449     }
1450
1451 #if defined(THREADED_RTS)
1452     // check the stacks too in threaded mode, because we don't do a
1453     // full heap sanity check in this case (see checkHeap())
1454     checkGlobalTSOList(rtsTrue);
1455 #else
1456     checkGlobalTSOList(rtsFalse);
1457 #endif
1458 }
1459
1460 /* Nursery sanity check */
1461 void
1462 checkNurserySanity( step *stp )
1463 {
1464     bdescr *bd, *prev;
1465     nat blocks = 0;
1466
1467     prev = NULL;
1468     for (bd = stp->blocks; bd != NULL; bd = bd->link) {
1469         ASSERT(bd->u.back == prev);
1470         prev = bd;
1471         blocks += bd->blocks;
1472     }
1473     ASSERT(blocks == stp->n_blocks);
1474 }
1475
1476 // handy function for use in gdb, because Bdescr() is inlined.
1477 extern bdescr *_bdescr( StgPtr p );
1478
1479 bdescr *
1480 _bdescr( StgPtr p )
1481 {
1482     return Bdescr(p);
1483 }
1484
1485 #endif