rts/sm/GC.c

   1 /* -----------------------------------------------------------------------------
   2  *
   3  * (c) The GHC Team 1998-2008
   4  *
   5  * Generational garbage collector
   6  *
   7  * Documentation on the architecture of the Garbage Collector can be
   8  * found in the online commentary:
   9  *
  10  *   http://hackage.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage/GC
  11  *
  12  * ---------------------------------------------------------------------------*/
  13
  14 #include "PosixSource.h"
  15 #include "Rts.h"
  16 #include "HsFFI.h"
  17
  18 #include "Storage.h"
  19 #include "RtsUtils.h"
  20 #include "Apply.h"
  21 #include "Updates.h"
  22 #include "Stats.h"
  23 #include "Schedule.h"
  24 #include "Sanity.h"
  25 #include "BlockAlloc.h"
  26 #include "ProfHeap.h"
  27 #include "Weak.h"
  28 #include "Prelude.h"
  29 #include "RtsSignals.h"
  30 #include "STM.h"
  31 #if defined(RTS_GTK_FRONTPANEL)
  32 #include "FrontPanel.h"
  33 #endif
  34 #include "Trace.h"
  35 #include "RetainerProfile.h"
  36 #include "LdvProfile.h"
  37 #include "RaiseAsync.h"
  38 #include "Papi.h"
  39 #include "Stable.h"
  40
  41 #include "GC.h"
  42 #include "GCThread.h"
  43 #include "Compact.h"
  44 #include "Evac.h"
  45 #include "Scav.h"
  46 #include "GCUtils.h"
  47 #include "MarkStack.h"
  48 #include "MarkWeak.h"
  49 #include "Sparks.h"
  50 #include "Sweep.h"
  51
  52 #include <string.h> // for memset()
  53 #include <unistd.h>
  54
  55 /* -----------------------------------------------------------------------------
  56    Global variables
  57    -------------------------------------------------------------------------- */
  58
  59 /* STATIC OBJECT LIST.
  60  *
  61  * During GC:
  62  * We maintain a linked list of static objects that are still live.
  63  * The requirements for this list are:
  64  *
  65  *  - we need to scan the list while adding to it, in order to
  66  *    scavenge all the static objects (in the same way that
  67  *    breadth-first scavenging works for dynamic objects).
  68  *
  69  *  - we need to be able to tell whether an object is already on
  70  *    the list, to break loops.
  71  *
  72  * Each static object has a "static link field", which we use for
  73  * linking objects on to the list.  We use a stack-type list, consing
  74  * objects on the front as they are added (this means that the
  75  * scavenge phase is depth-first, not breadth-first, but that
  76  * shouldn't matter).
  77  *
  78  * A separate list is kept for objects that have been scavenged
  79  * already - this is so that we can zero all the marks afterwards.
  80  *
  81  * An object is on the list if its static link field is non-zero; this
  82  * means that we have to mark the end of the list with '1', not NULL.
  83  *
  84  * Extra notes for generational GC:
  85  *
  86  * Each generation has a static object list associated with it.  When
  87  * collecting generations up to N, we treat the static object lists
  88  * from generations > N as roots.
  89  *
  90  * We build up a static object list while collecting generations 0..N,
  91  * which is then appended to the static object list of generation N+1.
  92  */
  93
  94 /* N is the oldest generation being collected, where the generations
  95  * are numbered starting at 0.  A major GC (indicated by the major_gc
  96  * flag) is when we're collecting all generations.  We only attempt to
  97  * deal with static objects and GC CAFs when doing a major GC.
  98  */
  99 nat N;
 100 rtsBool major_gc;
 101
 102 /* Data used for allocation area sizing.
 103  */
 104 static lnat g0s0_pcnt_kept = 30; // percentage of g0s0 live at last minor GC
 105
 106 /* Mut-list stats */
 107 #ifdef DEBUG
 108 nat mutlist_MUTVARS,
 109     mutlist_MUTARRS,
 110     mutlist_MVARS,
 111     mutlist_OTHERS;
 112 #endif
 113
 114 /* Thread-local data for each GC thread
 115  */
 116 gc_thread **gc_threads = NULL;
 117
 118 #if !defined(THREADED_RTS)
 119 StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(step_workspace)];
 120 #endif
 121
 122 // Number of threads running in *this* GC.  Affects how many
 123 // step->todos[] lists we have to look in to find work.
 124 nat n_gc_threads;
 125
 126 // For stats:
 127 long copied;        // *words* copied & scavenged during this GC
 128
 129 rtsBool work_stealing;
 130
 131 DECLARE_GCT
 132
 133 /* -----------------------------------------------------------------------------
 134    Static function declarations
 135    -------------------------------------------------------------------------- */
 136
 137 static void mark_root               (void *user, StgClosure **root);
 138 static void zero_static_object_list (StgClosure* first_static);
 139 static nat  initialise_N            (rtsBool force_major_gc);
 140 static void init_collected_gen      (nat g, nat threads);
 141 static void init_uncollected_gen    (nat g, nat threads);
 142 static void init_gc_thread          (gc_thread *t);
 143 static void update_task_list        (void);
 144 static void resize_generations      (void);
 145 static void resize_nursery          (void);
 146 static void start_gc_threads        (void);
 147 static void scavenge_until_all_done (void);
 148 static StgWord inc_running          (void);
 149 static StgWord dec_running          (void);
 150 static void wakeup_gc_threads       (nat n_threads, nat me);
 151 static void shutdown_gc_threads     (nat n_threads, nat me);
 152
 153 #if 0 && defined(DEBUG)
 154 static void gcCAFs                  (void);
 155 #endif
 156
 157 /* -----------------------------------------------------------------------------
 158    The mark stack.
 159    -------------------------------------------------------------------------- */
 160
 161 bdescr *mark_stack_top_bd; // topmost block in the mark stack
 162 bdescr *mark_stack_bd;     // current block in the mark stack
 163 StgPtr mark_sp;            // pointer to the next unallocated mark stack entry
 164
 165 /* -----------------------------------------------------------------------------
 166    GarbageCollect: the main entry point to the garbage collector.
 167
 168    Locks held: all capabilities are held throughout GarbageCollect().
 169    -------------------------------------------------------------------------- */
 170
 171 void
 172 GarbageCollect (rtsBool force_major_gc,
 173                 nat gc_type USED_IF_THREADS,
 174                 Capability *cap)
 175 {
 176   bdescr *bd;
 177   step *stp;
 178   lnat live, allocated, max_copied, avg_copied, slop;
 179   gc_thread *saved_gct;
 180   nat g, s, t, n;
 181
 182   // necessary if we stole a callee-saves register for gct:
 183   saved_gct = gct;
 184
 185 #ifdef PROFILING
 186   CostCentreStack *prev_CCS;
 187 #endif
 188
 189   ACQUIRE_SM_LOCK;
 190
 191 #if defined(RTS_USER_SIGNALS)
 192   if (RtsFlags.MiscFlags.install_signal_handlers) {
 193     // block signals
 194     blockUserSignals();
 195   }
 196 #endif
 197
 198   ASSERT(sizeof(step_workspace) == 16 * sizeof(StgWord));
 199   // otherwise adjust the padding in step_workspace.
 200
 201   // tell the stats department that we've started a GC
 202   stat_startGC();
 203
 204   // tell the STM to discard any cached closures it's hoping to re-use
 205   stmPreGCHook();
 206
 207   // lock the StablePtr table
 208   stablePtrPreGC();
 209
 210 #ifdef DEBUG
 211   mutlist_MUTVARS = 0;
 212   mutlist_MUTARRS = 0;
 213   mutlist_OTHERS = 0;
 214 #endif
 215
 216   // attribute any costs to CCS_GC
 217 #ifdef PROFILING
 218   prev_CCS = CCCS;
 219   CCCS = CCS_GC;
 220 #endif
 221
 222   /* Approximate how much we allocated.
 223    * Todo: only when generating stats?
 224    */
 225   allocated = calcAllocated();
 226
 227   /* Figure out which generation to collect
 228    */
 229   n = initialise_N(force_major_gc);
 230
 231 #if defined(THREADED_RTS)
 232   work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
 233                   N >= RtsFlags.ParFlags.parGcLoadBalancingGen;
 234       // It's not always a good idea to do load balancing in parallel
 235       // GC.  In particular, for a parallel program we don't want to
 236       // lose locality by moving cached data into another CPU's cache
 237       // (this effect can be quite significant).
 238       //
 239       // We could have a more complex way to deterimine whether to do
 240       // work stealing or not, e.g. it might be a good idea to do it
 241       // if the heap is big.  For now, we just turn it on or off with
 242       // a flag.
 243 #endif
 244
 245   /* Start threads, so they can be spinning up while we finish initialisation.
 246    */
 247   start_gc_threads();
 248
 249 #if defined(THREADED_RTS)
 250   /* How many threads will be participating in this GC?
 251    * We don't try to parallelise minor GCs (unless the user asks for
 252    * it with +RTS -gn0), or mark/compact/sweep GC.
 253    */
 254   if (gc_type == PENDING_GC_PAR) {
 255       n_gc_threads = RtsFlags.ParFlags.nNodes;
 256   } else {
 257       n_gc_threads = 1;
 258   }
 259 #else
 260   n_gc_threads = 1;
 261 #endif
 262
 263   debugTrace(DEBUG_gc, "GC (gen %d): %d KB to collect, %ld MB in use, using %d thread(s)",
 264         N, n * (BLOCK_SIZE / 1024), mblocks_allocated, n_gc_threads);
 265
 266 #ifdef RTS_GTK_FRONTPANEL
 267   if (RtsFlags.GcFlags.frontpanel) {
 268       updateFrontPanelBeforeGC(N);
 269   }
 270 #endif
 271
 272 #ifdef DEBUG
 273   // check for memory leaks if DEBUG is on
 274   memInventory(DEBUG_gc);
 275 #endif
 276
 277   // check sanity *before* GC
 278   IF_DEBUG(sanity, checkSanity(rtsTrue));
 279
 280   // Initialise all our gc_thread structures
 281   for (t = 0; t < n_gc_threads; t++) {
 282       init_gc_thread(gc_threads[t]);
 283   }
 284
 285   // Initialise all the generations/steps that we're collecting.
 286   for (g = 0; g <= N; g++) {
 287       init_collected_gen(g,n_gc_threads);
 288   }
 289
 290   // Initialise all the generations/steps that we're *not* collecting.
 291   for (g = N+1; g < RtsFlags.GcFlags.generations; g++) {
 292       init_uncollected_gen(g,n_gc_threads);
 293   }
 294
 295   /* Allocate a mark stack if we're doing a major collection.
 296    */
 297   if (major_gc && oldest_gen->steps[0].mark) {
 298       mark_stack_bd     = allocBlock();
 299       mark_stack_top_bd = mark_stack_bd;
 300       mark_stack_bd->link = NULL;
 301       mark_stack_bd->u.back = NULL;
 302       mark_sp           = mark_stack_bd->start;
 303   } else {
 304       mark_stack_bd     = NULL;
 305       mark_stack_top_bd = NULL;
 306       mark_sp           = NULL;
 307   }
 308
 309   // this is the main thread
 310 #ifdef THREADED_RTS
 311   if (n_gc_threads == 1) {
 312       SET_GCT(gc_threads[0]);
 313   } else {
 314       SET_GCT(gc_threads[cap->no]);
 315   }
 316 #else
 317 SET_GCT(gc_threads[0]);
 318 #endif
 319
 320   /* -----------------------------------------------------------------------
 321    * follow all the roots that we know about:
 322    */
 323
 324   // the main thread is running: this prevents any other threads from
 325   // exiting prematurely, so we can start them now.
 326   // NB. do this after the mutable lists have been saved above, otherwise
 327   // the other GC threads will be writing into the old mutable lists.
 328   inc_running();
 329   wakeup_gc_threads(n_gc_threads, gct->thread_index);
 330
 331   // Mutable lists from each generation > N
 332   // we want to *scavenge* these roots, not evacuate them: they're not
 333   // going to move in this GC.
 334   // Also do them in reverse generation order, for the usual reason:
 335   // namely to reduce the likelihood of spurious old->new pointers.
 336   //
 337   for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
 338       scavenge_mutable_list(generations[g].saved_mut_list, &generations[g]);
 339       freeChain_sync(generations[g].saved_mut_list);
 340       generations[g].saved_mut_list = NULL;
 341
 342   }
 343
 344   // scavenge the capability-private mutable lists.  This isn't part
 345   // of markSomeCapabilities() because markSomeCapabilities() can only
 346   // call back into the GC via mark_root() (due to the gct register
 347   // variable).
 348   if (n_gc_threads == 1) {
 349       for (n = 0; n < n_capabilities; n++) {
 350           scavenge_capability_mut_lists(&capabilities[n]);
 351       }
 352   } else {
 353       scavenge_capability_mut_lists(&capabilities[gct->thread_index]);
 354   }
 355
 356   // follow roots from the CAF list (used by GHCi)
 357   gct->evac_step = 0;
 358   markCAFs(mark_root, gct);
 359
 360   // follow all the roots that the application knows about.
 361   gct->evac_step = 0;
 362   markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads,
 363                        rtsTrue/*prune sparks*/);
 364
 365 #if defined(RTS_USER_SIGNALS)
 366   // mark the signal handlers (signals should be already blocked)
 367   markSignalHandlers(mark_root, gct);
 368 #endif
 369
 370   // Mark the weak pointer list, and prepare to detect dead weak pointers.
 371   markWeakPtrList();
 372   initWeakForGC();
 373
 374   // Mark the stable pointer table.
 375   markStablePtrTable(mark_root, gct);
 376
 377   /* -------------------------------------------------------------------------
 378    * Repeatedly scavenge all the areas we know about until there's no
 379    * more scavenging to be done.
 380    */
 381   for (;;)
 382   {
 383       scavenge_until_all_done();
 384       // The other threads are now stopped.  We might recurse back to
 385       // here, but from now on this is the only thread.
 386
 387       // if any blackholes are alive, make the threads that wait on
 388       // them alive too.
 389       if (traverseBlackholeQueue()) {
 390           inc_running();
 391           continue;
 392       }
 393
 394       // must be last...  invariant is that everything is fully
 395       // scavenged at this point.
 396       if (traverseWeakPtrList()) { // returns rtsTrue if evaced something
 397           inc_running();
 398           continue;
 399       }
 400
 401       // If we get to here, there's really nothing left to do.
 402       break;
 403   }
 404
 405   shutdown_gc_threads(n_gc_threads, gct->thread_index);
 406
 407   // Update pointers from the Task list
 408   update_task_list();
 409
 410   // Now see which stable names are still alive.
 411   gcStablePtrTable();
 412
 413 #ifdef PROFILING
 414   // We call processHeapClosureForDead() on every closure destroyed during
 415   // the current garbage collection, so we invoke LdvCensusForDead().
 416   if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV
 417       || RtsFlags.ProfFlags.bioSelector != NULL)
 418     LdvCensusForDead(N);
 419 #endif
 420
 421   // NO MORE EVACUATION AFTER THIS POINT!
 422
 423   // Two-space collector: free the old to-space.
 424   // g0s0->old_blocks is the old nursery
 425   // g0s0->blocks is to-space from the previous GC
 426   if (RtsFlags.GcFlags.generations == 1) {
 427       if (g0->steps[0].blocks != NULL) {
 428           freeChain(g0->steps[0].blocks);
 429           g0->steps[0].blocks = NULL;
 430       }
 431   }
 432
 433   // For each workspace, in each thread, move the copied blocks to the step
 434   {
 435       gc_thread *thr;
 436       step_workspace *ws;
 437       bdescr *prev, *next;
 438
 439       for (t = 0; t < n_gc_threads; t++) {
 440           thr = gc_threads[t];
 441
 442           // not step 0
 443           if (RtsFlags.GcFlags.generations == 1) {
 444               s = 0;
 445           } else {
 446               s = 1;
 447           }
 448           for (; s < total_steps; s++) {
 449               ws = &thr->steps[s];
 450
 451               // Push the final block
 452               if (ws->todo_bd) {
 453                   push_scanned_block(ws->todo_bd, ws);
 454               }
 455
 456               ASSERT(gct->scan_bd == NULL);
 457               ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks);
 458
 459               prev = NULL;
 460               for (bd = ws->scavd_list; bd != NULL; bd = bd->link) {
 461                   ws->step->n_words += bd->free - bd->start;
 462                   prev = bd;
 463               }
 464               if (prev != NULL) {
 465                   prev->link = ws->step->blocks;
 466                   ws->step->blocks = ws->scavd_list;
 467               }
 468               ws->step->n_blocks += ws->n_scavd_blocks;
 469           }
 470       }
 471
 472       // Add all the partial blocks *after* we've added all the full
 473       // blocks.  This is so that we can grab the partial blocks back
 474       // again and try to fill them up in the next GC.
 475       for (t = 0; t < n_gc_threads; t++) {
 476           thr = gc_threads[t];
 477
 478           // not step 0
 479           if (RtsFlags.GcFlags.generations == 1) {
 480               s = 0;
 481           } else {
 482               s = 1;
 483           }
 484           for (; s < total_steps; s++) {
 485               ws = &thr->steps[s];
 486
 487               prev = NULL;
 488               for (bd = ws->part_list; bd != NULL; bd = next) {
 489                   next = bd->link;
 490                   if (bd->free == bd->start) {
 491                       if (prev == NULL) {
 492                           ws->part_list = next;
 493                       } else {
 494                           prev->link = next;
 495                       }
 496                       freeGroup(bd);
 497                       ws->n_part_blocks--;
 498                   } else {
 499                       ws->step->n_words += bd->free - bd->start;
 500                       prev = bd;
 501                   }
 502               }
 503               if (prev != NULL) {
 504                   prev->link = ws->step->blocks;
 505                   ws->step->blocks = ws->part_list;
 506               }
 507               ws->step->n_blocks += ws->n_part_blocks;
 508
 509               ASSERT(countBlocks(ws->step->blocks) == ws->step->n_blocks);
 510               ASSERT(countOccupied(ws->step->blocks) == ws->step->n_words);
 511           }
 512       }
 513   }
 514
 515   // Finally: compact or sweep the oldest generation.
 516   if (major_gc && oldest_gen->steps[0].mark) {
 517       if (oldest_gen->steps[0].compact)
 518           compact(gct->scavenged_static_objects);
 519       else
 520           sweep(&oldest_gen->steps[0]);
 521   }
 522
 523   /* run through all the generations/steps and tidy up
 524    */
 525   copied = 0;
 526   max_copied = 0;
 527   avg_copied = 0;
 528   {
 529       nat i;
 530       for (i=0; i < n_gc_threads; i++) {
 531           if (n_gc_threads > 1) {
 532               debugTrace(DEBUG_gc,"thread %d:", i);
 533               debugTrace(DEBUG_gc,"   copied           %ld", gc_threads[i]->copied * sizeof(W_));
 534               debugTrace(DEBUG_gc,"   scanned          %ld", gc_threads[i]->scanned * sizeof(W_));
 535               debugTrace(DEBUG_gc,"   any_work         %ld", gc_threads[i]->any_work);
 536               debugTrace(DEBUG_gc,"   no_work          %ld", gc_threads[i]->no_work);
 537               debugTrace(DEBUG_gc,"   scav_find_work %ld",   gc_threads[i]->scav_find_work);
 538           }
 539           copied += gc_threads[i]->copied;
 540           max_copied = stg_max(gc_threads[i]->copied, max_copied);
 541       }
 542       if (n_gc_threads == 1) {
 543           max_copied = 0;
 544           avg_copied = 0;
 545       } else {
 546           avg_copied = copied;
 547       }
 548   }
 549
 550   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
 551
 552     if (g == N) {
 553       generations[g].collections++; // for stats
 554       if (n_gc_threads > 1) generations[g].par_collections++;
 555     }
 556
 557     // Count the mutable list as bytes "copied" for the purposes of
 558     // stats.  Every mutable list is copied during every GC.
 559     if (g > 0) {
 560         nat mut_list_size = 0;
 561         for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
 562             mut_list_size += bd->free - bd->start;
 563         }
 564         for (n = 0; n < n_capabilities; n++) {
 565             for (bd = capabilities[n].mut_lists[g];
 566                  bd != NULL; bd = bd->link) {
 567                 mut_list_size += bd->free - bd->start;
 568             }
 569         }
 570         copied +=  mut_list_size;
 571
 572         debugTrace(DEBUG_gc,
 573                    "mut_list_size: %lu (%d vars, %d arrays, %d MVARs, %d others)",
 574                    (unsigned long)(mut_list_size * sizeof(W_)),
 575                    mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS, mutlist_OTHERS);
 576     }
 577
 578     for (s = 0; s < generations[g].n_steps; s++) {
 579       bdescr *next, *prev;
 580       stp = &generations[g].steps[s];
 581
 582       // for generations we collected...
 583       if (g <= N) {
 584
 585         /* free old memory and shift to-space into from-space for all
 586          * the collected steps (except the allocation area).  These
 587          * freed blocks will probaby be quickly recycled.
 588          */
 589         if (!(g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1)) {
 590             if (stp->mark)
 591             {
 592                 // tack the new blocks on the end of the existing blocks
 593                 if (stp->old_blocks != NULL) {
 594
 595                     prev = NULL;
 596                     for (bd = stp->old_blocks; bd != NULL; bd = next) {
 597
 598                         next = bd->link;
 599
 600                         if (!(bd->flags & BF_MARKED))
 601                         {
 602                             if (prev == NULL) {
 603                                 stp->old_blocks = next;
 604                             } else {
 605                                 prev->link = next;
 606                             }
 607                             freeGroup(bd);
 608                             stp->n_old_blocks--;
 609                         }
 610                         else
 611                         {
 612                             stp->n_words += bd->free - bd->start;
 613
 614                             // NB. this step might not be compacted next
 615                             // time, so reset the BF_MARKED flags.
 616                             // They are set before GC if we're going to
 617                             // compact.  (search for BF_MARKED above).
 618                             bd->flags &= ~BF_MARKED;
 619
 620                             // between GCs, all blocks in the heap except
 621                             // for the nursery have the BF_EVACUATED flag set.
 622                             bd->flags |= BF_EVACUATED;
 623
 624                             prev = bd;
 625                         }
 626                     }
 627
 628                     if (prev != NULL) {
 629                         prev->link = stp->blocks;
 630                         stp->blocks = stp->old_blocks;
 631                     }
 632                 }
 633                 // add the new blocks to the block tally
 634                 stp->n_blocks += stp->n_old_blocks;
 635                 ASSERT(countBlocks(stp->blocks) == stp->n_blocks);
 636                 ASSERT(countOccupied(stp->blocks) == stp->n_words);
 637             }
 638             else // not copacted
 639             {
 640                 freeChain(stp->old_blocks);
 641             }
 642             stp->old_blocks = NULL;
 643             stp->n_old_blocks = 0;
 644         }
 645
 646         /* LARGE OBJECTS.  The current live large objects are chained on
 647          * scavenged_large, having been moved during garbage
 648          * collection from large_objects.  Any objects left on the
 649          * large_objects list are therefore dead, so we free them here.
 650          */
 651         freeChain(stp->large_objects);
 652         stp->large_objects  = stp->scavenged_large_objects;
 653         stp->n_large_blocks = stp->n_scavenged_large_blocks;
 654         ASSERT(countBlocks(stp->large_objects) == stp->n_large_blocks);
 655       }
 656       else // for older generations...
 657       {
 658         /* For older generations, we need to append the
 659          * scavenged_large_object list (i.e. large objects that have been
 660          * promoted during this GC) to the large_object list for that step.
 661          */
 662         for (bd = stp->scavenged_large_objects; bd; bd = next) {
 663           next = bd->link;
 664           dbl_link_onto(bd, &stp->large_objects);
 665         }
 666
 667         // add the new blocks we promoted during this GC
 668         stp->n_large_blocks += stp->n_scavenged_large_blocks;
 669         ASSERT(countBlocks(stp->large_objects) == stp->n_large_blocks);
 670       }
 671     }
 672   }
 673
 674   // update the max size of older generations after a major GC
 675   resize_generations();
 676
 677   // Calculate the amount of live data for stats.
 678   live = calcLiveWords();
 679
 680   // Free the small objects allocated via allocate(), since this will
 681   // all have been copied into G0S1 now.
 682   if (RtsFlags.GcFlags.generations > 1) {
 683       if (g0->steps[0].blocks != NULL) {
 684           freeChain(g0->steps[0].blocks);
 685           g0->steps[0].blocks = NULL;
 686       }
 687       g0->steps[0].n_blocks = 0;
 688       g0->steps[0].n_words = 0;
 689   }
 690   alloc_blocks_lim = RtsFlags.GcFlags.minAllocAreaSize;
 691
 692   // Start a new pinned_object_block
 693   for (n = 0; n < n_capabilities; n++) {
 694       capabilities[n].pinned_object_block = NULL;
 695   }
 696
 697   // Free the mark stack.
 698   if (mark_stack_top_bd != NULL) {
 699       debugTrace(DEBUG_gc, "mark stack: %d blocks",
 700                  countBlocks(mark_stack_top_bd));
 701       freeChain(mark_stack_top_bd);
 702   }
 703
 704   // Free any bitmaps.
 705   for (g = 0; g <= N; g++) {
 706       for (s = 0; s < generations[g].n_steps; s++) {
 707           stp = &generations[g].steps[s];
 708           if (stp->bitmap != NULL) {
 709               freeGroup(stp->bitmap);
 710               stp->bitmap = NULL;
 711           }
 712       }
 713   }
 714
 715   resize_nursery();
 716
 717  // mark the garbage collected CAFs as dead
 718 #if 0 && defined(DEBUG) // doesn't work at the moment
 719   if (major_gc) { gcCAFs(); }
 720 #endif
 721
 722 #ifdef PROFILING
 723   // resetStaticObjectForRetainerProfiling() must be called before
 724   // zeroing below.
 725   if (n_gc_threads > 1) {
 726       barf("profiling is currently broken with multi-threaded GC");
 727       // ToDo: fix the gct->scavenged_static_objects below
 728   }
 729   resetStaticObjectForRetainerProfiling(gct->scavenged_static_objects);
 730 #endif
 731
 732   // zero the scavenged static object list
 733   if (major_gc) {
 734       nat i;
 735       for (i = 0; i < n_gc_threads; i++) {
 736           zero_static_object_list(gc_threads[i]->scavenged_static_objects);
 737       }
 738   }
 739
 740   // Reset the nursery
 741   resetNurseries();
 742
 743   // start any pending finalizers
 744   RELEASE_SM_LOCK;
 745   scheduleFinalizers(cap, old_weak_ptr_list);
 746   ACQUIRE_SM_LOCK;
 747
 748   // send exceptions to any threads which were about to die
 749   RELEASE_SM_LOCK;
 750   resurrectThreads(resurrected_threads);
 751   performPendingThrowTos(exception_threads);
 752   ACQUIRE_SM_LOCK;
 753
 754   // Update the stable pointer hash table.
 755   updateStablePtrTable(major_gc);
 756
 757   // check sanity after GC
 758   IF_DEBUG(sanity, checkSanity(rtsTrue));
 759
 760   // extra GC trace info
 761   IF_DEBUG(gc, statDescribeGens());
 762
 763 #ifdef DEBUG
 764   // symbol-table based profiling
 765   /*  heapCensus(to_blocks); */ /* ToDo */
 766 #endif
 767
 768   // restore enclosing cost centre
 769 #ifdef PROFILING
 770   CCCS = prev_CCS;
 771 #endif
 772
 773 #ifdef DEBUG
 774   // check for memory leaks if DEBUG is on
 775   memInventory(DEBUG_gc);
 776 #endif
 777
 778 #ifdef RTS_GTK_FRONTPANEL
 779   if (RtsFlags.GcFlags.frontpanel) {
 780       updateFrontPanelAfterGC( N, live );
 781   }
 782 #endif
 783
 784   // ok, GC over: tell the stats department what happened.
 785   slop = calcLiveBlocks() * BLOCK_SIZE_W - live;
 786   stat_endGC(allocated, live, copied, N, max_copied, avg_copied, slop);
 787
 788   // unlock the StablePtr table
 789   stablePtrPostGC();
 790
 791   // Guess which generation we'll collect *next* time
 792   initialise_N(force_major_gc);
 793
 794 #if defined(RTS_USER_SIGNALS)
 795   if (RtsFlags.MiscFlags.install_signal_handlers) {
 796     // unblock signals again
 797     unblockUserSignals();
 798   }
 799 #endif
 800
 801   RELEASE_SM_LOCK;
 802
 803   SET_GCT(saved_gct);
 804 }
 805
 806 /* -----------------------------------------------------------------------------
 807    Figure out which generation to collect, initialise N and major_gc.
 808
 809    Also returns the total number of blocks in generations that will be
 810    collected.
 811    -------------------------------------------------------------------------- */
 812
 813 static nat
 814 initialise_N (rtsBool force_major_gc)
 815 {
 816     int g;
 817     nat s, blocks, blocks_total;
 818
 819     blocks = 0;
 820     blocks_total = 0;
 821
 822     if (force_major_gc) {
 823         N = RtsFlags.GcFlags.generations - 1;
 824     } else {
 825         N = 0;
 826     }
 827
 828     for (g = RtsFlags.GcFlags.generations - 1; g >= 0; g--) {
 829         blocks = 0;
 830         for (s = 0; s < generations[g].n_steps; s++) {
 831             blocks += generations[g].steps[s].n_words / BLOCK_SIZE_W;
 832             blocks += generations[g].steps[s].n_large_blocks;
 833         }
 834         if (blocks >= generations[g].max_blocks) {
 835             N = stg_max(N,g);
 836         }
 837         if ((nat)g <= N) {
 838             blocks_total += blocks;
 839         }
 840     }
 841
 842     blocks_total += countNurseryBlocks();
 843
 844     major_gc = (N == RtsFlags.GcFlags.generations-1);
 845     return blocks_total;
 846 }
 847
 848 /* -----------------------------------------------------------------------------
 849    Initialise the gc_thread structures.
 850    -------------------------------------------------------------------------- */
 851
 852 #define GC_THREAD_INACTIVE             0
 853 #define GC_THREAD_STANDING_BY          1
 854 #define GC_THREAD_RUNNING              2
 855 #define GC_THREAD_WAITING_TO_CONTINUE  3
 856
 857 static void
 858 new_gc_thread (nat n, gc_thread *t)
 859 {
 860     nat s;
 861     step_workspace *ws;
 862
 863 #ifdef THREADED_RTS
 864     t->id = 0;
 865     initSpinLock(&t->gc_spin);
 866     initSpinLock(&t->mut_spin);
 867     ACQUIRE_SPIN_LOCK(&t->gc_spin);
 868     t->wakeup = GC_THREAD_INACTIVE;  // starts true, so we can wait for the
 869                           // thread to start up, see wakeup_gc_threads
 870 #endif
 871
 872     t->thread_index = n;
 873     t->free_blocks = NULL;
 874     t->gc_count = 0;
 875
 876     init_gc_thread(t);
 877
 878 #ifdef USE_PAPI
 879     t->papi_events = -1;
 880 #endif
 881
 882     for (s = 0; s < total_steps; s++)
 883     {
 884         ws = &t->steps[s];
 885         ws->step = &all_steps[s];
 886         ASSERT(s == ws->step->abs_no);
 887         ws->my_gct = t;
 888
 889         ws->todo_bd = NULL;
 890         ws->todo_q = newWSDeque(128);
 891         ws->todo_overflow = NULL;
 892         ws->n_todo_overflow = 0;
 893
 894         ws->part_list = NULL;
 895         ws->n_part_blocks = 0;
 896
 897         ws->scavd_list = NULL;
 898         ws->n_scavd_blocks = 0;
 899     }
 900 }
 901
 902
 903 void
 904 initGcThreads (void)
 905 {
 906     if (gc_threads == NULL) {
 907 #if defined(THREADED_RTS)
 908         nat i;
 909         gc_threads = stgMallocBytes (RtsFlags.ParFlags.nNodes *
 910                                      sizeof(gc_thread*),
 911                                      "alloc_gc_threads");
 912
 913         for (i = 0; i < RtsFlags.ParFlags.nNodes; i++) {
 914             gc_threads[i] =
 915                 stgMallocBytes(sizeof(gc_thread) + total_steps * sizeof(step_workspace),
 916                                "alloc_gc_threads");
 917
 918             new_gc_thread(i, gc_threads[i]);
 919         }
 920 #else
 921         gc_threads = stgMallocBytes (sizeof(gc_thread*),"alloc_gc_threads");
 922         gc_threads[0] = gct;
 923         new_gc_thread(0,gc_threads[0]);
 924 #endif
 925     }
 926 }
 927
 928 void
 929 freeGcThreads (void)
 930 {
 931     nat s;
 932     if (gc_threads != NULL) {
 933 #if defined(THREADED_RTS)
 934         nat i;
 935         for (i = 0; i < n_capabilities; i++) {
 936             for (s = 0; s < total_steps; s++)
 937             {
 938                 freeWSDeque(gc_threads[i]->steps[s].todo_q);
 939             }
 940             stgFree (gc_threads[i]);
 941         }
 942         stgFree (gc_threads);
 943 #else
 944         for (s = 0; s < total_steps; s++)
 945         {
 946             freeWSDeque(gc_threads[0]->steps[s].todo_q);
 947         }
 948         stgFree (gc_threads);
 949 #endif
 950         gc_threads = NULL;
 951     }
 952 }
 953
 954 /* ----------------------------------------------------------------------------
 955    Start GC threads
 956    ------------------------------------------------------------------------- */
 957
 958 static volatile StgWord gc_running_threads;
 959
 960 static StgWord
 961 inc_running (void)
 962 {
 963     StgWord new;
 964     new = atomic_inc(&gc_running_threads);
 965     ASSERT(new <= n_gc_threads);
 966     return new;
 967 }
 968
 969 static StgWord
 970 dec_running (void)
 971 {
 972     ASSERT(gc_running_threads != 0);
 973     return atomic_dec(&gc_running_threads);
 974 }
 975
 976 static rtsBool
 977 any_work (void)
 978 {
 979     int s;
 980     step_workspace *ws;
 981
 982     gct->any_work++;
 983
 984     write_barrier();
 985
 986     // scavenge objects in compacted generation
 987     if (mark_stack_bd != NULL && !mark_stack_empty()) {
 988         return rtsTrue;
 989     }
 990
 991     // Check for global work in any step.  We don't need to check for
 992     // local work, because we have already exited scavenge_loop(),
 993     // which means there is no local work for this thread.
 994     for (s = total_steps-1; s >= 0; s--) {
 995         if (s == 0 && RtsFlags.GcFlags.generations > 1) {
 996             continue;
 997         }
 998         ws = &gct->steps[s];
 999         if (ws->todo_large_objects) return rtsTrue;
1000         if (!looksEmptyWSDeque(ws->todo_q)) return rtsTrue;
1001         if (ws->todo_overflow) return rtsTrue;
1002     }
1003
1004 #if defined(THREADED_RTS)
1005     if (work_stealing) {
1006         nat n;
1007         // look for work to steal
1008         for (n = 0; n < n_gc_threads; n++) {
1009             if (n == gct->thread_index) continue;
1010             for (s = total_steps-1; s >= 0; s--) {
1011                 ws = &gc_threads[n]->steps[s];
1012                 if (!looksEmptyWSDeque(ws->todo_q)) return rtsTrue;
1013             }
1014         }
1015     }
1016 #endif
1017
1018     gct->no_work++;
1019
1020     return rtsFalse;
1021 }
1022
1023 static void
1024 scavenge_until_all_done (void)
1025 {
1026     nat r;
1027
1028
1029 loop:
1030     traceEvent(&capabilities[gct->thread_index], EVENT_GC_WORK);
1031
1032 #if defined(THREADED_RTS)
1033     if (n_gc_threads > 1) {
1034         scavenge_loop();
1035     } else {
1036         scavenge_loop1();
1037     }
1038 #else
1039     scavenge_loop();
1040 #endif
1041
1042     // scavenge_loop() only exits when there's no work to do
1043     r = dec_running();
1044
1045     traceEvent(&capabilities[gct->thread_index], EVENT_GC_IDLE);
1046
1047     debugTrace(DEBUG_gc, "%d GC threads still running", r);
1048
1049     while (gc_running_threads != 0) {
1050         // usleep(1);
1051         if (any_work()) {
1052             inc_running();
1053             goto loop;
1054         }
1055         // any_work() does not remove the work from the queue, it
1056         // just checks for the presence of work.  If we find any,
1057         // then we increment gc_running_threads and go back to
1058         // scavenge_loop() to perform any pending work.
1059     }
1060
1061     traceEvent(&capabilities[gct->thread_index], EVENT_GC_DONE);
1062 }
1063
1064 #if defined(THREADED_RTS)
1065
1066 void
1067 gcWorkerThread (Capability *cap)
1068 {
1069     gc_thread *saved_gct;
1070
1071     // necessary if we stole a callee-saves register for gct:
1072     saved_gct = gct;
1073
1074     cap->in_gc = rtsTrue;
1075
1076     gct = gc_threads[cap->no];
1077     gct->id = osThreadId();
1078
1079     // Wait until we're told to wake up
1080     RELEASE_SPIN_LOCK(&gct->mut_spin);
1081     gct->wakeup = GC_THREAD_STANDING_BY;
1082     debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->thread_index);
1083     ACQUIRE_SPIN_LOCK(&gct->gc_spin);
1084
1085 #ifdef USE_PAPI
1086     // start performance counters in this thread...
1087     if (gct->papi_events == -1) {
1088         papi_init_eventset(&gct->papi_events);
1089     }
1090     papi_thread_start_gc1_count(gct->papi_events);
1091 #endif
1092
1093     // Every thread evacuates some roots.
1094     gct->evac_step = 0;
1095     markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads,
1096                          rtsTrue/*prune sparks*/);
1097     scavenge_capability_mut_lists(&capabilities[gct->thread_index]);
1098
1099     scavenge_until_all_done();
1100
1101 #ifdef USE_PAPI
1102     // count events in this thread towards the GC totals
1103     papi_thread_stop_gc1_count(gct->papi_events);
1104 #endif
1105
1106     // Wait until we're told to continue
1107     RELEASE_SPIN_LOCK(&gct->gc_spin);
1108     gct->wakeup = GC_THREAD_WAITING_TO_CONTINUE;
1109     debugTrace(DEBUG_gc, "GC thread %d waiting to continue...",
1110                gct->thread_index);
1111     ACQUIRE_SPIN_LOCK(&gct->mut_spin);
1112     debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index);
1113
1114     SET_GCT(saved_gct);
1115 }
1116
1117 #endif
1118
1119 #if defined(THREADED_RTS)
1120
1121 void
1122 waitForGcThreads (Capability *cap USED_IF_THREADS)
1123 {
1124     nat n_threads = RtsFlags.ParFlags.nNodes;
1125     nat me = cap->no;
1126     nat i, j;
1127     rtsBool retry = rtsTrue;
1128
1129     while(retry) {
1130         for (i=0; i < n_threads; i++) {
1131             if (i == me) continue;
1132             if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
1133                 prodCapability(&capabilities[i], cap->running_task);
1134             }
1135         }
1136         for (j=0; j < 10; j++) {
1137             retry = rtsFalse;
1138             for (i=0; i < n_threads; i++) {
1139                 if (i == me) continue;
1140                 write_barrier();
1141                 setContextSwitches();
1142                 if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
1143                     retry = rtsTrue;
1144                 }
1145             }
1146             if (!retry) break;
1147             yieldThread();
1148         }
1149     }
1150 }
1151
1152 #endif // THREADED_RTS
1153
1154 static void
1155 start_gc_threads (void)
1156 {
1157 #if defined(THREADED_RTS)
1158     gc_running_threads = 0;
1159 #endif
1160 }
1161
1162 static void
1163 wakeup_gc_threads (nat n_threads USED_IF_THREADS, nat me USED_IF_THREADS)
1164 {
1165 #if defined(THREADED_RTS)
1166     nat i;
1167     for (i=0; i < n_threads; i++) {
1168         if (i == me) continue;
1169         inc_running();
1170         debugTrace(DEBUG_gc, "waking up gc thread %d", i);
1171         if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) barf("wakeup_gc_threads");
1172
1173         gc_threads[i]->wakeup = GC_THREAD_RUNNING;
1174         ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
1175         RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
1176     }
1177 #endif
1178 }
1179
1180 // After GC is complete, we must wait for all GC threads to enter the
1181 // standby state, otherwise they may still be executing inside
1182 // any_work(), and may even remain awake until the next GC starts.
1183 static void
1184 shutdown_gc_threads (nat n_threads USED_IF_THREADS, nat me USED_IF_THREADS)
1185 {
1186 #if defined(THREADED_RTS)
1187     nat i;
1188     for (i=0; i < n_threads; i++) {
1189         if (i == me) continue;
1190         while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) { write_barrier(); }
1191     }
1192 #endif
1193 }
1194
1195 #if defined(THREADED_RTS)
1196 void
1197 releaseGCThreads (Capability *cap USED_IF_THREADS)
1198 {
1199     nat n_threads = RtsFlags.ParFlags.nNodes;
1200     nat me = cap->no;
1201     nat i;
1202     for (i=0; i < n_threads; i++) {
1203         if (i == me) continue;
1204         if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE)
1205             barf("releaseGCThreads");
1206
1207         gc_threads[i]->wakeup = GC_THREAD_INACTIVE;
1208         ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin);
1209         RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin);
1210     }
1211 }
1212 #endif
1213
1214 /* ----------------------------------------------------------------------------
1215    Initialise a generation that is to be collected
1216    ------------------------------------------------------------------------- */
1217
1218 static void
1219 init_collected_gen (nat g, nat n_threads)
1220 {
1221     nat s, t, i;
1222     step_workspace *ws;
1223     step *stp;
1224     bdescr *bd;
1225
1226     // Throw away the current mutable list.  Invariant: the mutable
1227     // list always has at least one block; this means we can avoid a
1228     // check for NULL in recordMutable().
1229     if (g != 0) {
1230         freeChain(generations[g].mut_list);
1231         generations[g].mut_list = allocBlock();
1232         for (i = 0; i < n_capabilities; i++) {
1233             freeChain(capabilities[i].mut_lists[g]);
1234             capabilities[i].mut_lists[g] = allocBlock();
1235         }
1236     }
1237
1238     if (g == 0) {
1239         for (i = 0; i < n_capabilities; i++) {
1240             stp = &nurseries[i];
1241             stp->old_threads = stp->threads;
1242             stp->threads = END_TSO_QUEUE;
1243         }
1244     }
1245
1246     for (s = 0; s < generations[g].n_steps; s++) {
1247
1248         // generation 0, step 0 doesn't need to-space, unless -G1
1249         if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) {
1250             continue;
1251         }
1252
1253         stp = &generations[g].steps[s];
1254         ASSERT(stp->gen_no == g);
1255
1256         // we'll construct a new list of threads in this step
1257         // during GC, throw away the current list.
1258         stp->old_threads = stp->threads;
1259         stp->threads = END_TSO_QUEUE;
1260
1261         // deprecate the existing blocks
1262         stp->old_blocks   = stp->blocks;
1263         stp->n_old_blocks = stp->n_blocks;
1264         stp->blocks       = NULL;
1265         stp->n_blocks     = 0;
1266         stp->n_words      = 0;
1267         stp->live_estimate = 0;
1268
1269         // initialise the large object queues.
1270         stp->scavenged_large_objects = NULL;
1271         stp->n_scavenged_large_blocks = 0;
1272
1273         // mark the small objects as from-space
1274         for (bd = stp->old_blocks; bd; bd = bd->link) {
1275             bd->flags &= ~BF_EVACUATED;
1276         }
1277
1278         // mark the large objects as from-space
1279         for (bd = stp->large_objects; bd; bd = bd->link) {
1280             bd->flags &= ~BF_EVACUATED;
1281         }
1282
1283         // for a compacted step, we need to allocate the bitmap
1284         if (stp->mark) {
1285             nat bitmap_size; // in bytes
1286             bdescr *bitmap_bdescr;
1287             StgWord *bitmap;
1288
1289             bitmap_size = stp->n_old_blocks * BLOCK_SIZE / (sizeof(W_)*BITS_PER_BYTE);
1290
1291             if (bitmap_size > 0) {
1292                 bitmap_bdescr = allocGroup((lnat)BLOCK_ROUND_UP(bitmap_size)
1293                                            / BLOCK_SIZE);
1294                 stp->bitmap = bitmap_bdescr;
1295                 bitmap = bitmap_bdescr->start;
1296
1297                 debugTrace(DEBUG_gc, "bitmap_size: %d, bitmap: %p",
1298                            bitmap_size, bitmap);
1299
1300                 // don't forget to fill it with zeros!
1301                 memset(bitmap, 0, bitmap_size);
1302
1303                 // For each block in this step, point to its bitmap from the
1304                 // block descriptor.
1305                 for (bd=stp->old_blocks; bd != NULL; bd = bd->link) {
1306                     bd->u.bitmap = bitmap;
1307                     bitmap += BLOCK_SIZE_W / (sizeof(W_)*BITS_PER_BYTE);
1308
1309                     // Also at this point we set the BF_MARKED flag
1310                     // for this block.  The invariant is that
1311                     // BF_MARKED is always unset, except during GC
1312                     // when it is set on those blocks which will be
1313                     // compacted.
1314                     if (!(bd->flags & BF_FRAGMENTED)) {
1315                         bd->flags |= BF_MARKED;
1316                     }
1317                 }
1318             }
1319         }
1320     }
1321
1322     // For each GC thread, for each step, allocate a "todo" block to
1323     // store evacuated objects to be scavenged, and a block to store
1324     // evacuated objects that do not need to be scavenged.
1325     for (t = 0; t < n_threads; t++) {
1326         for (s = 0; s < generations[g].n_steps; s++) {
1327
1328             // we don't copy objects into g0s0, unless -G0
1329             if (g==0 && s==0 && RtsFlags.GcFlags.generations > 1) continue;
1330
1331             ws = &gc_threads[t]->steps[g * RtsFlags.GcFlags.steps + s];
1332
1333             ws->todo_large_objects = NULL;
1334
1335             ws->part_list = NULL;
1336             ws->n_part_blocks = 0;
1337
1338             // allocate the first to-space block; extra blocks will be
1339             // chained on as necessary.
1340             ws->todo_bd = NULL;
1341             ASSERT(looksEmptyWSDeque(ws->todo_q));
1342             alloc_todo_block(ws,0);
1343
1344             ws->todo_overflow = NULL;
1345             ws->n_todo_overflow = 0;
1346
1347             ws->scavd_list = NULL;
1348             ws->n_scavd_blocks = 0;
1349         }
1350     }
1351 }
1352
1353
1354 /* ----------------------------------------------------------------------------
1355    Initialise a generation that is *not* to be collected
1356    ------------------------------------------------------------------------- */
1357
1358 static void
1359 init_uncollected_gen (nat g, nat threads)
1360 {
1361     nat s, t, n;
1362     step_workspace *ws;
1363     step *stp;
1364     bdescr *bd;
1365
1366     // save the current mutable lists for this generation, and
1367     // allocate a fresh block for each one.  We'll traverse these
1368     // mutable lists as roots early on in the GC.
1369     generations[g].saved_mut_list = generations[g].mut_list;
1370     generations[g].mut_list = allocBlock();
1371     for (n = 0; n < n_capabilities; n++) {
1372         capabilities[n].saved_mut_lists[g] = capabilities[n].mut_lists[g];
1373         capabilities[n].mut_lists[g] = allocBlock();
1374     }
1375
1376     for (s = 0; s < generations[g].n_steps; s++) {
1377         stp = &generations[g].steps[s];
1378         stp->scavenged_large_objects = NULL;
1379         stp->n_scavenged_large_blocks = 0;
1380     }
1381
1382     for (s = 0; s < generations[g].n_steps; s++) {
1383
1384         stp = &generations[g].steps[s];
1385
1386         for (t = 0; t < threads; t++) {
1387             ws = &gc_threads[t]->steps[g * RtsFlags.GcFlags.steps + s];
1388
1389             ASSERT(looksEmptyWSDeque(ws->todo_q));
1390             ws->todo_large_objects = NULL;
1391
1392             ws->part_list = NULL;
1393             ws->n_part_blocks = 0;
1394
1395             ws->scavd_list = NULL;
1396             ws->n_scavd_blocks = 0;
1397
1398             // If the block at the head of the list in this generation
1399             // is less than 3/4 full, then use it as a todo block.
1400             if (stp->blocks && isPartiallyFull(stp->blocks))
1401             {
1402                 ws->todo_bd = stp->blocks;
1403                 ws->todo_free = ws->todo_bd->free;
1404                 ws->todo_lim = ws->todo_bd->start + BLOCK_SIZE_W;
1405                 stp->blocks = stp->blocks->link;
1406                 stp->n_blocks -= 1;
1407                 stp->n_words -= ws->todo_bd->free - ws->todo_bd->start;
1408                 ws->todo_bd->link = NULL;
1409                 // we must scan from the current end point.
1410                 ws->todo_bd->u.scan = ws->todo_bd->free;
1411             }
1412             else
1413             {
1414                 ws->todo_bd = NULL;
1415                 alloc_todo_block(ws,0);
1416             }
1417         }
1418
1419         // deal out any more partial blocks to the threads' part_lists
1420         t = 0;
1421         while (stp->blocks && isPartiallyFull(stp->blocks))
1422         {
1423             bd = stp->blocks;
1424             stp->blocks = bd->link;
1425             ws = &gc_threads[t]->steps[g * RtsFlags.GcFlags.steps + s];
1426             bd->link = ws->part_list;
1427             ws->part_list = bd;
1428             ws->n_part_blocks += 1;
1429             bd->u.scan = bd->free;
1430             stp->n_blocks -= 1;
1431             stp->n_words -= bd->free - bd->start;
1432             t++;
1433             if (t == n_gc_threads) t = 0;
1434         }
1435     }
1436 }
1437
1438 /* -----------------------------------------------------------------------------
1439    Initialise a gc_thread before GC
1440    -------------------------------------------------------------------------- */
1441
1442 static void
1443 init_gc_thread (gc_thread *t)
1444 {
1445     t->static_objects = END_OF_STATIC_LIST;
1446     t->scavenged_static_objects = END_OF_STATIC_LIST;
1447     t->scan_bd = NULL;
1448     t->mut_lists = capabilities[t->thread_index].mut_lists;
1449     t->evac_step = 0;
1450     t->failed_to_evac = rtsFalse;
1451     t->eager_promotion = rtsTrue;
1452     t->thunk_selector_depth = 0;
1453     t->copied = 0;
1454     t->scanned = 0;
1455     t->any_work = 0;
1456     t->no_work = 0;
1457     t->scav_find_work = 0;
1458 }
1459
1460 /* -----------------------------------------------------------------------------
1461    Function we pass to evacuate roots.
1462    -------------------------------------------------------------------------- */
1463
1464 static void
1465 mark_root(void *user USED_IF_THREADS, StgClosure **root)
1466 {
1467     // we stole a register for gct, but this function is called from
1468     // *outside* the GC where the register variable is not in effect,
1469     // so we need to save and restore it here.  NB. only call
1470     // mark_root() from the main GC thread, otherwise gct will be
1471     // incorrect.
1472     gc_thread *saved_gct;
1473     saved_gct = gct;
1474     SET_GCT(user);
1475
1476     evacuate(root);
1477
1478     SET_GCT(saved_gct);
1479 }
1480
1481 /* -----------------------------------------------------------------------------
1482    Initialising the static object & mutable lists
1483    -------------------------------------------------------------------------- */
1484
1485 static void
1486 zero_static_object_list(StgClosure* first_static)
1487 {
1488   StgClosure* p;
1489   StgClosure* link;
1490   const StgInfoTable *info;
1491
1492   for (p = first_static; p != END_OF_STATIC_LIST; p = link) {
1493     info = get_itbl(p);
1494     link = *STATIC_LINK(info, p);
1495     *STATIC_LINK(info,p) = NULL;
1496   }
1497 }
1498
1499 /* ----------------------------------------------------------------------------
1500    Update the pointers from the task list
1501
1502    These are treated as weak pointers because we want to allow a main
1503    thread to get a BlockedOnDeadMVar exception in the same way as any
1504    other thread.  Note that the threads should all have been retained
1505    by GC by virtue of being on the all_threads list, we're just
1506    updating pointers here.
1507    ------------------------------------------------------------------------- */
1508
1509 static void
1510 update_task_list (void)
1511 {
1512     Task *task;
1513     StgTSO *tso;
1514     for (task = all_tasks; task != NULL; task = task->all_link) {
1515         if (!task->stopped && task->tso) {
1516             ASSERT(task->tso->bound == task);
1517             tso = (StgTSO *) isAlive((StgClosure *)task->tso);
1518             if (tso == NULL) {
1519                 barf("task %p: main thread %d has been GC'd",
1520 #ifdef THREADED_RTS
1521                      (void *)task->id,
1522 #else
1523                      (void *)task,
1524 #endif
1525                      task->tso->id);
1526             }
1527             task->tso = tso;
1528         }
1529     }
1530 }
1531
1532 /* ----------------------------------------------------------------------------
1533    Reset the sizes of the older generations when we do a major
1534    collection.
1535
1536    CURRENT STRATEGY: make all generations except zero the same size.
1537    We have to stay within the maximum heap size, and leave a certain
1538    percentage of the maximum heap size available to allocate into.
1539    ------------------------------------------------------------------------- */
1540
1541 static void
1542 resize_generations (void)
1543 {
1544     nat g;
1545
1546     if (major_gc && RtsFlags.GcFlags.generations > 1) {
1547         nat live, size, min_alloc, words;
1548         nat max  = RtsFlags.GcFlags.maxHeapSize;
1549         nat gens = RtsFlags.GcFlags.generations;
1550
1551         // live in the oldest generations
1552         if (oldest_gen->steps[0].live_estimate != 0) {
1553             words = oldest_gen->steps[0].live_estimate;
1554         } else {
1555             words = oldest_gen->steps[0].n_words;
1556         }
1557         live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W +
1558             oldest_gen->steps[0].n_large_blocks;
1559
1560         // default max size for all generations except zero
1561         size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
1562                        RtsFlags.GcFlags.minOldGenSize);
1563
1564         if (RtsFlags.GcFlags.heapSizeSuggestionAuto) {
1565             RtsFlags.GcFlags.heapSizeSuggestion = size;
1566         }
1567
1568         // minimum size for generation zero
1569         min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
1570                             RtsFlags.GcFlags.minAllocAreaSize);
1571
1572         // Auto-enable compaction when the residency reaches a
1573         // certain percentage of the maximum heap size (default: 30%).
1574         if (RtsFlags.GcFlags.generations > 1 &&
1575             (RtsFlags.GcFlags.compact ||
1576              (max > 0 &&
1577               oldest_gen->steps[0].n_blocks >
1578               (RtsFlags.GcFlags.compactThreshold * max) / 100))) {
1579             oldest_gen->steps[0].mark = 1;
1580             oldest_gen->steps[0].compact = 1;
1581 //        debugBelch("compaction: on\n", live);
1582         } else {
1583             oldest_gen->steps[0].mark = 0;
1584             oldest_gen->steps[0].compact = 0;
1585 //        debugBelch("compaction: off\n", live);
1586         }
1587
1588         if (RtsFlags.GcFlags.sweep) {
1589             oldest_gen->steps[0].mark = 1;
1590         }
1591
1592         // if we're going to go over the maximum heap size, reduce the
1593         // size of the generations accordingly.  The calculation is
1594         // different if compaction is turned on, because we don't need
1595         // to double the space required to collect the old generation.
1596         if (max != 0) {
1597
1598             // this test is necessary to ensure that the calculations
1599             // below don't have any negative results - we're working
1600             // with unsigned values here.
1601             if (max < min_alloc) {
1602                 heapOverflow();
1603             }
1604
1605             if (oldest_gen->steps[0].compact) {
1606                 if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
1607                     size = (max - min_alloc) / ((gens - 1) * 2 - 1);
1608                 }
1609             } else {
1610                 if ( (size * (gens - 1) * 2) + min_alloc > max ) {
1611                     size = (max - min_alloc) / ((gens - 1) * 2);
1612                 }
1613             }
1614
1615             if (size < live) {
1616                 heapOverflow();
1617             }
1618         }
1619
1620 #if 0
1621         debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
1622                    min_alloc, size, max);
1623 #endif
1624
1625         for (g = 0; g < gens; g++) {
1626             generations[g].max_blocks = size;
1627         }
1628     }
1629 }
1630
1631 /* -----------------------------------------------------------------------------
1632    Calculate the new size of the nursery, and resize it.
1633    -------------------------------------------------------------------------- */
1634
1635 static void
1636 resize_nursery (void)
1637 {
1638     lnat min_nursery = RtsFlags.GcFlags.minAllocAreaSize * n_capabilities;
1639
1640     if (RtsFlags.GcFlags.generations == 1)
1641     {   // Two-space collector:
1642         nat blocks;
1643
1644         /* set up a new nursery.  Allocate a nursery size based on a
1645          * function of the amount of live data (by default a factor of 2)
1646          * Use the blocks from the old nursery if possible, freeing up any
1647          * left over blocks.
1648          *
1649          * If we get near the maximum heap size, then adjust our nursery
1650          * size accordingly.  If the nursery is the same size as the live
1651          * data (L), then we need 3L bytes.  We can reduce the size of the
1652          * nursery to bring the required memory down near 2L bytes.
1653          *
1654          * A normal 2-space collector would need 4L bytes to give the same
1655          * performance we get from 3L bytes, reducing to the same
1656          * performance at 2L bytes.
1657          */
1658         blocks = generations[0].steps[0].n_blocks;
1659
1660         if ( RtsFlags.GcFlags.maxHeapSize != 0 &&
1661              blocks * RtsFlags.GcFlags.oldGenFactor * 2 >
1662              RtsFlags.GcFlags.maxHeapSize )
1663         {
1664             long adjusted_blocks;  // signed on purpose
1665             int pc_free;
1666
1667             adjusted_blocks = (RtsFlags.GcFlags.maxHeapSize - 2 * blocks);
1668
1669             debugTrace(DEBUG_gc, "near maximum heap size of 0x%x blocks, blocks = %d, adjusted to %ld",
1670                        RtsFlags.GcFlags.maxHeapSize, blocks, adjusted_blocks);
1671
1672             pc_free = adjusted_blocks * 100 / RtsFlags.GcFlags.maxHeapSize;
1673             if (pc_free < RtsFlags.GcFlags.pcFreeHeap) /* might even * be < 0 */
1674             {
1675                 heapOverflow();
1676             }
1677             blocks = adjusted_blocks;
1678         }
1679         else
1680         {
1681             blocks *= RtsFlags.GcFlags.oldGenFactor;
1682             if (blocks < min_nursery)
1683             {
1684                 blocks = min_nursery;
1685             }
1686         }
1687         resizeNurseries(blocks);
1688     }
1689     else  // Generational collector
1690     {
1691         /*
1692          * If the user has given us a suggested heap size, adjust our
1693          * allocation area to make best use of the memory available.
1694          */
1695         if (RtsFlags.GcFlags.heapSizeSuggestion)
1696         {
1697             long blocks;
1698             nat needed = calcNeeded();  // approx blocks needed at next GC
1699
1700             /* Guess how much will be live in generation 0 step 0 next time.
1701              * A good approximation is obtained by finding the
1702              * percentage of g0s0 that was live at the last minor GC.
1703              *
1704              * We have an accurate figure for the amount of copied data in
1705              * 'copied', but we must convert this to a number of blocks, with
1706              * a small adjustment for estimated slop at the end of a block
1707              * (- 10 words).
1708              */
1709             if (N == 0)
1710             {
1711                 g0s0_pcnt_kept = ((copied / (BLOCK_SIZE_W - 10)) * 100)
1712                     / countNurseryBlocks();
1713             }
1714
1715             /* Estimate a size for the allocation area based on the
1716              * information available.  We might end up going slightly under
1717              * or over the suggested heap size, but we should be pretty
1718              * close on average.
1719              *
1720              * Formula:            suggested - needed
1721              *                ----------------------------
1722              *                    1 + g0s0_pcnt_kept/100
1723              *
1724              * where 'needed' is the amount of memory needed at the next
1725              * collection for collecting all steps except g0s0.
1726              */
1727             blocks =
1728                 (((long)RtsFlags.GcFlags.heapSizeSuggestion - (long)needed) * 100) /
1729                 (100 + (long)g0s0_pcnt_kept);
1730
1731             if (blocks < (long)min_nursery) {
1732                 blocks = min_nursery;
1733             }
1734
1735             resizeNurseries((nat)blocks);
1736         }
1737         else
1738         {
1739             // we might have added extra large blocks to the nursery, so
1740             // resize back to minAllocAreaSize again.
1741             resizeNurseriesFixed(RtsFlags.GcFlags.minAllocAreaSize);
1742         }
1743     }
1744 }
1745
1746 /* -----------------------------------------------------------------------------
1747    Sanity code for CAF garbage collection.
1748
1749    With DEBUG turned on, we manage a CAF list in addition to the SRT
1750    mechanism.  After GC, we run down the CAF list and blackhole any
1751    CAFs which have been garbage collected.  This means we get an error
1752    whenever the program tries to enter a garbage collected CAF.
1753
1754    Any garbage collected CAFs are taken off the CAF list at the same
1755    time.
1756    -------------------------------------------------------------------------- */
1757
1758 #if 0 && defined(DEBUG)
1759
1760 static void
1761 gcCAFs(void)
1762 {
1763   StgClosure*  p;
1764   StgClosure** pp;
1765   const StgInfoTable *info;
1766   nat i;
1767
1768   i = 0;
1769   p = caf_list;
1770   pp = &caf_list;
1771
1772   while (p != NULL) {
1773
1774     info = get_itbl(p);
1775
1776     ASSERT(info->type == IND_STATIC);
1777
1778     if (STATIC_LINK(info,p) == NULL) {
1779         debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%04lx", (long)p);
1780         // black hole it
1781         SET_INFO(p,&stg_BLACKHOLE_info);
1782         p = STATIC_LINK2(info,p);
1783         *pp = p;
1784     }
1785     else {
1786       pp = &STATIC_LINK2(info,p);
1787       p = *pp;
1788       i++;
1789     }
1790
1791   }
1792
1793   debugTrace(DEBUG_gccafs, "%d CAFs live", i);
1794 }
1795 #endif