From: Simon Marlow <marlowsd@gmail.com>
Date: Wed, 2 Feb 2011 15:49:55 +0000 (+0000)
Subject: GC refactoring and cleanup
X-Git-Url: http://git.megacz.com/?p=ghc-hetmet.git;a=commitdiff_plain;h=18896fa2b06844407fd1e0d3f85cd3db97a96ff4

GC refactoring and cleanup
Now we keep any partially-full blocks in the gc_thread[] structs after
each GC, rather than moving them to the generation.  This should give
us slightly better locality (though I wasn't able to measure any
difference).

Also in this patch: better sanity checking with THREADED.
---

diff --git a/includes/rts/storage/GC.h b/includes/rts/storage/GC.h
index 7cee670..bbed216 100644
--- a/includes/rts/storage/GC.h
+++ b/includes/rts/storage/GC.h
@@ -87,7 +87,7 @@ typedef struct generation_ {
 #if defined(THREADED_RTS)
     char pad[128];                      // make sure the following is
                                         // on a separate cache line.
-    SpinLock     sync_large_objects;    // lock for large_objects
+    SpinLock     sync;                  // lock for large_objects
                                         //    and scavenged_large_objects
 #endif
 
@@ -101,9 +101,6 @@ typedef struct generation_ {
     unsigned int n_old_blocks;		// number of blocks in from-space
     unsigned int live_estimate;         // for sweeping: estimate of live data
     
-    bdescr *     part_blocks;           // partially-full scanned blocks
-    unsigned int n_part_blocks;         // count of above
-
     bdescr *     scavenged_large_objects;  // live large objs after GC (d-link)
     unsigned int n_scavenged_large_blocks; // size (not count) of above
 
diff --git a/rts/RaiseAsync.c b/rts/RaiseAsync.c
index 550f703..775505f 100644
--- a/rts/RaiseAsync.c
+++ b/rts/RaiseAsync.c
@@ -592,7 +592,7 @@ removeFromMVarBlockedQueue (StgTSO *tso)
 
     if (mvar->head == q) {
         mvar->head = q->link;
-        q->header.info = &stg_IND_info;
+        OVERWRITE_INFO(q, &stg_IND_info);
         if (mvar->tail == q) {
             mvar->tail = (StgMVarTSOQueue*)END_TSO_QUEUE;
         }
@@ -602,10 +602,10 @@ removeFromMVarBlockedQueue (StgTSO *tso)
         // we lose the tail pointer when the GC shorts out the IND.
         // So we use MSG_NULL as a kind of non-dupable indirection;
         // these are ignored by takeMVar/putMVar.
-        q->header.info = &stg_MSG_NULL_info;
+        OVERWRITE_INFO(q, &stg_MSG_NULL_info);
     }
     else {
-        q->header.info = &stg_IND_info;
+        OVERWRITE_INFO(q, &stg_IND_info);
     }
 
     // revoke the MVar operation
diff --git a/rts/Stats.c b/rts/Stats.c
index cbd02cd..4b9f6d8 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -710,7 +710,7 @@ stat_exit(int alloc)
                 statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin);
                 statsPrintf("whitehole_spin: %"FMT_Word64"\n", whitehole_spin);
                 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-                    statsPrintf("gen[%d].sync_large_objects: %"FMT_Word64"\n", g, generations[g].sync_large_objects.spin);
+                    statsPrintf("gen[%d].sync: %"FMT_Word64"\n", g, generations[g].sync.spin);
                 }
             }
 #endif
@@ -772,8 +772,9 @@ void
 statDescribeGens(void)
 {
   nat g, mut, lge, i;
-  lnat live, slop;
+  lnat gen_slop;
   lnat tot_live, tot_slop;
+  lnat gen_live, gen_blocks;
   bdescr *bd;
   generation *gen;
   
@@ -785,25 +786,32 @@ statDescribeGens(void)
 
   tot_live = 0;
   tot_slop = 0;
+
   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+      gen = &generations[g];
+
+      for (bd = gen->large_objects, lge = 0; bd; bd = bd->link) {
+          lge++;
+      }
+
+      gen_live   = genLiveWords(gen);
+      gen_blocks = genLiveBlocks(gen);
+
       mut = 0;
       for (i = 0; i < n_capabilities; i++) {
           mut += countOccupied(capabilities[i].mut_lists[g]);
+          gen_live   += gcThreadLiveWords(i,g);
+          gen_blocks += gcThreadLiveBlocks(i,g);
       }
 
-      gen = &generations[g];
-
       debugBelch("%5d %7d %9d", g, gen->max_blocks, mut);
 
-      for (bd = gen->large_objects, lge = 0; bd; bd = bd->link) {
-          lge++;
-      }
-      live = gen->n_words + countOccupied(gen->large_objects);
-      slop = (gen->n_blocks + gen->n_large_blocks) * BLOCK_SIZE_W - live;
-      debugBelch("%8d %8d %8ld %8ld\n", gen->n_blocks, lge,
-                 live*sizeof(W_), slop*sizeof(W_));
-      tot_live += live;
-      tot_slop += slop;
+      gen_slop = gen_blocks * BLOCK_SIZE_W - gen_live;
+
+      debugBelch("%8ld %8d %8ld %8ld\n", gen_blocks, lge,
+                 gen_live*sizeof(W_), gen_slop*sizeof(W_));
+      tot_live += gen_live;
+      tot_slop += gen_slop;
   }
   debugBelch("----------------------------------------------------------\n");
   debugBelch("%41s%8ld %8ld\n","",tot_live*sizeof(W_),tot_slop*sizeof(W_));
diff --git a/rts/Threads.c b/rts/Threads.c
index dcb916a..3e1c5cf 100644
--- a/rts/Threads.c
+++ b/rts/Threads.c
@@ -628,7 +628,7 @@ threadStackOverflow (Capability *cap, StgTSO *tso)
     // will be discarded after the first overflow, being replaced by a
     // non-moving 32k chunk.
     if (old_stack->sp == old_stack->stack + old_stack->stack_size) {
-        frame->next_chunk = new_stack;
+        frame->next_chunk = (StgStack*)END_TSO_QUEUE; // dummy
     }
 
     tso->stackobj = new_stack;
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index 18ace21..d049f98 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -248,7 +248,7 @@ evacuate_large(StgPtr p)
   bd = Bdescr(p);
   gen = bd->gen;
   gen_no = bd->gen_no;
-  ACQUIRE_SPIN_LOCK(&gen->sync_large_objects);
+  ACQUIRE_SPIN_LOCK(&gen->sync);
 
   // already evacuated? 
   if (bd->flags & BF_EVACUATED) { 
@@ -259,7 +259,7 @@ evacuate_large(StgPtr p)
 	gct->failed_to_evac = rtsTrue;
 	TICK_GC_FAILED_PROMOTION();
     }
-    RELEASE_SPIN_LOCK(&gen->sync_large_objects);
+    RELEASE_SPIN_LOCK(&gen->sync);
     return;
   }
 
@@ -297,16 +297,16 @@ evacuate_large(StgPtr p)
   // them straight on the scavenged_large_objects list.
   if (bd->flags & BF_PINNED) {
       ASSERT(get_itbl((StgClosure *)p)->type == ARR_WORDS);
-      if (new_gen != gen) { ACQUIRE_SPIN_LOCK(&new_gen->sync_large_objects); }
+      if (new_gen != gen) { ACQUIRE_SPIN_LOCK(&new_gen->sync); }
       dbl_link_onto(bd, &new_gen->scavenged_large_objects);
       new_gen->n_scavenged_large_blocks += bd->blocks;
-      if (new_gen != gen) { RELEASE_SPIN_LOCK(&new_gen->sync_large_objects); }
+      if (new_gen != gen) { RELEASE_SPIN_LOCK(&new_gen->sync); }
   } else {
       bd->link = ws->todo_large_objects;
       ws->todo_large_objects = bd;
   }
 
-  RELEASE_SPIN_LOCK(&gen->sync_large_objects);
+  RELEASE_SPIN_LOCK(&gen->sync);
 }
 
 /* ----------------------------------------------------------------------------
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index c990835..221f24a 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -137,8 +137,8 @@ DECLARE_GCT
 static void mark_root               (void *user, StgClosure **root);
 static void zero_static_object_list (StgClosure* first_static);
 static nat  initialise_N            (rtsBool force_major_gc);
-static void init_collected_gen      (nat g, nat threads);
-static void init_uncollected_gen    (nat g, nat threads);
+static void prepare_collected_gen   (generation *gen);
+static void prepare_uncollected_gen (generation *gen);
 static void init_gc_thread          (gc_thread *t);
 static void resize_generations      (void);
 static void resize_nursery          (void);
@@ -148,6 +148,7 @@ static StgWord inc_running          (void);
 static StgWord dec_running          (void);
 static void wakeup_gc_threads       (nat n_threads, nat me);
 static void shutdown_gc_threads     (nat n_threads, nat me);
+static void collect_gct_blocks      (void);
 
 #if 0 && defined(DEBUG)
 static void gcCAFs                  (void);
@@ -174,7 +175,7 @@ GarbageCollect (rtsBool force_major_gc,
 {
   bdescr *bd;
   generation *gen;
-  lnat live, allocated, max_copied, avg_copied, slop;
+  lnat live_blocks, live_words, allocated, max_copied, avg_copied;
   gc_thread *saved_gct;
   nat g, t, n;
 
@@ -274,7 +275,7 @@ GarbageCollect (rtsBool force_major_gc,
 #endif
 
   // check sanity *before* GC
-  IF_DEBUG(sanity, checkSanity(rtsTrue));
+  IF_DEBUG(sanity, checkSanity(rtsFalse /* before GC */, major_gc));
 
   // Initialise all our gc_thread structures
   for (t = 0; t < n_gc_threads; t++) {
@@ -283,12 +284,11 @@ GarbageCollect (rtsBool force_major_gc,
 
   // Initialise all the generations/steps that we're collecting.
   for (g = 0; g <= N; g++) {
-      init_collected_gen(g,n_gc_threads);
+      prepare_collected_gen(&generations[g]);
   }
-  
   // Initialise all the generations/steps that we're *not* collecting.
   for (g = N+1; g < RtsFlags.GcFlags.generations; g++) {
-      init_uncollected_gen(g,n_gc_threads);
+      prepare_uncollected_gen(&generations[g]);
   }
 
   /* Allocate a mark stack if we're doing a major collection.
@@ -420,76 +420,6 @@ SET_GCT(gc_threads[0]);
       }
   }
 
-  // For each workspace, in each thread, move the copied blocks to the step
-  {
-      gc_thread *thr;
-      gen_workspace *ws;
-      bdescr *prev, *next;
-
-      for (t = 0; t < n_gc_threads; t++) {
-	  thr = gc_threads[t];
-
-          for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-              ws = &thr->gens[g];
-
-              // Push the final block
-              if (ws->todo_bd) { 
-                  push_scanned_block(ws->todo_bd, ws);
-              }
-
-              ASSERT(gct->scan_bd == NULL);
-              ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks);
-              
-              prev = NULL;
-              for (bd = ws->scavd_list; bd != NULL; bd = bd->link) {
-                  ws->gen->n_words += bd->free - bd->start;
-                  prev = bd;
-              }
-              if (prev != NULL) {
-                  prev->link = ws->gen->blocks;
-                  ws->gen->blocks = ws->scavd_list;
-              } 
-              ws->gen->n_blocks += ws->n_scavd_blocks;
-          }
-      }
-
-      // Add all the partial blocks *after* we've added all the full
-      // blocks.  This is so that we can grab the partial blocks back
-      // again and try to fill them up in the next GC.
-      for (t = 0; t < n_gc_threads; t++) {
-	  thr = gc_threads[t];
-
-          for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-              ws = &thr->gens[g];
-
-              prev = NULL;
-              for (bd = ws->part_list; bd != NULL; bd = next) {
-                  next = bd->link;
-                  if (bd->free == bd->start) {
-                      if (prev == NULL) {
-                          ws->part_list = next;
-                      } else {
-                          prev->link = next;
-                      }
-                      freeGroup(bd);
-                      ws->n_part_blocks--;
-                  } else {
-                      ws->gen->n_words += bd->free - bd->start;
-                      prev = bd;
-                  }
-              }
-              if (prev != NULL) {
-                  prev->link = ws->gen->blocks;
-                  ws->gen->blocks = ws->part_list;
-              }
-              ws->gen->n_blocks += ws->n_part_blocks;
-
-              ASSERT(countBlocks(ws->gen->blocks) == ws->gen->n_blocks);
-              ASSERT(countOccupied(ws->gen->blocks) == ws->gen->n_words);
-	  }
-      }
-  }
-
   // Finally: compact or sweep the oldest generation.
   if (major_gc && oldest_gen->mark) {
       if (oldest_gen->compact) 
@@ -498,8 +428,6 @@ SET_GCT(gc_threads[0]);
           sweep(oldest_gen);
   }
 
-  /* run through all the generations/steps and tidy up 
-   */
   copied = 0;
   max_copied = 0;
   avg_copied = 0;
@@ -525,6 +453,16 @@ SET_GCT(gc_threads[0]);
       }
   }
 
+  // Run through all the generations/steps and tidy up.
+  // We're going to:
+  //   - count the amount of "live" data (live_words, live_blocks)
+  //   - count the amount of "copied" data in this GC (copied)
+  //   - free from-space
+  //   - make to-space the new from-space (set BF_EVACUATED on all blocks)
+  //
+  live_words = 0;
+  live_blocks = 0;
+
   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
 
     if (g == N) {
@@ -622,7 +560,6 @@ SET_GCT(gc_threads[0]);
         gen->large_objects  = gen->scavenged_large_objects;
         gen->n_large_blocks = gen->n_scavenged_large_blocks;
         gen->n_new_large_words = 0;
-        ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks);
     }
     else // for generations > N
     {
@@ -637,16 +574,31 @@ SET_GCT(gc_threads[0]);
         
 	// add the new blocks we promoted during this GC 
 	gen->n_large_blocks += gen->n_scavenged_large_blocks;
-        ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks);
+    }
+
+    ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks);
+
+    gen->scavenged_large_objects = NULL;
+    gen->n_scavenged_large_blocks = 0;
+
+    // Count "live" data
+    live_words  += genLiveWords(gen);
+    live_blocks += genLiveBlocks(gen);
+
+    // add in the partial blocks in the gen_workspaces, but ignore gen 0
+    // if this is a local GC (we can't count another capability's part_list)
+    {
+        nat i;
+        for (i = 0; i < n_capabilities; i++) {
+            live_words  += gcThreadLiveWords(i, gen->no);
+            live_blocks += gcThreadLiveBlocks(i, gen->no);
+        }
     }
   } // for all generations
 
   // update the max size of older generations after a major GC
   resize_generations();
   
-  // Calculate the amount of live data for stats.
-  live = calcLiveWords();
-
   // Start a new pinned_object_block
   for (n = 0; n < n_capabilities; n++) {
       capabilities[n].pinned_object_block = NULL;
@@ -698,11 +650,6 @@ SET_GCT(gc_threads[0]);
       }
   }
 
-  // send exceptions to any threads which were about to die
-  RELEASE_SM_LOCK;
-  resurrectThreads(resurrected_threads);
-  ACQUIRE_SM_LOCK;
-
   // Update the stable pointer hash table.
   updateStablePtrTable(major_gc);
 
@@ -717,6 +664,17 @@ SET_GCT(gc_threads[0]);
   scheduleFinalizers(cap, old_weak_ptr_list);
   ACQUIRE_SM_LOCK;
 
+  // check sanity after GC
+  // before resurrectThreads(), because that might overwrite some
+  // closures, which will cause problems with THREADED where we don't
+  // fill slop.
+  IF_DEBUG(sanity, checkSanity(rtsTrue /* after GC */, major_gc));
+
+  // send exceptions to any threads which were about to die
+  RELEASE_SM_LOCK;
+  resurrectThreads(resurrected_threads);
+  ACQUIRE_SM_LOCK;
+
   if (major_gc) {
       nat need, got;
       need = BLOCKS_TO_MBLOCKS(n_alloc_blocks);
@@ -730,10 +688,7 @@ SET_GCT(gc_threads[0]);
       }
   }
 
-  // check sanity after GC
-  IF_DEBUG(sanity, checkSanity(rtsTrue));
-
-  // extra GC trace info 
+  // extra GC trace info
   IF_DEBUG(gc, statDescribeGens());
 
 #ifdef DEBUG
@@ -758,8 +713,8 @@ SET_GCT(gc_threads[0]);
 #endif
 
   // ok, GC over: tell the stats department what happened. 
-  slop = calcLiveBlocks() * BLOCK_SIZE_W - live;
-  stat_endGC(allocated, live, copied, N, max_copied, avg_copied, slop);
+  stat_endGC(allocated, live_words, copied, N, max_copied, avg_copied,
+             live_blocks * BLOCK_SIZE_W - live_words /* slop */);
 
   // Guess which generation we'll collect *next* time
   initialise_N(force_major_gc);
@@ -858,7 +813,21 @@ new_gc_thread (nat n, gc_thread *t)
         ASSERT(g == ws->gen->no);
         ws->my_gct = t;
         
-        ws->todo_bd = NULL;
+        // We want to call
+        //   alloc_todo_block(ws,0);
+        // but can't, because it uses gct which isn't set up at this point.
+        // Hence, allocate a block for todo_bd manually:
+        {
+            bdescr *bd = allocBlock(); // no lock, locks aren't initialised yet
+            initBdescr(bd, ws->gen, ws->gen->to);
+            bd->flags = BF_EVACUATED;
+            bd->u.scan = bd->free = bd->start;
+
+            ws->todo_bd = bd;
+            ws->todo_free = bd->free;
+            ws->todo_lim = bd->start + BLOCK_SIZE_W;
+        }
+
         ws->todo_q = newWSDeque(128);
         ws->todo_overflow = NULL;
         ws->n_todo_overflow = 0;
@@ -1012,6 +981,8 @@ loop:
     scavenge_loop();
 #endif
 
+    collect_gct_blocks();
+
     // scavenge_loop() only exits when there's no work to do
     r = dec_running();
     
@@ -1197,16 +1168,16 @@ releaseGCThreads (Capability *cap USED_IF_THREADS)
    ------------------------------------------------------------------------- */
 
 static void
-init_collected_gen (nat g, nat n_threads)
+prepare_collected_gen (generation *gen)
 {
-    nat t, i;
+    nat i, g, n;
     gen_workspace *ws;
-    generation *gen;
-    bdescr *bd;
+    bdescr *bd, *next;
 
     // Throw away the current mutable list.  Invariant: the mutable
     // list always has at least one block; this means we can avoid a
     // check for NULL in recordMutable().
+    g = gen->no;
     if (g != 0) {
         for (i = 0; i < n_capabilities; i++) {
 	    freeChain(capabilities[i].mut_lists[g]);
@@ -1231,9 +1202,35 @@ init_collected_gen (nat g, nat n_threads)
     gen->live_estimate = 0;
 
     // initialise the large object queues.
-    gen->scavenged_large_objects = NULL;
-    gen->n_scavenged_large_blocks = 0;
-    
+    ASSERT(gen->scavenged_large_objects == NULL);
+    ASSERT(gen->n_scavenged_large_blocks == 0);
+
+    // grab all the partial blocks stashed in the gc_thread workspaces and
+    // move them to the old_blocks list of this gen.
+    for (n = 0; n < n_capabilities; n++) {
+        ws = &gc_threads[n]->gens[gen->no];
+
+        for (bd = ws->part_list; bd != NULL; bd = next) {
+            next = bd->link;
+            bd->link = gen->old_blocks;
+            gen->old_blocks = bd;
+            gen->n_old_blocks += bd->blocks;
+        }
+        ws->part_list = NULL;
+        ws->n_part_blocks = 0;
+
+        ASSERT(ws->scavd_list == NULL);
+        ASSERT(ws->n_scavd_blocks == 0);
+
+        if (ws->todo_free != ws->todo_bd->start) {
+            ws->todo_bd->free = ws->todo_free;
+            ws->todo_bd->link = gen->old_blocks;
+            gen->old_blocks = ws->todo_bd;
+            gen->n_old_blocks += ws->todo_bd->blocks;
+            alloc_todo_block(ws,0); // always has one block.
+        }
+    }
+
     // mark the small objects as from-space
     for (bd = gen->old_blocks; bd; bd = bd->link) {
         bd->flags &= ~BF_EVACUATED;
@@ -1285,106 +1282,83 @@ init_collected_gen (nat g, nat n_threads)
             }
         }
     }
-
-    // For each GC thread, for each step, allocate a "todo" block to
-    // store evacuated objects to be scavenged, and a block to store
-    // evacuated objects that do not need to be scavenged.
-    for (t = 0; t < n_threads; t++) {
-        ws = &gc_threads[t]->gens[g];
-        
-        ws->todo_large_objects = NULL;
-        
-        ws->part_list = NULL;
-        ws->n_part_blocks = 0;
-        
-        // allocate the first to-space block; extra blocks will be
-        // chained on as necessary.
-        ws->todo_bd = NULL;
-        ASSERT(looksEmptyWSDeque(ws->todo_q));
-        alloc_todo_block(ws,0);
-        
-        ws->todo_overflow = NULL;
-        ws->n_todo_overflow = 0;
-        
-        ws->scavd_list = NULL;
-        ws->n_scavd_blocks = 0;
-    }
 }
 
 
 /* ----------------------------------------------------------------------------
+   Save the mutable lists in saved_mut_lists
+   ------------------------------------------------------------------------- */
+
+static void
+stash_mut_list (Capability *cap, nat gen_no)
+{
+    cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
+    cap->mut_lists[gen_no] = allocBlock_sync();
+}
+
+/* ----------------------------------------------------------------------------
    Initialise a generation that is *not* to be collected 
    ------------------------------------------------------------------------- */
 
 static void
-init_uncollected_gen (nat g, nat threads)
+prepare_uncollected_gen (generation *gen)
 {
-    nat t, n;
-    gen_workspace *ws;
-    generation *gen;
-    bdescr *bd;
+    nat i;
+
+
+    ASSERT(gen->no > 0);
 
     // save the current mutable lists for this generation, and
     // allocate a fresh block for each one.  We'll traverse these
     // mutable lists as roots early on in the GC.
-    for (n = 0; n < n_capabilities; n++) {
-        capabilities[n].saved_mut_lists[g] = capabilities[n].mut_lists[g];
-        capabilities[n].mut_lists[g] = allocBlock();
+    for (i = 0; i < n_capabilities; i++) {
+        stash_mut_list(&capabilities[i], gen->no);
     }
 
-    gen = &generations[g];
+    ASSERT(gen->scavenged_large_objects == NULL);
+    ASSERT(gen->n_scavenged_large_blocks == 0);
+}
 
-    gen->scavenged_large_objects = NULL;
-    gen->n_scavenged_large_blocks = 0;
+/* -----------------------------------------------------------------------------
+   Collect the completed blocks from a GC thread and attach them to
+   the generation.
+   -------------------------------------------------------------------------- */
 
-    for (t = 0; t < threads; t++) {
-        ws = &gc_threads[t]->gens[g];
-	    
-        ASSERT(looksEmptyWSDeque(ws->todo_q));
-        ws->todo_large_objects = NULL;
-        
-        ws->part_list = NULL;
-        ws->n_part_blocks = 0;
+static void
+collect_gct_blocks (void)
+{
+    nat g;
+    gen_workspace *ws;
+    bdescr *bd, *prev;
+    
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+        ws = &gct->gens[g];
         
-        ws->scavd_list = NULL;
-        ws->n_scavd_blocks = 0;
+        // there may still be a block attached to ws->todo_bd;
+        // leave it there to use next time.
+
+        if (ws->scavd_list != NULL) {
+            ACQUIRE_SPIN_LOCK(&ws->gen->sync);
+
+            ASSERT(gct->scan_bd == NULL);
+            ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks);
         
-        // If the block at the head of the list in this generation
-        // is less than 3/4 full, then use it as a todo block.
-        if (gen->blocks && isPartiallyFull(gen->blocks))
-        {
-            ws->todo_bd = gen->blocks;
-            ws->todo_free = ws->todo_bd->free;
-            ws->todo_lim = ws->todo_bd->start + BLOCK_SIZE_W;
-            gen->blocks = gen->blocks->link;
-            gen->n_blocks -= 1;
-            gen->n_words -= ws->todo_bd->free - ws->todo_bd->start;
-            ws->todo_bd->link = NULL;
-            // we must scan from the current end point.
-            ws->todo_bd->u.scan = ws->todo_bd->free;
-        } 
-        else
-        {
-            ws->todo_bd = NULL;
-            alloc_todo_block(ws,0);
-        }
-    }
+            prev = NULL;
+            for (bd = ws->scavd_list; bd != NULL; bd = bd->link) {
+                ws->gen->n_words += bd->free - bd->start;
+                prev = bd;
+            }
+            if (prev != NULL) {
+                prev->link = ws->gen->blocks;
+                ws->gen->blocks = ws->scavd_list;
+            } 
+            ws->gen->n_blocks += ws->n_scavd_blocks;
 
-    // deal out any more partial blocks to the threads' part_lists
-    t = 0;
-    while (gen->blocks && isPartiallyFull(gen->blocks))
-    {
-        bd = gen->blocks;
-        gen->blocks = bd->link;
-        ws = &gc_threads[t]->gens[g];
-        bd->link = ws->part_list;
-        ws->part_list = bd;
-        ws->n_part_blocks += 1;
-        bd->u.scan = bd->free;
-        gen->n_blocks -= 1;
-        gen->n_words -= bd->free - bd->start;
-        t++;
-        if (t == n_gc_threads) t = 0;
+            ws->scavd_list = NULL;
+            ws->n_scavd_blocks = 0;
+
+            RELEASE_SPIN_LOCK(&ws->gen->sync);
+        }
     }
 }
 
diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c
index 65a70fa..8ebb9a2 100644
--- a/rts/sm/Sanity.c
+++ b/rts/sm/Sanity.c
@@ -21,6 +21,7 @@
 #include "RtsUtils.h"
 #include "sm/Storage.h"
 #include "sm/BlockAlloc.h"
+#include "GCThread.h"
 #include "Sanity.h"
 #include "Schedule.h"
 #include "Apply.h"
@@ -468,17 +469,10 @@ checkClosure( StgClosure* p )
    all the objects in the remainder of the chain.
    -------------------------------------------------------------------------- */
 
-void 
-checkHeap(bdescr *bd)
+void checkHeapChain (bdescr *bd)
 {
     StgPtr p;
 
-#if defined(THREADED_RTS)
-    // heap sanity checking doesn't work with SMP, because we can't
-    // zero the slop (see Updates.h).
-    return;
-#endif
-
     for (; bd != NULL; bd = bd->link) {
         if(!(bd->flags & BF_SWEPT)) {
             p = bd->start;
@@ -496,7 +490,7 @@ checkHeap(bdescr *bd)
     }
 }
 
-void 
+void
 checkHeapChunk(StgPtr start, StgPtr end)
 {
   StgPtr p;
@@ -587,6 +581,24 @@ checkGlobalTSOList (rtsBool checkTSOs)
               ASSERT(Bdescr((P_)tso)->gen_no == 0 || (tso->flags & TSO_MARKED));
               tso->flags &= ~TSO_MARKED;
           }
+
+          {
+              StgStack *stack;
+              StgUnderflowFrame *frame;
+
+              stack = tso->stackobj;
+              while (1) {
+                  if (stack->dirty & 1) {
+                      ASSERT(Bdescr((P_)stack)->gen_no == 0 || (stack->dirty & TSO_MARKED));
+                      stack->dirty &= ~TSO_MARKED;
+                  }
+                  frame = (StgUnderflowFrame*) (stack->stack + stack->stack_size
+                                                - sizeofW(StgUnderflowFrame));
+                  if (frame->info != &stg_stack_underflow_frame_info
+                      || frame->next_chunk == (StgStack*)END_TSO_QUEUE) break;
+                  stack = frame->next_chunk;
+              }
+          }
       }
   }
 }
@@ -595,7 +607,7 @@ checkGlobalTSOList (rtsBool checkTSOs)
    Check mutable list sanity.
    -------------------------------------------------------------------------- */
 
-void
+static void
 checkMutableList( bdescr *mut_bd, nat gen )
 {
     bdescr *bd;
@@ -605,25 +617,37 @@ checkMutableList( bdescr *mut_bd, nat gen )
     for (bd = mut_bd; bd != NULL; bd = bd->link) {
 	for (q = bd->start; q < bd->free; q++) {
 	    p = (StgClosure *)*q;
-	    ASSERT(!HEAP_ALLOCED(p) || Bdescr((P_)p)->gen_no == gen);
-            if (get_itbl(p)->type == TSO) {
+            ASSERT(!HEAP_ALLOCED(p) || Bdescr((P_)p)->gen_no == gen);
+            checkClosure(p);
+
+            switch (get_itbl(p)->type) {
+            case TSO:
                 ((StgTSO *)p)->flags |= TSO_MARKED;
+                break;
+            case STACK:
+                ((StgStack *)p)->dirty |= TSO_MARKED;
+                break;
             }
-	}
+        }
     }
 }
 
-void
-checkMutableLists (rtsBool checkTSOs)
+static void
+checkLocalMutableLists (nat cap_no)
 {
-    nat g, i;
+    nat g;
+    for (g = 1; g < RtsFlags.GcFlags.generations; g++) {
+        checkMutableList(capabilities[cap_no].mut_lists[g], g);
+    }
+}
 
-    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-        for (i = 0; i < n_capabilities; i++) {
-            checkMutableList(capabilities[i].mut_lists[g], g);
-        }
+static void
+checkMutableLists (void)
+{
+    nat i;
+    for (i = 0; i < n_capabilities; i++) {
+        checkLocalMutableLists(i);
     }
-    checkGlobalTSOList(checkTSOs);
 }
 
 /*
@@ -677,7 +701,8 @@ checkNurserySanity (nursery *nursery)
 
     prev = NULL;
     for (bd = nursery->blocks; bd != NULL; bd = bd->link) {
-	ASSERT(bd->u.back == prev);
+        ASSERT(bd->gen == g0);
+        ASSERT(bd->u.back == prev);
 	prev = bd;
 	blocks += bd->blocks;
     }
@@ -685,41 +710,59 @@ checkNurserySanity (nursery *nursery)
     ASSERT(blocks == nursery->n_blocks);
 }
 
+static void checkGeneration (generation *gen, 
+                             rtsBool after_major_gc USED_IF_THREADS)
+{
+    nat n;
+    gen_workspace *ws;
+
+    ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
+    ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks);
+
+#if defined(THREADED_RTS)
+    // heap sanity checking doesn't work with SMP, because we can't
+    // zero the slop (see Updates.h).  However, we can sanity-check
+    // the heap after a major gc, because there is no slop.
+    if (!after_major_gc) return;
+#endif
+
+    checkHeapChain(gen->blocks);
+
+    for (n = 0; n < n_capabilities; n++) {
+        ws = &gc_threads[n]->gens[gen->no];
+        checkHeapChain(ws->todo_bd);
+        checkHeapChain(ws->part_list);
+        checkHeapChain(ws->scavd_list);
+    }
+
+    checkLargeObjects(gen->large_objects);
+}
 
 /* Full heap sanity check. */
-void
-checkSanity( rtsBool check_heap )
+static void checkFullHeap (rtsBool after_major_gc)
 {
     nat g, n;
 
     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-        ASSERT(countBlocks(generations[g].blocks)
-               == generations[g].n_blocks);
-        ASSERT(countBlocks(generations[g].large_objects)
-                   == generations[g].n_large_blocks);
-        if (check_heap) {
-            checkHeap(generations[g].blocks);
-        }
-        checkLargeObjects(generations[g].large_objects);
+        checkGeneration(&generations[g], after_major_gc);
     }
-    
     for (n = 0; n < n_capabilities; n++) {
         checkNurserySanity(&nurseries[n]);
     }
-    
+}
+
+void checkSanity (rtsBool after_gc, rtsBool major_gc)
+{
+    checkFullHeap(after_gc && major_gc);
+
     checkFreeListSanity();
 
-#if defined(THREADED_RTS)
     // always check the stacks in threaded mode, because checkHeap()
     // does nothing in this case.
-    checkMutableLists(rtsTrue);
-#else
-    if (check_heap) {
-        checkMutableLists(rtsFalse);
-    } else {
-        checkMutableLists(rtsTrue);
+    if (after_gc) {
+        checkMutableLists();
+        checkGlobalTSOList(rtsTrue);
     }
-#endif
 }
 
 // If memInventory() calculates that we have a memory leak, this
@@ -732,18 +775,21 @@ checkSanity( rtsBool check_heap )
 static void
 findMemoryLeak (void)
 {
-  nat g, i;
-  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-      for (i = 0; i < n_capabilities; i++) {
-	  markBlocks(capabilities[i].mut_lists[g]);
-      }
-      markBlocks(generations[g].blocks);
-      markBlocks(generations[g].large_objects);
-  }
+    nat g, i;
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+        for (i = 0; i < n_capabilities; i++) {
+            markBlocks(capabilities[i].mut_lists[g]);
+            markBlocks(gc_threads[i]->gens[g].part_list);
+            markBlocks(gc_threads[i]->gens[g].scavd_list);
+            markBlocks(gc_threads[i]->gens[g].todo_bd);
+        }
+        markBlocks(generations[g].blocks);
+        markBlocks(generations[g].large_objects);
+    }
 
-  for (i = 0; i < n_capabilities; i++) {
-      markBlocks(nurseries[i].blocks);
-  }
+    for (i = 0; i < n_capabilities; i++) {
+        markBlocks(nurseries[i].blocks);
+    }
 
 #ifdef PROFILING
   // TODO:
@@ -823,6 +869,9 @@ memInventory (rtsBool show)
       gen_blocks[g] = 0;
       for (i = 0; i < n_capabilities; i++) {
 	  gen_blocks[g] += countBlocks(capabilities[i].mut_lists[g]);
+          gen_blocks[g] += countBlocks(gc_threads[i]->gens[g].part_list);
+          gen_blocks[g] += countBlocks(gc_threads[i]->gens[g].scavd_list);
+          gen_blocks[g] += countBlocks(gc_threads[i]->gens[g].todo_bd);
       }	  
       gen_blocks[g] += genBlocks(&generations[g]);
   }
diff --git a/rts/sm/Sanity.h b/rts/sm/Sanity.h
index 602be54..f302bc2 100644
--- a/rts/sm/Sanity.h
+++ b/rts/sm/Sanity.h
@@ -21,9 +21,9 @@
 # endif
 
 /* debugging routines */
-void checkSanity        ( rtsBool check_heap );
+void checkSanity        ( rtsBool after_gc, rtsBool major_gc );
 void checkNurserySanity ( nursery *nursery );
-void checkHeap          ( bdescr *bd );
+void checkHeapChain     ( bdescr *bd );
 void checkHeapChunk     ( StgPtr start, StgPtr end );
 void checkLargeObjects  ( bdescr *bd );
 void checkTSO           ( StgTSO* tso );
@@ -33,9 +33,6 @@ void checkStackChunk    ( StgPtr sp, StgPtr stack_end );
 StgOffset checkStackFrame ( StgPtr sp );
 StgOffset checkClosure  ( StgClosure* p );
 
-void checkMutableList   ( bdescr *bd, nat gen );
-void checkMutableLists  ( rtsBool checkTSOs );
-
 void checkRunQueue      (Capability *cap);
 
 void memInventory (rtsBool show);
diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c
index 9ac152a..d77734f 100644
--- a/rts/sm/Scav.c
+++ b/rts/sm/Scav.c
@@ -1755,10 +1755,10 @@ scavenge_large (gen_workspace *ws)
 	// the front when evacuating.
 	ws->todo_large_objects = bd->link;
 	
-	ACQUIRE_SPIN_LOCK(&ws->gen->sync_large_objects);
+        ACQUIRE_SPIN_LOCK(&ws->gen->sync);
 	dbl_link_onto(bd, &ws->gen->scavenged_large_objects);
 	ws->gen->n_scavenged_large_blocks += bd->blocks;
-	RELEASE_SPIN_LOCK(&ws->gen->sync_large_objects);
+        RELEASE_SPIN_LOCK(&ws->gen->sync);
 	
 	p = bd->start;
 	if (scavenge_one(p)) {
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 1b8a720..6c11065 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -15,6 +15,7 @@
 #include "Rts.h"
 
 #include "Storage.h"
+#include "GCThread.h"
 #include "RtsUtils.h"
 #include "Stats.h"
 #include "BlockAlloc.h"
@@ -84,7 +85,7 @@ initGeneration (generation *gen, int g)
     gen->compact = 0;
     gen->bitmap = NULL;
 #ifdef THREADED_RTS
-    initSpinLock(&gen->sync_large_objects);
+    initSpinLock(&gen->sync);
 #endif
     gen->threads = END_TSO_QUEUE;
     gen->old_threads = END_TSO_QUEUE;
@@ -766,7 +767,6 @@ lnat
 calcAllocated (rtsBool include_nurseries)
 {
   nat allocated = 0;
-  bdescr *bd;
   nat i;
 
   // When called from GC.c, we already have the allocation count for
@@ -775,9 +775,7 @@ calcAllocated (rtsBool include_nurseries)
   if (include_nurseries)
   {
       for (i = 0; i < n_capabilities; i++) {
-          for (bd = nurseries[i].blocks; bd; bd = bd->link) {
-              allocated += (lnat)(bd->free - bd->start);
-          }
+          allocated += countOccupied(nurseries[i].blocks);
       }
   }
 
@@ -787,25 +785,6 @@ calcAllocated (rtsBool include_nurseries)
   return allocated;
 }  
 
-/* Approximate the amount of live data in the heap.  To be called just
- * after garbage collection (see GarbageCollect()).
- */
-lnat calcLiveBlocks (void)
-{
-  nat g;
-  lnat live = 0;
-  generation *gen;
-
-  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-      /* approximate amount of live data (doesn't take into account slop
-       * at end of each block).
-       */
-      gen = &generations[g];
-      live += gen->n_large_blocks + gen->n_blocks;
-  }
-  return live;
-}
-
 lnat countOccupied (bdescr *bd)
 {
     lnat words;
@@ -818,6 +797,38 @@ lnat countOccupied (bdescr *bd)
     return words;
 }
 
+lnat genLiveWords (generation *gen)
+{
+    return gen->n_words + countOccupied(gen->large_objects);
+}
+
+lnat genLiveBlocks (generation *gen)
+{
+    return gen->n_blocks + gen->n_large_blocks;
+}
+
+lnat gcThreadLiveWords (nat i, nat g)
+{
+    lnat words;
+
+    words   = countOccupied(gc_threads[i]->gens[g].todo_bd);
+    words  += countOccupied(gc_threads[i]->gens[g].part_list);
+    words  += countOccupied(gc_threads[i]->gens[g].scavd_list);
+
+    return words;
+}
+
+lnat gcThreadLiveBlocks (nat i, nat g)
+{
+    lnat blocks;
+
+    blocks  = countBlocks(gc_threads[i]->gens[g].todo_bd);
+    blocks += gc_threads[i]->gens[g].n_part_blocks;
+    blocks += gc_threads[i]->gens[g].n_scavd_blocks;
+
+    return blocks;
+}
+
 // Return an accurate count of the live data in the heap, excluding
 // generation 0.
 lnat calcLiveWords (void)
@@ -828,8 +839,19 @@ lnat calcLiveWords (void)
     
     live = 0;
     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-        gen = &generations[g];
-        live += gen->n_words + countOccupied(gen->large_objects);
+        live += genLiveWords(&generations[g]);
+    }
+    return live;
+}
+
+lnat calcLiveBlocks (void)
+{
+    nat g;
+    lnat live;
+
+    live = 0;
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+        live += genLiveBlocks(&generations[g]);
     }
     return live;
 }
diff --git a/rts/sm/Storage.h b/rts/sm/Storage.h
index 8927ed6..d463d1a 100644
--- a/rts/sm/Storage.h
+++ b/rts/sm/Storage.h
@@ -87,11 +87,18 @@ lnat     countNurseryBlocks   ( void );
    -------------------------------------------------------------------------- */
 
 lnat    calcAllocated  (rtsBool count_nurseries);
-lnat    calcLiveBlocks (void);
-lnat    calcLiveWords  (void);
 lnat    countOccupied  (bdescr *bd);
 lnat    calcNeeded     (void);
 
+lnat    gcThreadLiveWords  (nat i, nat g);
+lnat    gcThreadLiveBlocks (nat i, nat g);
+
+lnat    genLiveWords  (generation *gen);
+lnat    genLiveBlocks (generation *gen);
+
+lnat    calcLiveBlocks (void);
+lnat    calcLiveWords  (void);
+
 /* ----------------------------------------------------------------------------
    Storage manager internal APIs and globals
    ------------------------------------------------------------------------- */