From 5270423a6afe69f1dc57e5e5a474812182718d40 Mon Sep 17 00:00:00 2001
From: Simon Marlow <marlowsd@gmail.com>
Date: Tue, 1 Dec 2009 16:03:21 +0000
Subject: [PATCH] Make allocatePinned use local storage, and other
 refactorings

This is a batch of refactoring to remove some of the GC's global
state, as we move towards CPU-local GC.

  - allocateLocal() now allocates large objects into the local
    nursery, rather than taking a global lock and allocating
    then in gen 0 step 0.

  - allocatePinned() was still allocating from global storage and
    taking a lock each time, now it uses local storage.
    (mallocForeignPtrBytes should be faster with -threaded).

  - We had a gen 0 step 0, distinct from the nurseries, which are
    stored in a separate nurseries[] array.  This is slightly strange.
    I removed the g0s0 global that pointed to gen 0 step 0, and
    removed all uses of it.  I think now we don't use gen 0 step 0 at
    all, except possibly when there is only one generation.  Possibly
    more tidying up is needed here.

  - I removed the global allocate() function, and renamed
    allocateLocal() to allocate().

  - the alloc_blocks global is gone.  MAYBE_GC() and
    doYouWantToGC() now check the local nursery only.
---
 includes/Cmm.h                |    7 +-
 includes/mkDerivedConstants.c |    3 +
 includes/rts/storage/GC.h     |   36 ++---
 rts/Capability.c              |    1 +
 rts/Capability.h              |    3 +
 rts/Interpreter.c             |   20 +--
 rts/Linker.c                  |    3 +-
 rts/PrimOps.cmm               |   10 +-
 rts/ProfHeap.c                |    2 +-
 rts/RaiseAsync.c              |    4 +-
 rts/RtsAPI.c                  |   34 ++---
 rts/STM.c                     |   12 +-
 rts/Schedule.c                |    4 +-
 rts/Threads.c                 |    6 +-
 rts/Weak.c                    |    2 +-
 rts/sm/GC.c                   |   64 +++++----
 rts/sm/MarkWeak.c             |  224 ++++++++++++++++--------------
 rts/sm/Storage.c              |  304 +++++++++++++++--------------------------
 rts/sm/Storage.h              |   11 +-
 19 files changed, 343 insertions(+), 407 deletions(-)

diff --git a/includes/Cmm.h b/includes/Cmm.h
index aba5c2e..59081e2 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -380,11 +380,12 @@
    HP_CHK_GEN(alloc,liveness,reentry);			\
    TICK_ALLOC_HEAP_NOCTR(alloc);
 
-// allocateLocal() allocates from the nursery, so we check to see
+// allocate() allocates from the nursery, so we check to see
 // whether the nursery is nearly empty in any function that uses
-// allocateLocal() - this includes many of the primops.
+// allocate() - this includes many of the primops.
 #define MAYBE_GC(liveness,reentry)			\
-  if (bdescr_link(CurrentNursery) == NULL || CInt[alloc_blocks] >= CInt[alloc_blocks_lim]) {		\
+    if (bdescr_link(CurrentNursery) == NULL || \
+        step_n_large_blocks(StgRegTable_rNursery(BaseReg)) >= CInt[alloc_blocks_lim]) { \
 	R9  = liveness;					\
         R10 = reentry;					\
         HpAlloc = 0;					\
diff --git a/includes/mkDerivedConstants.c b/includes/mkDerivedConstants.c
index b6d0106..ddd2e65 100644
--- a/includes/mkDerivedConstants.c
+++ b/includes/mkDerivedConstants.c
@@ -230,6 +230,7 @@ main(int argc, char *argv[])
     field_offset(StgRegTable, rCurrentNursery);
     field_offset(StgRegTable, rHpAlloc);
     struct_field(StgRegTable, rRet);
+    struct_field(StgRegTable, rNursery);
 
     def_offset("stgEagerBlackholeInfo", FUN_OFFSET(stgEagerBlackholeInfo));
     def_offset("stgGCEnter1", FUN_OFFSET(stgGCEnter1));
@@ -249,6 +250,8 @@ main(int argc, char *argv[])
     struct_size(generation);
     struct_field(generation, mut_list);
 
+    struct_field(step, n_large_blocks);
+
     struct_size(CostCentreStack);
     struct_field(CostCentreStack, ccsID);
     struct_field(CostCentreStack, mem_alloc);
diff --git a/includes/rts/storage/GC.h b/includes/rts/storage/GC.h
index aa05313..1cd57c9 100644
--- a/includes/rts/storage/GC.h
+++ b/includes/rts/storage/GC.h
@@ -75,10 +75,6 @@ typedef struct step_ {
     // ------------------------------------
     // Fields below are used during GC only
 
-    // During GC, if we are collecting this step, blocks and n_blocks
-    // are copied into the following two fields.  After GC, these blocks
-    // are freed.
-
 #if defined(THREADED_RTS)
     char pad[128];                      // make sure the following is
                                         // on a separate cache line.
@@ -89,6 +85,9 @@ typedef struct step_ {
     int          mark;			// mark (not copy)? (old gen only)
     int          compact;		// compact (not sweep)? (old gen only)
 
+    // During GC, if we are collecting this step, blocks and n_blocks
+    // are copied into the following two fields.  After GC, these blocks
+    // are freed.
     bdescr *     old_blocks;	        // bdescr of first from-space block
     unsigned int n_old_blocks;		// number of blocks in from-space
     unsigned int live_estimate;         // for sweeping: estimate of live data
@@ -125,7 +124,6 @@ typedef struct generation_ {
 extern generation * generations;
 
 extern generation * g0;
-extern step * g0s0;
 extern generation * oldest_gen;
 extern step * all_steps;
 extern nat total_steps;
@@ -133,21 +131,14 @@ extern nat total_steps;
 /* -----------------------------------------------------------------------------
    Generic allocation
 
-   StgPtr allocateInGen(generation *g, nat n)
-                                Allocates a chunk of contiguous store
-   				n words long in generation g,
-   				returning a pointer to the first word.
-   				Always succeeds.
-				
-   StgPtr allocate(nat n)       Equaivalent to allocateInGen(g0)
-				
-   StgPtr allocateLocal(Capability *cap, nat n)
+   StgPtr allocate(Capability *cap, nat n)
                                 Allocates memory from the nursery in
 				the current Capability.  This can be
 				done without taking a global lock,
                                 unlike allocate().
 
-   StgPtr allocatePinned(nat n) Allocates a chunk of contiguous store
+   StgPtr allocatePinned(Capability *cap, nat n) 
+                                Allocates a chunk of contiguous store
    				n words long, which is at a fixed
 				address (won't be moved by GC).  
 				Returns a pointer to the first word.
@@ -163,27 +154,16 @@ extern nat total_steps;
 				allocatePinned, for the
 				benefit of the ticky-ticky profiler.
 
-   rtsBool doYouWantToGC(void)  Returns True if the storage manager is
-   				ready to perform a GC, False otherwise.
-
-   lnat  allocatedBytes(void)  Returns the number of bytes allocated
-                                via allocate() since the last GC.
-				Used in the reporting of statistics.
-
    -------------------------------------------------------------------------- */
 
-StgPtr  allocate        ( lnat n );
-StgPtr  allocateInGen   ( generation *g, lnat n );
-StgPtr  allocateLocal   ( Capability *cap, lnat n );
-StgPtr  allocatePinned  ( lnat n );
-lnat    allocatedBytes  ( void );
+StgPtr  allocate        ( Capability *cap, lnat n );
+StgPtr  allocatePinned  ( Capability *cap, lnat n );
 
 /* memory allocator for executable memory */
 void * allocateExec(unsigned int len, void **exec_addr);
 void   freeExec (void *p);
 
 // Used by GC checks in external .cmm code:
-extern nat alloc_blocks;
 extern nat alloc_blocks_lim;
 
 /* -----------------------------------------------------------------------------
diff --git a/rts/Capability.c b/rts/Capability.c
index 95050ba..0012c24 100644
--- a/rts/Capability.c
+++ b/rts/Capability.c
@@ -253,6 +253,7 @@ initCapability( Capability *cap, nat i )
     cap->free_trec_headers = NO_TREC;
     cap->transaction_tokens = 0;
     cap->context_switch = 0;
+    cap->pinned_object_block = NULL;
 }
 
 /* ---------------------------------------------------------------------------
diff --git a/rts/Capability.h b/rts/Capability.h
index 3f01bf3..ff6e368 100644
--- a/rts/Capability.h
+++ b/rts/Capability.h
@@ -69,6 +69,9 @@ struct Capability_ {
     bdescr **mut_lists;
     bdescr **saved_mut_lists; // tmp use during GC
 
+    // block for allocating pinned objects into
+    bdescr *pinned_object_block;
+
     // Context switch flag. We used to have one global flag, now one 
     // per capability. Locks required  : none (conflicts are harmless)
     int context_switch;
diff --git a/rts/Interpreter.c b/rts/Interpreter.c
index 339d4d8..5197510 100644
--- a/rts/Interpreter.c
+++ b/rts/Interpreter.c
@@ -89,7 +89,7 @@
 STATIC_INLINE StgPtr
 allocate_NONUPD (Capability *cap, int n_words)
 {
-    return allocateLocal(cap, stg_max(sizeofW(StgHeader)+MIN_PAYLOAD_SIZE, n_words));
+    return allocate(cap, stg_max(sizeofW(StgHeader)+MIN_PAYLOAD_SIZE, n_words));
 }
 
 int rts_stop_next_breakpoint = 0;
@@ -604,7 +604,7 @@ do_apply:
 	    else /* arity > n */ {
 		// build a new PAP and return it.
 		StgPAP *new_pap;
-		new_pap = (StgPAP *)allocateLocal(cap, PAP_sizeW(pap->n_args + m));
+		new_pap = (StgPAP *)allocate(cap, PAP_sizeW(pap->n_args + m));
 		SET_HDR(new_pap,&stg_PAP_info,CCCS);
 		new_pap->arity = pap->arity - n;
 		new_pap->n_args = pap->n_args + m;
@@ -649,7 +649,7 @@ do_apply:
 		// build a PAP and return it.
 		StgPAP *pap;
 		nat i;
-		pap = (StgPAP *)allocateLocal(cap, PAP_sizeW(m));
+		pap = (StgPAP *)allocate(cap, PAP_sizeW(m));
 		SET_HDR(pap, &stg_PAP_info,CCCS);
 		pap->arity = arity - n;
 		pap->fun = obj;
@@ -718,7 +718,7 @@ do_apply:
 
 run_BCO_return:
     // Heap check
-    if (doYouWantToGC()) {
+    if (doYouWantToGC(cap)) {
 	Sp--; Sp[0] = (W_)&stg_enter_info;
 	RETURN_TO_SCHEDULER(ThreadInterpret, HeapOverflow);
     }
@@ -729,7 +729,7 @@ run_BCO_return:
     
 run_BCO_return_unboxed:
     // Heap check
-    if (doYouWantToGC()) {
+    if (doYouWantToGC(cap)) {
 	RETURN_TO_SCHEDULER(ThreadInterpret, HeapOverflow);
     }
     // Stack checks aren't necessary at return points, the stack use
@@ -747,7 +747,7 @@ run_BCO_fun:
 	);
 
     // Heap check
-    if (doYouWantToGC()) {
+    if (doYouWantToGC(cap)) {
 	Sp -= 2; 
 	Sp[1] = (W_)obj; 
 	Sp[0] = (W_)&stg_apply_interp_info; // placeholder, really
@@ -863,7 +863,7 @@ run_BCO:
                   // stg_apply_interp_info pointer and a pointer to
                   // the BCO
                   size_words = BCO_BITMAP_SIZE(obj) + 2;
-                  new_aps = (StgAP_STACK *) allocateLocal(cap, AP_STACK_sizeW(size_words));
+                  new_aps = (StgAP_STACK *) allocate(cap, AP_STACK_sizeW(size_words));
                   SET_HDR(new_aps,&stg_AP_STACK_info,CCS_SYSTEM); 
                   new_aps->size = size_words;
                   new_aps->fun = &stg_dummy_ret_closure; 
@@ -1082,7 +1082,7 @@ run_BCO:
 	case bci_ALLOC_AP: {
 	    StgAP* ap; 
 	    int n_payload = BCO_NEXT;
-	    ap = (StgAP*)allocateLocal(cap, AP_sizeW(n_payload));
+	    ap = (StgAP*)allocate(cap, AP_sizeW(n_payload));
 	    Sp[-1] = (W_)ap;
 	    ap->n_args = n_payload;
 	    SET_HDR(ap, &stg_AP_info, CCS_SYSTEM/*ToDo*/)
@@ -1093,7 +1093,7 @@ run_BCO:
 	case bci_ALLOC_AP_NOUPD: {
 	    StgAP* ap; 
 	    int n_payload = BCO_NEXT;
-	    ap = (StgAP*)allocateLocal(cap, AP_sizeW(n_payload));
+	    ap = (StgAP*)allocate(cap, AP_sizeW(n_payload));
 	    Sp[-1] = (W_)ap;
 	    ap->n_args = n_payload;
 	    SET_HDR(ap, &stg_AP_NOUPD_info, CCS_SYSTEM/*ToDo*/)
@@ -1105,7 +1105,7 @@ run_BCO:
 	    StgPAP* pap; 
 	    int arity = BCO_NEXT;
 	    int n_payload = BCO_NEXT;
-	    pap = (StgPAP*)allocateLocal(cap, PAP_sizeW(n_payload));
+	    pap = (StgPAP*)allocate(cap, PAP_sizeW(n_payload));
 	    Sp[-1] = (W_)pap;
 	    pap->n_args = n_payload;
 	    pap->arity = arity;
diff --git a/rts/Linker.c b/rts/Linker.c
index 2dcd21b..2412864 100644
--- a/rts/Linker.c
+++ b/rts/Linker.c
@@ -943,9 +943,8 @@ typedef struct _RtsSymbolVal {
       SymI_HasProto(stg_writeTVarzh)			\
       SymI_HasProto(stg_yieldzh)                        \
       SymI_NeedsProto(stg_interp_constr_entry)          \
-      SymI_HasProto(alloc_blocks)                       \
       SymI_HasProto(alloc_blocks_lim)                   \
-      SymI_HasProto(allocateLocal)                      \
+      SymI_HasProto(allocate)                           \
       SymI_HasProto(allocateExec)	                \
       SymI_HasProto(freeExec)		                \
       SymI_HasProto(getAllocations)                     \
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index ac6de81..5e762b1 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -58,7 +58,7 @@ stg_newByteArrayzh
     n = R1;
     payload_words = ROUNDUP_BYTES_TO_WDS(n);
     words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-    ("ptr" p) = foreign "C" allocateLocal(MyCapability() "ptr",words) [];
+    ("ptr" p) = foreign "C" allocate(MyCapability() "ptr",words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
     StgArrWords_words(p) = payload_words;
@@ -85,7 +85,7 @@ stg_newPinnedByteArrayzh
     /* Now we convert to a number of words: */
     words = ROUNDUP_BYTES_TO_WDS(bytes);
 
-    ("ptr" p) = foreign "C" allocatePinned(words) [];
+    ("ptr" p) = foreign "C" allocatePinned(MyCapability() "ptr", words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
     /* Now we need to move p forward so that the payload is aligned
@@ -117,7 +117,7 @@ stg_newAlignedPinnedByteArrayzh
     /* Now we convert to a number of words: */
     words = ROUNDUP_BYTES_TO_WDS(bytes);
 
-    ("ptr" p) = foreign "C" allocatePinned(words) [];
+    ("ptr" p) = foreign "C" allocatePinned(MyCapability() "ptr", words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
     /* Now we need to move p forward so that the payload is aligned
@@ -139,7 +139,7 @@ stg_newArrayzh
     MAYBE_GC(R2_PTR,stg_newArrayzh);
 
     words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + n;
-    ("ptr" arr) = foreign "C" allocateLocal(MyCapability() "ptr",words) [R2];
+    ("ptr" arr) = foreign "C" allocate(MyCapability() "ptr",words) [R2];
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
 
     SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
@@ -356,7 +356,7 @@ stg_mkWeakForeignEnvzh
 
   payload_words = 4;
   words         = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-  ("ptr" p)     = foreign "C" allocateLocal(MyCapability() "ptr", words) [];
+  ("ptr" p)     = foreign "C" allocate(MyCapability() "ptr", words) [];
 
   TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
diff --git a/rts/ProfHeap.c b/rts/ProfHeap.c
index 69dd798..b9fc7b3 100644
--- a/rts/ProfHeap.c
+++ b/rts/ProfHeap.c
@@ -1086,7 +1086,7 @@ heapCensus( void )
 
   // Traverse the heap, collecting the census info
   if (RtsFlags.GcFlags.generations == 1) {
-      heapCensusChain( census, g0s0->blocks );
+      heapCensusChain( census, g0->steps[0].blocks );
   } else {
       for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
 	  for (s = 0; s < generations[g].n_steps; s++) {
diff --git a/rts/RaiseAsync.c b/rts/RaiseAsync.c
index a0f78ee..fad2803 100644
--- a/rts/RaiseAsync.c
+++ b/rts/RaiseAsync.c
@@ -792,7 +792,7 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
 	    // fun field.
 	    //
 	    words = frame - sp - 1;
-	    ap = (StgAP_STACK *)allocateLocal(cap,AP_STACK_sizeW(words));
+	    ap = (StgAP_STACK *)allocate(cap,AP_STACK_sizeW(words));
 	    
 	    ap->size = words;
 	    ap->fun  = (StgClosure *)sp[0];
@@ -856,7 +856,7 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
 	    // we've got an exception to raise, so let's pass it to the
 	    // handler in this frame.
 	    //
-	    raise = (StgThunk *)allocateLocal(cap,sizeofW(StgThunk)+1);
+	    raise = (StgThunk *)allocate(cap,sizeofW(StgThunk)+1);
 	    TICK_ALLOC_SE_THK(1,0);
 	    SET_HDR(raise,&stg_raise_info,cf->header.prof.ccs);
 	    raise->payload[0] = exception;
diff --git a/rts/RtsAPI.c b/rts/RtsAPI.c
index 54d1e75..c4babca 100644
--- a/rts/RtsAPI.c
+++ b/rts/RtsAPI.c
@@ -28,7 +28,7 @@
 HaskellObj
 rts_mkChar (Capability *cap, HsChar c)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap, CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap, CONSTR_sizeW(0,1));
   SET_HDR(p, Czh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)(StgChar)c;
   return p;
@@ -37,7 +37,7 @@ rts_mkChar (Capability *cap, HsChar c)
 HaskellObj
 rts_mkInt (Capability *cap, HsInt i)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, Izh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgInt)i;
   return p;
@@ -46,7 +46,7 @@ rts_mkInt (Capability *cap, HsInt i)
 HaskellObj
 rts_mkInt8 (Capability *cap, HsInt8 i)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, I8zh_con_info, CCS_SYSTEM);
   /* Make sure we mask out the bits above the lowest 8 */
   p->payload[0]  = (StgClosure *)(StgInt)i;
@@ -56,7 +56,7 @@ rts_mkInt8 (Capability *cap, HsInt8 i)
 HaskellObj
 rts_mkInt16 (Capability *cap, HsInt16 i)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, I16zh_con_info, CCS_SYSTEM);
   /* Make sure we mask out the relevant bits */
   p->payload[0]  = (StgClosure *)(StgInt)i;
@@ -66,7 +66,7 @@ rts_mkInt16 (Capability *cap, HsInt16 i)
 HaskellObj
 rts_mkInt32 (Capability *cap, HsInt32 i)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, I32zh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgInt)i;
   return p;
@@ -75,7 +75,7 @@ rts_mkInt32 (Capability *cap, HsInt32 i)
 HaskellObj
 rts_mkInt64 (Capability *cap, HsInt64 i)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,2));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,2));
   SET_HDR(p, I64zh_con_info, CCS_SYSTEM);
   ASSIGN_Int64((P_)&(p->payload[0]), i);
   return p;
@@ -84,7 +84,7 @@ rts_mkInt64 (Capability *cap, HsInt64 i)
 HaskellObj
 rts_mkWord (Capability *cap, HsWord i)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, Wzh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)i;
   return p;
@@ -94,7 +94,7 @@ HaskellObj
 rts_mkWord8 (Capability *cap, HsWord8 w)
 {
   /* see rts_mkInt* comments */
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, W8zh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)(w & 0xff);
   return p;
@@ -104,7 +104,7 @@ HaskellObj
 rts_mkWord16 (Capability *cap, HsWord16 w)
 {
   /* see rts_mkInt* comments */
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, W16zh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffff);
   return p;
@@ -114,7 +114,7 @@ HaskellObj
 rts_mkWord32 (Capability *cap, HsWord32 w)
 {
   /* see rts_mkInt* comments */
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, W32zh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffffffff);
   return p;
@@ -123,7 +123,7 @@ rts_mkWord32 (Capability *cap, HsWord32 w)
 HaskellObj
 rts_mkWord64 (Capability *cap, HsWord64 w)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,2));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,2));
   /* see mk_Int8 comment */
   SET_HDR(p, W64zh_con_info, CCS_SYSTEM);
   ASSIGN_Word64((P_)&(p->payload[0]), w);
@@ -134,7 +134,7 @@ rts_mkWord64 (Capability *cap, HsWord64 w)
 HaskellObj
 rts_mkFloat (Capability *cap, HsFloat f)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
   SET_HDR(p, Fzh_con_info, CCS_SYSTEM);
   ASSIGN_FLT((P_)p->payload, (StgFloat)f);
   return p;
@@ -143,7 +143,7 @@ rts_mkFloat (Capability *cap, HsFloat f)
 HaskellObj
 rts_mkDouble (Capability *cap, HsDouble d)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,sizeofW(StgDouble)));
+  StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,sizeofW(StgDouble)));
   SET_HDR(p, Dzh_con_info, CCS_SYSTEM);
   ASSIGN_DBL((P_)p->payload, (StgDouble)d);
   return p;
@@ -152,7 +152,7 @@ rts_mkDouble (Capability *cap, HsDouble d)
 HaskellObj
 rts_mkStablePtr (Capability *cap, HsStablePtr s)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,sizeofW(StgHeader)+1);
+  StgClosure *p = (StgClosure *)allocate(cap,sizeofW(StgHeader)+1);
   SET_HDR(p, StablePtr_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)s;
   return p;
@@ -161,7 +161,7 @@ rts_mkStablePtr (Capability *cap, HsStablePtr s)
 HaskellObj
 rts_mkPtr (Capability *cap, HsPtr a)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,sizeofW(StgHeader)+1);
+  StgClosure *p = (StgClosure *)allocate(cap,sizeofW(StgHeader)+1);
   SET_HDR(p, Ptr_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)a;
   return p;
@@ -170,7 +170,7 @@ rts_mkPtr (Capability *cap, HsPtr a)
 HaskellObj
 rts_mkFunPtr (Capability *cap, HsFunPtr a)
 {
-  StgClosure *p = (StgClosure *)allocateLocal(cap,sizeofW(StgHeader)+1);
+  StgClosure *p = (StgClosure *)allocate(cap,sizeofW(StgHeader)+1);
   SET_HDR(p, FunPtr_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)a;
   return p;
@@ -197,7 +197,7 @@ rts_apply (Capability *cap, HaskellObj f, HaskellObj arg)
 {
     StgThunk *ap;
 
-    ap = (StgThunk *)allocateLocal(cap,sizeofW(StgThunk) + 2);
+    ap = (StgThunk *)allocate(cap,sizeofW(StgThunk) + 2);
     SET_HDR(ap, (StgInfoTable *)&stg_ap_2_upd_info, CCS_SYSTEM);
     ap->payload[0] = f;
     ap->payload[1] = arg;
diff --git a/rts/STM.c b/rts/STM.c
index 7921a67..ed5a722 100644
--- a/rts/STM.c
+++ b/rts/STM.c
@@ -412,7 +412,7 @@ static void unpark_waiters_on(Capability *cap, StgTVar *s) {
 static StgInvariantCheckQueue *new_stg_invariant_check_queue(Capability *cap,
 							     StgAtomicInvariant *invariant) {
   StgInvariantCheckQueue *result;
-  result = (StgInvariantCheckQueue *)allocateLocal(cap, sizeofW(StgInvariantCheckQueue));
+  result = (StgInvariantCheckQueue *)allocate(cap, sizeofW(StgInvariantCheckQueue));
   SET_HDR (result, &stg_INVARIANT_CHECK_QUEUE_info, CCS_SYSTEM);
   result -> invariant = invariant;
   result -> my_execution = NO_TREC;
@@ -422,7 +422,7 @@ static StgInvariantCheckQueue *new_stg_invariant_check_queue(Capability *cap,
 static StgTVarWatchQueue *new_stg_tvar_watch_queue(Capability *cap,
 						   StgClosure *closure) {
   StgTVarWatchQueue *result;
-  result = (StgTVarWatchQueue *)allocateLocal(cap, sizeofW(StgTVarWatchQueue));
+  result = (StgTVarWatchQueue *)allocate(cap, sizeofW(StgTVarWatchQueue));
   SET_HDR (result, &stg_TVAR_WATCH_QUEUE_info, CCS_SYSTEM);
   result -> closure = closure;
   return result;
@@ -430,7 +430,7 @@ static StgTVarWatchQueue *new_stg_tvar_watch_queue(Capability *cap,
 
 static StgTRecChunk *new_stg_trec_chunk(Capability *cap) {
   StgTRecChunk *result;
-  result = (StgTRecChunk *)allocateLocal(cap, sizeofW(StgTRecChunk));
+  result = (StgTRecChunk *)allocate(cap, sizeofW(StgTRecChunk));
   SET_HDR (result, &stg_TREC_CHUNK_info, CCS_SYSTEM);
   result -> prev_chunk = END_STM_CHUNK_LIST;
   result -> next_entry_idx = 0;
@@ -440,7 +440,7 @@ static StgTRecChunk *new_stg_trec_chunk(Capability *cap) {
 static StgTRecHeader *new_stg_trec_header(Capability *cap,
                                           StgTRecHeader *enclosing_trec) {
   StgTRecHeader *result;
-  result = (StgTRecHeader *) allocateLocal(cap, sizeofW(StgTRecHeader));
+  result = (StgTRecHeader *) allocate(cap, sizeofW(StgTRecHeader));
   SET_HDR (result, &stg_TREC_HEADER_info, CCS_SYSTEM);
 
   result -> enclosing_trec = enclosing_trec;
@@ -1175,7 +1175,7 @@ void stmAddInvariantToCheck(Capability *cap,
   // 1. Allocate an StgAtomicInvariant, set last_execution to NO_TREC
   //    to signal that this is a new invariant in the current atomic block
 
-  invariant = (StgAtomicInvariant *) allocateLocal(cap, sizeofW(StgAtomicInvariant));
+  invariant = (StgAtomicInvariant *) allocate(cap, sizeofW(StgAtomicInvariant));
   TRACE("%p : stmAddInvariantToCheck allocated invariant=%p", trec, invariant);
   SET_HDR (invariant, &stg_ATOMIC_INVARIANT_info, CCS_SYSTEM);
   invariant -> code = code;
@@ -1657,7 +1657,7 @@ void stmWriteTVar(Capability *cap,
 StgTVar *stmNewTVar(Capability *cap,
                     StgClosure *new_value) {
   StgTVar *result;
-  result = (StgTVar *)allocateLocal(cap, sizeofW(StgTVar));
+  result = (StgTVar *)allocate(cap, sizeofW(StgTVar));
   SET_HDR (result, &stg_TVAR_info, CCS_SYSTEM);
   result -> current_value = new_value;
   result -> first_watch_queue_entry = END_STM_WATCH_QUEUE;
diff --git a/rts/Schedule.c b/rts/Schedule.c
index bb36f9b..3ae1fe0 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -2262,7 +2262,7 @@ threadStackOverflow(Capability *cap, StgTSO *tso)
 	     "increasing stack size from %ld words to %d.",
 	     (long)tso->stack_size, new_stack_size);
 
-  dest = (StgTSO *)allocateLocal(cap,new_tso_size);
+  dest = (StgTSO *)allocate(cap,new_tso_size);
   TICK_ALLOC_TSO(new_stack_size,0);
 
   /* copy the TSO block and the old stack into the new area */
@@ -2533,7 +2533,7 @@ raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception)
 	    // Only create raise_closure if we need to.
 	    if (raise_closure == NULL) {
 		raise_closure = 
-		    (StgThunk *)allocateLocal(cap,sizeofW(StgThunk)+1);
+		    (StgThunk *)allocate(cap,sizeofW(StgThunk)+1);
 		SET_HDR(raise_closure, &stg_raise_info, CCCS);
 		raise_closure->payload[0] = exception;
 	    }
diff --git a/rts/Threads.c b/rts/Threads.c
index 3b209ea..9867c1c 100644
--- a/rts/Threads.c
+++ b/rts/Threads.c
@@ -63,7 +63,7 @@ createThread(Capability *cap, nat size)
     }
 
     size = round_to_mblocks(size);
-    tso = (StgTSO *)allocateLocal(cap, size);
+    tso = (StgTSO *)allocate(cap, size);
 
     stack_size = size - TSO_STRUCT_SIZEW;
     TICK_ALLOC_TSO(stack_size, 0);
@@ -102,8 +102,8 @@ createThread(Capability *cap, nat size)
      */
     ACQUIRE_LOCK(&sched_mutex);
     tso->id = next_thread_id++;  // while we have the mutex
-    tso->global_link = g0s0->threads;
-    g0s0->threads = tso;
+    tso->global_link = cap->r.rNursery->threads;
+    cap->r.rNursery->threads = tso;
     RELEASE_LOCK(&sched_mutex);
     
     // ToDo: report the stack size in the event?
diff --git a/rts/Weak.c b/rts/Weak.c
index f5c3a62..f5e918a 100644
--- a/rts/Weak.c
+++ b/rts/Weak.c
@@ -120,7 +120,7 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
 
     debugTrace(DEBUG_weak, "weak: batching %d finalizers", n);
 
-    arr = (StgMutArrPtrs *)allocateLocal(cap, sizeofW(StgMutArrPtrs) + n);
+    arr = (StgMutArrPtrs *)allocate(cap, sizeofW(StgMutArrPtrs) + n);
     TICK_ALLOC_PRIM(sizeofW(StgMutArrPtrs), n, 0);
     SET_HDR(arr, &stg_MUT_ARR_PTRS_FROZEN_info, CCS_SYSTEM);
     arr->ptrs = n;
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 0593bd7..3f556ab 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -425,9 +425,9 @@ SET_GCT(gc_threads[0]);
   // g0s0->old_blocks is the old nursery
   // g0s0->blocks is to-space from the previous GC
   if (RtsFlags.GcFlags.generations == 1) {
-      if (g0s0->blocks != NULL) {
-	  freeChain(g0s0->blocks);
-	  g0s0->blocks = NULL;
+      if (g0->steps[0].blocks != NULL) {
+	  freeChain(g0->steps[0].blocks);
+	  g0->steps[0].blocks = NULL;
       }
   }
 
@@ -646,18 +646,13 @@ SET_GCT(gc_threads[0]);
 
 	/* LARGE OBJECTS.  The current live large objects are chained on
 	 * scavenged_large, having been moved during garbage
-	 * collection from large_objects.  Any objects left on
+	 * collection from large_objects.  Any objects left on the
 	 * large_objects list are therefore dead, so we free them here.
 	 */
-	for (bd = stp->large_objects; bd != NULL; bd = next) {
-	  next = bd->link;
-	  freeGroup(bd);
-	  bd = next;
-	}
-
+        freeChain(stp->large_objects);
 	stp->large_objects  = stp->scavenged_large_objects;
 	stp->n_large_blocks = stp->n_scavenged_large_blocks;
-
+        ASSERT(countBlocks(stp->large_objects) == stp->n_large_blocks);
       }
       else // for older generations... 
       {
@@ -672,6 +667,7 @@ SET_GCT(gc_threads[0]);
 
 	// add the new blocks we promoted during this GC 
 	stp->n_large_blocks += stp->n_scavenged_large_blocks;
+        ASSERT(countBlocks(stp->large_objects) == stp->n_large_blocks);
       }
     }
   }
@@ -685,18 +681,19 @@ SET_GCT(gc_threads[0]);
   // Free the small objects allocated via allocate(), since this will
   // all have been copied into G0S1 now.  
   if (RtsFlags.GcFlags.generations > 1) {
-      if (g0s0->blocks != NULL) {
-          freeChain(g0s0->blocks);
-          g0s0->blocks = NULL;
+      if (g0->steps[0].blocks != NULL) {
+          freeChain(g0->steps[0].blocks);
+          g0->steps[0].blocks = NULL;
       }
-      g0s0->n_blocks = 0;
-      g0s0->n_words = 0;
+      g0->steps[0].n_blocks = 0;
+      g0->steps[0].n_words = 0;
   }
-  alloc_blocks = 0;
   alloc_blocks_lim = RtsFlags.GcFlags.minAllocAreaSize;
 
   // Start a new pinned_object_block
-  pinned_object_block = NULL;
+  for (n = 0; n < n_capabilities; n++) {
+      capabilities[n].pinned_object_block = NULL;
+  }
 
   // Free the mark stack.
   if (mark_stack_top_bd != NULL) {
@@ -932,14 +929,23 @@ initGcThreads (void)
 void
 freeGcThreads (void)
 {
+    nat s;
     if (gc_threads != NULL) {
 #if defined(THREADED_RTS)
         nat i;
-	for (i = 0; i < RtsFlags.ParFlags.nNodes; i++) {
+	for (i = 0; i < n_capabilities; i++) {
+            for (s = 0; s < total_steps; s++)
+            {
+                freeWSDeque(gc_threads[i]->steps[s].todo_q);
+            }
             stgFree (gc_threads[i]);
 	}
         stgFree (gc_threads);
 #else
+        for (s = 0; s < total_steps; s++)
+        {
+            freeWSDeque(gc_threads[0]->steps[s].todo_q);
+        }
         stgFree (gc_threads);
 #endif
         gc_threads = NULL;
@@ -1230,8 +1236,21 @@ init_collected_gen (nat g, nat n_threads)
 	}
     }
 
+    if (g == 0) {
+        for (i = 0; i < n_capabilities; i++) {
+            stp = &nurseries[i];
+            stp->old_threads = stp->threads;
+            stp->threads = END_TSO_QUEUE;
+        }
+    }
+
     for (s = 0; s < generations[g].n_steps; s++) {
 
+	// generation 0, step 0 doesn't need to-space, unless -G1
+	if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) { 
+	    continue; 
+	}
+	
 	stp = &generations[g].steps[s];
 	ASSERT(stp->gen_no == g);
 
@@ -1240,11 +1259,6 @@ init_collected_gen (nat g, nat n_threads)
         stp->old_threads = stp->threads;
         stp->threads = END_TSO_QUEUE;
 
-	// generation 0, step 0 doesn't need to-space 
-	if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) { 
-	    continue; 
-	}
-	
 	// deprecate the existing blocks
 	stp->old_blocks   = stp->blocks;
 	stp->n_old_blocks = stp->n_blocks;
@@ -1642,7 +1656,7 @@ resize_nursery (void)
 	 * performance we get from 3L bytes, reducing to the same
 	 * performance at 2L bytes.
 	 */
-	blocks = g0s0->n_blocks;
+	blocks = generations[0].steps[0].n_blocks;
 	
 	if ( RtsFlags.GcFlags.maxHeapSize != 0 &&
 	     blocks * RtsFlags.GcFlags.oldGenFactor * 2 > 
diff --git a/rts/sm/MarkWeak.c b/rts/sm/MarkWeak.c
index 4f0a7a4..2f5964f 100644
--- a/rts/sm/MarkWeak.c
+++ b/rts/sm/MarkWeak.c
@@ -21,6 +21,7 @@
 #include "Trace.h"
 #include "Schedule.h"
 #include "Weak.h"
+#include "Storage.h"
 
 /* -----------------------------------------------------------------------------
    Weak Pointers
@@ -82,6 +83,9 @@ StgTSO *resurrected_threads;
 // List of blocked threads found to have pending throwTos
 StgTSO *exception_threads;
 
+static void resurrectUnreachableThreads (step *stp);
+static rtsBool tidyThreadList (step *stp);
+
 void
 initWeakForGC(void)
 {
@@ -182,85 +186,23 @@ traverseWeakPtrList(void)
       return rtsTrue;
 
   case WeakThreads:
-      /* Now deal with the all_threads list, which behaves somewhat like
+      /* Now deal with the step->threads lists, which behave somewhat like
        * the weak ptr list.  If we discover any threads that are about to
        * become garbage, we wake them up and administer an exception.
        */
-     {
-          StgTSO *t, *tmp, *next, **prev;
-          nat g, s;
-          step *stp;
+  {
+      nat g, s, n;
 	  
-          // Traverse thread lists for generations we collected...
-          for (g = 0; g <= N; g++) {
-              for (s = 0; s < generations[g].n_steps; s++) {
-                  stp = &generations[g].steps[s];
-
-                  prev = &stp->old_threads;
-
-                  for (t = stp->old_threads; t != END_TSO_QUEUE; t = next) {
-	      
-                      tmp = (StgTSO *)isAlive((StgClosure *)t);
-	      
-                      if (tmp != NULL) {
-                          t = tmp;
-                      }
-
-                      ASSERT(get_itbl(t)->type == TSO);
-                      if (t->what_next == ThreadRelocated) {
-                          next = t->_link;
-                          *prev = next;
-                          continue;
-                      }
-
-                      next = t->global_link;
-
-                      // This is a good place to check for blocked
-                      // exceptions.  It might be the case that a thread is
-                      // blocked on delivering an exception to a thread that
-                      // is also blocked - we try to ensure that this
-                      // doesn't happen in throwTo(), but it's too hard (or
-                      // impossible) to close all the race holes, so we
-                      // accept that some might get through and deal with
-                      // them here.  A GC will always happen at some point,
-                      // even if the system is otherwise deadlocked.
-                      //
-                      // If an unreachable thread has blocked
-                      // exceptions, we really want to perform the
-                      // blocked exceptions rather than throwing
-                      // BlockedIndefinitely exceptions.  This is the
-                      // only place we can discover such threads.
-                      // The target thread might even be
-                      // ThreadFinished or ThreadKilled.  Bugs here
-                      // will only be seen when running on a
-                      // multiprocessor.
-                      if (t->blocked_exceptions != END_TSO_QUEUE) {
-                          if (tmp == NULL) {
-                              evacuate((StgClosure **)&t);
-                              flag = rtsTrue;
-                          }
-                          t->global_link = exception_threads;
-                          exception_threads = t;
-                          *prev = next;
-                          continue;
-                      }
-
-                      if (tmp == NULL) {
-                          // not alive (yet): leave this thread on the
-                          // old_all_threads list.
-                          prev = &(t->global_link);
-                      } 
-                      else {
-                          // alive
-                          *prev = next;
-
-                          // move this thread onto the correct threads list.
-                          step *new_step;
-                          new_step = Bdescr((P_)t)->step;
-                          t->global_link = new_step->threads;
-                          new_step->threads  = t;
-                      }
-                  }
+      // Traverse thread lists for generations we collected...
+      for (n = 0; n < n_capabilities; n++) {
+          if (tidyThreadList(&nurseries[n])) {
+              flag = rtsTrue;
+          }
+      }              
+      for (g = 0; g <= N; g++) {
+          for (s = 0; s < generations[g].n_steps; s++) {
+              if (tidyThreadList(&generations[g].steps[s])) {
+                  flag = rtsTrue;
               }
           }
       }
@@ -272,36 +214,18 @@ traverseWeakPtrList(void)
       /* And resurrect any threads which were about to become garbage.
        */
       {
-          nat g, s;
-          step *stp;
-	  StgTSO *t, *tmp, *next;
+          nat g, s, n;
 
+          for (n = 0; n < n_capabilities; n++) {
+              resurrectUnreachableThreads(&nurseries[n]);
+          }              
           for (g = 0; g <= N; g++) {
               for (s = 0; s < generations[g].n_steps; s++) {
-                  stp = &generations[g].steps[s];
-
-                  for (t = stp->old_threads; t != END_TSO_QUEUE; t = next) {
-                      next = t->global_link;
-
-                      // ThreadFinished and ThreadComplete: we have to keep
-                      // these on the all_threads list until they
-                      // become garbage, because they might get
-                      // pending exceptions.
-                      switch (t->what_next) {
-                      case ThreadKilled:
-                      case ThreadComplete:
-                          continue;
-                      default:
-                          tmp = t;
-                          evacuate((StgClosure **)&tmp);
-                          tmp->global_link = resurrected_threads;
-                          resurrected_threads = tmp;
-                      }
-                  }
+                  resurrectUnreachableThreads(&generations[g].steps[s]);
               }
           }
       }
-      
+        
       /* Finally, we can update the blackhole_queue.  This queue
        * simply strings together TSOs blocked on black holes, it is
        * not intended to keep anything alive.  Hence, we do not follow
@@ -316,15 +240,113 @@ traverseWeakPtrList(void)
 	      ASSERT(*pt != NULL);
 	  }
       }
-
+      
       weak_stage = WeakDone;  // *now* we're done,
       return rtsTrue;         // but one more round of scavenging, please
-
+  }
+      
   default:
       barf("traverse_weak_ptr_list");
       return rtsTrue;
   }
+}
+  
+  static void resurrectUnreachableThreads (step *stp)
+{
+    StgTSO *t, *tmp, *next;
+
+    for (t = stp->old_threads; t != END_TSO_QUEUE; t = next) {
+        next = t->global_link;
+        
+        // ThreadFinished and ThreadComplete: we have to keep
+        // these on the all_threads list until they
+        // become garbage, because they might get
+        // pending exceptions.
+        switch (t->what_next) {
+        case ThreadKilled:
+        case ThreadComplete:
+            continue;
+        default:
+            tmp = t;
+            evacuate((StgClosure **)&tmp);
+            tmp->global_link = resurrected_threads;
+            resurrected_threads = tmp;
+        }
+    }
+}
+
+static rtsBool tidyThreadList (step *stp)
+{
+    StgTSO *t, *tmp, *next, **prev;
+    rtsBool flag = rtsFalse;
 
+    prev = &stp->old_threads;
+
+    for (t = stp->old_threads; t != END_TSO_QUEUE; t = next) {
+	      
+        tmp = (StgTSO *)isAlive((StgClosure *)t);
+	
+        if (tmp != NULL) {
+            t = tmp;
+        }
+        
+        ASSERT(get_itbl(t)->type == TSO);
+        if (t->what_next == ThreadRelocated) {
+            next = t->_link;
+            *prev = next;
+            continue;
+        }
+        
+        next = t->global_link;
+        
+        // This is a good place to check for blocked
+        // exceptions.  It might be the case that a thread is
+        // blocked on delivering an exception to a thread that
+        // is also blocked - we try to ensure that this
+        // doesn't happen in throwTo(), but it's too hard (or
+        // impossible) to close all the race holes, so we
+        // accept that some might get through and deal with
+        // them here.  A GC will always happen at some point,
+        // even if the system is otherwise deadlocked.
+        //
+        // If an unreachable thread has blocked
+        // exceptions, we really want to perform the
+        // blocked exceptions rather than throwing
+        // BlockedIndefinitely exceptions.  This is the
+        // only place we can discover such threads.
+        // The target thread might even be
+        // ThreadFinished or ThreadKilled.  Bugs here
+        // will only be seen when running on a
+        // multiprocessor.
+        if (t->blocked_exceptions != END_TSO_QUEUE) {
+            if (tmp == NULL) {
+                evacuate((StgClosure **)&t);
+                flag = rtsTrue;
+            }
+            t->global_link = exception_threads;
+            exception_threads = t;
+            *prev = next;
+            continue;
+        }
+        
+        if (tmp == NULL) {
+            // not alive (yet): leave this thread on the
+            // old_all_threads list.
+            prev = &(t->global_link);
+        } 
+        else {
+            // alive
+            *prev = next;
+            
+            // move this thread onto the correct threads list.
+            step *new_step;
+            new_step = Bdescr((P_)t)->step;
+            t->global_link = new_step->threads;
+            new_step->threads  = t;
+        }
+    }
+
+    return flag;
 }
 
 /* -----------------------------------------------------------------------------
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 73ef53f..5d371b9 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -40,16 +40,14 @@ StgClosure    *caf_list         = NULL;
 StgClosure    *revertible_caf_list = NULL;
 rtsBool       keepCAFs;
 
-bdescr *pinned_object_block;    /* allocate pinned objects into this block */
-nat alloc_blocks;		/* number of allocate()d blocks since GC */
-nat alloc_blocks_lim;		/* approximate limit on alloc_blocks */
+nat alloc_blocks_lim;    /* GC if n_large_blocks in any nursery
+                          * reaches this. */
 
 static bdescr *exec_block;
 
 generation *generations = NULL;	/* all the generations */
 generation *g0		= NULL; /* generation 0, for convenience */
 generation *oldest_gen  = NULL; /* oldest generation, for convenience */
-step *g0s0 		= NULL; /* generation 0, step 0, for convenience */
 
 nat total_steps         = 0;
 step *all_steps         = NULL; /* single array of steps */
@@ -143,14 +141,6 @@ initStorage( void )
 					     * sizeof(struct generation_),
 					     "initStorage: gens");
 
-  /* allocate all the steps into an array.  It is important that we do
-     it this way, because we need the invariant that two step pointers
-     can be directly compared to see which is the oldest.
-     Remember that the last generation has only one step. */
-  total_steps = 1 + (RtsFlags.GcFlags.generations - 1) * RtsFlags.GcFlags.steps;
-  all_steps   = stgMallocBytes(total_steps * sizeof(struct step_),
-                               "initStorage: steps");
-
   /* Initialise all generations */
   for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
     gen = &generations[g];
@@ -166,6 +156,14 @@ initStorage( void )
   g0 = &generations[0];
   oldest_gen = &generations[RtsFlags.GcFlags.generations-1];
 
+  /* allocate all the steps into an array.  It is important that we do
+     it this way, because we need the invariant that two step pointers
+     can be directly compared to see which is the oldest.
+     Remember that the last generation has only one step. */
+  total_steps = 1 + (RtsFlags.GcFlags.generations - 1) * RtsFlags.GcFlags.steps;
+  all_steps   = stgMallocBytes(total_steps * sizeof(struct step_),
+                               "initStorage: steps");
+
   /* Allocate step structures in each generation */
   if (RtsFlags.GcFlags.generations > 1) {
     /* Only for multiple-generations */
@@ -187,11 +185,7 @@ initStorage( void )
     g0->steps   = all_steps;
   }
 
-#ifdef THREADED_RTS
   n_nurseries = n_capabilities;
-#else
-  n_nurseries = 1;
-#endif
   nurseries = stgMallocBytes (n_nurseries * sizeof(struct step_),
 			      "initStorage: nurseries");
 
@@ -231,7 +225,6 @@ initStorage( void )
   }
 
   generations[0].max_blocks = 0;
-  g0s0 = &generations[0].steps[0];
 
   /* The allocation area.  Policy: keep the allocation area
    * small to begin with, even if we have a large suggested heap
@@ -246,7 +239,6 @@ initStorage( void )
   revertible_caf_list = NULL;
    
   /* initialise the allocate() interface */
-  alloc_blocks = 0;
   alloc_blocks_lim = RtsFlags.GcFlags.minAllocAreaSize;
 
   exec_block = NULL;
@@ -274,7 +266,7 @@ exitStorage (void)
 void
 freeStorage (void)
 {
-    stgFree(g0s0); // frees all the steps
+    stgFree(all_steps); // frees all the steps
     stgFree(generations);
     freeAllMBlocks();
 #if defined(THREADED_RTS)
@@ -423,7 +415,6 @@ allocNursery (step *stp, bdescr *tail, nat blocks)
 static void
 assignNurseriesToCapabilities (void)
 {
-#ifdef THREADED_RTS
     nat i;
 
     for (i = 0; i < n_nurseries; i++) {
@@ -431,11 +422,6 @@ assignNurseriesToCapabilities (void)
 	capabilities[i].r.rCurrentNursery = nurseries[i].blocks;
 	capabilities[i].r.rCurrentAlloc   = NULL;
     }
-#else /* THREADED_RTS */
-    MainCapability.r.rNursery        = &nurseries[0];
-    MainCapability.r.rCurrentNursery = nurseries[0].blocks;
-    MainCapability.r.rCurrentAlloc   = NULL;
-#endif
 }
 
 static void
@@ -469,6 +455,10 @@ resetNurseries( void )
 	    ASSERT(bd->step == stp);
 	    IF_DEBUG(sanity,memset(bd->start, 0xaa, BLOCK_SIZE));
 	}
+        // these large objects are dead, since we have just GC'd
+        freeChain(stp->large_objects);
+        stp->large_objects = NULL;
+        stp->n_large_blocks = 0;
     }
     assignNurseriesToCapabilities();
 }
@@ -481,6 +471,7 @@ countNurseryBlocks (void)
 
     for (i = 0; i < n_nurseries; i++) {
 	blocks += nurseries[i].n_blocks;
+        blocks += nurseries[i].n_large_blocks;
     }
     return blocks;
 }
@@ -565,129 +556,46 @@ move_TSO (StgTSO *src, StgTSO *dest)
 }
 
 /* -----------------------------------------------------------------------------
-   The allocate() interface
-
-   allocateInGen() function allocates memory directly into a specific
-   generation.  It always succeeds, and returns a chunk of memory n
-   words long.  n can be larger than the size of a block if necessary,
-   in which case a contiguous block group will be allocated.
-
-   allocate(n) is equivalent to allocateInGen(g0).
+   split N blocks off the front of the given bdescr, returning the
+   new block group.  We add the remainder to the large_blocks list
+   in the same step as the original block.
    -------------------------------------------------------------------------- */
 
-StgPtr
-allocateInGen (generation *g, lnat n)
-{
-    step *stp;
-    bdescr *bd;
-    StgPtr ret;
-
-    ACQUIRE_SM_LOCK;
-    
-    TICK_ALLOC_HEAP_NOCTR(n);
-    CCS_ALLOC(CCCS,n);
-
-    stp = &g->steps[0];
-
-    if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_))
-    {
-	lnat req_blocks =  (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
-
-        // Attempting to allocate an object larger than maxHeapSize
-        // should definitely be disallowed.  (bug #1791)
-        if (RtsFlags.GcFlags.maxHeapSize > 0 && 
-            req_blocks >= RtsFlags.GcFlags.maxHeapSize) {
-            heapOverflow();
-            // heapOverflow() doesn't exit (see #2592), but we aren't
-            // in a position to do a clean shutdown here: we
-            // either have to allocate the memory or exit now.
-            // Allocating the memory would be bad, because the user
-            // has requested that we not exceed maxHeapSize, so we
-            // just exit.
-	    stg_exit(EXIT_HEAPOVERFLOW);
-        }
-
-	bd = allocGroup(req_blocks);
-	dbl_link_onto(bd, &stp->large_objects);
-	stp->n_large_blocks += bd->blocks; // might be larger than req_blocks
-	alloc_blocks += bd->blocks;
-        initBdescr(bd, stp);
-	bd->flags = BF_LARGE;
-	bd->free = bd->start + n;
-	ret = bd->start;
-    }
-    else
-    {
-        // small allocation (<LARGE_OBJECT_THRESHOLD) */
-        bd = stp->blocks;
-	if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
-            bd = allocBlock();
-            initBdescr(bd, stp);
-            bd->flags = 0;
-            bd->link = stp->blocks;
-            stp->blocks = bd;
-            stp->n_blocks++;
-            alloc_blocks++;
-        }
-        ret = bd->free;
-        bd->free += n;
-    }
-
-    RELEASE_SM_LOCK;
-
-    return ret;
-}
-
-StgPtr
-allocate (lnat n)
-{
-    return allocateInGen(g0,n);
-}
-
-lnat
-allocatedBytes( void )
-{
-    lnat allocated;
-
-    allocated = alloc_blocks * BLOCK_SIZE_W;
-    if (pinned_object_block != NULL) {
-	allocated -= (pinned_object_block->start + BLOCK_SIZE_W) - 
-	    pinned_object_block->free;
-    }
-	
-    return allocated;
-}
-
-// split N blocks off the front of the given bdescr, returning the
-// new block group.  We treat the remainder as if it
-// had been freshly allocated in generation 0.
 bdescr *
 splitLargeBlock (bdescr *bd, nat blocks)
 {
     bdescr *new_bd;
 
+    ACQUIRE_SM_LOCK;
+
+    ASSERT(countBlocks(bd->step->large_objects) == bd->step->n_large_blocks);
+
     // subtract the original number of blocks from the counter first
     bd->step->n_large_blocks -= bd->blocks;
 
     new_bd = splitBlockGroup (bd, blocks);
-
-    dbl_link_onto(new_bd, &g0s0->large_objects);
-    g0s0->n_large_blocks += new_bd->blocks;
-    initBdescr(new_bd, g0s0);
-    new_bd->flags   = BF_LARGE;
+    initBdescr(new_bd, bd->step);
+    new_bd->flags   = BF_LARGE | (bd->flags & BF_EVACUATED); 
+    // if new_bd is in an old generation, we have to set BF_EVACUATED
     new_bd->free    = bd->free;
+    dbl_link_onto(new_bd, &bd->step->large_objects);
+
     ASSERT(new_bd->free <= new_bd->start + new_bd->blocks * BLOCK_SIZE_W);
 
     // add the new number of blocks to the counter.  Due to the gaps
-    // for block descriptor, new_bd->blocks + bd->blocks might not be
+    // for block descriptors, new_bd->blocks + bd->blocks might not be
     // equal to the original bd->blocks, which is why we do it this way.
-    bd->step->n_large_blocks += bd->blocks;
+    bd->step->n_large_blocks += bd->blocks + new_bd->blocks;
+
+    ASSERT(countBlocks(bd->step->large_objects) == bd->step->n_large_blocks);
+
+    RELEASE_SM_LOCK;
 
     return new_bd;
 }
 
 /* -----------------------------------------------------------------------------
-   allocateLocal()
+   allocate()
 
    This allocates memory in the current thread - it is intended for
    use primarily from STG-land where we have a Capability.  It is
@@ -700,13 +608,38 @@ splitLargeBlock (bdescr *bd, nat blocks)
    -------------------------------------------------------------------------- */
 
 StgPtr
-allocateLocal (Capability *cap, lnat n)
+allocate (Capability *cap, lnat n)
 {
     bdescr *bd;
     StgPtr p;
+    step *stp;
 
     if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
-        return allocateInGen(g0,n);
+	lnat req_blocks =  (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
+
+        // Attempting to allocate an object larger than maxHeapSize
+        // should definitely be disallowed.  (bug #1791)
+        if (RtsFlags.GcFlags.maxHeapSize > 0 && 
+            req_blocks >= RtsFlags.GcFlags.maxHeapSize) {
+            heapOverflow();
+            // heapOverflow() doesn't exit (see #2592), but we aren't
+            // in a position to do a clean shutdown here: we
+            // either have to allocate the memory or exit now.
+            // Allocating the memory would be bad, because the user
+            // has requested that we not exceed maxHeapSize, so we
+            // just exit.
+	    stg_exit(EXIT_HEAPOVERFLOW);
+        }
+
+        stp = &nurseries[cap->no];
+
+	bd = allocGroup(req_blocks);
+	dbl_link_onto(bd, &stp->large_objects);
+	stp->n_large_blocks += bd->blocks; // might be larger than req_blocks
+        initBdescr(bd, stp);
+	bd->flags = BF_LARGE;
+	bd->free = bd->start + n;
+	return bd->start;
     }
 
     /* small allocation (<LARGE_OBJECT_THRESHOLD) */
@@ -731,10 +664,8 @@ allocateLocal (Capability *cap, lnat n)
             RELEASE_SM_LOCK;
             initBdescr(bd, cap->r.rNursery);
             bd->flags = 0;
-            // NO: alloc_blocks++;
-            // calcAllocated() uses the size of the nursery, and we've
-            // already bumpted nursery->n_blocks above.  We'll GC
-            // pretty quickly now anyway, because MAYBE_GC() will
+            // If we had to allocate a new block, then we'll GC
+            // pretty quickly now, because MAYBE_GC() will
             // notice that CurrentNursery->link is NULL.
         } else {
             // we have a block in the nursery: take it and put
@@ -778,39 +709,41 @@ allocateLocal (Capability *cap, lnat n)
    ------------------------------------------------------------------------- */
 
 StgPtr
-allocatePinned( lnat n )
+allocatePinned (Capability *cap, lnat n)
 {
     StgPtr p;
-    bdescr *bd = pinned_object_block;
+    bdescr *bd;
+    step *stp;
 
     // If the request is for a large object, then allocate()
     // will give us a pinned object anyway.
     if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
-	p = allocate(n);
+	p = allocate(cap, n);
         Bdescr(p)->flags |= BF_PINNED;
         return p;
     }
 
-    ACQUIRE_SM_LOCK;
-    
     TICK_ALLOC_HEAP_NOCTR(n);
     CCS_ALLOC(CCCS,n);
 
+    bd = cap->pinned_object_block;
+    
     // If we don't have a block of pinned objects yet, or the current
     // one isn't large enough to hold the new object, allocate a new one.
     if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) {
-	pinned_object_block = bd = allocBlock();
-	dbl_link_onto(bd, &g0s0->large_objects);
-	g0s0->n_large_blocks++;
-        initBdescr(bd, g0s0);
+        ACQUIRE_SM_LOCK
+	cap->pinned_object_block = bd = allocBlock();
+        RELEASE_SM_LOCK
+        stp = &nurseries[cap->no];
+	dbl_link_onto(bd, &stp->large_objects);
+	stp->n_large_blocks++;
+        initBdescr(bd, stp);
 	bd->flags  = BF_PINNED | BF_LARGE;
 	bd->free   = bd->start;
-	alloc_blocks++;
     }
 
     p = bd->free;
     bd->free += n;
-    RELEASE_SM_LOCK;
     return p;
 }
 
@@ -900,14 +833,11 @@ calcAllocated( void )
 {
   nat allocated;
   bdescr *bd;
+  nat i;
 
-  allocated = allocatedBytes();
-  allocated += countNurseryBlocks() * BLOCK_SIZE_W;
+  allocated = countNurseryBlocks() * BLOCK_SIZE_W;
   
-  {
-#ifdef THREADED_RTS
-  nat i;
-  for (i = 0; i < n_nurseries; i++) {
+  for (i = 0; i < n_capabilities; i++) {
       Capability *cap;
       for ( bd = capabilities[i].r.rCurrentNursery->link; 
 	    bd != NULL; bd = bd->link ) {
@@ -919,18 +849,10 @@ calcAllocated( void )
 	  allocated -= (cap->r.rCurrentNursery->start + BLOCK_SIZE_W)
 	      - cap->r.rCurrentNursery->free;
       }
-  }
-#else
-  bdescr *current_nursery = MainCapability.r.rCurrentNursery;
-
-  for ( bd = current_nursery->link; bd != NULL; bd = bd->link ) {
-      allocated -= BLOCK_SIZE_W;
-  }
-  if (current_nursery->free < current_nursery->start + BLOCK_SIZE_W) {
-      allocated -= (current_nursery->start + BLOCK_SIZE_W)
-	  - current_nursery->free;
-  }
-#endif
+      if (cap->pinned_object_block != NULL) {
+          allocated -= (cap->pinned_object_block->start + BLOCK_SIZE_W) - 
+              cap->pinned_object_block->free;
+      }
   }
 
   total_allocated += allocated;
@@ -947,16 +869,12 @@ calcLiveBlocks(void)
   lnat live = 0;
   step *stp;
 
-  if (RtsFlags.GcFlags.generations == 1) {
-      return g0s0->n_large_blocks + g0s0->n_blocks;
-  }
-
   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
     for (s = 0; s < generations[g].n_steps; s++) {
       /* approximate amount of live data (doesn't take into account slop
        * at end of each block).
        */
-      if (g == 0 && s == 0) { 
+      if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) { 
 	  continue; 
       }
       stp = &generations[g].steps[s];
@@ -988,14 +906,10 @@ calcLiveWords(void)
     lnat live;
     step *stp;
     
-    if (RtsFlags.GcFlags.generations == 1) {
-        return g0s0->n_words + countOccupied(g0s0->large_objects);
-    }
-    
     live = 0;
     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
         for (s = 0; s < generations[g].n_steps; s++) {
-            if (g == 0 && s == 0) continue; 
+            if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) continue; 
             stp = &generations[g].steps[s];
             live += stp->n_words + countOccupied(stp->large_objects);
         } 
@@ -1384,32 +1298,28 @@ checkSanity( void )
 {
     nat g, s;
 
-    if (RtsFlags.GcFlags.generations == 1) {
-	checkHeap(g0s0->blocks);
-	checkLargeObjects(g0s0->large_objects);
-    } else {
-	
-	for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-	    for (s = 0; s < generations[g].n_steps; s++) {
-		if (g == 0 && s == 0) { continue; }
-		ASSERT(countBlocks(generations[g].steps[s].blocks)
-		       == generations[g].steps[s].n_blocks);
-		ASSERT(countBlocks(generations[g].steps[s].large_objects)
-		       == generations[g].steps[s].n_large_blocks);
-		checkHeap(generations[g].steps[s].blocks);
-		checkLargeObjects(generations[g].steps[s].large_objects);
-	    }
-	}
-
-	for (s = 0; s < n_nurseries; s++) {
-	    ASSERT(countBlocks(nurseries[s].blocks)
-		   == nurseries[s].n_blocks);
-	    ASSERT(countBlocks(nurseries[s].large_objects)
-		   == nurseries[s].n_large_blocks);
-	}
-	    
-	checkFreeListSanity();
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+        for (s = 0; s < generations[g].n_steps; s++) {
+            if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) { 
+                continue;
+            }
+            ASSERT(countBlocks(generations[g].steps[s].blocks)
+                   == generations[g].steps[s].n_blocks);
+            ASSERT(countBlocks(generations[g].steps[s].large_objects)
+                   == generations[g].steps[s].n_large_blocks);
+            checkHeap(generations[g].steps[s].blocks);
+            checkLargeObjects(generations[g].steps[s].large_objects);
+        }
+    }
+    
+    for (s = 0; s < n_nurseries; s++) {
+        ASSERT(countBlocks(nurseries[s].blocks)
+               == nurseries[s].n_blocks);
+        ASSERT(countBlocks(nurseries[s].large_objects)
+               == nurseries[s].n_large_blocks);
     }
+    
+    checkFreeListSanity();
 
 #if defined(THREADED_RTS)
     // check the stacks too in threaded mode, because we don't do a
diff --git a/rts/sm/Storage.h b/rts/sm/Storage.h
index 5ddcbdc..30bdf54 100644
--- a/rts/sm/Storage.h
+++ b/rts/sm/Storage.h
@@ -9,6 +9,8 @@
 #ifndef SM_STORAGE_H
 #define SM_STORAGE_H
 
+#include "Capability.h"
+
 BEGIN_RTS_PRIVATE
 
 /* -----------------------------------------------------------------------------
@@ -23,12 +25,11 @@ void freeStorage(void);
    Storage manager state
    -------------------------------------------------------------------------- */
 
-extern bdescr * pinned_object_block;
-
 INLINE_HEADER rtsBool
-doYouWantToGC( void )
+doYouWantToGC( Capability *cap )
 {
-  return (alloc_blocks >= alloc_blocks_lim);
+  return (cap->r.rCurrentNursery->link == NULL ||
+          cap->r.rNursery->n_large_blocks >= alloc_blocks_lim);
 }
 
 /* for splitting blocks groups in two */
@@ -120,6 +121,8 @@ void dirty_MVAR(StgRegTable *reg, StgClosure *p);
    Nursery manipulation
    -------------------------------------------------------------------------- */
 
+extern step *nurseries;
+
 void     resetNurseries       ( void );
 void     resizeNurseries      ( nat blocks );
 void     resizeNurseriesFixed ( nat blocks );
-- 
1.7.10.4