Keep the remembered sets local to each thread during parallel GC

author Simon Marlow <marlowsd@gmail.com>

Mon, 12 Jan 2009 12:10:24 +0000 (12:10 +0000)

committer Simon Marlow <marlowsd@gmail.com>

Mon, 12 Jan 2009 12:10:24 +0000 (12:10 +0000)
author Simon Marlow <marlowsd@gmail.com>
Mon, 12 Jan 2009 12:10:24 +0000 (12:10 +0000)
committer Simon Marlow <marlowsd@gmail.com>
Mon, 12 Jan 2009 12:10:24 +0000 (12:10 +0000)
diff --git a/includes/Storage.h b/includes/Storage.h

index 0a7aae6..f43eb79 100644 (file)
--- a/includes/Storage.h
+++ b/includes/Storage.h
@@ -245,7 +245,6 @@ extern void GarbageCollect(rtsBool force_major_gc, nat gc_type, Capability *cap)
  #if defined(THREADED_RTS)
  extern Mutex sm_mutex;
  extern Mutex atomic_modify_mutvar_mutex;
-extern SpinLock recordMutableGen_sync;
  #endif
  
  #if defined(THREADED_RTS)
@@ -258,63 +257,40 @@ extern SpinLock recordMutableGen_sync;
  #define ASSERT_SM_LOCK()
  #endif
  
+#if !IN_STG_CODE
+
  INLINE_HEADER void
-recordMutableGen(StgClosure *p, generation *gen)
+recordMutableGen(StgClosure *p, nat gen_no)
  {
      bdescr *bd;
  
-    bd = gen->mut_list;
+    bd = generations[gen_no].mut_list;
      if (bd->free >= bd->start + BLOCK_SIZE_W) {
         bdescr *new_bd;
         new_bd = allocBlock();
         new_bd->link = bd;
         bd = new_bd;
-       gen->mut_list = bd;
+       generations[gen_no].mut_list = bd;
      }
      *bd->free++ = (StgWord)p;
  
  }
  
  INLINE_HEADER void
-recordMutableGenLock(StgClosure *p, generation *gen)
+recordMutableGenLock(StgClosure *p, nat gen_no)
  {
      ACQUIRE_SM_LOCK;
-    recordMutableGen(p,gen);
+    recordMutableGen(p,gen_no);
      RELEASE_SM_LOCK;
  }
  
-extern bdescr *allocBlock_sync(void);
-
-// Version of recordMutableGen() for use in parallel GC.  The same as
-// recordMutableGen(), except that we surround it with a spinlock and
-// call the spinlock version of allocBlock().
-INLINE_HEADER void
-recordMutableGen_GC(StgClosure *p, generation *gen)
-{
-    bdescr *bd;
-
-    ACQUIRE_SPIN_LOCK(&recordMutableGen_sync);
-
-    bd = gen->mut_list;
-    if (bd->free >= bd->start + BLOCK_SIZE_W) {
-       bdescr *new_bd;
-       new_bd = allocBlock_sync();
-       new_bd->link = bd;
-       bd = new_bd;
-       gen->mut_list = bd;
-    }
-    *bd->free++ = (StgWord)p;
-
-    RELEASE_SPIN_LOCK(&recordMutableGen_sync);
-}
-
  INLINE_HEADER void
  recordMutable(StgClosure *p)
  {
      bdescr *bd;
      ASSERT(closure_MUTABLE(p));
      bd = Bdescr((P_)p);
-    if (bd->gen_no > 0) recordMutableGen(p, &RTS_DEREF(generations)[bd->gen_no]);
+    if (bd->gen_no > 0) recordMutableGen(p, bd->gen_no);
  }
  
  INLINE_HEADER void
@@ -325,6 +301,8 @@ recordMutableLock(StgClosure *p)
      RELEASE_SM_LOCK;
  }
  
+#endif // !IN_STG_CODE
+
  /* -----------------------------------------------------------------------------
     The CAF table - used to let us revert CAFs in GHCi
     -------------------------------------------------------------------------- */
diff --git a/rts/Capability.c b/rts/Capability.c

index 7c6ceb5..a81d710 100644 (file)
--- a/rts/Capability.c
+++ b/rts/Capability.c
@@ -217,6 +217,9 @@ initCapability( Capability *cap, nat i )
      cap->mut_lists  = stgMallocBytes(sizeof(bdescr *) *
                                      RtsFlags.GcFlags.generations,
                                      "initCapability");
+    cap->saved_mut_lists = stgMallocBytes(sizeof(bdescr *) *
+                                          RtsFlags.GcFlags.generations,
+                                          "initCapability");
  
      for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
         cap->mut_lists[g] = NULL;
diff --git a/rts/Capability.h b/rts/Capability.h

index ba0695c..77132e3 100644 (file)
--- a/rts/Capability.h
+++ b/rts/Capability.h
@@ -65,10 +65,12 @@ struct Capability_ {
      Task *suspended_ccalling_tasks;
  
      // One mutable list per generation, so we don't need to take any
-    // locks when updating an old-generation thunk.  These
-    // mini-mut-lists are moved onto the respective gen->mut_list at
-    // each GC.
+    // locks when updating an old-generation thunk.  This also lets us
+    // keep track of which closures this CPU has been mutating, so we
+    // can traverse them using the right thread during GC and avoid
+    // unnecessarily moving the data from one cache to another.
      bdescr **mut_lists;
+    bdescr **saved_mut_lists; // tmp use during GC
  
      // Context switch flag. We used to have one global flag, now one 
      // per capability. Locks required  : none (conflicts are harmless)
diff --git a/rts/Stats.c b/rts/Stats.c

index 3ec5794..c43806f 100644 (file)
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -712,7 +712,6 @@ stat_exit(int alloc)
              {
                  nat g, s;
                  
-                statsPrintf("recordMutableGen_sync: %"FMT_Word64"\n", recordMutableGen_sync.spin);
                  statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin);
                  statsPrintf("whitehole_spin: %"FMT_Word64"\n", whitehole_spin);
                  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
diff --git a/rts/Updates.h b/rts/Updates.h

index 98e9e5a..10fa09b 100644 (file)
--- a/rts/Updates.h
+++ b/rts/Updates.h
@@ -268,7 +268,7 @@ no_slop:
      write_barrier();                                           \
      bd = Bdescr((P_)p1);                                       \
      if (bd->gen_no != 0) {                                     \
-      recordMutableGenLock(p1, &generations[bd->gen_no]);      \
+      recordMutableGenLock(p1, bd->gen_no);                    \
        SET_INFO(p1, &stg_IND_OLDGEN_info);                      \
        TICK_UPD_OLD_IND();                                      \
        and_then;                                                        \
diff --git a/rts/sm/Compact.c b/rts/sm/Compact.c

index fcd7cb1..6758cfa 100644 (file)
--- a/rts/sm/Compact.c
+++ b/rts/sm/Compact.c
@@ -976,11 +976,20 @@ compact(StgClosure *static_objects)
      for (g = 1; g < RtsFlags.GcFlags.generations; g++) {
         bdescr *bd;
         StgPtr p;
+        nat n;
         for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
             for (p = bd->start; p < bd->free; p++) {
                 thread((StgClosure **)p);
             }
         }
+        for (n = 0; n < n_capabilities; n++) {
+            for (bd = capabilities[n].mut_lists[g]; 
+                 bd != NULL; bd = bd->link) {
+                for (p = bd->start; p < bd->free; p++) {
+                    thread((StgClosure **)p);
+                }
+            }
+        }
      }
  
      // the global thread list
diff --git a/rts/sm/GC.c b/rts/sm/GC.c

index bf2464b..2af5fa1 100644 (file)
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -125,10 +125,6 @@ nat n_gc_threads;
  // For stats:
  long copied;        // *words* copied & scavenged during this GC
  
-#ifdef THREADED_RTS
-SpinLock recordMutableGen_sync;
-#endif
-
  DECLARE_GCT
  
  /* -----------------------------------------------------------------------------
@@ -314,17 +310,7 @@ GarbageCollect (rtsBool force_major_gc,
  
    /* -----------------------------------------------------------------------
     * follow all the roots that we know about:
-   *   - mutable lists from each generation > N
-   * we want to *scavenge* these roots, not evacuate them: they're not
-   * going to move in this GC.
-   * Also do them in reverse generation order, for the usual reason:
-   * namely to reduce the likelihood of spurious old->new pointers.
     */
-  for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
-      generations[g].saved_mut_list = generations[g].mut_list;
-      generations[g].mut_list = allocBlock(); 
-      // mut_list always has at least one block.
-  }
  
    // the main thread is running: this prevents any other threads from
    // exiting prematurely, so we can start them now.
@@ -333,8 +319,29 @@ GarbageCollect (rtsBool force_major_gc,
    inc_running();
    wakeup_gc_threads(n_gc_threads, gct->thread_index);
  
+  // Mutable lists from each generation > N
+  // we want to *scavenge* these roots, not evacuate them: they're not
+  // going to move in this GC.
+  // Also do them in reverse generation order, for the usual reason:
+  // namely to reduce the likelihood of spurious old->new pointers.
+  //
    for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
-      scavenge_mutable_list(&generations[g]);
+      scavenge_mutable_list(generations[g].saved_mut_list, &generations[g]);
+      freeChain_sync(generations[g].saved_mut_list);
+      generations[g].saved_mut_list = NULL;
+
+  }
+
+  // scavenge the capability-private mutable lists.  This isn't part
+  // of markSomeCapabilities() because markSomeCapabilities() can only
+  // call back into the GC via mark_root() (due to the gct register
+  // variable).
+  if (n_gc_threads == 1) {
+      for (n = 0; n < n_capabilities; n++) {
+          scavenge_capability_mut_lists(&capabilities[n]);
+      }
+  } else {
+      scavenge_capability_mut_lists(&capabilities[gct->thread_index]);
    }
  
    // follow roots from the CAF list (used by GHCi)
@@ -545,6 +552,12 @@ GarbageCollect (rtsBool force_major_gc,
         for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
             mut_list_size += bd->free - bd->start;
         }
+        for (n = 0; n < n_capabilities; n++) {
+            for (bd = capabilities[n].mut_lists[g]; 
+                 bd != NULL; bd = bd->link) {
+                mut_list_size += bd->free - bd->start;
+            }
+        }
         copied +=  mut_list_size;
  
         debugTrace(DEBUG_gc,
@@ -1037,6 +1050,7 @@ gcWorkerThread (Capability *cap)
      gct->evac_step = 0;
      markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads,
                           rtsTrue/*prune sparks*/);
+    scavenge_capability_mut_lists(&capabilities[gct->thread_index]);
  
      scavenge_until_all_done();
      
@@ -1287,11 +1301,21 @@ init_collected_gen (nat g, nat n_threads)
  static void
  init_uncollected_gen (nat g, nat threads)
  {
-    nat s, t, i;
+    nat s, t, n;
      step_workspace *ws;
      step *stp;
      bdescr *bd;
  
+    // save the current mutable lists for this generation, and
+    // allocate a fresh block for each one.  We'll traverse these
+    // mutable lists as roots early on in the GC.
+    generations[g].saved_mut_list = generations[g].mut_list;
+    generations[g].mut_list = allocBlock(); 
+    for (n = 0; n < n_capabilities; n++) {
+        capabilities[n].saved_mut_lists[g] = capabilities[n].mut_lists[g];
+        capabilities[n].mut_lists[g] = allocBlock();
+    }
+
      for (s = 0; s < generations[g].n_steps; s++) {
         stp = &generations[g].steps[s];
         stp->scavenged_large_objects = NULL;
@@ -1352,19 +1376,6 @@ init_uncollected_gen (nat g, nat threads)
              if (t == n_gc_threads) t = 0;
          }
      }
-
-
-    // Move the private mutable lists from each capability onto the
-    // main mutable list for the generation.
-    for (i = 0; i < n_capabilities; i++) {
-       for (bd = capabilities[i].mut_lists[g]; 
-            bd->link != NULL; bd = bd->link) {
-           /* nothing */
-       }
-       bd->link = generations[g].mut_list;
-       generations[g].mut_list = capabilities[i].mut_lists[g];
-       capabilities[i].mut_lists[g] = allocBlock();
-    }
  }
  
  /* -----------------------------------------------------------------------------
@@ -1377,6 +1388,7 @@ init_gc_thread (gc_thread *t)
      t->static_objects = END_OF_STATIC_LIST;
      t->scavenged_static_objects = END_OF_STATIC_LIST;
      t->scan_bd = NULL;
+    t->mut_lists = capabilities[t->thread_index].mut_lists;
      t->evac_step = 0;
      t->failed_to_evac = rtsFalse;
      t->eager_promotion = rtsTrue;
diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h

index 1d0a05c..3ee2757 100644 (file)
--- a/rts/sm/GCThread.h
+++ b/rts/sm/GCThread.h
@@ -131,6 +131,14 @@ typedef struct gc_thread_ {
      // block that is currently being scanned
      bdescr *     scan_bd;
  
+    // Remembered sets on this CPU.  Each GC thread has its own
+    // private per-generation remembered sets, so it can add an item
+    // to the remembered set without taking a lock.  The mut_lists
+    // array on a gc_thread is the same as the one on the
+    // corresponding Capability; we stash it here too for easy access
+    // during GC; see recordMutableGen_GC().
+    bdescr **    mut_lists;
+
      // --------------------
      // evacuate flags
  
diff --git a/rts/sm/GCUtils.h b/rts/sm/GCUtils.h

index 32810b6..6948b2f 100644 (file)
--- a/rts/sm/GCUtils.h
+++ b/rts/sm/GCUtils.h
@@ -34,3 +34,22 @@ isPartiallyFull(bdescr *bd)
  #if DEBUG
  void printMutableList (generation *gen);
  #endif
+
+// Version of recordMutableGen for use during GC.  This uses the
+// mutable lists attached to the current gc_thread structure, which
+// are the same as the mutable lists on the Capability.
+INLINE_HEADER void
+recordMutableGen_GC (StgClosure *p, nat gen_no)
+{
+    bdescr *bd;
+
+    bd = gct->mut_lists[gen_no];
+    if (bd->free >= bd->start + BLOCK_SIZE_W) {
+       bdescr *new_bd;
+       new_bd = allocBlock_sync();
+       new_bd->link = bd;
+       bd = new_bd;
+       gct->mut_lists[gen_no] = bd;
+    }
+    *bd->free++ = (StgWord)p;
+}
diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c

index d396b9f..9d1a0b6 100644 (file)
--- a/rts/sm/Scav.c
+++ b/rts/sm/Scav.c
@@ -25,6 +25,7 @@
  #include "Trace.h"
  #include "LdvProfile.h"
  #include "Sanity.h"
+#include "Capability.h"
  
  static void scavenge_stack (StgPtr p, StgPtr stack_end);
  
@@ -36,7 +37,8 @@ static void scavenge_large_bitmap (StgPtr p,
  # define evacuate(a) evacuate1(a)
  # define recordMutableGen_GC(a,b) recordMutableGen(a,b)
  # define scavenge_loop(a) scavenge_loop1(a)
-# define scavenge_mutable_list(g) scavenge_mutable_list1(g)
+# define scavenge_mutable_list(bd,g) scavenge_mutable_list1(bd,g)
+# define scavenge_capability_mut_lists(cap) scavenge_capability_mut_Lists1(cap)
  #endif
  
  /* -----------------------------------------------------------------------------
@@ -67,6 +69,8 @@ scavengeTSO (StgTSO *tso)
          return;
      }
  
+    debugTrace(DEBUG_gc,"scavenging thread %d",tso->id);
+
      saved_eager = gct->eager_promotion;
      gct->eager_promotion = rtsFalse;
  
@@ -692,7 +696,7 @@ scavenge_block (bdescr *bd)
      if (gct->failed_to_evac) {
         gct->failed_to_evac = rtsFalse;
         if (bd->gen_no > 0) {
-           recordMutableGen_GC((StgClosure *)q, &generations[bd->gen_no]);
+           recordMutableGen_GC((StgClosure *)q, bd->gen_no);
         }
      }
    }
@@ -1047,7 +1051,7 @@ linear_scan:
         if (gct->failed_to_evac) {
             gct->failed_to_evac = rtsFalse;
             if (gct->evac_step) {
-               recordMutableGen_GC((StgClosure *)q, gct->evac_step->gen);
+               recordMutableGen_GC((StgClosure *)q, gct->evac_step->gen_no);
             }
         }
         
@@ -1419,13 +1423,10 @@ scavenge_one(StgPtr p)
     -------------------------------------------------------------------------- */
  
  void
-scavenge_mutable_list(generation *gen)
+scavenge_mutable_list(bdescr *bd, generation *gen)
  {
-    bdescr *bd;
      StgPtr p, q;
  
-    bd = gen->saved_mut_list;
-
      gct->evac_step = &gen->steps[0];
      for (; bd != NULL; bd = bd->link) {
         for (q = bd->start; q < bd->free; q++) {
@@ -1456,12 +1457,12 @@ scavenge_mutable_list(generation *gen)
             // definitely doesn't point into a young generation.
             // Clean objects don't need to be scavenged.  Some clean
             // objects (MUT_VAR_CLEAN) are not kept on the mutable
-           // list at all; others, such as MUT_ARR_PTRS_CLEAN and
-           // TSO, are always on the mutable list.
+           // list at all; others, such as MUT_ARR_PTRS_CLEAN
+           // are always on the mutable list.
             //
             switch (get_itbl((StgClosure *)p)->type) {
             case MUT_ARR_PTRS_CLEAN:
-               recordMutableGen_GC((StgClosure *)p,gen);
+               recordMutableGen_GC((StgClosure *)p,gen->no);
                 continue;
             case TSO: {
                 StgTSO *tso = (StgTSO *)p;
@@ -1472,7 +1473,7 @@ scavenge_mutable_list(generation *gen)
  
                      scavenge_TSO_link(tso);
                      if (gct->failed_to_evac) {
-                        recordMutableGen_GC((StgClosure *)p,gen);
+                        recordMutableGen_GC((StgClosure *)p,gen->no);
                          gct->failed_to_evac = rtsFalse;
                      } else {
                          tso->flags &= ~TSO_LINK_DIRTY;
@@ -1487,14 +1488,28 @@ scavenge_mutable_list(generation *gen)
             if (scavenge_one(p)) {
                 // didn't manage to promote everything, so put the
                 // object back on the list.
-               recordMutableGen_GC((StgClosure *)p,gen);
+               recordMutableGen_GC((StgClosure *)p,gen->no);
             }
         }
      }
+}
  
-    // free the old mut_list
-    freeChain_sync(gen->saved_mut_list);
-    gen->saved_mut_list = NULL;
+void
+scavenge_capability_mut_lists (Capability *cap)
+{
+    nat g;
+
+    /* Mutable lists from each generation > N
+     * we want to *scavenge* these roots, not evacuate them: they're not
+     * going to move in this GC.
+     * Also do them in reverse generation order, for the usual reason:
+     * namely to reduce the likelihood of spurious old->new pointers.
+     */
+    for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
+        scavenge_mutable_list(cap->saved_mut_lists[g], &generations[g]);
+        freeChain_sync(cap->saved_mut_lists[g]);
+        cap->saved_mut_lists[g] = NULL;
+    }
  }
  
  /* -----------------------------------------------------------------------------
@@ -1560,7 +1575,7 @@ scavenge_static(void)
          */
         if (gct->failed_to_evac) {
           gct->failed_to_evac = rtsFalse;
-         recordMutableGen_GC((StgClosure *)p,oldest_gen);
+         recordMutableGen_GC((StgClosure *)p,oldest_gen->no);
         }
         break;
        }
@@ -1834,7 +1849,7 @@ scavenge_large (step_workspace *ws)
         p = bd->start;
         if (scavenge_one(p)) {
             if (ws->step->gen_no > 0) {
-               recordMutableGen_GC((StgClosure *)p, ws->step->gen);
+               recordMutableGen_GC((StgClosure *)p, ws->step->gen_no);
             }
         }
  
diff --git a/rts/sm/Scav.h b/rts/sm/Scav.h

index 244073e..df774cd 100644 (file)
--- a/rts/sm/Scav.h
+++ b/rts/sm/Scav.h
@@ -12,9 +12,11 @@
   * ---------------------------------------------------------------------------*/
  
  void    scavenge_loop (void);
-void    scavenge_mutable_list (generation *g);
+void    scavenge_mutable_list (bdescr *bd, generation *gen);
+void    scavenge_capability_mut_lists (Capability *cap);
  
  #ifdef THREADED_RTS
  void    scavenge_loop1 (void);
-void    scavenge_mutable_list1 (generation *g);
+void    scavenge_mutable_list1 (bdescr *bd, generation *gen);
+void    scavenge_capability_mut_Lists1 (Capability *cap);
  #endif
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c

index 6fa90cf..6f6d591 100644 (file)
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -272,7 +272,6 @@ initStorage( void )
  
  #ifdef THREADED_RTS
    initSpinLock(&gc_alloc_block_sync);
-  initSpinLock(&recordMutableGen_sync);
    whitehole_spin = 0;
  #endif
  
@@ -376,7 +375,7 @@ newCAF(StgClosure* caf)
      * any more and can use it as a STATIC_LINK.
      */
      ((StgIndStatic *)caf)->saved_info = NULL;
-    recordMutableGen(caf, oldest_gen);
+    recordMutableGen(caf, oldest_gen->no);
    }
    
    RELEASE_SM_LOCK;
author	Simon Marlow <marlowsd@gmail.com>
	Mon, 12 Jan 2009 12:10:24 +0000 (12:10 +0000)
committer	Simon Marlow <marlowsd@gmail.com>
	Mon, 12 Jan 2009 12:10:24 +0000 (12:10 +0000)
includes/Storage.h		patch \| blob \| history
rts/Capability.c		patch \| blob \| history
rts/Capability.h		patch \| blob \| history
rts/Stats.c		patch \| blob \| history
rts/Updates.h		patch \| blob \| history
rts/sm/Compact.c		patch \| blob \| history
rts/sm/GC.c		patch \| blob \| history
rts/sm/GCThread.h		patch \| blob \| history
rts/sm/GCUtils.h		patch \| blob \| history
rts/sm/Scav.c		patch \| blob \| history
rts/sm/Scav.h		patch \| blob \| history
rts/sm/Storage.c		patch \| blob \| history