This turns out to be quite vital for parallel programs:
- The way we discover which threads to traverse is by finding
dirty threads via the remembered sets (aka mutable lists).
- A dirty thread will be on the remembered set of the capability
that was running it, and we really want to traverse that thread's
stack using the GC thread for the capability, because it is in
that CPU's cache. If we get this wrong, we get penalised badly by
the memory system.
Previously we had per-capability mutable lists but they were
aggregated before GC and traversed by just one of the GC threads.
This resulted in very poor performance particularly for parallel
programs with deep stacks.
Now we keep per-capability remembered sets throughout GC, which also
removes a lock (recordMutableGen_sync).
12 files changed:
#if defined(THREADED_RTS)
extern Mutex sm_mutex;
extern Mutex atomic_modify_mutvar_mutex;
#if defined(THREADED_RTS)
extern Mutex sm_mutex;
extern Mutex atomic_modify_mutvar_mutex;
-extern SpinLock recordMutableGen_sync;
#endif
#if defined(THREADED_RTS)
#endif
#if defined(THREADED_RTS)
#define ASSERT_SM_LOCK()
#endif
#define ASSERT_SM_LOCK()
#endif
-recordMutableGen(StgClosure *p, generation *gen)
+recordMutableGen(StgClosure *p, nat gen_no)
+ bd = generations[gen_no].mut_list;
if (bd->free >= bd->start + BLOCK_SIZE_W) {
bdescr *new_bd;
new_bd = allocBlock();
new_bd->link = bd;
bd = new_bd;
if (bd->free >= bd->start + BLOCK_SIZE_W) {
bdescr *new_bd;
new_bd = allocBlock();
new_bd->link = bd;
bd = new_bd;
+ generations[gen_no].mut_list = bd;
}
*bd->free++ = (StgWord)p;
}
INLINE_HEADER void
}
*bd->free++ = (StgWord)p;
}
INLINE_HEADER void
-recordMutableGenLock(StgClosure *p, generation *gen)
+recordMutableGenLock(StgClosure *p, nat gen_no)
- recordMutableGen(p,gen);
+ recordMutableGen(p,gen_no);
-extern bdescr *allocBlock_sync(void);
-
-// Version of recordMutableGen() for use in parallel GC. The same as
-// recordMutableGen(), except that we surround it with a spinlock and
-// call the spinlock version of allocBlock().
-INLINE_HEADER void
-recordMutableGen_GC(StgClosure *p, generation *gen)
-{
- bdescr *bd;
-
- ACQUIRE_SPIN_LOCK(&recordMutableGen_sync);
-
- bd = gen->mut_list;
- if (bd->free >= bd->start + BLOCK_SIZE_W) {
- bdescr *new_bd;
- new_bd = allocBlock_sync();
- new_bd->link = bd;
- bd = new_bd;
- gen->mut_list = bd;
- }
- *bd->free++ = (StgWord)p;
-
- RELEASE_SPIN_LOCK(&recordMutableGen_sync);
-}
-
INLINE_HEADER void
recordMutable(StgClosure *p)
{
bdescr *bd;
ASSERT(closure_MUTABLE(p));
bd = Bdescr((P_)p);
INLINE_HEADER void
recordMutable(StgClosure *p)
{
bdescr *bd;
ASSERT(closure_MUTABLE(p));
bd = Bdescr((P_)p);
- if (bd->gen_no > 0) recordMutableGen(p, &RTS_DEREF(generations)[bd->gen_no]);
+ if (bd->gen_no > 0) recordMutableGen(p, bd->gen_no);
+#endif // !IN_STG_CODE
+
/* -----------------------------------------------------------------------------
The CAF table - used to let us revert CAFs in GHCi
-------------------------------------------------------------------------- */
/* -----------------------------------------------------------------------------
The CAF table - used to let us revert CAFs in GHCi
-------------------------------------------------------------------------- */
cap->mut_lists = stgMallocBytes(sizeof(bdescr *) *
RtsFlags.GcFlags.generations,
"initCapability");
cap->mut_lists = stgMallocBytes(sizeof(bdescr *) *
RtsFlags.GcFlags.generations,
"initCapability");
+ cap->saved_mut_lists = stgMallocBytes(sizeof(bdescr *) *
+ RtsFlags.GcFlags.generations,
+ "initCapability");
for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
cap->mut_lists[g] = NULL;
for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
cap->mut_lists[g] = NULL;
Task *suspended_ccalling_tasks;
// One mutable list per generation, so we don't need to take any
Task *suspended_ccalling_tasks;
// One mutable list per generation, so we don't need to take any
- // locks when updating an old-generation thunk. These
- // mini-mut-lists are moved onto the respective gen->mut_list at
- // each GC.
+ // locks when updating an old-generation thunk. This also lets us
+ // keep track of which closures this CPU has been mutating, so we
+ // can traverse them using the right thread during GC and avoid
+ // unnecessarily moving the data from one cache to another.
+ bdescr **saved_mut_lists; // tmp use during GC
// Context switch flag. We used to have one global flag, now one
// per capability. Locks required : none (conflicts are harmless)
// Context switch flag. We used to have one global flag, now one
// per capability. Locks required : none (conflicts are harmless)
- statsPrintf("recordMutableGen_sync: %"FMT_Word64"\n", recordMutableGen_sync.spin);
statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin);
statsPrintf("whitehole_spin: %"FMT_Word64"\n", whitehole_spin);
for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin);
statsPrintf("whitehole_spin: %"FMT_Word64"\n", whitehole_spin);
for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
write_barrier(); \
bd = Bdescr((P_)p1); \
if (bd->gen_no != 0) { \
write_barrier(); \
bd = Bdescr((P_)p1); \
if (bd->gen_no != 0) { \
- recordMutableGenLock(p1, &generations[bd->gen_no]); \
+ recordMutableGenLock(p1, bd->gen_no); \
SET_INFO(p1, &stg_IND_OLDGEN_info); \
TICK_UPD_OLD_IND(); \
and_then; \
SET_INFO(p1, &stg_IND_OLDGEN_info); \
TICK_UPD_OLD_IND(); \
and_then; \
for (g = 1; g < RtsFlags.GcFlags.generations; g++) {
bdescr *bd;
StgPtr p;
for (g = 1; g < RtsFlags.GcFlags.generations; g++) {
bdescr *bd;
StgPtr p;
for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
for (p = bd->start; p < bd->free; p++) {
thread((StgClosure **)p);
}
}
for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
for (p = bd->start; p < bd->free; p++) {
thread((StgClosure **)p);
}
}
+ for (n = 0; n < n_capabilities; n++) {
+ for (bd = capabilities[n].mut_lists[g];
+ bd != NULL; bd = bd->link) {
+ for (p = bd->start; p < bd->free; p++) {
+ thread((StgClosure **)p);
+ }
+ }
+ }
}
// the global thread list
}
// the global thread list
// For stats:
long copied; // *words* copied & scavenged during this GC
// For stats:
long copied; // *words* copied & scavenged during this GC
-#ifdef THREADED_RTS
-SpinLock recordMutableGen_sync;
-#endif
-
DECLARE_GCT
/* -----------------------------------------------------------------------------
DECLARE_GCT
/* -----------------------------------------------------------------------------
/* -----------------------------------------------------------------------
* follow all the roots that we know about:
/* -----------------------------------------------------------------------
* follow all the roots that we know about:
- * - mutable lists from each generation > N
- * we want to *scavenge* these roots, not evacuate them: they're not
- * going to move in this GC.
- * Also do them in reverse generation order, for the usual reason:
- * namely to reduce the likelihood of spurious old->new pointers.
- for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
- generations[g].saved_mut_list = generations[g].mut_list;
- generations[g].mut_list = allocBlock();
- // mut_list always has at least one block.
- }
// the main thread is running: this prevents any other threads from
// exiting prematurely, so we can start them now.
// the main thread is running: this prevents any other threads from
// exiting prematurely, so we can start them now.
inc_running();
wakeup_gc_threads(n_gc_threads, gct->thread_index);
inc_running();
wakeup_gc_threads(n_gc_threads, gct->thread_index);
+ // Mutable lists from each generation > N
+ // we want to *scavenge* these roots, not evacuate them: they're not
+ // going to move in this GC.
+ // Also do them in reverse generation order, for the usual reason:
+ // namely to reduce the likelihood of spurious old->new pointers.
+ //
for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
- scavenge_mutable_list(&generations[g]);
+ scavenge_mutable_list(generations[g].saved_mut_list, &generations[g]);
+ freeChain_sync(generations[g].saved_mut_list);
+ generations[g].saved_mut_list = NULL;
+
+ }
+
+ // scavenge the capability-private mutable lists. This isn't part
+ // of markSomeCapabilities() because markSomeCapabilities() can only
+ // call back into the GC via mark_root() (due to the gct register
+ // variable).
+ if (n_gc_threads == 1) {
+ for (n = 0; n < n_capabilities; n++) {
+ scavenge_capability_mut_lists(&capabilities[n]);
+ }
+ } else {
+ scavenge_capability_mut_lists(&capabilities[gct->thread_index]);
}
// follow roots from the CAF list (used by GHCi)
}
// follow roots from the CAF list (used by GHCi)
for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
mut_list_size += bd->free - bd->start;
}
for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
mut_list_size += bd->free - bd->start;
}
+ for (n = 0; n < n_capabilities; n++) {
+ for (bd = capabilities[n].mut_lists[g];
+ bd != NULL; bd = bd->link) {
+ mut_list_size += bd->free - bd->start;
+ }
+ }
copied += mut_list_size;
debugTrace(DEBUG_gc,
copied += mut_list_size;
debugTrace(DEBUG_gc,
gct->evac_step = 0;
markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads,
rtsTrue/*prune sparks*/);
gct->evac_step = 0;
markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads,
rtsTrue/*prune sparks*/);
+ scavenge_capability_mut_lists(&capabilities[gct->thread_index]);
scavenge_until_all_done();
scavenge_until_all_done();
static void
init_uncollected_gen (nat g, nat threads)
{
static void
init_uncollected_gen (nat g, nat threads)
{
step_workspace *ws;
step *stp;
bdescr *bd;
step_workspace *ws;
step *stp;
bdescr *bd;
+ // save the current mutable lists for this generation, and
+ // allocate a fresh block for each one. We'll traverse these
+ // mutable lists as roots early on in the GC.
+ generations[g].saved_mut_list = generations[g].mut_list;
+ generations[g].mut_list = allocBlock();
+ for (n = 0; n < n_capabilities; n++) {
+ capabilities[n].saved_mut_lists[g] = capabilities[n].mut_lists[g];
+ capabilities[n].mut_lists[g] = allocBlock();
+ }
+
for (s = 0; s < generations[g].n_steps; s++) {
stp = &generations[g].steps[s];
stp->scavenged_large_objects = NULL;
for (s = 0; s < generations[g].n_steps; s++) {
stp = &generations[g].steps[s];
stp->scavenged_large_objects = NULL;
if (t == n_gc_threads) t = 0;
}
}
if (t == n_gc_threads) t = 0;
}
}
-
-
- // Move the private mutable lists from each capability onto the
- // main mutable list for the generation.
- for (i = 0; i < n_capabilities; i++) {
- for (bd = capabilities[i].mut_lists[g];
- bd->link != NULL; bd = bd->link) {
- /* nothing */
- }
- bd->link = generations[g].mut_list;
- generations[g].mut_list = capabilities[i].mut_lists[g];
- capabilities[i].mut_lists[g] = allocBlock();
- }
}
/* -----------------------------------------------------------------------------
}
/* -----------------------------------------------------------------------------
t->static_objects = END_OF_STATIC_LIST;
t->scavenged_static_objects = END_OF_STATIC_LIST;
t->scan_bd = NULL;
t->static_objects = END_OF_STATIC_LIST;
t->scavenged_static_objects = END_OF_STATIC_LIST;
t->scan_bd = NULL;
+ t->mut_lists = capabilities[t->thread_index].mut_lists;
t->evac_step = 0;
t->failed_to_evac = rtsFalse;
t->eager_promotion = rtsTrue;
t->evac_step = 0;
t->failed_to_evac = rtsFalse;
t->eager_promotion = rtsTrue;
// block that is currently being scanned
bdescr * scan_bd;
// block that is currently being scanned
bdescr * scan_bd;
+ // Remembered sets on this CPU. Each GC thread has its own
+ // private per-generation remembered sets, so it can add an item
+ // to the remembered set without taking a lock. The mut_lists
+ // array on a gc_thread is the same as the one on the
+ // corresponding Capability; we stash it here too for easy access
+ // during GC; see recordMutableGen_GC().
+ bdescr ** mut_lists;
+
// --------------------
// evacuate flags
// --------------------
// evacuate flags
#if DEBUG
void printMutableList (generation *gen);
#endif
#if DEBUG
void printMutableList (generation *gen);
#endif
+
+// Version of recordMutableGen for use during GC. This uses the
+// mutable lists attached to the current gc_thread structure, which
+// are the same as the mutable lists on the Capability.
+INLINE_HEADER void
+recordMutableGen_GC (StgClosure *p, nat gen_no)
+{
+ bdescr *bd;
+
+ bd = gct->mut_lists[gen_no];
+ if (bd->free >= bd->start + BLOCK_SIZE_W) {
+ bdescr *new_bd;
+ new_bd = allocBlock_sync();
+ new_bd->link = bd;
+ bd = new_bd;
+ gct->mut_lists[gen_no] = bd;
+ }
+ *bd->free++ = (StgWord)p;
+}
#include "Trace.h"
#include "LdvProfile.h"
#include "Sanity.h"
#include "Trace.h"
#include "LdvProfile.h"
#include "Sanity.h"
static void scavenge_stack (StgPtr p, StgPtr stack_end);
static void scavenge_stack (StgPtr p, StgPtr stack_end);
# define evacuate(a) evacuate1(a)
# define recordMutableGen_GC(a,b) recordMutableGen(a,b)
# define scavenge_loop(a) scavenge_loop1(a)
# define evacuate(a) evacuate1(a)
# define recordMutableGen_GC(a,b) recordMutableGen(a,b)
# define scavenge_loop(a) scavenge_loop1(a)
-# define scavenge_mutable_list(g) scavenge_mutable_list1(g)
+# define scavenge_mutable_list(bd,g) scavenge_mutable_list1(bd,g)
+# define scavenge_capability_mut_lists(cap) scavenge_capability_mut_Lists1(cap)
#endif
/* -----------------------------------------------------------------------------
#endif
/* -----------------------------------------------------------------------------
+ debugTrace(DEBUG_gc,"scavenging thread %d",tso->id);
+
saved_eager = gct->eager_promotion;
gct->eager_promotion = rtsFalse;
saved_eager = gct->eager_promotion;
gct->eager_promotion = rtsFalse;
if (gct->failed_to_evac) {
gct->failed_to_evac = rtsFalse;
if (bd->gen_no > 0) {
if (gct->failed_to_evac) {
gct->failed_to_evac = rtsFalse;
if (bd->gen_no > 0) {
- recordMutableGen_GC((StgClosure *)q, &generations[bd->gen_no]);
+ recordMutableGen_GC((StgClosure *)q, bd->gen_no);
if (gct->failed_to_evac) {
gct->failed_to_evac = rtsFalse;
if (gct->evac_step) {
if (gct->failed_to_evac) {
gct->failed_to_evac = rtsFalse;
if (gct->evac_step) {
- recordMutableGen_GC((StgClosure *)q, gct->evac_step->gen);
+ recordMutableGen_GC((StgClosure *)q, gct->evac_step->gen_no);
-------------------------------------------------------------------------- */
void
-------------------------------------------------------------------------- */
void
-scavenge_mutable_list(generation *gen)
+scavenge_mutable_list(bdescr *bd, generation *gen)
- bd = gen->saved_mut_list;
-
gct->evac_step = &gen->steps[0];
for (; bd != NULL; bd = bd->link) {
for (q = bd->start; q < bd->free; q++) {
gct->evac_step = &gen->steps[0];
for (; bd != NULL; bd = bd->link) {
for (q = bd->start; q < bd->free; q++) {
// definitely doesn't point into a young generation.
// Clean objects don't need to be scavenged. Some clean
// objects (MUT_VAR_CLEAN) are not kept on the mutable
// definitely doesn't point into a young generation.
// Clean objects don't need to be scavenged. Some clean
// objects (MUT_VAR_CLEAN) are not kept on the mutable
- // list at all; others, such as MUT_ARR_PTRS_CLEAN and
- // TSO, are always on the mutable list.
+ // list at all; others, such as MUT_ARR_PTRS_CLEAN
+ // are always on the mutable list.
//
switch (get_itbl((StgClosure *)p)->type) {
case MUT_ARR_PTRS_CLEAN:
//
switch (get_itbl((StgClosure *)p)->type) {
case MUT_ARR_PTRS_CLEAN:
- recordMutableGen_GC((StgClosure *)p,gen);
+ recordMutableGen_GC((StgClosure *)p,gen->no);
continue;
case TSO: {
StgTSO *tso = (StgTSO *)p;
continue;
case TSO: {
StgTSO *tso = (StgTSO *)p;
scavenge_TSO_link(tso);
if (gct->failed_to_evac) {
scavenge_TSO_link(tso);
if (gct->failed_to_evac) {
- recordMutableGen_GC((StgClosure *)p,gen);
+ recordMutableGen_GC((StgClosure *)p,gen->no);
gct->failed_to_evac = rtsFalse;
} else {
tso->flags &= ~TSO_LINK_DIRTY;
gct->failed_to_evac = rtsFalse;
} else {
tso->flags &= ~TSO_LINK_DIRTY;
if (scavenge_one(p)) {
// didn't manage to promote everything, so put the
// object back on the list.
if (scavenge_one(p)) {
// didn't manage to promote everything, so put the
// object back on the list.
- recordMutableGen_GC((StgClosure *)p,gen);
+ recordMutableGen_GC((StgClosure *)p,gen->no);
- // free the old mut_list
- freeChain_sync(gen->saved_mut_list);
- gen->saved_mut_list = NULL;
+void
+scavenge_capability_mut_lists (Capability *cap)
+{
+ nat g;
+
+ /* Mutable lists from each generation > N
+ * we want to *scavenge* these roots, not evacuate them: they're not
+ * going to move in this GC.
+ * Also do them in reverse generation order, for the usual reason:
+ * namely to reduce the likelihood of spurious old->new pointers.
+ */
+ for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
+ scavenge_mutable_list(cap->saved_mut_lists[g], &generations[g]);
+ freeChain_sync(cap->saved_mut_lists[g]);
+ cap->saved_mut_lists[g] = NULL;
+ }
}
/* -----------------------------------------------------------------------------
}
/* -----------------------------------------------------------------------------
*/
if (gct->failed_to_evac) {
gct->failed_to_evac = rtsFalse;
*/
if (gct->failed_to_evac) {
gct->failed_to_evac = rtsFalse;
- recordMutableGen_GC((StgClosure *)p,oldest_gen);
+ recordMutableGen_GC((StgClosure *)p,oldest_gen->no);
p = bd->start;
if (scavenge_one(p)) {
if (ws->step->gen_no > 0) {
p = bd->start;
if (scavenge_one(p)) {
if (ws->step->gen_no > 0) {
- recordMutableGen_GC((StgClosure *)p, ws->step->gen);
+ recordMutableGen_GC((StgClosure *)p, ws->step->gen_no);
* ---------------------------------------------------------------------------*/
void scavenge_loop (void);
* ---------------------------------------------------------------------------*/
void scavenge_loop (void);
-void scavenge_mutable_list (generation *g);
+void scavenge_mutable_list (bdescr *bd, generation *gen);
+void scavenge_capability_mut_lists (Capability *cap);
#ifdef THREADED_RTS
void scavenge_loop1 (void);
#ifdef THREADED_RTS
void scavenge_loop1 (void);
-void scavenge_mutable_list1 (generation *g);
+void scavenge_mutable_list1 (bdescr *bd, generation *gen);
+void scavenge_capability_mut_Lists1 (Capability *cap);
#ifdef THREADED_RTS
initSpinLock(&gc_alloc_block_sync);
#ifdef THREADED_RTS
initSpinLock(&gc_alloc_block_sync);
- initSpinLock(&recordMutableGen_sync);
whitehole_spin = 0;
#endif
whitehole_spin = 0;
#endif
* any more and can use it as a STATIC_LINK.
*/
((StgIndStatic *)caf)->saved_info = NULL;
* any more and can use it as a STATIC_LINK.
*/
((StgIndStatic *)caf)->saved_info = NULL;
- recordMutableGen(caf, oldest_gen);
+ recordMutableGen(caf, oldest_gen->no);