X-Git-Url: http://git.megacz.com/?p=ghc-hetmet.git;a=blobdiff_plain;f=rts%2Fsm%2FGC.c;h=4d63724ba01009b744db70397be869d17d3edd98;hp=fe26cf9e531849f0acf54d9dbd46f7dd8429a0a7;hb=5d52d9b64c21dcf77849866584744722f8121389;hpb=15b6daa6d8e35e34ad077049c1f76d9f5966be9a diff --git a/rts/sm/GC.c b/rts/sm/GC.c index fe26cf9..4d63724 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -1,6 +1,6 @@ /* ----------------------------------------------------------------------------- * - * (c) The GHC Team 1998-2006 + * (c) The GHC Team 1998-2008 * * Generational garbage collector * @@ -13,44 +13,44 @@ #include "PosixSource.h" #include "Rts.h" -#include "RtsFlags.h" +#include "HsFFI.h" + +#include "Storage.h" #include "RtsUtils.h" #include "Apply.h" -#include "OSThreads.h" -#include "LdvProfile.h" #include "Updates.h" #include "Stats.h" #include "Schedule.h" #include "Sanity.h" #include "BlockAlloc.h" -#include "MBlock.h" #include "ProfHeap.h" -#include "SchedAPI.h" #include "Weak.h" #include "Prelude.h" -#include "ParTicky.h" // ToDo: move into Rts.h #include "RtsSignals.h" #include "STM.h" -#include "HsFFI.h" -#include "Linker.h" #if defined(RTS_GTK_FRONTPANEL) #include "FrontPanel.h" #endif #include "Trace.h" #include "RetainerProfile.h" +#include "LdvProfile.h" #include "RaiseAsync.h" -#include "Sparks.h" #include "Papi.h" +#include "Stable.h" #include "GC.h" +#include "GCThread.h" #include "Compact.h" #include "Evac.h" #include "Scav.h" #include "GCUtils.h" +#include "MarkStack.h" #include "MarkWeak.h" #include "Sparks.h" +#include "Sweep.h" #include // for memset() +#include /* ----------------------------------------------------------------------------- Global variables @@ -90,11 +90,6 @@ * We build up a static object list while collecting generations 0..N, * which is then appended to the static object list of generation N+1. */ -StgClosure* static_objects; // live static objects -StgClosure* scavenged_static_objects; // static objects scavenged so far -#ifdef THREADED_RTS -SpinLock static_objects_sync; -#endif /* N is the oldest generation being collected, where the generations * are numbered starting at 0. A major GC (indicated by the major_gc @@ -106,7 +101,7 @@ rtsBool major_gc; /* Data used for allocation area sizing. */ -static lnat g0s0_pcnt_kept = 30; // percentage of g0s0 live at last minor GC +static lnat g0_pcnt_kept = 30; // percentage of g0 live at last minor GC /* Mut-list stats */ #ifdef DEBUG @@ -118,8 +113,11 @@ nat mutlist_MUTVARS, /* Thread-local data for each GC thread */ -gc_thread *gc_threads = NULL; -// gc_thread *gct = NULL; // this thread's gct TODO: make thread-local +gc_thread **gc_threads = NULL; + +#if !defined(THREADED_RTS) +StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)]; +#endif // Number of threads running in *this* GC. Affects how many // step->todos[] lists we have to look in to find work. @@ -128,50 +126,40 @@ nat n_gc_threads; // For stats: long copied; // *words* copied & scavenged during this GC -#ifdef THREADED_RTS -SpinLock recordMutableGen_sync; -#endif +rtsBool work_stealing; + +DECLARE_GCT /* ----------------------------------------------------------------------------- Static function declarations -------------------------------------------------------------------------- */ -static void mark_root (StgClosure **root); +static void mark_root (void *user, StgClosure **root); static void zero_static_object_list (StgClosure* first_static); -static void initialise_N (rtsBool force_major_gc); -static void alloc_gc_threads (void); +static nat initialise_N (rtsBool force_major_gc); static void init_collected_gen (nat g, nat threads); static void init_uncollected_gen (nat g, nat threads); static void init_gc_thread (gc_thread *t); -static void update_task_list (void); static void resize_generations (void); static void resize_nursery (void); static void start_gc_threads (void); -static void gc_thread_work (void); -static nat inc_running (void); -static nat dec_running (void); -static void wakeup_gc_threads (nat n_threads); +static void scavenge_until_all_done (void); +static StgWord inc_running (void); +static StgWord dec_running (void); +static void wakeup_gc_threads (nat n_threads, nat me); +static void shutdown_gc_threads (nat n_threads, nat me); #if 0 && defined(DEBUG) static void gcCAFs (void); #endif /* ----------------------------------------------------------------------------- - The mark bitmap & stack. + The mark stack. -------------------------------------------------------------------------- */ -#define MARK_STACK_BLOCKS 4 - -bdescr *mark_stack_bdescr; -StgPtr *mark_stack; -StgPtr *mark_sp; -StgPtr *mark_splim; - -// Flag and pointers used for falling back to a linear scan when the -// mark stack overflows. -rtsBool mark_stack_overflowed; -bdescr *oldgen_scan_bd; -StgPtr oldgen_scan; +bdescr *mark_stack_top_bd; // topmost block in the mark stack +bdescr *mark_stack_bd; // current block in the mark stack +StgPtr mark_sp; // pointer to the next unallocated mark stack entry /* ----------------------------------------------------------------------------- GarbageCollect: the main entry point to the garbage collector. @@ -180,14 +168,15 @@ StgPtr oldgen_scan; -------------------------------------------------------------------------- */ void -GarbageCollect ( rtsBool force_major_gc ) +GarbageCollect (rtsBool force_major_gc, + nat gc_type USED_IF_THREADS, + Capability *cap) { bdescr *bd; - step *stp; - lnat live, allocated; - lnat oldgen_saved_blocks = 0; + generation *gen; + lnat live, allocated, max_copied, avg_copied, slop; gc_thread *saved_gct; - nat g, s, t; + nat g, t, n; // necessary if we stole a callee-saves register for gct: saved_gct = gct; @@ -198,8 +187,6 @@ GarbageCollect ( rtsBool force_major_gc ) ACQUIRE_SM_LOCK; - debugTrace(DEBUG_gc, "starting GC"); - #if defined(RTS_USER_SIGNALS) if (RtsFlags.MiscFlags.install_signal_handlers) { // block signals @@ -207,16 +194,17 @@ GarbageCollect ( rtsBool force_major_gc ) } #endif + ASSERT(sizeof(gen_workspace) == 16 * sizeof(StgWord)); + // otherwise adjust the padding in gen_workspace. + // tell the stats department that we've started a GC stat_startGC(); // tell the STM to discard any cached closures it's hoping to re-use stmPreGCHook(); -#ifdef DEBUG - // check for memory leaks if DEBUG is on - memInventory(); -#endif + // lock the StablePtr table + stablePtrPreGC(); #ifdef DEBUG mutlist_MUTVARS = 0; @@ -237,48 +225,61 @@ GarbageCollect ( rtsBool force_major_gc ) /* Figure out which generation to collect */ - initialise_N(force_major_gc); + n = initialise_N(force_major_gc); - /* Allocate + initialise the gc_thread structures. - */ - alloc_gc_threads(); +#if defined(THREADED_RTS) + work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled && + N >= RtsFlags.ParFlags.parGcLoadBalancingGen; + // It's not always a good idea to do load balancing in parallel + // GC. In particular, for a parallel program we don't want to + // lose locality by moving cached data into another CPU's cache + // (this effect can be quite significant). + // + // We could have a more complex way to deterimine whether to do + // work stealing or not, e.g. it might be a good idea to do it + // if the heap is big. For now, we just turn it on or off with + // a flag. +#endif /* Start threads, so they can be spinning up while we finish initialisation. */ start_gc_threads(); +#if defined(THREADED_RTS) /* How many threads will be participating in this GC? - * We don't try to parallelise minor GC. + * We don't try to parallelise minor GCs (unless the user asks for + * it with +RTS -gn0), or mark/compact/sweep GC. */ -#if defined(THREADED_RTS) - if (N == 0) { - n_gc_threads = 1; + if (gc_type == PENDING_GC_PAR) { + n_gc_threads = RtsFlags.ParFlags.nNodes; } else { - n_gc_threads = RtsFlags.ParFlags.gcThreads; + n_gc_threads = 1; } #else n_gc_threads = 1; #endif + debugTrace(DEBUG_gc, "GC (gen %d): %d KB to collect, %ld MB in use, using %d thread(s)", + N, n * (BLOCK_SIZE / 1024), mblocks_allocated, n_gc_threads); + #ifdef RTS_GTK_FRONTPANEL if (RtsFlags.GcFlags.frontpanel) { updateFrontPanelBeforeGC(N); } #endif - // check stack sanity *before* GC (ToDo: check all threads) - IF_DEBUG(sanity, checkFreeListSanity()); +#ifdef DEBUG + // check for memory leaks if DEBUG is on + memInventory(DEBUG_gc); +#endif - /* Initialise the static object lists - */ - static_objects = END_OF_STATIC_LIST; - scavenged_static_objects = END_OF_STATIC_LIST; + // check sanity *before* GC + IF_DEBUG(sanity, checkSanity(rtsTrue)); -#ifdef THREADED_RTS - initSpinLock(&static_objects_sync); - initSpinLock(&recordMutableGen_sync); - initSpinLock(&gc_alloc_block_sync); -#endif + // Initialise all our gc_thread structures + for (t = 0; t < n_gc_threads; t++) { + init_gc_thread(gc_threads[t]); + } // Initialise all the generations/steps that we're collecting. for (g = 0; g <= N; g++) { @@ -292,61 +293,89 @@ GarbageCollect ( rtsBool force_major_gc ) /* Allocate a mark stack if we're doing a major collection. */ - if (major_gc) { - mark_stack_bdescr = allocGroup(MARK_STACK_BLOCKS); - mark_stack = (StgPtr *)mark_stack_bdescr->start; - mark_sp = mark_stack; - mark_splim = mark_stack + (MARK_STACK_BLOCKS * BLOCK_SIZE_W); + if (major_gc && oldest_gen->mark) { + mark_stack_bd = allocBlock(); + mark_stack_top_bd = mark_stack_bd; + mark_stack_bd->link = NULL; + mark_stack_bd->u.back = NULL; + mark_sp = mark_stack_bd->start; } else { - mark_stack_bdescr = NULL; + mark_stack_bd = NULL; + mark_stack_top_bd = NULL; + mark_sp = NULL; } - // Initialise all our gc_thread structures - for (t = 0; t < n_gc_threads; t++) { - init_gc_thread(&gc_threads[t]); + // this is the main thread +#ifdef THREADED_RTS + if (n_gc_threads == 1) { + SET_GCT(gc_threads[0]); + } else { + SET_GCT(gc_threads[cap->no]); } +#else +SET_GCT(gc_threads[0]); +#endif + + /* ----------------------------------------------------------------------- + * follow all the roots that we know about: + */ // the main thread is running: this prevents any other threads from // exiting prematurely, so we can start them now. + // NB. do this after the mutable lists have been saved above, otherwise + // the other GC threads will be writing into the old mutable lists. inc_running(); - wakeup_gc_threads(n_gc_threads); - - // Initialise stats - copied = 0; + wakeup_gc_threads(n_gc_threads, gct->thread_index); + + // Mutable lists from each generation > N + // we want to *scavenge* these roots, not evacuate them: they're not + // going to move in this GC. + // Also do them in reverse generation order, for the usual reason: + // namely to reduce the likelihood of spurious old->new pointers. + // + for (g = RtsFlags.GcFlags.generations-1; g > N; g--) { +#if defined(THREADED_RTS) + if (n_gc_threads > 1) { + scavenge_mutable_list(generations[g].saved_mut_list, &generations[g]); + } else { + scavenge_mutable_list1(generations[g].saved_mut_list, &generations[g]); + } +#else + scavenge_mutable_list(generations[g].saved_mut_list, &generations[g]); +#endif + freeChain_sync(generations[g].saved_mut_list); + generations[g].saved_mut_list = NULL; - // this is the main thread - gct = &gc_threads[0]; + } - /* ----------------------------------------------------------------------- - * follow all the roots that we know about: - * - mutable lists from each generation > N - * we want to *scavenge* these roots, not evacuate them: they're not - * going to move in this GC. - * Also do them in reverse generation order, for the usual reason: - * namely to reduce the likelihood of spurious old->new pointers. - */ - { - for (g = RtsFlags.GcFlags.generations-1; g > N; g--) { - generations[g].saved_mut_list = generations[g].mut_list; - generations[g].mut_list = allocBlock(); - // mut_list always has at least one block. - } - for (g = RtsFlags.GcFlags.generations-1; g > N; g--) { - scavenge_mutable_list(&generations[g]); - } + // scavenge the capability-private mutable lists. This isn't part + // of markSomeCapabilities() because markSomeCapabilities() can only + // call back into the GC via mark_root() (due to the gct register + // variable). + if (n_gc_threads == 1) { + for (n = 0; n < n_capabilities; n++) { +#if defined(THREADED_RTS) + scavenge_capability_mut_Lists1(&capabilities[n]); +#else + scavenge_capability_mut_lists(&capabilities[n]); +#endif + } + } else { + scavenge_capability_mut_lists(&capabilities[gct->thread_index]); } // follow roots from the CAF list (used by GHCi) - gct->evac_step = 0; - markCAFs(mark_root); + gct->evac_gen = 0; + markCAFs(mark_root, gct); // follow all the roots that the application knows about. - gct->evac_step = 0; - GetRoots(mark_root); + gct->evac_gen = 0; + markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads, + rtsTrue/*prune sparks*/); #if defined(RTS_USER_SIGNALS) // mark the signal handlers (signals should be already blocked) - markSignalHandlers(mark_root); + markSignalHandlers(mark_root, gct); #endif // Mark the weak pointer list, and prepare to detect dead weak pointers. @@ -354,7 +383,7 @@ GarbageCollect ( rtsBool force_major_gc ) initWeakForGC(); // Mark the stable pointer table. - markStablePtrTable(mark_root); + markStablePtrTable(mark_root, gct); /* ------------------------------------------------------------------------- * Repeatedly scavenge all the areas we know about until there's no @@ -362,17 +391,10 @@ GarbageCollect ( rtsBool force_major_gc ) */ for (;;) { - gc_thread_work(); + scavenge_until_all_done(); // The other threads are now stopped. We might recurse back to // here, but from now on this is the only thread. - // if any blackholes are alive, make the threads that wait on - // them alive too. - if (traverseBlackholeQueue()) { - inc_running(); - continue; - } - // must be last... invariant is that everything is fully // scavenged at this point. if (traverseWeakPtrList()) { // returns rtsTrue if evaced something @@ -384,8 +406,7 @@ GarbageCollect ( rtsBool force_major_gc ) break; } - // Update pointers from the Task list - update_task_list(); + shutdown_gc_threads(n_gc_threads, gct->thread_index); // Now see which stable names are still alive. gcStablePtrTable(); @@ -399,83 +420,127 @@ GarbageCollect ( rtsBool force_major_gc ) #endif // NO MORE EVACUATION AFTER THIS POINT! - // Finally: compaction of the oldest generation. - if (major_gc && oldest_gen->steps[0].is_compacted) { - // save number of blocks for stats - oldgen_saved_blocks = oldest_gen->steps[0].n_old_blocks; - compact(); - } - - IF_DEBUG(sanity, checkGlobalTSOList(rtsFalse)); // Two-space collector: free the old to-space. - // g0s0->old_blocks is the old nursery - // g0s0->blocks is to-space from the previous GC + // g0->old_blocks is the old nursery + // g0->blocks is to-space from the previous GC if (RtsFlags.GcFlags.generations == 1) { - if (g0s0->blocks != NULL) { - freeChain(g0s0->blocks); - g0s0->blocks = NULL; + if (g0->blocks != NULL) { + freeChain(g0->blocks); + g0->blocks = NULL; } } - // For each workspace, in each thread: - // * clear the BF_EVACUATED flag from each copied block - // * move the copied blocks to the step + // For each workspace, in each thread, move the copied blocks to the step { gc_thread *thr; - step_workspace *ws; - bdescr *prev; + gen_workspace *ws; + bdescr *prev, *next; + + for (t = 0; t < n_gc_threads; t++) { + thr = gc_threads[t]; + + for (g = 0; g < RtsFlags.GcFlags.generations; g++) { + ws = &thr->gens[g]; + + // Push the final block + if (ws->todo_bd) { + push_scanned_block(ws->todo_bd, ws); + } + + ASSERT(gct->scan_bd == NULL); + ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks); + + prev = NULL; + for (bd = ws->scavd_list; bd != NULL; bd = bd->link) { + ws->gen->n_words += bd->free - bd->start; + prev = bd; + } + if (prev != NULL) { + prev->link = ws->gen->blocks; + ws->gen->blocks = ws->scavd_list; + } + ws->gen->n_blocks += ws->n_scavd_blocks; + } + } + // Add all the partial blocks *after* we've added all the full + // blocks. This is so that we can grab the partial blocks back + // again and try to fill them up in the next GC. for (t = 0; t < n_gc_threads; t++) { - thr = &gc_threads[t]; - - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - for (s = 0; s < generations[g].n_steps; s++) { - ws = &thr->steps[g][s]; - if (g==0 && s==0) continue; - - // Not true? - // ASSERT( ws->scan_bd == ws->todo_bd ); - ASSERT( ws->scan_bd ? ws->scan == ws->scan_bd->free : 1 ); - - // Push the final block - if (ws->scan_bd) { push_scan_block(ws->scan_bd, ws); } - - ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks); - - prev = ws->scavd_list; - for (bd = ws->scavd_list; bd != NULL; bd = bd->link) { - bd->flags &= ~BF_EVACUATED; // now from-space - prev = bd; - } - prev->link = ws->stp->blocks; - ws->stp->blocks = ws->scavd_list; - ws->stp->n_blocks += ws->n_scavd_blocks; - ASSERT(countBlocks(ws->stp->blocks) == ws->stp->n_blocks); - } + thr = gc_threads[t]; + + for (g = 0; g < RtsFlags.GcFlags.generations; g++) { + ws = &thr->gens[g]; + + prev = NULL; + for (bd = ws->part_list; bd != NULL; bd = next) { + next = bd->link; + if (bd->free == bd->start) { + if (prev == NULL) { + ws->part_list = next; + } else { + prev->link = next; + } + freeGroup(bd); + ws->n_part_blocks--; + } else { + ws->gen->n_words += bd->free - bd->start; + prev = bd; + } + } + if (prev != NULL) { + prev->link = ws->gen->blocks; + ws->gen->blocks = ws->part_list; + } + ws->gen->n_blocks += ws->n_part_blocks; + + ASSERT(countBlocks(ws->gen->blocks) == ws->gen->n_blocks); + ASSERT(countOccupied(ws->gen->blocks) == ws->gen->n_words); } } } - // Two-space collector: swap the semi-spaces around. - // Currently: g0s0->old_blocks is the old nursery - // g0s0->blocks is to-space from this GC - // We want these the other way around. - if (RtsFlags.GcFlags.generations == 1) { - bdescr *nursery_blocks = g0s0->old_blocks; - nat n_nursery_blocks = g0s0->n_old_blocks; - g0s0->old_blocks = g0s0->blocks; - g0s0->n_old_blocks = g0s0->n_blocks; - g0s0->blocks = nursery_blocks; - g0s0->n_blocks = n_nursery_blocks; + // Finally: compact or sweep the oldest generation. + if (major_gc && oldest_gen->mark) { + if (oldest_gen->compact) + compact(gct->scavenged_static_objects); + else + sweep(oldest_gen); } /* run through all the generations/steps and tidy up */ + copied = 0; + max_copied = 0; + avg_copied = 0; + { + nat i; + for (i=0; i < n_gc_threads; i++) { + if (n_gc_threads > 1) { + debugTrace(DEBUG_gc,"thread %d:", i); + debugTrace(DEBUG_gc," copied %ld", gc_threads[i]->copied * sizeof(W_)); + debugTrace(DEBUG_gc," scanned %ld", gc_threads[i]->scanned * sizeof(W_)); + debugTrace(DEBUG_gc," any_work %ld", gc_threads[i]->any_work); + debugTrace(DEBUG_gc," no_work %ld", gc_threads[i]->no_work); + debugTrace(DEBUG_gc," scav_find_work %ld", gc_threads[i]->scav_find_work); + } + copied += gc_threads[i]->copied; + max_copied = stg_max(gc_threads[i]->copied, max_copied); + } + if (n_gc_threads == 1) { + max_copied = 0; + avg_copied = 0; + } else { + avg_copied = copied; + } + } + for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - if (g <= N) { + if (g == N) { generations[g].collections++; // for stats + if (n_gc_threads > 1) generations[g].par_collections++; } // Count the mutable list as bytes "copied" for the purposes of @@ -485,6 +550,12 @@ GarbageCollect ( rtsBool force_major_gc ) for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) { mut_list_size += bd->free - bd->start; } + for (n = 0; n < n_capabilities; n++) { + for (bd = capabilities[n].mut_lists[g]; + bd != NULL; bd = bd->link) { + mut_list_size += bd->free - bd->start; + } + } copied += mut_list_size; debugTrace(DEBUG_gc, @@ -493,125 +564,128 @@ GarbageCollect ( rtsBool force_major_gc ) mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS, mutlist_OTHERS); } - for (s = 0; s < generations[g].n_steps; s++) { - bdescr *next; - stp = &generations[g].steps[s]; + bdescr *next, *prev; + gen = &generations[g]; - // for generations we collected... - if (g <= N) { + // for generations we collected... + if (g <= N) { /* free old memory and shift to-space into from-space for all * the collected steps (except the allocation area). These * freed blocks will probaby be quickly recycled. */ - if (!(g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1)) { - if (stp->is_compacted) - { - // for a compacted step, just shift the new to-space - // onto the front of the now-compacted existing blocks. - for (bd = stp->blocks; bd != NULL; bd = bd->link) { - bd->flags &= ~BF_EVACUATED; // now from-space - } - // tack the new blocks on the end of the existing blocks - if (stp->old_blocks != NULL) { - for (bd = stp->old_blocks; bd != NULL; bd = next) { - // NB. this step might not be compacted next - // time, so reset the BF_COMPACTED flags. - // They are set before GC if we're going to - // compact. (search for BF_COMPACTED above). - bd->flags &= ~BF_COMPACTED; - next = bd->link; - if (next == NULL) { - bd->link = stp->blocks; - } - } - stp->blocks = stp->old_blocks; - } - // add the new blocks to the block tally - stp->n_blocks += stp->n_old_blocks; - ASSERT(countBlocks(stp->blocks) == stp->n_blocks); - } - else // not copacted - { - freeChain(stp->old_blocks); - } - stp->old_blocks = NULL; - stp->n_old_blocks = 0; - } - - /* LARGE OBJECTS. The current live large objects are chained on - * scavenged_large, having been moved during garbage - * collection from large_objects. Any objects left on - * large_objects list are therefore dead, so we free them here. - */ - for (bd = stp->large_objects; bd != NULL; bd = next) { - next = bd->link; - freeGroup(bd); - bd = next; - } - - // update the count of blocks used by large objects - for (bd = stp->scavenged_large_objects; bd != NULL; bd = bd->link) { - bd->flags &= ~BF_EVACUATED; - } - stp->large_objects = stp->scavenged_large_objects; - stp->n_large_blocks = stp->n_scavenged_large_blocks; + if (gen->mark) + { + // tack the new blocks on the end of the existing blocks + if (gen->old_blocks != NULL) { + + prev = NULL; + for (bd = gen->old_blocks; bd != NULL; bd = next) { + + next = bd->link; + + if (!(bd->flags & BF_MARKED)) + { + if (prev == NULL) { + gen->old_blocks = next; + } else { + prev->link = next; + } + freeGroup(bd); + gen->n_old_blocks--; + } + else + { + gen->n_words += bd->free - bd->start; + + // NB. this step might not be compacted next + // time, so reset the BF_MARKED flags. + // They are set before GC if we're going to + // compact. (search for BF_MARKED above). + bd->flags &= ~BF_MARKED; + + // between GCs, all blocks in the heap except + // for the nursery have the BF_EVACUATED flag set. + bd->flags |= BF_EVACUATED; + + prev = bd; + } + } + + if (prev != NULL) { + prev->link = gen->blocks; + gen->blocks = gen->old_blocks; + } + } + // add the new blocks to the block tally + gen->n_blocks += gen->n_old_blocks; + ASSERT(countBlocks(gen->blocks) == gen->n_blocks); + ASSERT(countOccupied(gen->blocks) == gen->n_words); + } + else // not copacted + { + freeChain(gen->old_blocks); + } - } - else // for older generations... - { + gen->old_blocks = NULL; + gen->n_old_blocks = 0; + + /* LARGE OBJECTS. The current live large objects are chained on + * scavenged_large, having been moved during garbage + * collection from large_objects. Any objects left on the + * large_objects list are therefore dead, so we free them here. + */ + freeChain(gen->large_objects); + gen->large_objects = gen->scavenged_large_objects; + gen->n_large_blocks = gen->n_scavenged_large_blocks; + gen->n_new_large_blocks = 0; + ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks); + } + else // for generations > N + { /* For older generations, we need to append the * scavenged_large_object list (i.e. large objects that have been * promoted during this GC) to the large_object list for that step. */ - for (bd = stp->scavenged_large_objects; bd; bd = next) { - next = bd->link; - bd->flags &= ~BF_EVACUATED; - dbl_link_onto(bd, &stp->large_objects); + for (bd = gen->scavenged_large_objects; bd; bd = next) { + next = bd->link; + dbl_link_onto(bd, &gen->large_objects); } - + // add the new blocks we promoted during this GC - stp->n_large_blocks += stp->n_scavenged_large_blocks; - } + gen->n_large_blocks += gen->n_scavenged_large_blocks; + ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks); } - } + } // for all generations // update the max size of older generations after a major GC resize_generations(); - // Guess the amount of live data for stats. - live = calcLiveBlocks() * BLOCK_SIZE_W; - debugTrace(DEBUG_gc, "Slop: %ldKB", - (live - calcLiveWords()) / (1024/sizeof(W_))); + // Calculate the amount of live data for stats. + live = calcLiveWords(); // Free the small objects allocated via allocate(), since this will // all have been copied into G0S1 now. - if (RtsFlags.GcFlags.generations > 1) { - if (g0s0->blocks != NULL) { - freeChain(g0s0->blocks); - g0s0->blocks = NULL; - } - g0s0->n_blocks = 0; - } - alloc_blocks = 0; alloc_blocks_lim = RtsFlags.GcFlags.minAllocAreaSize; // Start a new pinned_object_block - pinned_object_block = NULL; + for (n = 0; n < n_capabilities; n++) { + capabilities[n].pinned_object_block = NULL; + } // Free the mark stack. - if (mark_stack_bdescr != NULL) { - freeGroup(mark_stack_bdescr); + if (mark_stack_top_bd != NULL) { + debugTrace(DEBUG_gc, "mark stack: %d blocks", + countBlocks(mark_stack_top_bd)); + freeChain(mark_stack_top_bd); } // Free any bitmaps. for (g = 0; g <= N; g++) { - for (s = 0; s < generations[g].n_steps; s++) { - stp = &generations[g].steps[s]; - if (stp->bitmap != NULL) { - freeGroup(stp->bitmap); - stp->bitmap = NULL; - } + gen = &generations[g]; + if (gen->bitmap != NULL) { + freeGroup(gen->bitmap); + gen->bitmap = NULL; } } @@ -625,12 +699,19 @@ GarbageCollect ( rtsBool force_major_gc ) #ifdef PROFILING // resetStaticObjectForRetainerProfiling() must be called before // zeroing below. - resetStaticObjectForRetainerProfiling(); + if (n_gc_threads > 1) { + barf("profiling is currently broken with multi-threaded GC"); + // ToDo: fix the gct->scavenged_static_objects below + } + resetStaticObjectForRetainerProfiling(gct->scavenged_static_objects); #endif // zero the scavenged static object list if (major_gc) { - zero_static_object_list(scavenged_static_objects); + nat i; + for (i = 0; i < n_gc_threads; i++) { + zero_static_object_list(gc_threads[i]->scavenged_static_objects); + } } // Reset the nursery @@ -638,7 +719,7 @@ GarbageCollect ( rtsBool force_major_gc ) // start any pending finalizers RELEASE_SM_LOCK; - scheduleFinalizers(last_free_capability, old_weak_ptr_list); + scheduleFinalizers(cap, old_weak_ptr_list); ACQUIRE_SM_LOCK; // send exceptions to any threads which were about to die @@ -649,8 +730,8 @@ GarbageCollect ( rtsBool force_major_gc ) // Update the stable pointer hash table. updateStablePtrTable(major_gc); - // check sanity after GC - IF_DEBUG(sanity, checkSanity()); + // check sanity after GC + IF_DEBUG(sanity, checkSanity(rtsTrue)); // extra GC trace info IF_DEBUG(gc, statDescribeGens()); @@ -667,7 +748,7 @@ GarbageCollect ( rtsBool force_major_gc ) #ifdef DEBUG // check for memory leaks if DEBUG is on - memInventory(); + memInventory(DEBUG_gc); #endif #ifdef RTS_GTK_FRONTPANEL @@ -677,7 +758,14 @@ GarbageCollect ( rtsBool force_major_gc ) #endif // ok, GC over: tell the stats department what happened. - stat_endGC(allocated, live, copied, N); + slop = calcLiveBlocks() * BLOCK_SIZE_W - live; + stat_endGC(allocated, live, copied, N, max_copied, avg_copied, slop); + + // unlock the StablePtr table + stablePtrPostGC(); + + // Guess which generation we'll collect *next* time + initialise_N(force_major_gc); #if defined(RTS_USER_SIGNALS) if (RtsFlags.MiscFlags.install_signal_handlers) { @@ -688,256 +776,72 @@ GarbageCollect ( rtsBool force_major_gc ) RELEASE_SM_LOCK; - gct = saved_gct; + SET_GCT(saved_gct); } /* ----------------------------------------------------------------------------- - * Mark all nodes pointed to by sparks in the spark queues (for GC) Does an - * implicit slide i.e. after marking all sparks are at the beginning of the - * spark pool and the spark pool only contains sparkable closures - * -------------------------------------------------------------------------- */ - -#ifdef THREADED_RTS -static void -markSparkQueue (evac_fn evac, Capability *cap) -{ - StgClosure **sparkp, **to_sparkp; - nat n, pruned_sparks; // stats only - StgSparkPool *pool; - - PAR_TICKY_MARK_SPARK_QUEUE_START(); - - n = 0; - pruned_sparks = 0; - - pool = &(cap->r.rSparks); - - ASSERT_SPARK_POOL_INVARIANTS(pool); - -#if defined(PARALLEL_HASKELL) - // stats only - n = 0; - pruned_sparks = 0; -#endif - - sparkp = pool->hd; - to_sparkp = pool->hd; - while (sparkp != pool->tl) { - ASSERT(*sparkp!=NULL); - ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgClosure *)*sparkp))); - // ToDo?: statistics gathering here (also for GUM!) - if (closure_SHOULD_SPARK(*sparkp)) { - evac(sparkp); - *to_sparkp++ = *sparkp; - if (to_sparkp == pool->lim) { - to_sparkp = pool->base; - } - n++; - } else { - pruned_sparks++; - } - sparkp++; - if (sparkp == pool->lim) { - sparkp = pool->base; - } - } - pool->tl = to_sparkp; - - PAR_TICKY_MARK_SPARK_QUEUE_END(n); - -#if defined(PARALLEL_HASKELL) - debugTrace(DEBUG_sched, - "marked %d sparks and pruned %d sparks on [%x]", - n, pruned_sparks, mytid); -#else - debugTrace(DEBUG_sched, - "marked %d sparks and pruned %d sparks", - n, pruned_sparks); -#endif - - debugTrace(DEBUG_sched, - "new spark queue len=%d; (hd=%p; tl=%p)\n", - sparkPoolSize(pool), pool->hd, pool->tl); -} -#endif - -/* --------------------------------------------------------------------------- - Where are the roots that we know about? - - - all the threads on the runnable queue - - all the threads on the blocked queue - - all the threads on the sleeping queue - - all the thread currently executing a _ccall_GC - - all the "main threads" - - ------------------------------------------------------------------------ */ - -void -GetRoots( evac_fn evac ) -{ - nat i; - Capability *cap; - Task *task; - - // Each GC thread is responsible for following roots from the - // Capability of the same number. There will usually be the same - // or fewer Capabilities as GC threads, but just in case there - // are more, we mark every Capability whose number is the GC - // thread's index plus a multiple of the number of GC threads. - for (i = gct->thread_index; i < n_capabilities; i += n_gc_threads) { - cap = &capabilities[i]; - evac((StgClosure **)(void *)&cap->run_queue_hd); - evac((StgClosure **)(void *)&cap->run_queue_tl); -#if defined(THREADED_RTS) - evac((StgClosure **)(void *)&cap->wakeup_queue_hd); - evac((StgClosure **)(void *)&cap->wakeup_queue_tl); -#endif - for (task = cap->suspended_ccalling_tasks; task != NULL; - task=task->next) { - debugTrace(DEBUG_sched, - "evac'ing suspended TSO %lu", (unsigned long)task->suspended_tso->id); - evac((StgClosure **)(void *)&task->suspended_tso); - } - -#if defined(THREADED_RTS) - markSparkQueue(evac,cap); -#endif - } - -#if !defined(THREADED_RTS) - evac((StgClosure **)(void *)&blocked_queue_hd); - evac((StgClosure **)(void *)&blocked_queue_tl); - evac((StgClosure **)(void *)&sleeping_queue); -#endif -} - -/* ----------------------------------------------------------------------------- - isAlive determines whether the given closure is still alive (after - a garbage collection) or not. It returns the new address of the - closure if it is alive, or NULL otherwise. + Figure out which generation to collect, initialise N and major_gc. - NOTE: Use it before compaction only! - It untags and (if needed) retags pointers to closures. + Also returns the total number of blocks in generations that will be + collected. -------------------------------------------------------------------------- */ - -StgClosure * -isAlive(StgClosure *p) +static nat +initialise_N (rtsBool force_major_gc) { - const StgInfoTable *info; - bdescr *bd; - StgWord tag; - StgClosure *q; - - while (1) { - /* The tag and the pointer are split, to be merged later when needed. */ - tag = GET_CLOSURE_TAG(p); - q = UNTAG_CLOSURE(p); - - ASSERT(LOOKS_LIKE_CLOSURE_PTR(q)); - info = get_itbl(q); - - // ignore static closures - // - // ToDo: for static closures, check the static link field. - // Problem here is that we sometimes don't set the link field, eg. - // for static closures with an empty SRT or CONSTR_STATIC_NOCAFs. - // - if (!HEAP_ALLOCED(q)) { - return p; - } + int g; + nat blocks, blocks_total; - // ignore closures in generations that we're not collecting. - bd = Bdescr((P_)q); - if (bd->gen_no > N) { - return p; - } + blocks = 0; + blocks_total = 0; - // if it's a pointer into to-space, then we're done - if (bd->flags & BF_EVACUATED) { - return p; + if (force_major_gc) { + N = RtsFlags.GcFlags.generations - 1; + } else { + N = 0; } - // large objects use the evacuated flag - if (bd->flags & BF_LARGE) { - return NULL; - } + for (g = RtsFlags.GcFlags.generations - 1; g >= 0; g--) { - // check the mark bit for compacted steps - if ((bd->flags & BF_COMPACTED) && is_marked((P_)q,bd)) { - return p; - } + blocks = generations[g].n_words / BLOCK_SIZE_W + + generations[g].n_large_blocks; - switch (info->type) { - - case IND: - case IND_STATIC: - case IND_PERM: - case IND_OLDGEN: // rely on compatible layout with StgInd - case IND_OLDGEN_PERM: - // follow indirections - p = ((StgInd *)q)->indirectee; - continue; - - case EVACUATED: - // alive! - return ((StgEvacuated *)q)->evacuee; - - case TSO: - if (((StgTSO *)q)->what_next == ThreadRelocated) { - p = (StgClosure *)((StgTSO *)q)->link; - continue; - } - return NULL; - - default: - // dead. - return NULL; + if (blocks >= generations[g].max_blocks) { + N = stg_max(N,g); + } + if ((nat)g <= N) { + blocks_total += blocks; + } } - } -} - -/* ----------------------------------------------------------------------------- - Figure out which generation to collect, initialise N and major_gc. - -------------------------------------------------------------------------- */ -static void -initialise_N (rtsBool force_major_gc) -{ - nat g; + blocks_total += countNurseryBlocks(); - if (force_major_gc) { - N = RtsFlags.GcFlags.generations - 1; - major_gc = rtsTrue; - } else { - N = 0; - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - if (generations[g].steps[0].n_blocks + - generations[g].steps[0].n_large_blocks - >= generations[g].max_blocks) { - N = g; - } - } - major_gc = (N == RtsFlags.GcFlags.generations-1); - } + major_gc = (N == RtsFlags.GcFlags.generations-1); + return blocks_total; } /* ----------------------------------------------------------------------------- Initialise the gc_thread structures. -------------------------------------------------------------------------- */ +#define GC_THREAD_INACTIVE 0 +#define GC_THREAD_STANDING_BY 1 +#define GC_THREAD_RUNNING 2 +#define GC_THREAD_WAITING_TO_CONTINUE 3 + static void -alloc_gc_thread (gc_thread *t, int n) +new_gc_thread (nat n, gc_thread *t) { - nat g, s; - step_workspace *ws; + nat g; + gen_workspace *ws; #ifdef THREADED_RTS t->id = 0; - initCondition(&t->wake_cond); - initMutex(&t->wake_mutex); - t->wakeup = rtsFalse; - t->exit = rtsFalse; + initSpinLock(&t->gc_spin); + initSpinLock(&t->mut_spin); + ACQUIRE_SPIN_LOCK(&t->gc_spin); + t->wakeup = GC_THREAD_INACTIVE; // starts true, so we can wait for the + // thread to start up, see wakeup_gc_threads #endif t->thread_index = n; @@ -950,54 +854,76 @@ alloc_gc_thread (gc_thread *t, int n) t->papi_events = -1; #endif - t->steps = stgMallocBytes(RtsFlags.GcFlags.generations * - sizeof(step_workspace *), - "initialise_gc_thread"); - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - t->steps[g] = stgMallocBytes(generations[g].n_steps * - sizeof(step_workspace), - "initialise_gc_thread/2"); - - for (s = 0; s < generations[g].n_steps; s++) - { - ws = &t->steps[g][s]; - ws->stp = &generations[g].steps[s]; - ws->gct = t; - - ws->scan_bd = NULL; - ws->scan = NULL; - - ws->todo_bd = NULL; - ws->buffer_todo_bd = NULL; - - ws->scavd_list = NULL; - ws->n_scavd_blocks = 0; - } + ws = &t->gens[g]; + ws->gen = &generations[g]; + ASSERT(g == ws->gen->no); + ws->my_gct = t; + + ws->todo_bd = NULL; + ws->todo_q = newWSDeque(128); + ws->todo_overflow = NULL; + ws->n_todo_overflow = 0; + + ws->part_list = NULL; + ws->n_part_blocks = 0; + + ws->scavd_list = NULL; + ws->n_scavd_blocks = 0; } } -static void -alloc_gc_threads (void) +void +initGcThreads (void) { if (gc_threads == NULL) { #if defined(THREADED_RTS) nat i; - gc_threads = stgMallocBytes (RtsFlags.ParFlags.gcThreads * - sizeof(gc_thread), + gc_threads = stgMallocBytes (RtsFlags.ParFlags.nNodes * + sizeof(gc_thread*), "alloc_gc_threads"); - for (i = 0; i < RtsFlags.ParFlags.gcThreads; i++) { - alloc_gc_thread(&gc_threads[i], i); + for (i = 0; i < RtsFlags.ParFlags.nNodes; i++) { + gc_threads[i] = + stgMallocBytes(sizeof(gc_thread) + + RtsFlags.GcFlags.generations * sizeof(gen_workspace), + "alloc_gc_threads"); + + new_gc_thread(i, gc_threads[i]); } #else - gc_threads = stgMallocBytes (sizeof(gc_thread), - "alloc_gc_threads"); + gc_threads = stgMallocBytes (sizeof(gc_thread*),"alloc_gc_threads"); + gc_threads[0] = gct; + new_gc_thread(0,gc_threads[0]); +#endif + } +} - alloc_gc_thread(gc_threads, 0); +void +freeGcThreads (void) +{ + nat g; + if (gc_threads != NULL) { +#if defined(THREADED_RTS) + nat i; + for (i = 0; i < n_capabilities; i++) { + for (g = 0; g < RtsFlags.GcFlags.generations; g++) + { + freeWSDeque(gc_threads[i]->gens[g].todo_q); + } + stgFree (gc_threads[i]); + } + stgFree (gc_threads); +#else + for (g = 0; g < RtsFlags.GcFlags.generations; g++) + { + freeWSDeque(gc_threads[0]->gens[g].todo_q); + } + stgFree (gc_threads); #endif + gc_threads = NULL; } } @@ -1005,159 +931,260 @@ alloc_gc_threads (void) Start GC threads ------------------------------------------------------------------------- */ -static nat gc_running_threads; +static volatile StgWord gc_running_threads; -#if defined(THREADED_RTS) -static Mutex gc_running_mutex; -#endif - -static nat +static StgWord inc_running (void) { - nat n_running; - ACQUIRE_LOCK(&gc_running_mutex); - n_running = ++gc_running_threads; - RELEASE_LOCK(&gc_running_mutex); - return n_running; + StgWord new; + new = atomic_inc(&gc_running_threads); + ASSERT(new <= n_gc_threads); + return new; } -static nat +static StgWord dec_running (void) { - nat n_running; - ACQUIRE_LOCK(&gc_running_mutex); - n_running = --gc_running_threads; - RELEASE_LOCK(&gc_running_mutex); - return n_running; + ASSERT(gc_running_threads != 0); + return atomic_dec(&gc_running_threads); } -// -// gc_thread_work(): Scavenge until there's no work left to do and all -// the running threads are idle. -// +static rtsBool +any_work (void) +{ + int g; + gen_workspace *ws; + + gct->any_work++; + + write_barrier(); + + // scavenge objects in compacted generation + if (mark_stack_bd != NULL && !mark_stack_empty()) { + return rtsTrue; + } + + // Check for global work in any step. We don't need to check for + // local work, because we have already exited scavenge_loop(), + // which means there is no local work for this thread. + for (g = 0; g < (int)RtsFlags.GcFlags.generations; g++) { + ws = &gct->gens[g]; + if (ws->todo_large_objects) return rtsTrue; + if (!looksEmptyWSDeque(ws->todo_q)) return rtsTrue; + if (ws->todo_overflow) return rtsTrue; + } + +#if defined(THREADED_RTS) + if (work_stealing) { + nat n; + // look for work to steal + for (n = 0; n < n_gc_threads; n++) { + if (n == gct->thread_index) continue; + for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) { + ws = &gc_threads[n]->gens[g]; + if (!looksEmptyWSDeque(ws->todo_q)) return rtsTrue; + } + } + } +#endif + + gct->no_work++; +#if defined(THREADED_RTS) + yieldThread(); +#endif + + return rtsFalse; +} + static void -gc_thread_work (void) +scavenge_until_all_done (void) { nat r; - debugTrace(DEBUG_gc, "GC thread %d working", gct->thread_index); - - // gc_running_threads has already been incremented for us; either - // this is the main thread and we incremented it inside - // GarbageCollect(), or this is a worker thread and the main - // thread bumped gc_running_threads before waking us up. - - // Every thread evacuates some roots. - gct->evac_step = 0; - GetRoots(mark_root); loop: + traceEventGcWork(&capabilities[gct->thread_index]); + +#if defined(THREADED_RTS) + if (n_gc_threads > 1) { + scavenge_loop(); + } else { + scavenge_loop1(); + } +#else scavenge_loop(); +#endif + // scavenge_loop() only exits when there's no work to do r = dec_running(); - debugTrace(DEBUG_gc, "GC thread %d idle (%d still running)", - gct->thread_index, r); + traceEventGcIdle(&capabilities[gct->thread_index]); + debugTrace(DEBUG_gc, "%d GC threads still running", r); + while (gc_running_threads != 0) { - if (any_work()) { - inc_running(); - goto loop; - } - // any_work() does not remove the work from the queue, it - // just checks for the presence of work. If we find any, - // then we increment gc_running_threads and go back to - // scavenge_loop() to perform any pending work. + // usleep(1); + if (any_work()) { + inc_running(); + goto loop; + } + // any_work() does not remove the work from the queue, it + // just checks for the presence of work. If we find any, + // then we increment gc_running_threads and go back to + // scavenge_loop() to perform any pending work. } - // All threads are now stopped - debugTrace(DEBUG_gc, "GC thread %d finished.", gct->thread_index); + traceEventGcDone(&capabilities[gct->thread_index]); } - #if defined(THREADED_RTS) -static void -gc_thread_mainloop (void) + +void +gcWorkerThread (Capability *cap) { - while (!gct->exit) { - - // Wait until we're told to wake up - ACQUIRE_LOCK(&gct->wake_mutex); - while (!gct->wakeup) { - debugTrace(DEBUG_gc, "GC thread %d standing by...", - gct->thread_index); - waitCondition(&gct->wake_cond, &gct->wake_mutex); - } - RELEASE_LOCK(&gct->wake_mutex); - gct->wakeup = rtsFalse; - if (gct->exit) break; + gc_thread *saved_gct; + + // necessary if we stole a callee-saves register for gct: + saved_gct = gct; + + gct = gc_threads[cap->no]; + gct->id = osThreadId(); + // Wait until we're told to wake up + RELEASE_SPIN_LOCK(&gct->mut_spin); + gct->wakeup = GC_THREAD_STANDING_BY; + debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->thread_index); + ACQUIRE_SPIN_LOCK(&gct->gc_spin); + #ifdef USE_PAPI - // start performance counters in this thread... - if (gct->papi_events == -1) { - papi_init_eventset(&gct->papi_events); - } - papi_thread_start_gc1_count(gct->papi_events); + // start performance counters in this thread... + if (gct->papi_events == -1) { + papi_init_eventset(&gct->papi_events); + } + papi_thread_start_gc1_count(gct->papi_events); #endif + + // Every thread evacuates some roots. + gct->evac_gen = 0; + markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads, + rtsTrue/*prune sparks*/); + scavenge_capability_mut_lists(&capabilities[gct->thread_index]); - gc_thread_work(); - + scavenge_until_all_done(); + #ifdef USE_PAPI - // count events in this thread towards the GC totals - papi_thread_stop_gc1_count(gct->papi_events); + // count events in this thread towards the GC totals + papi_thread_stop_gc1_count(gct->papi_events); #endif - } -} + + // Wait until we're told to continue + RELEASE_SPIN_LOCK(&gct->gc_spin); + gct->wakeup = GC_THREAD_WAITING_TO_CONTINUE; + debugTrace(DEBUG_gc, "GC thread %d waiting to continue...", + gct->thread_index); + ACQUIRE_SPIN_LOCK(&gct->mut_spin); + debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index); + + SET_GCT(saved_gct); +} + #endif #if defined(THREADED_RTS) -static void -gc_thread_entry (gc_thread *my_gct) + +void +waitForGcThreads (Capability *cap USED_IF_THREADS) { - gct = my_gct; - debugTrace(DEBUG_gc, "GC thread %d starting...", gct->thread_index); - gct->id = osThreadId(); - gc_thread_mainloop(); + nat n_threads = RtsFlags.ParFlags.nNodes; + nat me = cap->no; + nat i, j; + rtsBool retry = rtsTrue; + + while(retry) { + for (i=0; i < n_threads; i++) { + if (i == me) continue; + if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) { + prodCapability(&capabilities[i], cap->running_task); + } + } + for (j=0; j < 10; j++) { + retry = rtsFalse; + for (i=0; i < n_threads; i++) { + if (i == me) continue; + write_barrier(); + setContextSwitches(); + if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) { + retry = rtsTrue; + } + } + if (!retry) break; + yieldThread(); + } + } } -#endif + +#endif // THREADED_RTS static void start_gc_threads (void) { #if defined(THREADED_RTS) - nat i; - OSThreadId id; - static rtsBool done = rtsFalse; - gc_running_threads = 0; - initMutex(&gc_running_mutex); +#endif +} - if (!done) { - // Start from 1: the main thread is 0 - for (i = 1; i < RtsFlags.ParFlags.gcThreads; i++) { - createOSThread(&id, (OSThreadProc*)&gc_thread_entry, - &gc_threads[i]); - } - done = rtsTrue; +static void +wakeup_gc_threads (nat n_threads USED_IF_THREADS, nat me USED_IF_THREADS) +{ +#if defined(THREADED_RTS) + nat i; + for (i=0; i < n_threads; i++) { + if (i == me) continue; + inc_running(); + debugTrace(DEBUG_gc, "waking up gc thread %d", i); + if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) barf("wakeup_gc_threads"); + + gc_threads[i]->wakeup = GC_THREAD_RUNNING; + ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin); + RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin); } #endif } +// After GC is complete, we must wait for all GC threads to enter the +// standby state, otherwise they may still be executing inside +// any_work(), and may even remain awake until the next GC starts. static void -wakeup_gc_threads (nat n_threads USED_IF_THREADS) +shutdown_gc_threads (nat n_threads USED_IF_THREADS, nat me USED_IF_THREADS) { #if defined(THREADED_RTS) nat i; - for (i=1; i < n_threads; i++) { - inc_running(); - ACQUIRE_LOCK(&gc_threads[i].wake_mutex); - gc_threads[i].wakeup = rtsTrue; - signalCondition(&gc_threads[i].wake_cond); - RELEASE_LOCK(&gc_threads[i].wake_mutex); + for (i=0; i < n_threads; i++) { + if (i == me) continue; + while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) { write_barrier(); } } #endif } +#if defined(THREADED_RTS) +void +releaseGCThreads (Capability *cap USED_IF_THREADS) +{ + nat n_threads = RtsFlags.ParFlags.nNodes; + nat me = cap->no; + nat i; + for (i=0; i < n_threads; i++) { + if (i == me) continue; + if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) + barf("releaseGCThreads"); + + gc_threads[i]->wakeup = GC_THREAD_INACTIVE; + ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin); + RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin); + } +} +#endif + /* ---------------------------------------------------------------------------- Initialise a generation that is to be collected ------------------------------------------------------------------------- */ @@ -1165,9 +1192,9 @@ wakeup_gc_threads (nat n_threads USED_IF_THREADS) static void init_collected_gen (nat g, nat n_threads) { - nat s, t, i; - step_workspace *ws; - step *stp; + nat t, i; + gen_workspace *ws; + generation *gen; bdescr *bd; // Throw away the current mutable list. Invariant: the mutable @@ -1182,97 +1209,96 @@ init_collected_gen (nat g, nat n_threads) } } - for (s = 0; s < generations[g].n_steps; s++) { + gen = &generations[g]; + ASSERT(gen->no == g); + + // we'll construct a new list of threads in this step + // during GC, throw away the current list. + gen->old_threads = gen->threads; + gen->threads = END_TSO_QUEUE; + + // deprecate the existing blocks + gen->old_blocks = gen->blocks; + gen->n_old_blocks = gen->n_blocks; + gen->blocks = NULL; + gen->n_blocks = 0; + gen->n_words = 0; + gen->live_estimate = 0; + + // initialise the large object queues. + gen->scavenged_large_objects = NULL; + gen->n_scavenged_large_blocks = 0; + + // mark the small objects as from-space + for (bd = gen->old_blocks; bd; bd = bd->link) { + bd->flags &= ~BF_EVACUATED; + } + + // mark the large objects as from-space + for (bd = gen->large_objects; bd; bd = bd->link) { + bd->flags &= ~BF_EVACUATED; + } - // generation 0, step 0 doesn't need to-space - if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) { - continue; - } + // for a compacted generation, we need to allocate the bitmap + if (gen->mark) { + nat bitmap_size; // in bytes + bdescr *bitmap_bdescr; + StgWord *bitmap; - stp = &generations[g].steps[s]; - ASSERT(stp->gen_no == g); - - // deprecate the existing blocks - stp->old_blocks = stp->blocks; - stp->n_old_blocks = stp->n_blocks; - stp->blocks = NULL; - stp->n_blocks = 0; - - // we don't have any to-be-scavenged blocks yet - stp->todos = NULL; - stp->n_todos = 0; - - // initialise the large object queues. - stp->scavenged_large_objects = NULL; - stp->n_scavenged_large_blocks = 0; - - // mark the large objects as not evacuated yet - for (bd = stp->large_objects; bd; bd = bd->link) { - bd->flags &= ~BF_EVACUATED; - } - - // for a compacted step, we need to allocate the bitmap - if (stp->is_compacted) { - nat bitmap_size; // in bytes - bdescr *bitmap_bdescr; - StgWord *bitmap; - - bitmap_size = stp->n_old_blocks * BLOCK_SIZE / (sizeof(W_)*BITS_PER_BYTE); - - if (bitmap_size > 0) { - bitmap_bdescr = allocGroup((lnat)BLOCK_ROUND_UP(bitmap_size) - / BLOCK_SIZE); - stp->bitmap = bitmap_bdescr; - bitmap = bitmap_bdescr->start; - - debugTrace(DEBUG_gc, "bitmap_size: %d, bitmap: %p", - bitmap_size, bitmap); - - // don't forget to fill it with zeros! - memset(bitmap, 0, bitmap_size); + bitmap_size = gen->n_old_blocks * BLOCK_SIZE / (sizeof(W_)*BITS_PER_BYTE); + + if (bitmap_size > 0) { + bitmap_bdescr = allocGroup((lnat)BLOCK_ROUND_UP(bitmap_size) + / BLOCK_SIZE); + gen->bitmap = bitmap_bdescr; + bitmap = bitmap_bdescr->start; + + debugTrace(DEBUG_gc, "bitmap_size: %d, bitmap: %p", + bitmap_size, bitmap); + + // don't forget to fill it with zeros! + memset(bitmap, 0, bitmap_size); + + // For each block in this step, point to its bitmap from the + // block descriptor. + for (bd=gen->old_blocks; bd != NULL; bd = bd->link) { + bd->u.bitmap = bitmap; + bitmap += BLOCK_SIZE_W / (sizeof(W_)*BITS_PER_BYTE); - // For each block in this step, point to its bitmap from the - // block descriptor. - for (bd=stp->old_blocks; bd != NULL; bd = bd->link) { - bd->u.bitmap = bitmap; - bitmap += BLOCK_SIZE_W / (sizeof(W_)*BITS_PER_BYTE); - - // Also at this point we set the BF_COMPACTED flag - // for this block. The invariant is that - // BF_COMPACTED is always unset, except during GC - // when it is set on those blocks which will be - // compacted. - bd->flags |= BF_COMPACTED; - } - } - } + // Also at this point we set the BF_MARKED flag + // for this block. The invariant is that + // BF_MARKED is always unset, except during GC + // when it is set on those blocks which will be + // compacted. + if (!(bd->flags & BF_FRAGMENTED)) { + bd->flags |= BF_MARKED; + } + } + } } // For each GC thread, for each step, allocate a "todo" block to // store evacuated objects to be scavenged, and a block to store // evacuated objects that do not need to be scavenged. for (t = 0; t < n_threads; t++) { - for (s = 0; s < generations[g].n_steps; s++) { - - // we don't copy objects into g0s0, unless -G0 - if (g==0 && s==0 && RtsFlags.GcFlags.generations > 1) continue; - - ws = &gc_threads[t].steps[g][s]; - - ws->scan_bd = NULL; - ws->scan = NULL; - - ws->todo_large_objects = NULL; - - // allocate the first to-space block; extra blocks will be - // chained on as necessary. - ws->todo_bd = NULL; - ws->buffer_todo_bd = NULL; - gc_alloc_todo_block(ws); - - ws->scavd_list = NULL; - ws->n_scavd_blocks = 0; - } + ws = &gc_threads[t]->gens[g]; + + ws->todo_large_objects = NULL; + + ws->part_list = NULL; + ws->n_part_blocks = 0; + + // allocate the first to-space block; extra blocks will be + // chained on as necessary. + ws->todo_bd = NULL; + ASSERT(looksEmptyWSDeque(ws->todo_q)); + alloc_todo_block(ws,0); + + ws->todo_overflow = NULL; + ws->n_todo_overflow = 0; + + ws->scavd_list = NULL; + ws->n_scavd_blocks = 0; } } @@ -1284,69 +1310,74 @@ init_collected_gen (nat g, nat n_threads) static void init_uncollected_gen (nat g, nat threads) { - nat s, t, i; - step_workspace *ws; - step *stp; + nat t, n; + gen_workspace *ws; + generation *gen; bdescr *bd; - for (s = 0; s < generations[g].n_steps; s++) { - stp = &generations[g].steps[s]; - stp->scavenged_large_objects = NULL; - stp->n_scavenged_large_blocks = 0; + // save the current mutable lists for this generation, and + // allocate a fresh block for each one. We'll traverse these + // mutable lists as roots early on in the GC. + generations[g].saved_mut_list = generations[g].mut_list; + generations[g].mut_list = allocBlock(); + for (n = 0; n < n_capabilities; n++) { + capabilities[n].saved_mut_lists[g] = capabilities[n].mut_lists[g]; + capabilities[n].mut_lists[g] = allocBlock(); } - - for (t = 0; t < threads; t++) { - for (s = 0; s < generations[g].n_steps; s++) { - - ws = &gc_threads[t].steps[g][s]; - stp = ws->stp; - - ws->buffer_todo_bd = NULL; - ws->todo_large_objects = NULL; - ws->scavd_list = NULL; - ws->n_scavd_blocks = 0; + gen = &generations[g]; - // If the block at the head of the list in this generation - // is less than 3/4 full, then use it as a todo block. - if (stp->blocks && isPartiallyFull(stp->blocks)) - { - ws->todo_bd = stp->blocks; - ws->todo_free = ws->todo_bd->free; - ws->todo_lim = ws->todo_bd->start + BLOCK_SIZE_W; - stp->blocks = stp->blocks->link; - stp->n_blocks -= 1; - ws->todo_bd->link = NULL; - - // this block is also the scan block; we must scan - // from the current end point. - ws->scan_bd = ws->todo_bd; - ws->scan = ws->scan_bd->free; - - // subtract the contents of this block from the stats, - // because we'll count the whole block later. - copied -= ws->scan_bd->free - ws->scan_bd->start; - } - else - { - ws->scan_bd = NULL; - ws->scan = NULL; - ws->todo_bd = NULL; - gc_alloc_todo_block(ws); - } - } + gen->scavenged_large_objects = NULL; + gen->n_scavenged_large_blocks = 0; + + for (t = 0; t < threads; t++) { + ws = &gc_threads[t]->gens[g]; + + ASSERT(looksEmptyWSDeque(ws->todo_q)); + ws->todo_large_objects = NULL; + + ws->part_list = NULL; + ws->n_part_blocks = 0; + + ws->scavd_list = NULL; + ws->n_scavd_blocks = 0; + + // If the block at the head of the list in this generation + // is less than 3/4 full, then use it as a todo block. + if (gen->blocks && isPartiallyFull(gen->blocks)) + { + ws->todo_bd = gen->blocks; + ws->todo_free = ws->todo_bd->free; + ws->todo_lim = ws->todo_bd->start + BLOCK_SIZE_W; + gen->blocks = gen->blocks->link; + gen->n_blocks -= 1; + gen->n_words -= ws->todo_bd->free - ws->todo_bd->start; + ws->todo_bd->link = NULL; + // we must scan from the current end point. + ws->todo_bd->u.scan = ws->todo_bd->free; + } + else + { + ws->todo_bd = NULL; + alloc_todo_block(ws,0); + } } - // Move the private mutable lists from each capability onto the - // main mutable list for the generation. - for (i = 0; i < n_capabilities; i++) { - for (bd = capabilities[i].mut_lists[g]; - bd->link != NULL; bd = bd->link) { - /* nothing */ - } - bd->link = generations[g].mut_list; - generations[g].mut_list = capabilities[i].mut_lists[g]; - capabilities[i].mut_lists[g] = allocBlock(); + // deal out any more partial blocks to the threads' part_lists + t = 0; + while (gen->blocks && isPartiallyFull(gen->blocks)) + { + bd = gen->blocks; + gen->blocks = bd->link; + ws = &gc_threads[t]->gens[g]; + bd->link = ws->part_list; + ws->part_list = bd; + ws->n_part_blocks += 1; + bd->u.scan = bd->free; + gen->n_blocks -= 1; + gen->n_words -= bd->free - bd->start; + t++; + if (t == n_gc_threads) t = 0; } } @@ -1357,20 +1388,40 @@ init_uncollected_gen (nat g, nat threads) static void init_gc_thread (gc_thread *t) { - t->evac_step = 0; + t->static_objects = END_OF_STATIC_LIST; + t->scavenged_static_objects = END_OF_STATIC_LIST; + t->scan_bd = NULL; + t->mut_lists = capabilities[t->thread_index].mut_lists; + t->evac_gen = 0; t->failed_to_evac = rtsFalse; t->eager_promotion = rtsTrue; t->thunk_selector_depth = 0; + t->copied = 0; + t->scanned = 0; + t->any_work = 0; + t->no_work = 0; + t->scav_find_work = 0; } /* ----------------------------------------------------------------------------- - Function we pass to GetRoots to evacuate roots. + Function we pass to evacuate roots. -------------------------------------------------------------------------- */ static void -mark_root(StgClosure **root) +mark_root(void *user USED_IF_THREADS, StgClosure **root) { - evacuate(root); + // we stole a register for gct, but this function is called from + // *outside* the GC where the register variable is not in effect, + // so we need to save and restore it here. NB. only call + // mark_root() from the main GC thread, otherwise gct will be + // incorrect. + gc_thread *saved_gct; + saved_gct = gct; + SET_GCT(user); + + evacuate(root); + + SET_GCT(saved_gct); } /* ----------------------------------------------------------------------------- @@ -1391,75 +1442,6 @@ zero_static_object_list(StgClosure* first_static) } } -/* ----------------------------------------------------------------------------- - Reverting CAFs - -------------------------------------------------------------------------- */ - -void -revertCAFs( void ) -{ - StgIndStatic *c; - - for (c = (StgIndStatic *)revertible_caf_list; c != NULL; - c = (StgIndStatic *)c->static_link) - { - SET_INFO(c, c->saved_info); - c->saved_info = NULL; - // could, but not necessary: c->static_link = NULL; - } - revertible_caf_list = NULL; -} - -void -markCAFs( evac_fn evac ) -{ - StgIndStatic *c; - - for (c = (StgIndStatic *)caf_list; c != NULL; - c = (StgIndStatic *)c->static_link) - { - evac(&c->indirectee); - } - for (c = (StgIndStatic *)revertible_caf_list; c != NULL; - c = (StgIndStatic *)c->static_link) - { - evac(&c->indirectee); - } -} - -/* ---------------------------------------------------------------------------- - Update the pointers from the task list - - These are treated as weak pointers because we want to allow a main - thread to get a BlockedOnDeadMVar exception in the same way as any - other thread. Note that the threads should all have been retained - by GC by virtue of being on the all_threads list, we're just - updating pointers here. - ------------------------------------------------------------------------- */ - -static void -update_task_list (void) -{ - Task *task; - StgTSO *tso; - for (task = all_tasks; task != NULL; task = task->all_link) { - if (!task->stopped && task->tso) { - ASSERT(task->tso->bound == task); - tso = (StgTSO *) isAlive((StgClosure *)task->tso); - if (tso == NULL) { - barf("task %p: main thread %d has been GC'd", -#ifdef THREADED_RTS - (void *)task->id, -#else - (void *)task, -#endif - task->tso->id); - } - task->tso = tso; - } - } -} - /* ---------------------------------------------------------------------------- Reset the sizes of the older generations when we do a major collection. @@ -1475,18 +1457,27 @@ resize_generations (void) nat g; if (major_gc && RtsFlags.GcFlags.generations > 1) { - nat live, size, min_alloc; + nat live, size, min_alloc, words; nat max = RtsFlags.GcFlags.maxHeapSize; nat gens = RtsFlags.GcFlags.generations; // live in the oldest generations - live = oldest_gen->steps[0].n_blocks + - oldest_gen->steps[0].n_large_blocks; + if (oldest_gen->live_estimate != 0) { + words = oldest_gen->live_estimate; + } else { + words = oldest_gen->n_words; + } + live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W + + oldest_gen->n_large_blocks; // default max size for all generations except zero size = stg_max(live * RtsFlags.GcFlags.oldGenFactor, RtsFlags.GcFlags.minOldGenSize); + if (RtsFlags.GcFlags.heapSizeSuggestionAuto) { + RtsFlags.GcFlags.heapSizeSuggestion = size; + } + // minimum size for generation zero min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200, RtsFlags.GcFlags.minAllocAreaSize); @@ -1496,15 +1487,21 @@ resize_generations (void) if (RtsFlags.GcFlags.generations > 1 && (RtsFlags.GcFlags.compact || (max > 0 && - oldest_gen->steps[0].n_blocks > + oldest_gen->n_blocks > (RtsFlags.GcFlags.compactThreshold * max) / 100))) { - oldest_gen->steps[0].is_compacted = 1; + oldest_gen->mark = 1; + oldest_gen->compact = 1; // debugBelch("compaction: on\n", live); } else { - oldest_gen->steps[0].is_compacted = 0; + oldest_gen->mark = 0; + oldest_gen->compact = 0; // debugBelch("compaction: off\n", live); } + if (RtsFlags.GcFlags.sweep) { + oldest_gen->mark = 1; + } + // if we're going to go over the maximum heap size, reduce the // size of the generations accordingly. The calculation is // different if compaction is turned on, because we don't need @@ -1518,7 +1515,7 @@ resize_generations (void) heapOverflow(); } - if (oldest_gen->steps[0].is_compacted) { + if (oldest_gen->compact) { if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) { size = (max - min_alloc) / ((gens - 1) * 2 - 1); } @@ -1551,6 +1548,8 @@ resize_generations (void) static void resize_nursery (void) { + lnat min_nursery = RtsFlags.GcFlags.minAllocAreaSize * n_capabilities; + if (RtsFlags.GcFlags.generations == 1) { // Two-space collector: nat blocks; @@ -1569,7 +1568,7 @@ resize_nursery (void) * performance we get from 3L bytes, reducing to the same * performance at 2L bytes. */ - blocks = g0s0->n_old_blocks; + blocks = generations[0].n_blocks; if ( RtsFlags.GcFlags.maxHeapSize != 0 && blocks * RtsFlags.GcFlags.oldGenFactor * 2 > @@ -1593,9 +1592,9 @@ resize_nursery (void) else { blocks *= RtsFlags.GcFlags.oldGenFactor; - if (blocks < RtsFlags.GcFlags.minAllocAreaSize) + if (blocks < min_nursery) { - blocks = RtsFlags.GcFlags.minAllocAreaSize; + blocks = min_nursery; } } resizeNurseries(blocks); @@ -1613,7 +1612,7 @@ resize_nursery (void) /* Guess how much will be live in generation 0 step 0 next time. * A good approximation is obtained by finding the - * percentage of g0s0 that was live at the last minor GC. + * percentage of g0 that was live at the last minor GC. * * We have an accurate figure for the amount of copied data in * 'copied', but we must convert this to a number of blocks, with @@ -1622,7 +1621,7 @@ resize_nursery (void) */ if (N == 0) { - g0s0_pcnt_kept = ((copied / (BLOCK_SIZE_W - 10)) * 100) + g0_pcnt_kept = ((copied / (BLOCK_SIZE_W - 10)) * 100) / countNurseryBlocks(); } @@ -1633,17 +1632,17 @@ resize_nursery (void) * * Formula: suggested - needed * ---------------------------- - * 1 + g0s0_pcnt_kept/100 + * 1 + g0_pcnt_kept/100 * * where 'needed' is the amount of memory needed at the next - * collection for collecting all steps except g0s0. + * collection for collecting all gens except g0. */ blocks = (((long)RtsFlags.GcFlags.heapSizeSuggestion - (long)needed) * 100) / - (100 + (long)g0s0_pcnt_kept); + (100 + (long)g0_pcnt_kept); - if (blocks < (long)RtsFlags.GcFlags.minAllocAreaSize) { - blocks = RtsFlags.GcFlags.minAllocAreaSize; + if (blocks < (long)min_nursery) { + blocks = min_nursery; } resizeNurseries((nat)blocks);