X-Git-Url: http://git.megacz.com/?p=ghc-hetmet.git;a=blobdiff_plain;f=rts%2Fsm%2FGC.c;h=02fd6d91610b363f3c09018c13b2ef0972b7c73f;hp=b71079b4ab0e3b08976959f08126a785e009d95a;hb=a2a67cd520b9841114d69a87a423dabcb3b4368e;hpb=b339c8b1d0f239031802555b454062e9430ec8bb diff --git a/rts/sm/GC.c b/rts/sm/GC.c index b71079b..02fd6d9 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -11,28 +11,23 @@ * * ---------------------------------------------------------------------------*/ -// #include "PosixSource.h" +#include "PosixSource.h" #include "Rts.h" -#include "RtsFlags.h" +#include "HsFFI.h" + +#include "Storage.h" #include "RtsUtils.h" #include "Apply.h" -#include "OSThreads.h" -#include "LdvProfile.h" #include "Updates.h" #include "Stats.h" #include "Schedule.h" #include "Sanity.h" #include "BlockAlloc.h" -#include "MBlock.h" #include "ProfHeap.h" -#include "SchedAPI.h" #include "Weak.h" #include "Prelude.h" -#include "ParTicky.h" // ToDo: move into Rts.h #include "RtsSignals.h" #include "STM.h" -#include "HsFFI.h" -#include "Linker.h" #if defined(RTS_GTK_FRONTPANEL) #include "FrontPanel.h" #endif @@ -40,6 +35,7 @@ #include "RetainerProfile.h" #include "RaiseAsync.h" #include "Papi.h" +#include "Stable.h" #include "GC.h" #include "GCThread.h" @@ -49,6 +45,7 @@ #include "GCUtils.h" #include "MarkWeak.h" #include "Sparks.h" +#include "Sweep.h" #include // for memset() #include @@ -115,7 +112,10 @@ nat mutlist_MUTVARS, /* Thread-local data for each GC thread */ gc_thread **gc_threads = NULL; -// gc_thread *gct = NULL; // this thread's gct TODO: make thread-local + +#if !defined(THREADED_RTS) +StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(step_workspace)]; +#endif // Number of threads running in *this* GC. Affects how many // step->todos[] lists we have to look in to find work. @@ -124,9 +124,7 @@ nat n_gc_threads; // For stats: long copied; // *words* copied & scavenged during this GC -#ifdef THREADED_RTS -SpinLock recordMutableGen_sync; -#endif +rtsBool work_stealing; DECLARE_GCT @@ -137,7 +135,6 @@ DECLARE_GCT static void mark_root (void *user, StgClosure **root); static void zero_static_object_list (StgClosure* first_static); static nat initialise_N (rtsBool force_major_gc); -static void alloc_gc_threads (void); static void init_collected_gen (nat g, nat threads); static void init_uncollected_gen (nat g, nat threads); static void init_gc_thread (gc_thread *t); @@ -146,10 +143,10 @@ static void resize_generations (void); static void resize_nursery (void); static void start_gc_threads (void); static void scavenge_until_all_done (void); -static nat inc_running (void); -static nat dec_running (void); -static void wakeup_gc_threads (nat n_threads); -static void shutdown_gc_threads (nat n_threads); +static StgWord inc_running (void); +static StgWord dec_running (void); +static void wakeup_gc_threads (nat n_threads, nat me); +static void shutdown_gc_threads (nat n_threads, nat me); #if 0 && defined(DEBUG) static void gcCAFs (void); @@ -179,7 +176,9 @@ StgPtr oldgen_scan; -------------------------------------------------------------------------- */ void -GarbageCollect ( rtsBool force_major_gc ) +GarbageCollect (rtsBool force_major_gc, + nat gc_type USED_IF_THREADS, + Capability *cap) { bdescr *bd; step *stp; @@ -212,6 +211,9 @@ GarbageCollect ( rtsBool force_major_gc ) // tell the STM to discard any cached closures it's hoping to re-use stmPreGCHook(); + // lock the StablePtr table + stablePtrPreGC(); + #ifdef DEBUG mutlist_MUTVARS = 0; mutlist_MUTARRS = 0; @@ -233,27 +235,38 @@ GarbageCollect ( rtsBool force_major_gc ) */ n = initialise_N(force_major_gc); - /* Allocate + initialise the gc_thread structures. - */ - alloc_gc_threads(); +#if defined(THREADED_RTS) + work_stealing = RtsFlags.ParFlags.parGcLoadBalancing; + // It's not always a good idea to do load balancing in parallel + // GC. In particular, for a parallel program we don't want to + // lose locality by moving cached data into another CPU's cache + // (this effect can be quite significant). + // + // We could have a more complex way to deterimine whether to do + // work stealing or not, e.g. it might be a good idea to do it + // if the heap is big. For now, we just turn it on or off with + // a flag. +#endif /* Start threads, so they can be spinning up while we finish initialisation. */ start_gc_threads(); +#if defined(THREADED_RTS) /* How many threads will be participating in this GC? - * We don't try to parallelise minor GC. + * We don't try to parallelise minor GCs (unless the user asks for + * it with +RTS -gn0), or mark/compact/sweep GC. */ -#if defined(THREADED_RTS) - if (n < (4*1024*1024 / BLOCK_SIZE)) { - n_gc_threads = 1; + if (gc_type == PENDING_GC_PAR) { + n_gc_threads = RtsFlags.ParFlags.nNodes; } else { - n_gc_threads = RtsFlags.ParFlags.gcThreads; + n_gc_threads = 1; } #else n_gc_threads = 1; #endif - trace(TRACE_gc|DEBUG_gc, "GC (gen %d): %d KB to collect, %ld MB in use, using %d thread(s)", + + debugTrace(DEBUG_gc, "GC (gen %d): %d KB to collect, %ld MB in use, using %d thread(s)", N, n * (BLOCK_SIZE / 1024), mblocks_allocated, n_gc_threads); #ifdef RTS_GTK_FRONTPANEL @@ -267,8 +280,9 @@ GarbageCollect ( rtsBool force_major_gc ) memInventory(traceClass(DEBUG_gc)); #endif - // check stack sanity *before* GC (ToDo: check all threads) + // check stack sanity *before* GC IF_DEBUG(sanity, checkFreeListSanity()); + IF_DEBUG(sanity, checkMutableLists(rtsTrue)); // Initialise all our gc_thread structures for (t = 0; t < n_gc_threads; t++) { @@ -287,41 +301,63 @@ GarbageCollect ( rtsBool force_major_gc ) /* Allocate a mark stack if we're doing a major collection. */ - if (major_gc) { - mark_stack_bdescr = allocGroup(MARK_STACK_BLOCKS); + if (major_gc && oldest_gen->steps[0].mark) { + nat mark_stack_blocks; + mark_stack_blocks = stg_max(MARK_STACK_BLOCKS, + oldest_gen->steps[0].n_old_blocks / 100); + mark_stack_bdescr = allocGroup(mark_stack_blocks); mark_stack = (StgPtr *)mark_stack_bdescr->start; mark_sp = mark_stack; - mark_splim = mark_stack + (MARK_STACK_BLOCKS * BLOCK_SIZE_W); + mark_splim = mark_stack + (mark_stack_blocks * BLOCK_SIZE_W); } else { mark_stack_bdescr = NULL; } // this is the main thread - gct = gc_threads[0]; +#ifdef THREADED_RTS + if (n_gc_threads == 1) { + SET_GCT(gc_threads[0]); + } else { + SET_GCT(gc_threads[cap->no]); + } +#else +SET_GCT(gc_threads[0]); +#endif /* ----------------------------------------------------------------------- * follow all the roots that we know about: - * - mutable lists from each generation > N - * we want to *scavenge* these roots, not evacuate them: they're not - * going to move in this GC. - * Also do them in reverse generation order, for the usual reason: - * namely to reduce the likelihood of spurious old->new pointers. */ - for (g = RtsFlags.GcFlags.generations-1; g > N; g--) { - generations[g].saved_mut_list = generations[g].mut_list; - generations[g].mut_list = allocBlock(); - // mut_list always has at least one block. - } // the main thread is running: this prevents any other threads from // exiting prematurely, so we can start them now. // NB. do this after the mutable lists have been saved above, otherwise // the other GC threads will be writing into the old mutable lists. inc_running(); - wakeup_gc_threads(n_gc_threads); - + wakeup_gc_threads(n_gc_threads, gct->thread_index); + + // Mutable lists from each generation > N + // we want to *scavenge* these roots, not evacuate them: they're not + // going to move in this GC. + // Also do them in reverse generation order, for the usual reason: + // namely to reduce the likelihood of spurious old->new pointers. + // for (g = RtsFlags.GcFlags.generations-1; g > N; g--) { - scavenge_mutable_list(&generations[g]); + scavenge_mutable_list(generations[g].saved_mut_list, &generations[g]); + freeChain_sync(generations[g].saved_mut_list); + generations[g].saved_mut_list = NULL; + + } + + // scavenge the capability-private mutable lists. This isn't part + // of markSomeCapabilities() because markSomeCapabilities() can only + // call back into the GC via mark_root() (due to the gct register + // variable). + if (n_gc_threads == 1) { + for (n = 0; n < n_capabilities; n++) { + scavenge_capability_mut_lists(&capabilities[n]); + } + } else { + scavenge_capability_mut_lists(&capabilities[gct->thread_index]); } // follow roots from the CAF list (used by GHCi) @@ -330,7 +366,8 @@ GarbageCollect ( rtsBool force_major_gc ) // follow all the roots that the application knows about. gct->evac_step = 0; - markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads); + markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads, + rtsTrue/*prune sparks*/); #if defined(RTS_USER_SIGNALS) // mark the signal handlers (signals should be already blocked) @@ -372,14 +409,11 @@ GarbageCollect ( rtsBool force_major_gc ) break; } - shutdown_gc_threads(n_gc_threads); + shutdown_gc_threads(n_gc_threads, gct->thread_index); // Update pointers from the Task list update_task_list(); - // Update pointers from capabilities (probably just the spark queues) - updateCapabilitiesPostGC(); - // Now see which stable names are still alive. gcStablePtrTable(); @@ -485,13 +519,14 @@ GarbageCollect ( rtsBool force_major_gc ) } } - // Finally: compaction of the oldest generation. - if (major_gc && oldest_gen->steps[0].is_compacted) { - compact(gct->scavenged_static_objects); + // Finally: compact or sweep the oldest generation. + if (major_gc && oldest_gen->steps[0].mark) { + if (oldest_gen->steps[0].compact) + compact(gct->scavenged_static_objects); + else + sweep(&oldest_gen->steps[0]); } - IF_DEBUG(sanity, checkGlobalTSOList(rtsFalse)); - /* run through all the generations/steps and tidy up */ copied = 0; @@ -501,12 +536,12 @@ GarbageCollect ( rtsBool force_major_gc ) nat i; for (i=0; i < n_gc_threads; i++) { if (n_gc_threads > 1) { - trace(TRACE_gc,"thread %d:", i); - trace(TRACE_gc," copied %ld", gc_threads[i]->copied * sizeof(W_)); - trace(TRACE_gc," scanned %ld", gc_threads[i]->scanned * sizeof(W_)); - trace(TRACE_gc," any_work %ld", gc_threads[i]->any_work); - trace(TRACE_gc," no_work %ld", gc_threads[i]->no_work); - trace(TRACE_gc," scav_find_work %ld", gc_threads[i]->scav_find_work); + debugTrace(DEBUG_gc,"thread %d:", i); + debugTrace(DEBUG_gc," copied %ld", gc_threads[i]->copied * sizeof(W_)); + debugTrace(DEBUG_gc," scanned %ld", gc_threads[i]->scanned * sizeof(W_)); + debugTrace(DEBUG_gc," any_work %ld", gc_threads[i]->any_work); + debugTrace(DEBUG_gc," no_work %ld", gc_threads[i]->no_work); + debugTrace(DEBUG_gc," scav_find_work %ld", gc_threads[i]->scav_find_work); } copied += gc_threads[i]->copied; max_copied = stg_max(gc_threads[i]->copied, max_copied); @@ -533,6 +568,12 @@ GarbageCollect ( rtsBool force_major_gc ) for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) { mut_list_size += bd->free - bd->start; } + for (n = 0; n < n_capabilities; n++) { + for (bd = capabilities[n].mut_lists[g]; + bd != NULL; bd = bd->link) { + mut_list_size += bd->free - bd->start; + } + } copied += mut_list_size; debugTrace(DEBUG_gc, @@ -542,7 +583,7 @@ GarbageCollect ( rtsBool force_major_gc ) } for (s = 0; s < generations[g].n_steps; s++) { - bdescr *next; + bdescr *next, *prev; stp = &generations[g].steps[s]; // for generations we collected... @@ -553,29 +594,48 @@ GarbageCollect ( rtsBool force_major_gc ) * freed blocks will probaby be quickly recycled. */ if (!(g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1)) { - if (stp->is_compacted) + if (stp->mark) { // tack the new blocks on the end of the existing blocks if (stp->old_blocks != NULL) { + + prev = NULL; for (bd = stp->old_blocks; bd != NULL; bd = next) { - stp->n_words += bd->free - bd->start; - - // NB. this step might not be compacted next - // time, so reset the BF_COMPACTED flags. - // They are set before GC if we're going to - // compact. (search for BF_COMPACTED above). - bd->flags &= ~BF_COMPACTED; - - // between GCs, all blocks in the heap except - // for the nursery have the BF_EVACUATED flag set. - bd->flags |= BF_EVACUATED; - - next = bd->link; - if (next == NULL) { - bd->link = stp->blocks; - } + + next = bd->link; + + if (!(bd->flags & BF_MARKED)) + { + if (prev == NULL) { + stp->old_blocks = next; + } else { + prev->link = next; + } + freeGroup(bd); + stp->n_old_blocks--; + } + else + { + stp->n_words += bd->free - bd->start; + + // NB. this step might not be compacted next + // time, so reset the BF_MARKED flags. + // They are set before GC if we're going to + // compact. (search for BF_MARKED above). + bd->flags &= ~BF_MARKED; + + // between GCs, all blocks in the heap except + // for the nursery have the BF_EVACUATED flag set. + bd->flags |= BF_EVACUATED; + + prev = bd; + } } - stp->blocks = stp->old_blocks; + + if (prev != NULL) { + prev->link = stp->blocks; + stp->blocks = stp->old_blocks; + } } // add the new blocks to the block tally stp->n_blocks += stp->n_old_blocks; @@ -690,7 +750,7 @@ GarbageCollect ( rtsBool force_major_gc ) // start any pending finalizers RELEASE_SM_LOCK; - scheduleFinalizers(last_free_capability, old_weak_ptr_list); + scheduleFinalizers(cap, old_weak_ptr_list); ACQUIRE_SM_LOCK; // send exceptions to any threads which were about to die @@ -706,7 +766,7 @@ GarbageCollect ( rtsBool force_major_gc ) IF_DEBUG(sanity, checkSanity()); // extra GC trace info - if (traceClass(TRACE_gc|DEBUG_gc)) statDescribeGens(); + IF_DEBUG(gc, statDescribeGens()); #ifdef DEBUG // symbol-table based profiling @@ -733,6 +793,12 @@ GarbageCollect ( rtsBool force_major_gc ) slop = calcLiveBlocks() * BLOCK_SIZE_W - live; stat_endGC(allocated, live, copied, N, max_copied, avg_copied, slop); + // unlock the StablePtr table + stablePtrPostGC(); + + // Guess which generation we'll collect *next* time + initialise_N(force_major_gc); + #if defined(RTS_USER_SIGNALS) if (RtsFlags.MiscFlags.install_signal_handlers) { // unblock signals again @@ -742,7 +808,7 @@ GarbageCollect ( rtsBool force_major_gc ) RELEASE_SM_LOCK; - gct = saved_gct; + SET_GCT(saved_gct); } /* ----------------------------------------------------------------------------- @@ -791,23 +857,24 @@ initialise_N (rtsBool force_major_gc) Initialise the gc_thread structures. -------------------------------------------------------------------------- */ -static gc_thread * -alloc_gc_thread (int n) +#define GC_THREAD_INACTIVE 0 +#define GC_THREAD_STANDING_BY 1 +#define GC_THREAD_RUNNING 2 +#define GC_THREAD_WAITING_TO_CONTINUE 3 + +static void +new_gc_thread (nat n, gc_thread *t) { nat s; step_workspace *ws; - gc_thread *t; - - t = stgMallocBytes(sizeof(gc_thread) + total_steps * sizeof(step_workspace), - "alloc_gc_thread"); #ifdef THREADED_RTS t->id = 0; - initCondition(&t->wake_cond); - initMutex(&t->wake_mutex); - t->wakeup = rtsTrue; // starts true, so we can wait for the + initSpinLock(&t->gc_spin); + initSpinLock(&t->mut_spin); + ACQUIRE_SPIN_LOCK(&t->gc_spin); + t->wakeup = GC_THREAD_INACTIVE; // starts true, so we can wait for the // thread to start up, see wakeup_gc_threads - t->exit = rtsFalse; #endif t->thread_index = n; @@ -825,10 +892,12 @@ alloc_gc_thread (int n) ws = &t->steps[s]; ws->step = &all_steps[s]; ASSERT(s == ws->step->abs_no); - ws->gct = t; + ws->my_gct = t; ws->todo_bd = NULL; - ws->buffer_todo_bd = NULL; + ws->todo_q = newWSDeque(128); + ws->todo_overflow = NULL; + ws->n_todo_overflow = 0; ws->part_list = NULL; ws->n_part_blocks = 0; @@ -836,30 +905,48 @@ alloc_gc_thread (int n) ws->scavd_list = NULL; ws->n_scavd_blocks = 0; } - - return t; } -static void -alloc_gc_threads (void) +void +initGcThreads (void) { if (gc_threads == NULL) { #if defined(THREADED_RTS) nat i; - gc_threads = stgMallocBytes (RtsFlags.ParFlags.gcThreads * + gc_threads = stgMallocBytes (RtsFlags.ParFlags.nNodes * sizeof(gc_thread*), "alloc_gc_threads"); - for (i = 0; i < RtsFlags.ParFlags.gcThreads; i++) { - gc_threads[i] = alloc_gc_thread(i); + for (i = 0; i < RtsFlags.ParFlags.nNodes; i++) { + gc_threads[i] = + stgMallocBytes(sizeof(gc_thread) + total_steps * sizeof(step_workspace), + "alloc_gc_threads"); + + new_gc_thread(i, gc_threads[i]); } #else - gc_threads = stgMallocBytes (sizeof(gc_thread*), - "alloc_gc_threads"); + gc_threads = stgMallocBytes (sizeof(gc_thread*),"alloc_gc_threads"); + gc_threads[0] = gct; + new_gc_thread(0,gc_threads[0]); +#endif + } +} - gc_threads[0] = alloc_gc_thread(0); +void +freeGcThreads (void) +{ + if (gc_threads != NULL) { +#if defined(THREADED_RTS) + nat i; + for (i = 0; i < RtsFlags.ParFlags.nNodes; i++) { + stgFree (gc_threads[i]); + } + stgFree (gc_threads); +#else + stgFree (gc_threads); #endif + gc_threads = NULL; } } @@ -867,32 +954,22 @@ alloc_gc_threads (void) Start GC threads ------------------------------------------------------------------------- */ -static nat gc_running_threads; - -#if defined(THREADED_RTS) -static Mutex gc_running_mutex; -#endif +static volatile StgWord gc_running_threads; -static nat +static StgWord inc_running (void) { - nat n_running; - ACQUIRE_LOCK(&gc_running_mutex); - n_running = ++gc_running_threads; - RELEASE_LOCK(&gc_running_mutex); - ASSERT(n_running <= n_gc_threads); - return n_running; + StgWord new; + new = atomic_inc(&gc_running_threads); + ASSERT(new <= n_gc_threads); + return new; } -static nat +static StgWord dec_running (void) { - nat n_running; - ACQUIRE_LOCK(&gc_running_mutex); - ASSERT(n_gc_threads != 0); - n_running = --gc_running_threads; - RELEASE_LOCK(&gc_running_mutex); - return n_running; + ASSERT(gc_running_threads != 0); + return atomic_dec(&gc_running_threads); } static rtsBool @@ -920,8 +997,23 @@ any_work (void) } ws = &gct->steps[s]; if (ws->todo_large_objects) return rtsTrue; - if (ws->step->todos) return rtsTrue; + if (!looksEmptyWSDeque(ws->todo_q)) return rtsTrue; + if (ws->todo_overflow) return rtsTrue; + } + +#if defined(THREADED_RTS) + if (work_stealing) { + nat n; + // look for work to steal + for (n = 0; n < n_gc_threads; n++) { + if (n == gct->thread_index) continue; + for (s = total_steps-1; s >= 0; s--) { + ws = &gc_threads[n]->steps[s]; + if (!looksEmptyWSDeque(ws->todo_q)) return rtsTrue; + } + } } +#endif gct->no_work++; @@ -950,18 +1042,18 @@ loop: r = dec_running(); debugTrace(DEBUG_gc, "GC thread %d idle (%d still running)", - gct->thread_index, r); - + gct->thread_index, r); + while (gc_running_threads != 0) { // usleep(1); - if (any_work()) { - inc_running(); - goto loop; - } - // any_work() does not remove the work from the queue, it - // just checks for the presence of work. If we find any, - // then we increment gc_running_threads and go back to - // scavenge_loop() to perform any pending work. + if (any_work()) { + inc_running(); + goto loop; + } + // any_work() does not remove the work from the queue, it + // just checks for the presence of work. If we find any, + // then we increment gc_running_threads and go back to + // scavenge_loop() to perform any pending work. } // All threads are now stopped @@ -969,112 +1061,109 @@ loop: } #if defined(THREADED_RTS) -// -// gc_thread_work(): Scavenge until there's no work left to do and all -// the running threads are idle. -// -static void -gc_thread_work (void) + +void +gcWorkerThread (Capability *cap) { - // gc_running_threads has already been incremented for us; this is - // a worker thread and the main thread bumped gc_running_threads - // before waking us up. + cap->in_gc = rtsTrue; + + gct = gc_threads[cap->no]; + gct->id = osThreadId(); + // Wait until we're told to wake up + RELEASE_SPIN_LOCK(&gct->mut_spin); + gct->wakeup = GC_THREAD_STANDING_BY; + debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->thread_index); + ACQUIRE_SPIN_LOCK(&gct->gc_spin); + +#ifdef USE_PAPI + // start performance counters in this thread... + if (gct->papi_events == -1) { + papi_init_eventset(&gct->papi_events); + } + papi_thread_start_gc1_count(gct->papi_events); +#endif + // Every thread evacuates some roots. gct->evac_step = 0; - markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads); + markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads, + rtsTrue/*prune sparks*/); + scavenge_capability_mut_lists(&capabilities[gct->thread_index]); scavenge_until_all_done(); -} - - -static void -gc_thread_mainloop (void) -{ - while (!gct->exit) { - - // Wait until we're told to wake up - ACQUIRE_LOCK(&gct->wake_mutex); - gct->wakeup = rtsFalse; - while (!gct->wakeup) { - debugTrace(DEBUG_gc, "GC thread %d standing by...", - gct->thread_index); - waitCondition(&gct->wake_cond, &gct->wake_mutex); - } - RELEASE_LOCK(&gct->wake_mutex); - if (gct->exit) break; - + #ifdef USE_PAPI - // start performance counters in this thread... - if (gct->papi_events == -1) { - papi_init_eventset(&gct->papi_events); - } - papi_thread_start_gc1_count(gct->papi_events); + // count events in this thread towards the GC totals + papi_thread_stop_gc1_count(gct->papi_events); #endif - gc_thread_work(); + // Wait until we're told to continue + RELEASE_SPIN_LOCK(&gct->gc_spin); + gct->wakeup = GC_THREAD_WAITING_TO_CONTINUE; + debugTrace(DEBUG_gc, "GC thread %d waiting to continue...", + gct->thread_index); + ACQUIRE_SPIN_LOCK(&gct->mut_spin); + debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index); +} -#ifdef USE_PAPI - // count events in this thread towards the GC totals - papi_thread_stop_gc1_count(gct->papi_events); -#endif - } -} #endif #if defined(THREADED_RTS) -static void -gc_thread_entry (gc_thread *my_gct) + +void +waitForGcThreads (Capability *cap USED_IF_THREADS) { - gct = my_gct; - debugTrace(DEBUG_gc, "GC thread %d starting...", gct->thread_index); - gct->id = osThreadId(); - gc_thread_mainloop(); + nat n_threads = RtsFlags.ParFlags.nNodes; + nat me = cap->no; + nat i, j; + rtsBool retry = rtsTrue; + + while(retry) { + for (i=0; i < n_threads; i++) { + if (i == me) continue; + if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) { + prodCapability(&capabilities[i], cap->running_task); + } + } + for (j=0; j < 10000000; j++) { + retry = rtsFalse; + for (i=0; i < n_threads; i++) { + if (i == me) continue; + write_barrier(); + setContextSwitches(); + if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) { + retry = rtsTrue; + } + } + if (!retry) break; + } + } } -#endif + +#endif // THREADED_RTS static void start_gc_threads (void) { #if defined(THREADED_RTS) - nat i; - OSThreadId id; - static rtsBool done = rtsFalse; - gc_running_threads = 0; - initMutex(&gc_running_mutex); - - if (!done) { - // Start from 1: the main thread is 0 - for (i = 1; i < RtsFlags.ParFlags.gcThreads; i++) { - createOSThread(&id, (OSThreadProc*)&gc_thread_entry, - gc_threads[i]); - } - done = rtsTrue; - } #endif } static void -wakeup_gc_threads (nat n_threads USED_IF_THREADS) +wakeup_gc_threads (nat n_threads USED_IF_THREADS, nat me USED_IF_THREADS) { #if defined(THREADED_RTS) nat i; - for (i=1; i < n_threads; i++) { + for (i=0; i < n_threads; i++) { + if (i == me) continue; inc_running(); debugTrace(DEBUG_gc, "waking up gc thread %d", i); - do { - ACQUIRE_LOCK(&gc_threads[i]->wake_mutex); - if (gc_threads[i]->wakeup) { - RELEASE_LOCK(&gc_threads[i]->wake_mutex); - continue; - } else { - break; - } - } while (1); - gc_threads[i]->wakeup = rtsTrue; - signalCondition(&gc_threads[i]->wake_cond); - RELEASE_LOCK(&gc_threads[i]->wake_mutex); + if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) barf("wakeup_gc_threads"); + + gc_threads[i]->wakeup = GC_THREAD_RUNNING; + ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin); + RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin); } #endif } @@ -1083,22 +1172,36 @@ wakeup_gc_threads (nat n_threads USED_IF_THREADS) // standby state, otherwise they may still be executing inside // any_work(), and may even remain awake until the next GC starts. static void -shutdown_gc_threads (nat n_threads USED_IF_THREADS) +shutdown_gc_threads (nat n_threads USED_IF_THREADS, nat me USED_IF_THREADS) { #if defined(THREADED_RTS) nat i; - rtsBool wakeup; - for (i=1; i < n_threads; i++) { - do { - ACQUIRE_LOCK(&gc_threads[i]->wake_mutex); - wakeup = gc_threads[i]->wakeup; - // wakeup is false while the thread is waiting - RELEASE_LOCK(&gc_threads[i]->wake_mutex); - } while (wakeup); + for (i=0; i < n_threads; i++) { + if (i == me) continue; + while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) { write_barrier(); } } #endif } +#if defined(THREADED_RTS) +void +releaseGCThreads (Capability *cap USED_IF_THREADS) +{ + nat n_threads = RtsFlags.ParFlags.nNodes; + nat me = cap->no; + nat i; + for (i=0; i < n_threads; i++) { + if (i == me) continue; + if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) + barf("releaseGCThreads"); + + gc_threads[i]->wakeup = GC_THREAD_INACTIVE; + ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin); + RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin); + } +} +#endif + /* ---------------------------------------------------------------------------- Initialise a generation that is to be collected ------------------------------------------------------------------------- */ @@ -1144,11 +1247,7 @@ init_collected_gen (nat g, nat n_threads) stp->blocks = NULL; stp->n_blocks = 0; stp->n_words = 0; - - // we don't have any to-be-scavenged blocks yet - stp->todos = NULL; - stp->todos_last = NULL; - stp->n_todos = 0; + stp->live_estimate = 0; // initialise the large object queues. stp->scavenged_large_objects = NULL; @@ -1165,7 +1264,7 @@ init_collected_gen (nat g, nat n_threads) } // for a compacted step, we need to allocate the bitmap - if (stp->is_compacted) { + if (stp->mark) { nat bitmap_size; // in bytes bdescr *bitmap_bdescr; StgWord *bitmap; @@ -1190,12 +1289,14 @@ init_collected_gen (nat g, nat n_threads) bd->u.bitmap = bitmap; bitmap += BLOCK_SIZE_W / (sizeof(W_)*BITS_PER_BYTE); - // Also at this point we set the BF_COMPACTED flag + // Also at this point we set the BF_MARKED flag // for this block. The invariant is that - // BF_COMPACTED is always unset, except during GC + // BF_MARKED is always unset, except during GC // when it is set on those blocks which will be // compacted. - bd->flags |= BF_COMPACTED; + if (!(bd->flags & BF_FRAGMENTED)) { + bd->flags |= BF_MARKED; + } } } } @@ -1220,9 +1321,12 @@ init_collected_gen (nat g, nat n_threads) // allocate the first to-space block; extra blocks will be // chained on as necessary. ws->todo_bd = NULL; - ws->buffer_todo_bd = NULL; + ASSERT(looksEmptyWSDeque(ws->todo_q)); alloc_todo_block(ws,0); + ws->todo_overflow = NULL; + ws->n_todo_overflow = 0; + ws->scavd_list = NULL; ws->n_scavd_blocks = 0; } @@ -1237,11 +1341,21 @@ init_collected_gen (nat g, nat n_threads) static void init_uncollected_gen (nat g, nat threads) { - nat s, t, i; + nat s, t, n; step_workspace *ws; step *stp; bdescr *bd; + // save the current mutable lists for this generation, and + // allocate a fresh block for each one. We'll traverse these + // mutable lists as roots early on in the GC. + generations[g].saved_mut_list = generations[g].mut_list; + generations[g].mut_list = allocBlock(); + for (n = 0; n < n_capabilities; n++) { + capabilities[n].saved_mut_lists[g] = capabilities[n].mut_lists[g]; + capabilities[n].mut_lists[g] = allocBlock(); + } + for (s = 0; s < generations[g].n_steps; s++) { stp = &generations[g].steps[s]; stp->scavenged_large_objects = NULL; @@ -1255,7 +1369,7 @@ init_uncollected_gen (nat g, nat threads) for (t = 0; t < threads; t++) { ws = &gc_threads[t]->steps[g * RtsFlags.GcFlags.steps + s]; - ws->buffer_todo_bd = NULL; + ASSERT(looksEmptyWSDeque(ws->todo_q)); ws->todo_large_objects = NULL; ws->part_list = NULL; @@ -1302,19 +1416,6 @@ init_uncollected_gen (nat g, nat threads) if (t == n_gc_threads) t = 0; } } - - - // Move the private mutable lists from each capability onto the - // main mutable list for the generation. - for (i = 0; i < n_capabilities; i++) { - for (bd = capabilities[i].mut_lists[g]; - bd->link != NULL; bd = bd->link) { - /* nothing */ - } - bd->link = generations[g].mut_list; - generations[g].mut_list = capabilities[i].mut_lists[g]; - capabilities[i].mut_lists[g] = allocBlock(); - } } /* ----------------------------------------------------------------------------- @@ -1327,6 +1428,7 @@ init_gc_thread (gc_thread *t) t->static_objects = END_OF_STATIC_LIST; t->scavenged_static_objects = END_OF_STATIC_LIST; t->scan_bd = NULL; + t->mut_lists = capabilities[t->thread_index].mut_lists; t->evac_step = 0; t->failed_to_evac = rtsFalse; t->eager_promotion = rtsTrue; @@ -1343,7 +1445,7 @@ init_gc_thread (gc_thread *t) -------------------------------------------------------------------------- */ static void -mark_root(void *user, StgClosure **root) +mark_root(void *user USED_IF_THREADS, StgClosure **root) { // we stole a register for gct, but this function is called from // *outside* the GC where the register variable is not in effect, @@ -1352,11 +1454,11 @@ mark_root(void *user, StgClosure **root) // incorrect. gc_thread *saved_gct; saved_gct = gct; - gct = user; + SET_GCT(user); evacuate(root); - gct = saved_gct; + SET_GCT(saved_gct); } /* ----------------------------------------------------------------------------- @@ -1425,13 +1527,18 @@ resize_generations (void) nat g; if (major_gc && RtsFlags.GcFlags.generations > 1) { - nat live, size, min_alloc; + nat live, size, min_alloc, words; nat max = RtsFlags.GcFlags.maxHeapSize; nat gens = RtsFlags.GcFlags.generations; // live in the oldest generations - live = (oldest_gen->steps[0].n_words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W+ - oldest_gen->steps[0].n_large_blocks; + if (oldest_gen->steps[0].live_estimate != 0) { + words = oldest_gen->steps[0].live_estimate; + } else { + words = oldest_gen->steps[0].n_words; + } + live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W + + oldest_gen->steps[0].n_large_blocks; // default max size for all generations except zero size = stg_max(live * RtsFlags.GcFlags.oldGenFactor, @@ -1448,13 +1555,19 @@ resize_generations (void) (max > 0 && oldest_gen->steps[0].n_blocks > (RtsFlags.GcFlags.compactThreshold * max) / 100))) { - oldest_gen->steps[0].is_compacted = 1; + oldest_gen->steps[0].mark = 1; + oldest_gen->steps[0].compact = 1; // debugBelch("compaction: on\n", live); } else { - oldest_gen->steps[0].is_compacted = 0; + oldest_gen->steps[0].mark = 0; + oldest_gen->steps[0].compact = 0; // debugBelch("compaction: off\n", live); } + if (RtsFlags.GcFlags.sweep) { + oldest_gen->steps[0].mark = 1; + } + // if we're going to go over the maximum heap size, reduce the // size of the generations accordingly. The calculation is // different if compaction is turned on, because we don't need @@ -1468,7 +1581,7 @@ resize_generations (void) heapOverflow(); } - if (oldest_gen->steps[0].is_compacted) { + if (oldest_gen->steps[0].compact) { if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) { size = (max - min_alloc) / ((gens - 1) * 2 - 1); }