From 1fb38442d3a55ac92795aa6c5ed4df82011df724 Mon Sep 17 00:00:00 2001 From: Simon Marlow Date: Mon, 11 Apr 2011 14:48:49 +0100 Subject: [PATCH] Refactoring and tidy up This is a port of some of the changes from my private local-GC branch (which is still in darcs, I haven't converted it to git yet). There are a couple of small functional differences in the GC stats: first, per-thread GC timings should now be more accurate, and secondly we now report average and maximum pause times. e.g. from minimax +RTS -N8 -s: Tot time (elapsed) Avg pause Max pause Gen 0 2755 colls, 2754 par 13.16s 0.93s 0.0003s 0.0150s Gen 1 769 colls, 769 par 3.71s 0.26s 0.0003s 0.0059s --- rts/Capability.c | 46 +++--- rts/Capability.h | 6 +- rts/STM.c | 13 +- rts/STM.h | 2 +- rts/Schedule.c | 10 ++ rts/Schedule.h | 1 + rts/Stats.c | 460 ++++++++++++++++++++++++++--------------------------- rts/Stats.h | 13 +- rts/Task.c | 19 ++- rts/Task.h | 3 + rts/sm/Compact.c | 2 + rts/sm/Evac.c | 1 + rts/sm/GC.c | 100 +++++++----- rts/sm/GCAux.c | 2 +- rts/sm/GCTDecl.h | 98 ++++++++++++ rts/sm/GCThread.h | 96 ++--------- rts/sm/GCUtils.c | 1 + rts/sm/GCUtils.h | 2 + rts/sm/MarkWeak.c | 1 + 19 files changed, 465 insertions(+), 411 deletions(-) create mode 100644 rts/sm/GCTDecl.h diff --git a/rts/Capability.c b/rts/Capability.c index bffb735..9091fdd 100644 --- a/rts/Capability.c +++ b/rts/Capability.c @@ -842,11 +842,9 @@ freeCapabilities (void) ------------------------------------------------------------------------ */ void -markSomeCapabilities (evac_fn evac, void *user, nat i0, nat delta, - rtsBool no_mark_sparks USED_IF_THREADS) +markCapability (evac_fn evac, void *user, Capability *cap, + rtsBool no_mark_sparks USED_IF_THREADS) { - nat i; - Capability *cap; InCall *incall; // Each GC thread is responsible for following roots from the @@ -854,39 +852,31 @@ markSomeCapabilities (evac_fn evac, void *user, nat i0, nat delta, // or fewer Capabilities as GC threads, but just in case there // are more, we mark every Capability whose number is the GC // thread's index plus a multiple of the number of GC threads. - for (i = i0; i < n_capabilities; i += delta) { - cap = &capabilities[i]; - evac(user, (StgClosure **)(void *)&cap->run_queue_hd); - evac(user, (StgClosure **)(void *)&cap->run_queue_tl); + evac(user, (StgClosure **)(void *)&cap->run_queue_hd); + evac(user, (StgClosure **)(void *)&cap->run_queue_tl); #if defined(THREADED_RTS) - evac(user, (StgClosure **)(void *)&cap->inbox); + evac(user, (StgClosure **)(void *)&cap->inbox); #endif - for (incall = cap->suspended_ccalls; incall != NULL; - incall=incall->next) { - evac(user, (StgClosure **)(void *)&incall->suspended_tso); - } + for (incall = cap->suspended_ccalls; incall != NULL; + incall=incall->next) { + evac(user, (StgClosure **)(void *)&incall->suspended_tso); + } #if defined(THREADED_RTS) - if (!no_mark_sparks) { - traverseSparkQueue (evac, user, cap); - } -#endif + if (!no_mark_sparks) { + traverseSparkQueue (evac, user, cap); } +#endif -#if !defined(THREADED_RTS) - evac(user, (StgClosure **)(void *)&blocked_queue_hd); - evac(user, (StgClosure **)(void *)&blocked_queue_tl); - evac(user, (StgClosure **)(void *)&sleeping_queue); -#endif + // Free STM structures for this Capability + stmPreGCHook(cap); } void markCapabilities (evac_fn evac, void *user) { - markSomeCapabilities(evac, user, 0, 1, rtsFalse); + nat n; + for (n = 0; n < n_capabilities; n++) { + markCapability(evac, user, &capabilities[n], rtsFalse); + } } - -/* ----------------------------------------------------------------------------- - Messages - -------------------------------------------------------------------------- */ - diff --git a/rts/Capability.h b/rts/Capability.h index 2daade8..d580a83 100644 --- a/rts/Capability.h +++ b/rts/Capability.h @@ -278,9 +278,11 @@ INLINE_HEADER void contextSwitchCapability(Capability *cap); void freeCapabilities (void); // For the GC: -void markSomeCapabilities (evac_fn evac, void *user, nat i0, nat delta, - rtsBool no_mark_sparks); +void markCapability (evac_fn evac, void *user, Capability *cap, + rtsBool no_mark_sparks USED_IF_THREADS); + void markCapabilities (evac_fn evac, void *user); + void traverseSparkQueues (evac_fn evac, void *user); /* ----------------------------------------------------------------------------- diff --git a/rts/STM.c b/rts/STM.c index 3de42e2..e8d3fc0 100644 --- a/rts/STM.c +++ b/rts/STM.c @@ -879,17 +879,12 @@ static StgBool check_read_only(StgTRecHeader *trec STG_UNUSED) { /************************************************************************/ -void stmPreGCHook() { - nat i; - +void stmPreGCHook (Capability *cap) { lock_stm(NO_TREC); TRACE("stmPreGCHook"); - for (i = 0; i < n_capabilities; i ++) { - Capability *cap = &capabilities[i]; - cap -> free_tvar_watch_queues = END_STM_WATCH_QUEUE; - cap -> free_trec_chunks = END_STM_CHUNK_LIST; - cap -> free_trec_headers = NO_TREC; - } + cap->free_tvar_watch_queues = END_STM_WATCH_QUEUE; + cap->free_trec_chunks = END_STM_CHUNK_LIST; + cap->free_trec_headers = NO_TREC; unlock_stm(NO_TREC); } diff --git a/rts/STM.h b/rts/STM.h index f15a681..dd11bb8 100644 --- a/rts/STM.h +++ b/rts/STM.h @@ -48,7 +48,7 @@ -------------- */ -void stmPreGCHook(void); +void stmPreGCHook(Capability *cap); /*---------------------------------------------------------------------- diff --git a/rts/Schedule.c b/rts/Schedule.c index 382ba97..f5cb568 100644 --- a/rts/Schedule.c +++ b/rts/Schedule.c @@ -2069,6 +2069,16 @@ freeScheduler( void ) #endif } +void markScheduler (evac_fn evac USED_IF_NOT_THREADS, + void *user USED_IF_NOT_THREADS) +{ +#if !defined(THREADED_RTS) + evac(user, (StgClosure **)(void *)&blocked_queue_hd); + evac(user, (StgClosure **)(void *)&blocked_queue_tl); + evac(user, (StgClosure **)(void *)&sleeping_queue); +#endif +} + /* ----------------------------------------------------------------------------- performGC diff --git a/rts/Schedule.h b/rts/Schedule.h index edba8f5..549f555 100644 --- a/rts/Schedule.h +++ b/rts/Schedule.h @@ -23,6 +23,7 @@ void initScheduler (void); void exitScheduler (rtsBool wait_foreign); void freeScheduler (void); +void markScheduler (evac_fn evac, void *user); // Place a new thread on the run queue of the current Capability void scheduleThread (Capability *cap, StgTSO *tso); diff --git a/rts/Stats.c b/rts/Stats.c index 4b9f6d8..159a909 100644 --- a/rts/Stats.c +++ b/rts/Stats.c @@ -16,6 +16,8 @@ #include "GetTime.h" #include "sm/Storage.h" #include "sm/GC.h" // gc_alloc_block_sync, whitehole_spin +#include "sm/GCThread.h" +#include "sm/BlockAlloc.h" #if USE_PAPI #include "Papi.h" @@ -26,31 +28,23 @@ #define TICK_TO_DBL(t) ((double)(t) / TICKS_PER_SECOND) -static Ticks ElapsedTimeStart = 0; +static Ticks + start_init_cpu, start_init_elapsed, + end_init_cpu, end_init_elapsed, + start_exit_cpu, start_exit_elapsed, + end_exit_cpu, end_exit_elapsed; -static Ticks InitUserTime = 0; -static Ticks InitElapsedTime = 0; -static Ticks InitElapsedStamp = 0; +static Ticks GC_tot_cpu = 0; -static Ticks MutUserTime = 0; -static Ticks MutElapsedTime = 0; -static Ticks MutElapsedStamp = 0; - -static Ticks ExitUserTime = 0; -static Ticks ExitElapsedTime = 0; - -static StgWord64 GC_tot_alloc = 0; -static StgWord64 GC_tot_copied = 0; +static StgWord64 GC_tot_alloc = 0; +static StgWord64 GC_tot_copied = 0; static StgWord64 GC_par_max_copied = 0; static StgWord64 GC_par_avg_copied = 0; -static Ticks GC_start_time = 0, GC_tot_time = 0; /* User GC Time */ -static Ticks GCe_start_time = 0, GCe_tot_time = 0; /* Elapsed GC time */ - #ifdef PROFILING -static Ticks RP_start_time = 0, RP_tot_time = 0; /* retainer prof user time */ -static Ticks RPe_start_time = 0, RPe_tot_time = 0; /* retainer prof elap time */ +static Ticks RP_start_time = 0, RP_tot_time = 0; // retainer prof user time +static Ticks RPe_start_time = 0, RPe_tot_time = 0; // retainer prof elap time static Ticks HC_start_time, HC_tot_time = 0; // heap census prof user time static Ticks HCe_start_time, HCe_tot_time = 0; // heap census prof elap time @@ -62,99 +56,81 @@ static Ticks HCe_start_time, HCe_tot_time = 0; // heap census prof elap time #define PROF_VAL(x) 0 #endif -static lnat MaxResidency = 0; // in words; for stats only -static lnat AvgResidency = 0; -static lnat ResidencySamples = 0; // for stats only -static lnat MaxSlop = 0; +static lnat max_residency = 0; // in words; for stats only +static lnat avg_residency = 0; +static lnat residency_samples = 0; // for stats only +static lnat max_slop = 0; -static lnat GC_start_faults = 0, GC_end_faults = 0; +static lnat GC_end_faults = 0; -static Ticks *GC_coll_times = NULL; -static Ticks *GC_coll_etimes = NULL; +static Ticks *GC_coll_cpu = NULL; +static Ticks *GC_coll_elapsed = NULL; +static Ticks *GC_coll_max_pause = NULL; static void statsFlush( void ); static void statsClose( void ); -Ticks stat_getElapsedGCTime(void) -{ - return GCe_tot_time; -} +/* ----------------------------------------------------------------------------- + Current elapsed time + ------------------------------------------------------------------------- */ Ticks stat_getElapsedTime(void) { - return getProcessElapsedTime() - ElapsedTimeStart; + return getProcessElapsedTime() - start_init_elapsed; } -/* mut_user_time_during_GC() and mut_user_time() - * - * The former function can be used to get the current mutator time - * *during* a GC, i.e. between stat_startGC and stat_endGC. This is - * used in the heap profiler for accurately time stamping the heap - * sample. - * - * ATTENTION: mut_user_time_during_GC() relies on GC_start_time being - * defined in stat_startGC() - to minimise system calls, - * GC_start_time is, however, only defined when really needed (check - * stat_startGC() for details) - */ -double -mut_user_time_during_GC( void ) -{ - return TICK_TO_DBL(GC_start_time - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time)); -} +/* --------------------------------------------------------------------------- + Measure the current MUT time, for profiling + ------------------------------------------------------------------------ */ double mut_user_time( void ) { - Ticks user; - user = getProcessCPUTime(); - return TICK_TO_DBL(user - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time)); + Ticks cpu; + cpu = getProcessCPUTime(); + return TICK_TO_DBL(cpu - GC_tot_cpu - PROF_VAL(RP_tot_time + HC_tot_time)); } #ifdef PROFILING /* - mut_user_time_during_RP() is similar to mut_user_time_during_GC(); - it returns the MUT time during retainer profiling. + mut_user_time_during_RP() returns the MUT time during retainer profiling. The same is for mut_user_time_during_HC(); */ double mut_user_time_during_RP( void ) { - return TICK_TO_DBL(RP_start_time - GC_tot_time - RP_tot_time - HC_tot_time); + return TICK_TO_DBL(RP_start_time - GC_tot_cpu - RP_tot_time - HC_tot_time); } double mut_user_time_during_heap_census( void ) { - return TICK_TO_DBL(HC_start_time - GC_tot_time - RP_tot_time - HC_tot_time); + return TICK_TO_DBL(HC_start_time - GC_tot_cpu - RP_tot_time - HC_tot_time); } #endif /* PROFILING */ -// initStats0() has no dependencies, it can be called right at the beginning +/* --------------------------------------------------------------------------- + initStats0() has no dependencies, it can be called right at the beginning + ------------------------------------------------------------------------ */ + void initStats0(void) { - ElapsedTimeStart = 0; - - InitUserTime = 0; - InitElapsedTime = 0; - InitElapsedStamp = 0; + start_init_cpu = 0; + start_init_elapsed = 0; + end_init_cpu = 0; + end_init_elapsed = 0; - MutUserTime = 0; - MutElapsedTime = 0; - MutElapsedStamp = 0; - - ExitUserTime = 0; - ExitElapsedTime = 0; + start_exit_cpu = 0; + start_exit_elapsed = 0; + end_exit_cpu = 0; + end_exit_elapsed = 0; GC_tot_alloc = 0; GC_tot_copied = 0; GC_par_max_copied = 0; GC_par_avg_copied = 0; - GC_start_time = 0; - GC_tot_time = 0; - GCe_start_time = 0; - GCe_tot_time = 0; + GC_tot_cpu = 0; #ifdef PROFILING RP_start_time = 0; @@ -168,16 +144,18 @@ initStats0(void) HCe_tot_time = 0; #endif - MaxResidency = 0; - AvgResidency = 0; - ResidencySamples = 0; - MaxSlop = 0; + max_residency = 0; + avg_residency = 0; + residency_samples = 0; + max_slop = 0; - GC_start_faults = 0; GC_end_faults = 0; } -// initStats1() can be called after setupRtsFlags() +/* --------------------------------------------------------------------------- + initStats1() can be called after setupRtsFlags() + ------------------------------------------------------------------------ */ + void initStats1 (void) { @@ -187,17 +165,22 @@ initStats1 (void) statsPrintf(" Alloc Copied Live GC GC TOT TOT Page Flts\n"); statsPrintf(" bytes bytes bytes user elap user elap\n"); } - GC_coll_times = + GC_coll_cpu = + (Ticks *)stgMallocBytes( + sizeof(Ticks)*RtsFlags.GcFlags.generations, + "initStats"); + GC_coll_elapsed = (Ticks *)stgMallocBytes( sizeof(Ticks)*RtsFlags.GcFlags.generations, "initStats"); - GC_coll_etimes = + GC_coll_max_pause = (Ticks *)stgMallocBytes( sizeof(Ticks)*RtsFlags.GcFlags.generations, "initStats"); for (i = 0; i < RtsFlags.GcFlags.generations; i++) { - GC_coll_times[i] = 0; - GC_coll_etimes[i] = 0; + GC_coll_cpu[i] = 0; + GC_coll_elapsed[i] = 0; + GC_coll_max_pause[i] = 0; } } @@ -208,26 +191,14 @@ initStats1 (void) void stat_startInit(void) { - Ticks elapsed; - - elapsed = getProcessElapsedTime(); - ElapsedTimeStart = elapsed; + getProcessTimes(&start_init_cpu, &start_init_elapsed); } void stat_endInit(void) { - Ticks user, elapsed; - - getProcessTimes(&user, &elapsed); + getProcessTimes(&end_init_cpu, &end_init_elapsed); - InitUserTime = user; - InitElapsedStamp = elapsed; - if (ElapsedTimeStart > elapsed) { - InitElapsedTime = 0; - } else { - InitElapsedTime = elapsed - ElapsedTimeStart; - } #if USE_PAPI /* We start counting events for the mutator * when garbage collection starts @@ -249,18 +220,7 @@ stat_endInit(void) void stat_startExit(void) { - Ticks user, elapsed; - - getProcessTimes(&user, &elapsed); - - MutElapsedStamp = elapsed; - MutElapsedTime = elapsed - GCe_tot_time - - PROF_VAL(RPe_tot_time + HCe_tot_time) - InitElapsedStamp; - if (MutElapsedTime < 0) { MutElapsedTime = 0; } /* sometimes -0.00 */ - - MutUserTime = user - GC_tot_time - - PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime; - if (MutUserTime < 0) { MutUserTime = 0; } + getProcessTimes(&start_exit_cpu, &start_exit_elapsed); #if USE_PAPI /* We stop counting mutator events @@ -269,25 +229,13 @@ stat_startExit(void) /* This flag is needed, because GC is run once more after this function */ papi_is_reporting = 0; - #endif } void stat_endExit(void) { - Ticks user, elapsed; - - getProcessTimes(&user, &elapsed); - - ExitUserTime = user - MutUserTime - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime; - ExitElapsedTime = elapsed - MutElapsedStamp; - if (ExitUserTime < 0) { - ExitUserTime = 0; - } - if (ExitElapsedTime < 0) { - ExitElapsedTime = 0; - } + getProcessTimes(&end_exit_cpu, &end_exit_elapsed); } /* ----------------------------------------------------------------------------- @@ -296,13 +244,8 @@ stat_endExit(void) static nat rub_bell = 0; -/* initialise global variables needed during GC - * - * * GC_start_time is read in mut_user_time_during_GC(), which in turn is - * needed if either PROFILING or DEBUGing is enabled - */ void -stat_startGC(void) +stat_startGC (gc_thread *gct) { nat bell = RtsFlags.GcFlags.ringBell; @@ -315,16 +258,6 @@ stat_startGC(void) } } - if (RtsFlags.GcFlags.giveStats != NO_GC_STATS - || RtsFlags.ProfFlags.doHeapProfile) - // heap profiling needs GC_tot_time - { - getProcessTimes(&GC_start_time, &GCe_start_time); - if (RtsFlags.GcFlags.giveStats) { - GC_start_faults = getPageFaults(); - } - } - #if USE_PAPI if(papi_is_reporting) { /* Switch to counting GC events */ @@ -333,6 +266,40 @@ stat_startGC(void) } #endif + getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed); + gct->gc_start_thread_cpu = getThreadCPUTime(); + + if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) + { + gct->gc_start_faults = getPageFaults(); + } +} + +void +stat_gcWorkerThreadStart (gc_thread *gct) +{ + if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) + { + getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed); + gct->gc_start_thread_cpu = getThreadCPUTime(); + } +} + +void +stat_gcWorkerThreadDone (gc_thread *gct) +{ + Ticks thread_cpu, elapsed, gc_cpu, gc_elapsed; + + if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) + { + elapsed = getProcessElapsedTime(); + thread_cpu = getThreadCPUTime(); + + gc_cpu = thread_cpu - gct->gc_start_thread_cpu; + gc_elapsed = elapsed - gct->gc_start_elapsed; + + taskDoneGC(gct->cap->running_task, gc_cpu, gc_elapsed); + } } /* ----------------------------------------------------------------------------- @@ -340,67 +307,65 @@ stat_startGC(void) -------------------------------------------------------------------------- */ void -stat_endGC (lnat alloc, lnat live, lnat copied, lnat gen, +stat_endGC (gc_thread *gct, + lnat alloc, lnat live, lnat copied, nat gen, lnat max_copied, lnat avg_copied, lnat slop) { if (RtsFlags.GcFlags.giveStats != NO_GC_STATS || RtsFlags.ProfFlags.doHeapProfile) // heap profiling needs GC_tot_time { - Ticks time, etime, gc_time, gc_etime; + Ticks cpu, elapsed, thread_gc_cpu, gc_cpu, gc_elapsed; - getProcessTimes(&time, &etime); - gc_time = time - GC_start_time; - gc_etime = etime - GCe_start_time; - - if (RtsFlags.GcFlags.giveStats == VERBOSE_GC_STATS) { + getProcessTimes(&cpu, &elapsed); + gc_elapsed = elapsed - gct->gc_start_elapsed; + + thread_gc_cpu = getThreadCPUTime() - gct->gc_start_thread_cpu; + + gc_cpu = cpu - gct->gc_start_cpu; + + taskDoneGC(gct->cap->running_task, thread_gc_cpu, gc_elapsed); + + if (RtsFlags.GcFlags.giveStats == VERBOSE_GC_STATS) { nat faults = getPageFaults(); statsPrintf("%9ld %9ld %9ld", alloc*sizeof(W_), copied*sizeof(W_), live*sizeof(W_)); - statsPrintf(" %5.2f %5.2f %7.2f %7.2f %4ld %4ld (Gen: %2ld)\n", - TICK_TO_DBL(gc_time), - TICK_TO_DBL(gc_etime), - TICK_TO_DBL(time), - TICK_TO_DBL(etime - ElapsedTimeStart), - faults - GC_start_faults, - GC_start_faults - GC_end_faults, - gen); - - GC_end_faults = faults; + statsPrintf(" %5.2f %5.2f %7.2f %7.2f %4ld %4ld (Gen: %2d)\n", + TICK_TO_DBL(gc_cpu), + TICK_TO_DBL(gc_elapsed), + TICK_TO_DBL(cpu), + TICK_TO_DBL(elapsed - start_init_elapsed), + faults - gct->gc_start_faults, + gct->gc_start_faults - GC_end_faults, + gen); + + GC_end_faults = faults; statsFlush(); } - GC_coll_times[gen] += gc_time; - GC_coll_etimes[gen] += gc_etime; + GC_coll_cpu[gen] += gc_cpu; + GC_coll_elapsed[gen] += gc_elapsed; + if (GC_coll_max_pause[gen] < gc_elapsed) { + GC_coll_max_pause[gen] = gc_elapsed; + } GC_tot_copied += (StgWord64) copied; GC_tot_alloc += (StgWord64) alloc; GC_par_max_copied += (StgWord64) max_copied; GC_par_avg_copied += (StgWord64) avg_copied; - GC_tot_time += gc_time; - GCe_tot_time += gc_etime; - -#if defined(THREADED_RTS) - { - Task *task; - if ((task = myTask()) != NULL) { - task->gc_time += gc_time; - task->gc_etime += gc_etime; - } - } -#endif + GC_tot_cpu += gc_cpu; if (gen == RtsFlags.GcFlags.generations-1) { /* major GC? */ - if (live > MaxResidency) { - MaxResidency = live; + if (live > max_residency) { + max_residency = live; } - ResidencySamples++; - AvgResidency += live; + residency_samples++; + avg_residency += live; } - if (slop > MaxSlop) MaxSlop = slop; + if (slop > max_slop) max_slop = slop; } if (rub_bell) { @@ -539,20 +504,28 @@ StgInt TOTAL_CALLS=1; statsPrintf(" (SLOW_CALLS_" #arity ") %% of (TOTAL_CALLS) : %.1f%%\n", \ SLOW_CALLS_##arity * 100.0/TOTAL_CALLS) -extern lnat hw_alloc_blocks; - void stat_exit(int alloc) { + generation *gen; + Ticks gc_cpu = 0; + Ticks gc_elapsed = 0; + Ticks init_cpu = 0; + Ticks init_elapsed = 0; + Ticks mut_cpu = 0; + Ticks mut_elapsed = 0; + Ticks exit_cpu = 0; + Ticks exit_elapsed = 0; + if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) { char temp[BIG_STRING_LEN]; - Ticks time; - Ticks etime; - nat g, total_collections = 0; + Ticks tot_cpu; + Ticks tot_elapsed; + nat i, g, total_collections = 0; - getProcessTimes( &time, &etime ); - etime -= ElapsedTimeStart; + getProcessTimes( &tot_cpu, &tot_elapsed ); + tot_elapsed -= start_init_elapsed; GC_tot_alloc += alloc; @@ -560,15 +533,20 @@ stat_exit(int alloc) for (g = 0; g < RtsFlags.GcFlags.generations; g++) total_collections += generations[g].collections; - /* avoid divide by zero if time is measured as 0.00 seconds -- SDM */ - if (time == 0.0) time = 1; - if (etime == 0.0) etime = 1; + /* avoid divide by zero if tot_cpu is measured as 0.00 seconds -- SDM */ + if (tot_cpu == 0.0) tot_cpu = 1; + if (tot_elapsed == 0.0) tot_elapsed = 1; if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) { statsPrintf("%9ld %9.9s %9.9s", (lnat)alloc*sizeof(W_), "", ""); statsPrintf(" %5.2f %5.2f\n\n", 0.0, 0.0); } + for (i = 0; i < RtsFlags.GcFlags.generations; i++) { + gc_cpu += GC_coll_cpu[i]; + gc_elapsed += GC_coll_elapsed[i]; + } + if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) { showStgWord64(GC_tot_alloc*sizeof(W_), temp, rtsTrue/*commas*/); @@ -578,14 +556,14 @@ stat_exit(int alloc) temp, rtsTrue/*commas*/); statsPrintf("%16s bytes copied during GC\n", temp); - if ( ResidencySamples > 0 ) { - showStgWord64(MaxResidency*sizeof(W_), + if ( residency_samples > 0 ) { + showStgWord64(max_residency*sizeof(W_), temp, rtsTrue/*commas*/); statsPrintf("%16s bytes maximum residency (%ld sample(s))\n", - temp, ResidencySamples); + temp, residency_samples); } - showStgWord64(MaxSlop*sizeof(W_), temp, rtsTrue/*commas*/); + showStgWord64(max_slop*sizeof(W_), temp, rtsTrue/*commas*/); statsPrintf("%16s bytes maximum slop\n", temp); statsPrintf("%16ld MB total memory in use (%ld MB lost due to fragmentation)\n\n", @@ -593,13 +571,18 @@ stat_exit(int alloc) (peak_mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W - hw_alloc_blocks * BLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_))); /* Print garbage collections in each gen */ - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - statsPrintf(" Generation %d: %5d collections, %5d parallel, %5.2fs, %5.2fs elapsed\n", - g, generations[g].collections, - generations[g].par_collections, - TICK_TO_DBL(GC_coll_times[g]), - TICK_TO_DBL(GC_coll_etimes[g])); - } + statsPrintf(" Tot time (elapsed) Avg pause Max pause\n"); + for (g = 0; g < RtsFlags.GcFlags.generations; g++) { + gen = &generations[g]; + statsPrintf(" Gen %2d %5d colls, %5d par %5.2fs %5.2fs %3.4fs %3.4fs\n", + gen->no, + gen->collections, + gen->par_collections, + TICK_TO_DBL(GC_coll_cpu[g]), + TICK_TO_DBL(GC_coll_elapsed[g]), + gen->collections == 0 ? 0 : TICK_TO_DBL(GC_coll_elapsed[g] / gen->collections), + TICK_TO_DBL(GC_coll_max_pause[g])); + } #if defined(THREADED_RTS) if (RtsFlags.ParFlags.parGcEnabled) { @@ -610,8 +593,7 @@ stat_exit(int alloc) ); } #endif - - statsPrintf("\n"); + statsPrintf("\n"); #if defined(THREADED_RTS) { @@ -653,44 +635,60 @@ stat_exit(int alloc) } #endif - statsPrintf(" INIT time %6.2fs (%6.2fs elapsed)\n", - TICK_TO_DBL(InitUserTime), TICK_TO_DBL(InitElapsedTime)); - statsPrintf(" MUT time %6.2fs (%6.2fs elapsed)\n", - TICK_TO_DBL(MutUserTime), TICK_TO_DBL(MutElapsedTime)); - statsPrintf(" GC time %6.2fs (%6.2fs elapsed)\n", - TICK_TO_DBL(GC_tot_time), TICK_TO_DBL(GCe_tot_time)); + init_cpu = end_init_cpu - start_init_cpu; + init_elapsed = end_init_elapsed - start_init_elapsed; + + exit_cpu = end_exit_cpu - start_exit_cpu; + exit_elapsed = end_exit_elapsed - start_exit_elapsed; + + statsPrintf(" INIT time %6.2fs (%6.2fs elapsed)\n", + TICK_TO_DBL(init_cpu), TICK_TO_DBL(init_elapsed)); + + mut_elapsed = start_exit_elapsed - end_init_elapsed - gc_elapsed; + + mut_cpu = start_exit_cpu - end_init_cpu - gc_cpu + - PROF_VAL(RP_tot_time + HC_tot_time); + if (mut_cpu < 0) { mut_cpu = 0; } + + statsPrintf(" MUT time %6.2fs (%6.2fs elapsed)\n", + TICK_TO_DBL(mut_cpu), TICK_TO_DBL(mut_elapsed)); + statsPrintf(" GC time %6.2fs (%6.2fs elapsed)\n", + TICK_TO_DBL(gc_cpu), TICK_TO_DBL(gc_elapsed)); + #ifdef PROFILING - statsPrintf(" RP time %6.2fs (%6.2fs elapsed)\n", + statsPrintf(" RP time %6.2fs (%6.2fs elapsed)\n", TICK_TO_DBL(RP_tot_time), TICK_TO_DBL(RPe_tot_time)); - statsPrintf(" PROF time %6.2fs (%6.2fs elapsed)\n", + statsPrintf(" PROF time %6.2fs (%6.2fs elapsed)\n", TICK_TO_DBL(HC_tot_time), TICK_TO_DBL(HCe_tot_time)); #endif - statsPrintf(" EXIT time %6.2fs (%6.2fs elapsed)\n", - TICK_TO_DBL(ExitUserTime), TICK_TO_DBL(ExitElapsedTime)); - statsPrintf(" Total time %6.2fs (%6.2fs elapsed)\n\n", - TICK_TO_DBL(time), TICK_TO_DBL(etime)); - statsPrintf(" %%GC time %5.1f%% (%.1f%% elapsed)\n\n", - TICK_TO_DBL(GC_tot_time)*100/TICK_TO_DBL(time), - TICK_TO_DBL(GCe_tot_time)*100/TICK_TO_DBL(etime)); - - if (time - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time) == 0) + statsPrintf(" EXIT time %6.2fs (%6.2fs elapsed)\n", + TICK_TO_DBL(exit_cpu), TICK_TO_DBL(exit_elapsed)); + statsPrintf(" Total time %6.2fs (%6.2fs elapsed)\n\n", + TICK_TO_DBL(tot_cpu), TICK_TO_DBL(tot_elapsed)); +#ifndef THREADED_RTS + statsPrintf(" %%GC time %5.1f%% (%.1f%% elapsed)\n\n", + TICK_TO_DBL(gc_cpu)*100/TICK_TO_DBL(tot_cpu), + TICK_TO_DBL(gc_elapsed)*100/TICK_TO_DBL(tot_elapsed)); +#endif + + if (tot_cpu - GC_tot_cpu - PROF_VAL(RP_tot_time + HC_tot_time) == 0) showStgWord64(0, temp, rtsTrue/*commas*/); else showStgWord64( (StgWord64)((GC_tot_alloc*sizeof(W_))/ - TICK_TO_DBL(time - GC_tot_time - + TICK_TO_DBL(tot_cpu - GC_tot_cpu - PROF_VAL(RP_tot_time + HC_tot_time))), temp, rtsTrue/*commas*/); statsPrintf(" Alloc rate %s bytes per MUT second\n\n", temp); statsPrintf(" Productivity %5.1f%% of total user, %.1f%% of total elapsed\n\n", - TICK_TO_DBL(time - GC_tot_time - - PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime) * 100 - / TICK_TO_DBL(time), - TICK_TO_DBL(time - GC_tot_time - - PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime) * 100 - / TICK_TO_DBL(etime)); + TICK_TO_DBL(tot_cpu - GC_tot_cpu - + PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100 + / TICK_TO_DBL(tot_cpu), + TICK_TO_DBL(tot_cpu - GC_tot_cpu - + PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100 + / TICK_TO_DBL(tot_elapsed)); /* TICK_PRINT(1); @@ -741,26 +739,26 @@ stat_exit(int alloc) statsPrintf(fmt1, GC_tot_alloc*(StgWord64)sizeof(W_)); statsPrintf(fmt2, total_collections, - ResidencySamples == 0 ? 0 : - AvgResidency*sizeof(W_)/ResidencySamples, - MaxResidency*sizeof(W_), - ResidencySamples, + residency_samples == 0 ? 0 : + avg_residency*sizeof(W_)/residency_samples, + max_residency*sizeof(W_), + residency_samples, (unsigned long)(peak_mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L)), - TICK_TO_DBL(InitUserTime), TICK_TO_DBL(InitElapsedTime), - TICK_TO_DBL(MutUserTime), TICK_TO_DBL(MutElapsedTime), - TICK_TO_DBL(GC_tot_time), TICK_TO_DBL(GCe_tot_time)); + TICK_TO_DBL(init_cpu), TICK_TO_DBL(init_elapsed), + TICK_TO_DBL(mut_cpu), TICK_TO_DBL(mut_elapsed), + TICK_TO_DBL(gc_cpu), TICK_TO_DBL(gc_elapsed)); } statsFlush(); statsClose(); } - if (GC_coll_times) - stgFree(GC_coll_times); - GC_coll_times = NULL; - if (GC_coll_etimes) - stgFree(GC_coll_etimes); - GC_coll_etimes = NULL; + if (GC_coll_cpu) + stgFree(GC_coll_cpu); + GC_coll_cpu = NULL; + if (GC_coll_elapsed) + stgFree(GC_coll_elapsed); + GC_coll_elapsed = NULL; } /* ----------------------------------------------------------------------------- diff --git a/rts/Stats.h b/rts/Stats.h index f3a20ae..0c51787 100644 --- a/rts/Stats.h +++ b/rts/Stats.h @@ -13,13 +13,18 @@ #include "BeginPrivate.h" +struct gc_thread_; + void stat_startInit(void); void stat_endInit(void); -void stat_startGC(void); -void stat_endGC (lnat alloc, lnat live, - lnat copied, lnat gen, - lnat max_copied, lnat avg_copied, lnat slop); +void stat_startGC(struct gc_thread_ *gct); +void stat_endGC (struct gc_thread_ *gct, lnat alloc, lnat live, + lnat copied, nat gen, + lnat max_copied, lnat avg_copied, lnat slop); + +void stat_gcWorkerThreadStart (struct gc_thread_ *gct); +void stat_gcWorkerThreadDone (struct gc_thread_ *gct); #ifdef PROFILING void stat_startRP(void); diff --git a/rts/Task.c b/rts/Task.c index a5de804..e77a030 100644 --- a/rts/Task.c +++ b/rts/Task.c @@ -318,25 +318,30 @@ void taskTimeStamp (Task *task USED_IF_THREADS) { #if defined(THREADED_RTS) - Ticks currentElapsedTime, currentUserTime, elapsedGCTime; + Ticks currentElapsedTime, currentUserTime; currentUserTime = getThreadCPUTime(); currentElapsedTime = getProcessElapsedTime(); - // XXX this is wrong; we want elapsed GC time since the - // Task started. - elapsedGCTime = stat_getElapsedGCTime(); - - task->mut_time = + task->mut_time = currentUserTime - task->muttimestart - task->gc_time; task->mut_etime = - currentElapsedTime - task->elapsedtimestart - elapsedGCTime; + currentElapsedTime - task->elapsedtimestart - task->gc_etime; + if (task->gc_time < 0) { task->gc_time = 0; } + if (task->gc_etime < 0) { task->gc_etime = 0; } if (task->mut_time < 0) { task->mut_time = 0; } if (task->mut_etime < 0) { task->mut_etime = 0; } #endif } +void +taskDoneGC (Task *task, Ticks cpu_time, Ticks elapsed_time) +{ + task->gc_time += cpu_time; + task->gc_etime += elapsed_time; +} + #if defined(THREADED_RTS) void diff --git a/rts/Task.h b/rts/Task.h index 38e4763..424af60 100644 --- a/rts/Task.h +++ b/rts/Task.h @@ -207,6 +207,9 @@ void workerTaskStop (Task *task); // void taskTimeStamp (Task *task); +// The current Task has finished a GC, record the amount of time spent. +void taskDoneGC (Task *task, Ticks cpu_time, Ticks elapsed_time); + // Put the task back on the free list, mark it stopped. Used by // forkProcess(). // diff --git a/rts/sm/Compact.c b/rts/sm/Compact.c index ff7480c..1b57c53 100644 --- a/rts/sm/Compact.c +++ b/rts/sm/Compact.c @@ -942,6 +942,8 @@ compact(StgClosure *static_objects) // 1. thread the roots markCapabilities((evac_fn)thread_root, NULL); + markScheduler((evac_fn)thread_root, NULL); + // the weak pointer lists... if (weak_ptr_list != NULL) { thread((void *)&weak_ptr_list); diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c index d049f98..fdb5477 100644 --- a/rts/sm/Evac.c +++ b/rts/sm/Evac.c @@ -18,6 +18,7 @@ #include "Storage.h" #include "GC.h" #include "GCThread.h" +#include "GCTDecl.h" #include "GCUtils.h" #include "Compact.h" #include "MarkStack.h" diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 4ba05bf..d0dd44d 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -40,6 +40,7 @@ #include "GC.h" #include "GCThread.h" +#include "GCTDecl.h" #include "Compact.h" #include "Evac.h" #include "Scav.h" @@ -146,8 +147,8 @@ static void start_gc_threads (void); static void scavenge_until_all_done (void); static StgWord inc_running (void); static StgWord dec_running (void); -static void wakeup_gc_threads (nat n_threads, nat me); -static void shutdown_gc_threads (nat n_threads, nat me); +static void wakeup_gc_threads (nat me); +static void shutdown_gc_threads (nat me); static void collect_gct_blocks (void); #if 0 && defined(DEBUG) @@ -177,7 +178,7 @@ GarbageCollect (rtsBool force_major_gc, generation *gen; lnat live_blocks, live_words, allocated, max_copied, avg_copied; gc_thread *saved_gct; - nat g, t, n; + nat g, n; // necessary if we stole a callee-saves register for gct: saved_gct = gct; @@ -198,11 +199,11 @@ GarbageCollect (rtsBool force_major_gc, ASSERT(sizeof(gen_workspace) == 16 * sizeof(StgWord)); // otherwise adjust the padding in gen_workspace. - // tell the stats department that we've started a GC - stat_startGC(); + // this is the main thread + SET_GCT(gc_threads[cap->no]); - // tell the STM to discard any cached closures it's hoping to re-use - stmPreGCHook(); + // tell the stats department that we've started a GC + stat_startGC(gct); // lock the StablePtr table stablePtrPreGC(); @@ -277,11 +278,6 @@ GarbageCollect (rtsBool force_major_gc, // check sanity *before* GC IF_DEBUG(sanity, checkSanity(rtsFalse /* before GC */, major_gc)); - // Initialise all our gc_thread structures - for (t = 0; t < n_gc_threads; t++) { - init_gc_thread(gc_threads[t]); - } - // Initialise all the generations/steps that we're collecting. for (g = 0; g <= N; g++) { prepare_collected_gen(&generations[g]); @@ -291,6 +287,9 @@ GarbageCollect (rtsBool force_major_gc, prepare_uncollected_gen(&generations[g]); } + // Prepare this gc_thread + init_gc_thread(gct); + /* Allocate a mark stack if we're doing a major collection. */ if (major_gc && oldest_gen->mark) { @@ -305,17 +304,6 @@ GarbageCollect (rtsBool force_major_gc, mark_sp = NULL; } - // this is the main thread -#ifdef THREADED_RTS - if (n_gc_threads == 1) { - SET_GCT(gc_threads[0]); - } else { - SET_GCT(gc_threads[cap->no]); - } -#else -SET_GCT(gc_threads[0]); -#endif - /* ----------------------------------------------------------------------- * follow all the roots that we know about: */ @@ -325,7 +313,9 @@ SET_GCT(gc_threads[0]); // NB. do this after the mutable lists have been saved above, otherwise // the other GC threads will be writing into the old mutable lists. inc_running(); - wakeup_gc_threads(n_gc_threads, gct->thread_index); + wakeup_gc_threads(gct->thread_index); + + traceEventGcWork(gct->cap); // scavenge the capability-private mutable lists. This isn't part // of markSomeCapabilities() because markSomeCapabilities() can only @@ -340,7 +330,7 @@ SET_GCT(gc_threads[0]); #endif } } else { - scavenge_capability_mut_lists(&capabilities[gct->thread_index]); + scavenge_capability_mut_lists(gct->cap); } // follow roots from the CAF list (used by GHCi) @@ -349,8 +339,16 @@ SET_GCT(gc_threads[0]); // follow all the roots that the application knows about. gct->evac_gen_no = 0; - markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads, - rtsTrue/*prune sparks*/); + if (n_gc_threads == 1) { + for (n = 0; n < n_capabilities; n++) { + markCapability(mark_root, gct, &capabilities[n], + rtsTrue/*don't mark sparks*/); + } + } else { + markCapability(mark_root, gct, cap, rtsTrue/*don't mark sparks*/); + } + + markScheduler(mark_root, gct); #if defined(RTS_USER_SIGNALS) // mark the signal handlers (signals should be already blocked) @@ -385,7 +383,7 @@ SET_GCT(gc_threads[0]); break; } - shutdown_gc_threads(n_gc_threads, gct->thread_index); + shutdown_gc_threads(gct->thread_index); // Now see which stable names are still alive. gcStablePtrTable(); @@ -396,7 +394,7 @@ SET_GCT(gc_threads[0]); pruneSparkQueue(&capabilities[n]); } } else { - pruneSparkQueue(&capabilities[gct->thread_index]); + pruneSparkQueue(gct->cap); } #endif @@ -713,7 +711,8 @@ SET_GCT(gc_threads[0]); #endif // ok, GC over: tell the stats department what happened. - stat_endGC(allocated, live_words, copied, N, max_copied, avg_copied, + stat_endGC(gct, allocated, live_words, + copied, N, max_copied, avg_copied, live_blocks * BLOCK_SIZE_W - live_words /* slop */); // Guess which generation we'll collect *next* time @@ -787,6 +786,8 @@ new_gc_thread (nat n, gc_thread *t) nat g; gen_workspace *ws; + t->cap = &capabilities[n]; + #ifdef THREADED_RTS t->id = 0; initSpinLock(&t->gc_spin); @@ -970,8 +971,6 @@ scavenge_until_all_done (void) loop: - traceEventGcWork(&capabilities[gct->thread_index]); - #if defined(THREADED_RTS) if (n_gc_threads > 1) { scavenge_loop(); @@ -987,7 +986,7 @@ loop: // scavenge_loop() only exits when there's no work to do r = dec_running(); - traceEventGcIdle(&capabilities[gct->thread_index]); + traceEventGcIdle(gct->cap); debugTrace(DEBUG_gc, "%d GC threads still running", r); @@ -995,6 +994,7 @@ loop: // usleep(1); if (any_work()) { inc_running(); + traceEventGcWork(gct->cap); goto loop; } // any_work() does not remove the work from the queue, it @@ -1003,7 +1003,7 @@ loop: // scavenge_loop() to perform any pending work. } - traceEventGcDone(&capabilities[gct->thread_index]); + traceEventGcDone(gct->cap); } #if defined(THREADED_RTS) @@ -1019,6 +1019,8 @@ gcWorkerThread (Capability *cap) gct = gc_threads[cap->no]; gct->id = osThreadId(); + stat_gcWorkerThreadStart(gct); + // Wait until we're told to wake up RELEASE_SPIN_LOCK(&gct->mut_spin); gct->wakeup = GC_THREAD_STANDING_BY; @@ -1032,12 +1034,15 @@ gcWorkerThread (Capability *cap) } papi_thread_start_gc1_count(gct->papi_events); #endif - + + init_gc_thread(gct); + + traceEventGcWork(gct->cap); + // Every thread evacuates some roots. gct->evac_gen_no = 0; - markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads, - rtsTrue/*prune sparks*/); - scavenge_capability_mut_lists(&capabilities[gct->thread_index]); + markCapability(mark_root, gct, cap, rtsTrue/*prune sparks*/); + scavenge_capability_mut_lists(cap); scavenge_until_all_done(); @@ -1064,6 +1069,9 @@ gcWorkerThread (Capability *cap) ACQUIRE_SPIN_LOCK(&gct->mut_spin); debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index); + // record the time spent doing GC in the Task structure + stat_gcWorkerThreadDone(gct); + SET_GCT(saved_gct); } @@ -1113,11 +1121,14 @@ start_gc_threads (void) } static void -wakeup_gc_threads (nat n_threads USED_IF_THREADS, nat me USED_IF_THREADS) +wakeup_gc_threads (nat me USED_IF_THREADS) { #if defined(THREADED_RTS) nat i; - for (i=0; i < n_threads; i++) { + + if (n_gc_threads == 1) return; + + for (i=0; i < n_gc_threads; i++) { if (i == me) continue; inc_running(); debugTrace(DEBUG_gc, "waking up gc thread %d", i); @@ -1134,11 +1145,14 @@ wakeup_gc_threads (nat n_threads USED_IF_THREADS, nat me USED_IF_THREADS) // standby state, otherwise they may still be executing inside // any_work(), and may even remain awake until the next GC starts. static void -shutdown_gc_threads (nat n_threads USED_IF_THREADS, nat me USED_IF_THREADS) +shutdown_gc_threads (nat me USED_IF_THREADS) { #if defined(THREADED_RTS) nat i; - for (i=0; i < n_threads; i++) { + + if (n_gc_threads == 1) return; + + for (i=0; i < n_gc_threads; i++) { if (i == me) continue; while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) { write_barrier(); } } @@ -1373,7 +1387,7 @@ init_gc_thread (gc_thread *t) t->static_objects = END_OF_STATIC_LIST; t->scavenged_static_objects = END_OF_STATIC_LIST; t->scan_bd = NULL; - t->mut_lists = capabilities[t->thread_index].mut_lists; + t->mut_lists = t->cap->mut_lists; t->evac_gen_no = 0; t->failed_to_evac = rtsFalse; t->eager_promotion = rtsTrue; diff --git a/rts/sm/GCAux.c b/rts/sm/GCAux.c index 97af17a..7f3968f 100644 --- a/rts/sm/GCAux.c +++ b/rts/sm/GCAux.c @@ -17,7 +17,7 @@ #include "Capability.h" #include "Trace.h" #include "Schedule.h" -// DO NOT include "GCThread.h", we don't want the register variable +// DO NOT include "GCTDecl.h", we don't want the register variable /* ----------------------------------------------------------------------------- isAlive determines whether the given closure is still alive (after diff --git a/rts/sm/GCTDecl.h b/rts/sm/GCTDecl.h new file mode 100644 index 0000000..11795ca --- /dev/null +++ b/rts/sm/GCTDecl.h @@ -0,0 +1,98 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team 1998-2009 + * + * Documentation on the architecture of the Garbage Collector can be + * found in the online commentary: + * + * http://hackage.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage/GC + * + * ---------------------------------------------------------------------------*/ + +#ifndef SM_GCTDECL_H +#define SM_GCTDECL_H + +#include "BeginPrivate.h" + +/* ----------------------------------------------------------------------------- + The gct variable is thread-local and points to the current thread's + gc_thread structure. It is heavily accessed, so we try to put gct + into a global register variable if possible; if we don't have a + register then use gcc's __thread extension to create a thread-local + variable. + -------------------------------------------------------------------------- */ + +#if defined(THREADED_RTS) + +#define GLOBAL_REG_DECL(type,name,reg) register type name REG(reg); + +#define SET_GCT(to) gct = (to) + + + +#if (defined(i386_HOST_ARCH) && defined(linux_HOST_OS)) +// Using __thread is better than stealing a register on x86/Linux, because +// we have too few registers available. In my tests it was worth +// about 5% in GC performance, but of course that might change as gcc +// improves. -- SDM 2009/04/03 +// +// We ought to do the same on MacOS X, but __thread is not +// supported there yet (gcc 4.0.1). + +extern __thread gc_thread* gct; +#define DECLARE_GCT __thread gc_thread* gct; + + +#elif defined(sparc_HOST_ARCH) +// On SPARC we can't pin gct to a register. Names like %l1 are just offsets +// into the register window, which change on each function call. +// +// There are eight global (non-window) registers, but they're used for other purposes. +// %g0 -- always zero +// %g1 -- volatile over function calls, used by the linker +// %g2-%g3 -- used as scratch regs by the C compiler (caller saves) +// %g4 -- volatile over function calls, used by the linker +// %g5-%g7 -- reserved by the OS + +extern __thread gc_thread* gct; +#define DECLARE_GCT __thread gc_thread* gct; + + +#elif defined(REG_Base) && !defined(i386_HOST_ARCH) +// on i386, REG_Base is %ebx which is also used for PIC, so we don't +// want to steal it + +GLOBAL_REG_DECL(gc_thread*, gct, REG_Base) +#define DECLARE_GCT /* nothing */ + + +#elif defined(REG_R1) + +GLOBAL_REG_DECL(gc_thread*, gct, REG_R1) +#define DECLARE_GCT /* nothing */ + + +#elif defined(__GNUC__) + +extern __thread gc_thread* gct; +#define DECLARE_GCT __thread gc_thread* gct; + +#else + +#error Cannot find a way to declare the thread-local gct + +#endif + +#else // not the threaded RTS + +extern StgWord8 the_gc_thread[]; + +#define gct ((gc_thread*)&the_gc_thread) +#define SET_GCT(to) /*nothing*/ +#define DECLARE_GCT /*nothing*/ + +#endif // THREADED_RTS + +#include "EndPrivate.h" + +#endif // SM_GCTDECL_H diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h index 62dd1fb..e42a3a1 100644 --- a/rts/sm/GCThread.h +++ b/rts/sm/GCThread.h @@ -15,6 +15,7 @@ #define SM_GCTHREAD_H #include "WSDeque.h" +#include "GetTime.h" // for Ticks #include "BeginPrivate.h" @@ -115,6 +116,8 @@ typedef struct gen_workspace_ { ------------------------------------------------------------------------- */ typedef struct gc_thread_ { + Capability *cap; + #ifdef THREADED_RTS OSThreadId id; // The OS thread that this struct belongs to SpinLock gc_spin; @@ -162,7 +165,8 @@ typedef struct gc_thread_ { // instead of the to-space // corresponding to the object - lnat thunk_selector_depth; // ummm.... not used as of now + lnat thunk_selector_depth; // used to avoid unbounded recursion in + // evacuate() for THUNK_SELECTOR #ifdef USE_PAPI int papi_events; @@ -177,10 +181,15 @@ typedef struct gc_thread_ { lnat no_work; lnat scav_find_work; + Ticks gc_start_cpu; // process CPU time + Ticks gc_start_elapsed; // process elapsed time + Ticks gc_start_thread_cpu; // thread CPU time + lnat gc_start_faults; + // ------------------- // workspaces - // array of workspaces, indexed by stp->abs_no. This is placed + // array of workspaces, indexed by gen->abs_no. This is placed // directly at the end of the gc_thread structure so that we can get from // the gc_thread pointer to a workspace using only pointer // arithmetic, no memory access. This happens in the inner loop @@ -191,91 +200,8 @@ typedef struct gc_thread_ { extern nat n_gc_threads; -/* ----------------------------------------------------------------------------- - The gct variable is thread-local and points to the current thread's - gc_thread structure. It is heavily accessed, so we try to put gct - into a global register variable if possible; if we don't have a - register then use gcc's __thread extension to create a thread-local - variable. - - Even on x86 where registers are scarce, it is worthwhile using a - register variable here: I measured about a 2-5% slowdown with the - __thread version. - -------------------------------------------------------------------------- */ - extern gc_thread **gc_threads; -#if defined(THREADED_RTS) - -#define GLOBAL_REG_DECL(type,name,reg) register type name REG(reg); - -#define SET_GCT(to) gct = (to) - - - -#if (defined(i386_HOST_ARCH) && defined(linux_HOST_OS)) -// Using __thread is better than stealing a register on x86/Linux, because -// we have too few registers available. In my tests it was worth -// about 5% in GC performance, but of course that might change as gcc -// improves. -- SDM 2009/04/03 -// -// We ought to do the same on MacOS X, but __thread is not -// supported there yet (gcc 4.0.1). - -extern __thread gc_thread* gct; -#define DECLARE_GCT __thread gc_thread* gct; - - -#elif defined(sparc_HOST_ARCH) -// On SPARC we can't pin gct to a register. Names like %l1 are just offsets -// into the register window, which change on each function call. -// -// There are eight global (non-window) registers, but they're used for other purposes. -// %g0 -- always zero -// %g1 -- volatile over function calls, used by the linker -// %g2-%g3 -- used as scratch regs by the C compiler (caller saves) -// %g4 -- volatile over function calls, used by the linker -// %g5-%g7 -- reserved by the OS - -extern __thread gc_thread* gct; -#define DECLARE_GCT __thread gc_thread* gct; - - -#elif defined(REG_Base) && !defined(i386_HOST_ARCH) -// on i386, REG_Base is %ebx which is also used for PIC, so we don't -// want to steal it - -GLOBAL_REG_DECL(gc_thread*, gct, REG_Base) -#define DECLARE_GCT /* nothing */ - - -#elif defined(REG_R1) - -GLOBAL_REG_DECL(gc_thread*, gct, REG_R1) -#define DECLARE_GCT /* nothing */ - - -#elif defined(__GNUC__) - -extern __thread gc_thread* gct; -#define DECLARE_GCT __thread gc_thread* gct; - -#else - -#error Cannot find a way to declare the thread-local gct - -#endif - -#else // not the threaded RTS - -extern StgWord8 the_gc_thread[]; - -#define gct ((gc_thread*)&the_gc_thread) -#define SET_GCT(to) /*nothing*/ -#define DECLARE_GCT /*nothing*/ - -#endif - #include "EndPrivate.h" #endif // SM_GCTHREAD_H diff --git a/rts/sm/GCUtils.c b/rts/sm/GCUtils.c index 8b63674..ef8d0bd 100644 --- a/rts/sm/GCUtils.c +++ b/rts/sm/GCUtils.c @@ -18,6 +18,7 @@ #include "Storage.h" #include "GC.h" #include "GCThread.h" +#include "GCTDecl.h" #include "GCUtils.h" #include "Printer.h" #include "Trace.h" diff --git a/rts/sm/GCUtils.h b/rts/sm/GCUtils.h index 3fe78a3..d47375d 100644 --- a/rts/sm/GCUtils.h +++ b/rts/sm/GCUtils.h @@ -16,6 +16,8 @@ #include "BeginPrivate.h" +#include "GCTDecl.h" + bdescr *allocBlock_sync(void); void freeChain_sync(bdescr *bd); diff --git a/rts/sm/MarkWeak.c b/rts/sm/MarkWeak.c index f4b576a..f9275ec 100644 --- a/rts/sm/MarkWeak.c +++ b/rts/sm/MarkWeak.c @@ -17,6 +17,7 @@ #include "MarkWeak.h" #include "GC.h" #include "GCThread.h" +#include "GCTDecl.h" #include "Evac.h" #include "Trace.h" #include "Schedule.h" -- 1.7.10.4