From 53628e913632cac29d54da914040e39add334784 Mon Sep 17 00:00:00 2001 From: Simon Marlow Date: Tue, 15 Sep 2009 08:40:00 +0000 Subject: [PATCH] Improve the default parallel GC settings, and sanitise the flags (#3340) Flags (from +RTS -?): -qg[] Use parallel GC only for generations >= (default: 0, -qg alone turns off parallel GC) -qb[] Use load-balancing in the parallel GC only for generations >= (default: 1, -qb alone turns off load-balancing) these are good defaults for most parallel programs. Single-threaded programs that want to make use of parallel GC will probably want +RTS -qg1 (this is documented). I've also updated the docs. --- docs/users_guide/runtime_control.xml | 71 +++++++++++++++++++--------------- docs/users_guide/using.xml | 28 ++++++++++++-- includes/rts/Flags.h | 9 ++++- rts/RtsFlags.c | 36 ++++++++++------- rts/sm/GC.c | 3 +- 5 files changed, 94 insertions(+), 53 deletions(-) diff --git a/docs/users_guide/runtime_control.xml b/docs/users_guide/runtime_control.xml index 69e26bc..2783daf 100644 --- a/docs/users_guide/runtime_control.xml +++ b/docs/users_guide/runtime_control.xml @@ -298,51 +298,58 @@ - - RTS + + RTS option - [New in GHC 6.12.1] Disable the parallel GC. - The parallel GC is turned on automatically when parallel - execution is enabled with the option; - this option is available to turn it off if - necessary. + [New in GHC 6.12.1] [Default: 0] + Use parallel GC in + generation gen and higher. + Omitting gen turns off the + parallel GC completely, reverting to sequential GC. - Experiments have shown that parallel GC usually - results in a performance improvement given 3 cores or - more; with 2 cores it may or may not be beneficial, - depending on the workload. Bigger heaps work better with - parallel GC, so set your value high (3 - or more times the maximum residency). Look at the timing - stats with to see whether you're - getting any benefit from parallel GC or not. If you find - parallel GC is significantly slower - (in elapsed time) than sequential GC, please report it as - a bug. - - In GHC 6.10.1 it was possible to use a different - number of threads for GC than for execution, because the GC - used its own pool of threads. Now, the GC uses the same - threads as the mutator (for executing the program). + The default parallel GC settings are usually suitable + for parallel programs (i.e. those + using par, Strategies, or with multiple + threads). However, it is sometimes beneficial to enable + the parallel GC for a single-threaded sequential program + too, especially if the program has a large amount of heap + data and GC is a significant fraction of runtime. To use + the parallel GC in a sequential program, enable the + parallel runtime with a suitable -N + option, and additionally it might be beneficial to + restrict parallel GC to the old generation + with -qg1. - - RTS + + RTS option - [Default: 1] [New in GHC 6.12.1] - Enable the parallel GC only in - generation n and greater. - Parallel GC is often not worthwhile for collections in - generation 0 (the young generation), so it is enabled by - default only for collections in generation 1 (and higher, - if applicable). + [New in GHC 6.12.1] [Default: 1] Use + load-balancing in the parallel GC in + generation gen and higher. + Omitting gen disables + load-balancing entirely. + + + Load-balancing shares out the work of GC between the + available cores. This is a good idea when the heap is + large and we need to parallelise the GC work, however it + is also pessimal for the short young-generation + collections in a parallel program, because it can harm + locality by moving data from the cache of the CPU where is + it being used to the cache of another CPU. Hence the + default is to do load-balancing only in the + old-generation. In fact, for a parallel program it is + sometimes beneficial to disable load-balancing entirely + with -qb. diff --git a/docs/users_guide/using.xml b/docs/users_guide/using.xml index af7950c..5ad34fa 100644 --- a/docs/users_guide/using.xml +++ b/docs/users_guide/using.xml @@ -1951,6 +1951,10 @@ f "2" = 2 There is no means (currently) by which this value may vary after the program has started. + + The current value of the option + is available to the Haskell program + via GHC.Conc.numCapabilities. @@ -1960,6 +1964,17 @@ f "2" = 2 + + RTS + option + + Use the OS's affinity facilities to try to pin OS + threads to CPU cores. This is an experimental feature, + and may or may not be useful. Please let us know + whether it helps for you! + + + RTS option @@ -1967,9 +1982,16 @@ f "2" = 2 Disable automatic migration for load balancing. Normally the runtime will automatically try to schedule threads across the available CPUs to make use of idle - CPUs; this option disables that behaviour. It is probably - only of use if you are explicitly scheduling threads onto - CPUs with GHC.Conc.forkOnIO. + CPUs; this option disables that behaviour. Note that + migration only applies to threads; sparks created + by par are load-balanced separately + by work-stealing. + + + This option is probably only of use for concurrent + programs that explicitly schedule threads onto CPUs + with GHC.Conc.forkOnIO. + diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h index 3d0230a..733318a 100644 --- a/includes/rts/Flags.h +++ b/includes/rts/Flags.h @@ -144,9 +144,14 @@ struct PAR_FLAGS { rtsBool wakeupMigrate; /* migrate a thread on wakeup */ unsigned int maxLocalSparks; rtsBool parGcEnabled; /* enable parallel GC */ - rtsBool parGcGen; /* do parallel GC in this generation + unsigned int parGcGen; /* do parallel GC in this generation * and higher only */ - rtsBool parGcLoadBalancing; /* do load-balancing in parallel GC */ + rtsBool parGcLoadBalancingEnabled; + /* enable load-balancing in the + * parallel GC */ + unsigned int parGcLoadBalancingGen; + /* do load-balancing in this + * generation and higher only */ rtsBool setAffinity; /* force thread affinity with CPUs */ }; #endif /* THREADED_RTS */ diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c index d200bf2..397ea8b 100644 --- a/rts/RtsFlags.c +++ b/rts/RtsFlags.c @@ -152,8 +152,9 @@ void initRtsFlagsDefaults(void) RtsFlags.ParFlags.migrate = rtsTrue; RtsFlags.ParFlags.wakeupMigrate = rtsFalse; RtsFlags.ParFlags.parGcEnabled = 1; - RtsFlags.ParFlags.parGcGen = 1; - RtsFlags.ParFlags.parGcLoadBalancing = 1; + RtsFlags.ParFlags.parGcGen = 0; + RtsFlags.ParFlags.parGcLoadBalancingEnabled = rtsTrue; + RtsFlags.ParFlags.parGcLoadBalancingGen = 1; RtsFlags.ParFlags.setAffinity = 0; #endif @@ -307,10 +308,11 @@ usage_text[] = { #if defined(THREADED_RTS) && !defined(NOSMP) " -N Use processors (default: 1)", " -N Determine the number of processors to use automatically", -" -q1 Use one OS thread for GC (turns off parallel GC)", -" -qg Use parallel GC only for generations >= (default: 1)", -" -qb Disable load-balancing in the parallel GC", -" -qa Use the OS to set thread affinity", +" -qg[] Use parallel GC only for generations >= ", +" (default: 0, -qg alone turns off parallel GC)", +" -qb[] Use load-balancing in the parallel GC only for generations >= ", +" (default: 1, -qb alone turns off load-balancing)", +" -qa Use the OS to set thread affinity (experimental)", " -qm Don't automatically migrate threads between CPUs", " -qw Migrate a thread to the current CPU when it is woken up", #endif @@ -1008,21 +1010,25 @@ error = rtsTrue; errorBelch("incomplete RTS option: %s",rts_argv[arg]); error = rtsTrue; break; - case '1': - RtsFlags.ParFlags.parGcEnabled = rtsFalse; - break; case 'g': - if (rts_argv[arg][3] != '\0') { + if (rts_argv[arg][3] == '\0') { + RtsFlags.ParFlags.parGcEnabled = rtsFalse; + } else { + RtsFlags.ParFlags.parGcEnabled = rtsTrue; RtsFlags.ParFlags.parGcGen = strtol(rts_argv[arg]+3, (char **) NULL, 10); - } else { - errorBelch("bad value for -qg"); - error = rtsTrue; } break; case 'b': - RtsFlags.ParFlags.parGcLoadBalancing = rtsFalse; - break; + if (rts_argv[arg][3] == '\0') { + RtsFlags.ParFlags.parGcLoadBalancingEnabled = rtsFalse; + } + else { + RtsFlags.ParFlags.parGcLoadBalancingEnabled = rtsTrue; + RtsFlags.ParFlags.parGcLoadBalancingGen + = strtol(rts_argv[arg]+3, (char **) NULL, 10); + } + break; case 'a': RtsFlags.ParFlags.setAffinity = rtsTrue; break; diff --git a/rts/sm/GC.c b/rts/sm/GC.c index febede6..6f15a47 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -237,7 +237,8 @@ GarbageCollect (rtsBool force_major_gc, n = initialise_N(force_major_gc); #if defined(THREADED_RTS) - work_stealing = RtsFlags.ParFlags.parGcLoadBalancing; + work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled && + N >= RtsFlags.ParFlags.parGcLoadBalancingGen; // It's not always a good idea to do load balancing in parallel // GC. In particular, for a parallel program we don't want to // lose locality by moving cached data into another CPU's cache -- 1.7.10.4