Improve the default parallel GC settings, and sanitise the flags (#3340)
authorSimon Marlow <marlowsd@gmail.com>
Tue, 15 Sep 2009 08:40:00 +0000 (08:40 +0000)
committerSimon Marlow <marlowsd@gmail.com>
Tue, 15 Sep 2009 08:40:00 +0000 (08:40 +0000)
Flags (from +RTS -?):

  -qg[<n>]  Use parallel GC only for generations >= <n>
            (default: 0, -qg alone turns off parallel GC)
  -qb[<n>]  Use load-balancing in the parallel GC only for generations >= <n>
            (default: 1, -qb alone turns off load-balancing)

these are good defaults for most parallel programs.  Single-threaded
programs that want to make use of parallel GC will probably want +RTS
-qg1 (this is documented).

I've also updated the docs.

docs/users_guide/runtime_control.xml
docs/users_guide/using.xml
includes/rts/Flags.h
rts/RtsFlags.c
rts/sm/GC.c

index 69e26bc..2783daf 100644 (file)
 
       <varlistentry>
         <term>
-          <option>-q1</option>
-          <indexterm><primary><option>-q1</option><secondary>RTS
+          <option>-qg<optional><replaceable>gen</replaceable></optional></option>
+          <indexterm><primary><option>-qg</option><secondary>RTS
           option</secondary></primary></indexterm>
         </term>
         <listitem>
-          <para>&lsqb;New in GHC 6.12.1&rsqb; Disable the parallel GC.
-            The parallel GC is turned on automatically when parallel
-            execution is enabled with the <option>-N</option> option;
-            this option is available to turn it off if
-            necessary.</para>
+          <para>&lsqb;New in GHC 6.12.1&rsqb; &lsqb;Default: 0&rsqb;
+            Use parallel GC in
+            generation <replaceable>gen</replaceable> and higher.
+            Omitting <replaceable>gen</replaceable> turns off the
+            parallel GC completely, reverting to sequential GC.</para>
           
-          <para>Experiments have shown that parallel GC usually
-            results in a performance improvement given 3 cores or
-            more; with 2 cores it may or may not be beneficial,
-            depending on the workload.  Bigger heaps work better with
-            parallel GC, so set your <option>-H</option> value high (3
-            or more times the maximum residency).  Look at the timing
-            stats with <option>+RTS -s</option> to see whether you're
-            getting any benefit from parallel GC or not.  If you find
-            parallel GC is significantly <emphasis>slower</emphasis>
-            (in elapsed time) than sequential GC, please report it as
-            a bug.</para>
-
-          <para>In GHC 6.10.1 it was possible to use a different
-            number of threads for GC than for execution, because the GC
-            used its own pool of threads.  Now, the GC uses the same
-            threads as the mutator (for executing the program).</para>
+          <para>The default parallel GC settings are usually suitable
+            for parallel programs (i.e. those
+            using <literal>par</literal>, Strategies, or with multiple
+            threads).  However, it is sometimes beneficial to enable
+            the parallel GC for a single-threaded sequential program
+            too, especially if the program has a large amount of heap
+            data and GC is a significant fraction of runtime.  To use
+            the parallel GC in a sequential program, enable the
+            parallel runtime with a suitable <literal>-N</literal>
+            option, and additionally it might be beneficial to
+            restrict parallel GC to the old generation
+            with <literal>-qg1</literal>.</para>
         </listitem>
       </varlistentry>        
 
       <varlistentry>
         <term>
-          <option>-qg<replaceable>n</replaceable></option>
-          <indexterm><primary><option>-qg</option><secondary>RTS
+          <option>-qb<optional><replaceable>gen</replaceable></optional></option>
+          <indexterm><primary><option>-qb</option><secondary>RTS
           option</secondary></primary></indexterm>
         </term>
         <listitem>
           <para>
-            &lsqb;Default: 1&rsqb; &lsqb;New in GHC 6.12.1&rsqb;
-            Enable the parallel GC only in
-            generation <replaceable>n</replaceable> and greater.
-            Parallel GC is often not worthwhile for collections in
-            generation 0 (the young generation), so it is enabled by
-            default only for collections in generation 1 (and higher,
-            if applicable).
+            &lsqb;New in GHC 6.12.1&rsqb; &lsqb;Default: 1&rsqb; Use
+            load-balancing in the parallel GC in
+            generation <replaceable>gen</replaceable> and higher.
+            Omitting <replaceable>gen</replaceable> disables
+            load-balancing entirely.</para>
+          
+          <para>
+            Load-balancing shares out the work of GC between the
+            available cores.  This is a good idea when the heap is
+            large and we need to parallelise the GC work, however it
+            is also pessimal for the short young-generation
+            collections in a parallel program, because it can harm
+            locality by moving data from the cache of the CPU where is
+            it being used to the cache of another CPU.  Hence the
+            default is to do load-balancing only in the
+            old-generation.  In fact, for a parallel program it is
+            sometimes beneficial to disable load-balancing entirely
+            with <literal>-qb</literal>.
           </para>
         </listitem>
       </varlistentry>
index af7950c..5ad34fa 100644 (file)
@@ -1951,6 +1951,10 @@ f "2"    = 2
 
             <para>There is no means (currently) by which this value
              may vary after the program has started.</para>
+
+            <para>The current value of the <option>-N</option> option
+              is available to the Haskell program
+              via <literal>GHC.Conc.numCapabilities</literal>.</para>
          </listitem>
        </varlistentry>
       </variablelist>
@@ -1960,6 +1964,17 @@ f "2"    = 2
 
       <variablelist>
        <varlistentry>
+         <term><option>-qa</option></term>
+          <indexterm><primary><option>-qa</option></primary><secondary>RTS
+          option</secondary></indexterm>
+         <listitem>
+            <para>Use the OS's affinity facilities to try to pin OS
+              threads to CPU cores.  This is an experimental feature,
+              and may or may not be useful.  Please let us know
+              whether it helps for you!</para>
+          </listitem>
+        </varlistentry>
+       <varlistentry>
          <term><option>-qm</option></term>
           <indexterm><primary><option>-qm</option></primary><secondary>RTS
           option</secondary></indexterm>
@@ -1967,9 +1982,16 @@ f "2"    = 2
             <para>Disable automatic migration for load balancing.
             Normally the runtime will automatically try to schedule
             threads across the available CPUs to make use of idle
-            CPUs; this option disables that behaviour.  It is probably
-            only of use if you are explicitly scheduling threads onto
-            CPUs with <literal>GHC.Conc.forkOnIO</literal>.</para>
+            CPUs; this option disables that behaviour.  Note that
+              migration only applies to threads; sparks created
+              by <literal>par</literal> are load-balanced separately
+              by work-stealing.</para>
+
+            <para>
+              This option is probably only of use for concurrent
+              programs that explicitly schedule threads onto CPUs
+              with <literal>GHC.Conc.forkOnIO</literal>.
+            </para>
           </listitem>
         </varlistentry>
        <varlistentry>
index 3d0230a..733318a 100644 (file)
@@ -144,9 +144,14 @@ struct PAR_FLAGS {
   rtsBool        wakeupMigrate;  /* migrate a thread on wakeup */
   unsigned int  maxLocalSparks;
   rtsBool        parGcEnabled;   /* enable parallel GC */
-  rtsBool        parGcGen;       /* do parallel GC in this generation
+  unsigned int   parGcGen;       /* do parallel GC in this generation
                                   * and higher only */
-  rtsBool        parGcLoadBalancing; /* do load-balancing in parallel GC */
+  rtsBool        parGcLoadBalancingEnabled; 
+                                 /* enable load-balancing in the
+                                  * parallel GC */
+  unsigned int   parGcLoadBalancingGen;
+                                 /* do load-balancing in this
+                                  * generation and higher only */
   rtsBool        setAffinity;    /* force thread affinity with CPUs */
 };
 #endif /* THREADED_RTS */
index d200bf2..397ea8b 100644 (file)
@@ -152,8 +152,9 @@ void initRtsFlagsDefaults(void)
     RtsFlags.ParFlags.migrate           = rtsTrue;
     RtsFlags.ParFlags.wakeupMigrate     = rtsFalse;
     RtsFlags.ParFlags.parGcEnabled      = 1;
-    RtsFlags.ParFlags.parGcGen          = 1;
-    RtsFlags.ParFlags.parGcLoadBalancing = 1;
+    RtsFlags.ParFlags.parGcGen          = 0;
+    RtsFlags.ParFlags.parGcLoadBalancingEnabled = rtsTrue;
+    RtsFlags.ParFlags.parGcLoadBalancingGen = 1;
     RtsFlags.ParFlags.setAffinity       = 0;
 #endif
 
@@ -307,10 +308,11 @@ usage_text[] = {
 #if defined(THREADED_RTS) && !defined(NOSMP)
 "  -N<n>     Use <n> processors (default: 1)",
 "  -N        Determine the number of processors to use automatically",
-"  -q1       Use one OS thread for GC (turns off parallel GC)",
-"  -qg<n>    Use parallel GC only for generations >= <n> (default: 1)",
-"  -qb       Disable load-balancing in the parallel GC",
-"  -qa       Use the OS to set thread affinity",
+"  -qg[<n>]  Use parallel GC only for generations >= <n>",
+"            (default: 0, -qg alone turns off parallel GC)",
+"  -qb[<n>]  Use load-balancing in the parallel GC only for generations >= <n>",
+"            (default: 1, -qb alone turns off load-balancing)",
+"  -qa       Use the OS to set thread affinity (experimental)",
 "  -qm       Don't automatically migrate threads between CPUs",
 "  -qw       Migrate a thread to the current CPU when it is woken up",
 #endif
@@ -1008,21 +1010,25 @@ error = rtsTrue;
                        errorBelch("incomplete RTS option: %s",rts_argv[arg]);
                        error = rtsTrue;
                        break;
-                    case '1':
-                        RtsFlags.ParFlags.parGcEnabled = rtsFalse;
-                        break;
                     case 'g':
-                        if (rts_argv[arg][3] != '\0') {
+                        if (rts_argv[arg][3] == '\0') {
+                            RtsFlags.ParFlags.parGcEnabled = rtsFalse;
+                        } else {
+                            RtsFlags.ParFlags.parGcEnabled = rtsTrue;
                             RtsFlags.ParFlags.parGcGen
                                 = strtol(rts_argv[arg]+3, (char **) NULL, 10);
-                        } else {
-                            errorBelch("bad value for -qg");
-                            error = rtsTrue;
                         }
                         break;
                    case 'b':
-                       RtsFlags.ParFlags.parGcLoadBalancing = rtsFalse;
-                       break;
+                        if (rts_argv[arg][3] == '\0') {
+                            RtsFlags.ParFlags.parGcLoadBalancingEnabled = rtsFalse;
+                        }
+                        else {
+                            RtsFlags.ParFlags.parGcLoadBalancingEnabled = rtsTrue;
+                            RtsFlags.ParFlags.parGcLoadBalancingGen
+                                = strtol(rts_argv[arg]+3, (char **) NULL, 10);
+                        }
+                        break;
                    case 'a':
                        RtsFlags.ParFlags.setAffinity = rtsTrue;
                        break;
index febede6..6f15a47 100644 (file)
@@ -237,7 +237,8 @@ GarbageCollect (rtsBool force_major_gc,
   n = initialise_N(force_major_gc);
 
 #if defined(THREADED_RTS)
-  work_stealing = RtsFlags.ParFlags.parGcLoadBalancing;
+  work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
+                  N >= RtsFlags.ParFlags.parGcLoadBalancingGen;
       // It's not always a good idea to do load balancing in parallel
       // GC.  In particular, for a parallel program we don't want to
       // lose locality by moving cached data into another CPU's cache