rts/sm/GCThread.h

   1 /* -----------------------------------------------------------------------------
   2  *
   3  * (c) The GHC Team 1998-2008
   4  *
   5  * Generational garbage collector
   6  *
   7  * Documentation on the architecture of the Garbage Collector can be
   8  * found in the online commentary:
   9  *
  10  *   http://hackage.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage/GC
  11  *
  12  * ---------------------------------------------------------------------------*/
  13
  14 #ifndef SM_GCTHREAD_H
  15 #define SM_GCTHREAD_H
  16
  17 #include "WSDeque.h"
  18
  19 BEGIN_RTS_PRIVATE
  20
  21 /* -----------------------------------------------------------------------------
  22    General scheme
  23
  24    ToDo: move this to the wiki when the implementation is done.
  25
  26    We're only going to try to parallelise the copying GC for now.  The
  27    Plan is as follows.
  28
  29    Each thread has a gc_thread structure (see below) which holds its
  30    thread-local data.  We'll keep a pointer to this in a thread-local
  31    variable, or possibly in a register.
  32
  33    In the gc_thread structure is a step_workspace for each step.  The
  34    primary purpose of the step_workspace is to hold evacuated objects;
  35    when an object is evacuated, it is copied to the "todo" block in
  36    the thread's workspace for the appropriate step.  When the todo
  37    block is full, it is pushed to the global step->todos list, which
  38    is protected by a lock.  (in fact we intervene a one-place buffer
  39    here to reduce contention).
  40
  41    A thread repeatedly grabs a block of work from one of the
  42    step->todos lists, scavenges it, and keeps the scavenged block on
  43    its own ws->scavd_list (this is to avoid unnecessary contention
  44    returning the completed buffers back to the step: we can just
  45    collect them all later).
  46
  47    When there is no global work to do, we start scavenging the todo
  48    blocks in the workspaces.  This is where the scan_bd field comes
  49    in: we can scan the contents of the todo block, when we have
  50    scavenged the contents of the todo block (up to todo_bd->free), we
  51    don't want to move this block immediately to the scavd_list,
  52    because it is probably only partially full.  So we remember that we
  53    have scanned up to this point by saving the block in ws->scan_bd,
  54    with the current scan pointer in ws->scan.  Later, when more
  55    objects have been copied to this block, we can come back and scan
  56    the rest.  When we visit this workspace again in the future,
  57    scan_bd may still be the same as todo_bd, or it might be different:
  58    if enough objects were copied into this block that it filled up,
  59    then we will have allocated a new todo block, but *not* pushed the
  60    old one to the step, because it is partially scanned.
  61
  62    The reason to leave scanning the todo blocks until last is that we
  63    want to deal with full blocks as far as possible.
  64    ------------------------------------------------------------------------- */
  65
  66
  67 /* -----------------------------------------------------------------------------
  68    Step Workspace
  69
  70    A step workspace exists for each step for each GC thread. The GC
  71    thread takes a block from the todos list of the step into the
  72    scanbd and then scans it.  Objects referred to by those in the scan
  73    block are copied into the todo or scavd blocks of the relevant step.
  74
  75    ------------------------------------------------------------------------- */
  76
  77 typedef struct step_workspace_ {
  78     step * step;                // the step for this workspace
  79     struct gc_thread_ * my_gct; // the gc_thread that contains this workspace
  80
  81     // where objects to be scavenged go
  82     bdescr *     todo_bd;
  83     StgPtr       todo_free;            // free ptr for todo_bd
  84     StgPtr       todo_lim;             // lim for todo_bd
  85
  86     WSDeque *    todo_q;
  87     bdescr *     todo_overflow;
  88     nat          n_todo_overflow;
  89
  90     // where large objects to be scavenged go
  91     bdescr *     todo_large_objects;
  92
  93     // Objects that have already been scavenged.
  94     bdescr *     scavd_list;
  95     nat          n_scavd_blocks;     // count of blocks in this list
  96
  97     // Partially-full, scavenged, blocks
  98     bdescr *     part_list;
  99     unsigned int n_part_blocks;      // count of above
 100
 101     StgWord pad[3];
 102
 103 } step_workspace ATTRIBUTE_ALIGNED(64);
 104 // align so that computing gct->steps[n] is a shift, not a multiply
 105 // fails if the size is <64, which is why we need the pad above
 106
 107 /* ----------------------------------------------------------------------------
 108    GC thread object
 109
 110    Every GC thread has one of these. It contains all the step specific
 111    workspaces and other GC thread local information. At some later
 112    point it maybe useful to move this other into the TLS store of the
 113    GC threads
 114    ------------------------------------------------------------------------- */
 115
 116 typedef struct gc_thread_ {
 117 #ifdef THREADED_RTS
 118     OSThreadId id;                 // The OS thread that this struct belongs to
 119     SpinLock   gc_spin;
 120     SpinLock   mut_spin;
 121     volatile rtsBool wakeup;
 122 #endif
 123     nat thread_index;              // a zero based index identifying the thread
 124
 125     bdescr * free_blocks;          // a buffer of free blocks for this thread
 126                                    //  during GC without accessing the block
 127                                    //   allocators spin lock.
 128
 129     StgClosure* static_objects;      // live static objects
 130     StgClosure* scavenged_static_objects;   // static objects scavenged so far
 131
 132     lnat gc_count;                 // number of GCs this thread has done
 133
 134     // block that is currently being scanned
 135     bdescr *     scan_bd;
 136
 137     // Remembered sets on this CPU.  Each GC thread has its own
 138     // private per-generation remembered sets, so it can add an item
 139     // to the remembered set without taking a lock.  The mut_lists
 140     // array on a gc_thread is the same as the one on the
 141     // corresponding Capability; we stash it here too for easy access
 142     // during GC; see recordMutableGen_GC().
 143     bdescr **    mut_lists;
 144
 145     // --------------------
 146     // evacuate flags
 147
 148     step *evac_step;               // Youngest generation that objects
 149                                    // should be evacuated to in
 150                                    // evacuate().  (Logically an
 151                                    // argument to evacuate, but it's
 152                                    // static a lot of the time so we
 153                                    // optimise it into a per-thread
 154                                    // variable).
 155
 156     rtsBool failed_to_evac;        // failure to evacuate an object typically
 157                                    // Causes it to be recorded in the mutable
 158                                    // object list
 159
 160     rtsBool eager_promotion;       // forces promotion to the evac gen
 161                                    // instead of the to-space
 162                                    // corresponding to the object
 163
 164     lnat thunk_selector_depth;     // ummm.... not used as of now
 165
 166 #ifdef USE_PAPI
 167     int papi_events;
 168 #endif
 169
 170     // -------------------
 171     // stats
 172
 173     lnat copied;
 174     lnat scanned;
 175     lnat any_work;
 176     lnat no_work;
 177     lnat scav_find_work;
 178
 179     // -------------------
 180     // workspaces
 181
 182     // array of workspaces, indexed by stp->abs_no.  This is placed
 183     // directly at the end of the gc_thread structure so that we can get from
 184     // the gc_thread pointer to a workspace using only pointer
 185     // arithmetic, no memory access.  This happens in the inner loop
 186     // of the GC, see Evac.c:alloc_for_copy().
 187     step_workspace steps[];
 188 } gc_thread;
 189
 190
 191 extern nat n_gc_threads;
 192
 193 /* -----------------------------------------------------------------------------
 194    The gct variable is thread-local and points to the current thread's
 195    gc_thread structure.  It is heavily accessed, so we try to put gct
 196    into a global register variable if possible; if we don't have a
 197    register then use gcc's __thread extension to create a thread-local
 198    variable.
 199
 200    Even on x86 where registers are scarce, it is worthwhile using a
 201    register variable here: I measured about a 2-5% slowdown with the
 202    __thread version.
 203    -------------------------------------------------------------------------- */
 204
 205 extern gc_thread **gc_threads;
 206
 207 #if defined(THREADED_RTS)
 208
 209 #define GLOBAL_REG_DECL(type,name,reg) register type name REG(reg);
 210
 211 #define SET_GCT(to) gct = (to)
 212
 213
 214
 215 #if (defined(i386_HOST_ARCH) && defined(linux_HOST_OS))
 216 // Using __thread is better than stealing a register on x86/Linux, because
 217 // we have too few registers available.  In my tests it was worth
 218 // about 5% in GC performance, but of course that might change as gcc
 219 // improves. -- SDM 2009/04/03
 220 //
 221 // We ought to do the same on MacOS X, but __thread is not
 222 // supported there yet (gcc 4.0.1).
 223
 224 extern __thread gc_thread* gct;
 225 #define DECLARE_GCT __thread gc_thread* gct;
 226
 227
 228 #elif defined(sparc_TARGET_ARCH)
 229 // On SPARC we can't pin gct to a register. Names like %l1 are just offsets
 230 //      into the register window, which change on each function call.
 231 //
 232 //      There are eight global (non-window) registers, but they're used for other purposes.
 233 //      %g0     -- always zero
 234 //      %g1     -- volatile over function calls, used by the linker
 235 //      %g2-%g3 -- used as scratch regs by the C compiler (caller saves)
 236 //      %g4     -- volatile over function calls, used by the linker
 237 //      %g5-%g7 -- reserved by the OS
 238
 239 extern __thread gc_thread* gct;
 240 #define DECLARE_GCT __thread gc_thread* gct;
 241
 242
 243 #elif defined(REG_Base) && !defined(i386_HOST_ARCH)
 244 // on i386, REG_Base is %ebx which is also used for PIC, so we don't
 245 // want to steal it
 246
 247 GLOBAL_REG_DECL(gc_thread*, gct, REG_Base)
 248 #define DECLARE_GCT /* nothing */
 249
 250
 251 #elif defined(REG_R1)
 252
 253 GLOBAL_REG_DECL(gc_thread*, gct, REG_R1)
 254 #define DECLARE_GCT /* nothing */
 255
 256
 257 #elif defined(__GNUC__)
 258
 259 extern __thread gc_thread* gct;
 260 #define DECLARE_GCT __thread gc_thread* gct;
 261
 262 #else
 263
 264 #error Cannot find a way to declare the thread-local gct
 265
 266 #endif
 267
 268 #else  // not the threaded RTS
 269
 270 extern StgWord8 the_gc_thread[];
 271
 272 #define gct ((gc_thread*)&the_gc_thread)
 273 #define SET_GCT(to) /*nothing*/
 274 #define DECLARE_GCT /*nothing*/
 275
 276 #endif
 277
 278 END_RTS_PRIVATE
 279
 280 #endif // SM_GCTHREAD_H
 281