rts/sm/GCThread.h

   1 /* -----------------------------------------------------------------------------
   2  *
   3  * (c) The GHC Team 1998-2008
   4  *
   5  * Generational garbage collector
   6  *
   7  * Documentation on the architecture of the Garbage Collector can be
   8  * found in the online commentary:
   9  *
  10  *   http://hackage.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage/GC
  11  *
  12  * ---------------------------------------------------------------------------*/
  13
  14 #ifndef SM_GCTHREAD_H
  15 #define SM_GCTHREAD_H
  16
  17 #include "WSDeque.h"
  18
  19 #include "BeginPrivate.h"
  20
  21 /* -----------------------------------------------------------------------------
  22    General scheme
  23
  24    ToDo: move this to the wiki when the implementation is done.
  25
  26    We're only going to try to parallelise the copying GC for now.  The
  27    Plan is as follows.
  28
  29    Each thread has a gc_thread structure (see below) which holds its
  30    thread-local data.  We'll keep a pointer to this in a thread-local
  31    variable, or possibly in a register.
  32
  33    In the gc_thread structure is a gen_workspace for each generation.  The
  34    primary purpose of the gen_workspace is to hold evacuated objects;
  35    when an object is evacuated, it is copied to the "todo" block in
  36    the thread's workspace for the appropriate generation.  When the todo
  37    block is full, it is pushed to the global gen->todos list, which
  38    is protected by a lock.  (in fact we intervene a one-place buffer
  39    here to reduce contention).
  40
  41    A thread repeatedly grabs a block of work from one of the
  42    gen->todos lists, scavenges it, and keeps the scavenged block on
  43    its own ws->scavd_list (this is to avoid unnecessary contention
  44    returning the completed buffers back to the generation: we can just
  45    collect them all later).
  46
  47    When there is no global work to do, we start scavenging the todo
  48    blocks in the workspaces.  This is where the scan_bd field comes
  49    in: we can scan the contents of the todo block, when we have
  50    scavenged the contents of the todo block (up to todo_bd->free), we
  51    don't want to move this block immediately to the scavd_list,
  52    because it is probably only partially full.  So we remember that we
  53    have scanned up to this point by saving the block in ws->scan_bd,
  54    with the current scan pointer in ws->scan.  Later, when more
  55    objects have been copied to this block, we can come back and scan
  56    the rest.  When we visit this workspace again in the future,
  57    scan_bd may still be the same as todo_bd, or it might be different:
  58    if enough objects were copied into this block that it filled up,
  59    then we will have allocated a new todo block, but *not* pushed the
  60    old one to the generation, because it is partially scanned.
  61
  62    The reason to leave scanning the todo blocks until last is that we
  63    want to deal with full blocks as far as possible.
  64    ------------------------------------------------------------------------- */
  65
  66
  67 /* -----------------------------------------------------------------------------
  68    Generation Workspace
  69
  70    A generation workspace exists for each generation for each GC
  71    thread. The GC thread takes a block from the todos list of the
  72    generation into the scanbd and then scans it.  Objects referred to
  73    by those in the scan block are copied into the todo or scavd blocks
  74    of the relevant generation.
  75
  76    ------------------------------------------------------------------------- */
  77
  78 typedef struct gen_workspace_ {
  79     generation * gen;           // the gen for this workspace
  80     struct gc_thread_ * my_gct; // the gc_thread that contains this workspace
  81
  82     // where objects to be scavenged go
  83     bdescr *     todo_bd;
  84     StgPtr       todo_free;            // free ptr for todo_bd
  85     StgPtr       todo_lim;             // lim for todo_bd
  86
  87     WSDeque *    todo_q;
  88     bdescr *     todo_overflow;
  89     nat          n_todo_overflow;
  90
  91     // where large objects to be scavenged go
  92     bdescr *     todo_large_objects;
  93
  94     // Objects that have already been scavenged.
  95     bdescr *     scavd_list;
  96     nat          n_scavd_blocks;     // count of blocks in this list
  97
  98     // Partially-full, scavenged, blocks
  99     bdescr *     part_list;
 100     unsigned int n_part_blocks;      // count of above
 101
 102     StgWord pad[3];
 103
 104 } gen_workspace ATTRIBUTE_ALIGNED(64);
 105 // align so that computing gct->gens[n] is a shift, not a multiply
 106 // fails if the size is <64, which is why we need the pad above
 107
 108 /* ----------------------------------------------------------------------------
 109    GC thread object
 110
 111    Every GC thread has one of these. It contains all the generation
 112    specific workspaces and other GC thread local information. At some
 113    later point it maybe useful to move this other into the TLS store
 114    of the GC threads
 115    ------------------------------------------------------------------------- */
 116
 117 typedef struct gc_thread_ {
 118 #ifdef THREADED_RTS
 119     OSThreadId id;                 // The OS thread that this struct belongs to
 120     SpinLock   gc_spin;
 121     SpinLock   mut_spin;
 122     volatile rtsBool wakeup;
 123 #endif
 124     nat thread_index;              // a zero based index identifying the thread
 125
 126     bdescr * free_blocks;          // a buffer of free blocks for this thread
 127                                    //  during GC without accessing the block
 128                                    //   allocators spin lock.
 129
 130     StgClosure* static_objects;      // live static objects
 131     StgClosure* scavenged_static_objects;   // static objects scavenged so far
 132
 133     lnat gc_count;                 // number of GCs this thread has done
 134
 135     // block that is currently being scanned
 136     bdescr *     scan_bd;
 137
 138     // Remembered sets on this CPU.  Each GC thread has its own
 139     // private per-generation remembered sets, so it can add an item
 140     // to the remembered set without taking a lock.  The mut_lists
 141     // array on a gc_thread is the same as the one on the
 142     // corresponding Capability; we stash it here too for easy access
 143     // during GC; see recordMutableGen_GC().
 144     bdescr **    mut_lists;
 145
 146     // --------------------
 147     // evacuate flags
 148
 149     nat evac_gen_no;               // Youngest generation that objects
 150                                    // should be evacuated to in
 151                                    // evacuate().  (Logically an
 152                                    // argument to evacuate, but it's
 153                                    // static a lot of the time so we
 154                                    // optimise it into a per-thread
 155                                    // variable).
 156
 157     rtsBool failed_to_evac;        // failure to evacuate an object typically
 158                                    // Causes it to be recorded in the mutable
 159                                    // object list
 160
 161     rtsBool eager_promotion;       // forces promotion to the evac gen
 162                                    // instead of the to-space
 163                                    // corresponding to the object
 164
 165     lnat thunk_selector_depth;     // ummm.... not used as of now
 166
 167 #ifdef USE_PAPI
 168     int papi_events;
 169 #endif
 170
 171     // -------------------
 172     // stats
 173
 174     lnat copied;
 175     lnat scanned;
 176     lnat any_work;
 177     lnat no_work;
 178     lnat scav_find_work;
 179
 180     // -------------------
 181     // workspaces
 182
 183     // array of workspaces, indexed by stp->abs_no.  This is placed
 184     // directly at the end of the gc_thread structure so that we can get from
 185     // the gc_thread pointer to a workspace using only pointer
 186     // arithmetic, no memory access.  This happens in the inner loop
 187     // of the GC, see Evac.c:alloc_for_copy().
 188     gen_workspace gens[];
 189 } gc_thread;
 190
 191
 192 extern nat n_gc_threads;
 193
 194 /* -----------------------------------------------------------------------------
 195    The gct variable is thread-local and points to the current thread's
 196    gc_thread structure.  It is heavily accessed, so we try to put gct
 197    into a global register variable if possible; if we don't have a
 198    register then use gcc's __thread extension to create a thread-local
 199    variable.
 200
 201    Even on x86 where registers are scarce, it is worthwhile using a
 202    register variable here: I measured about a 2-5% slowdown with the
 203    __thread version.
 204    -------------------------------------------------------------------------- */
 205
 206 extern gc_thread **gc_threads;
 207
 208 #if defined(THREADED_RTS)
 209
 210 #define GLOBAL_REG_DECL(type,name,reg) register type name REG(reg);
 211
 212 #define SET_GCT(to) gct = (to)
 213
 214
 215
 216 #if (defined(i386_HOST_ARCH) && defined(linux_HOST_OS))
 217 // Using __thread is better than stealing a register on x86/Linux, because
 218 // we have too few registers available.  In my tests it was worth
 219 // about 5% in GC performance, but of course that might change as gcc
 220 // improves. -- SDM 2009/04/03
 221 //
 222 // We ought to do the same on MacOS X, but __thread is not
 223 // supported there yet (gcc 4.0.1).
 224
 225 extern __thread gc_thread* gct;
 226 #define DECLARE_GCT __thread gc_thread* gct;
 227
 228
 229 #elif defined(sparc_HOST_ARCH)
 230 // On SPARC we can't pin gct to a register. Names like %l1 are just offsets
 231 //      into the register window, which change on each function call.
 232 //
 233 //      There are eight global (non-window) registers, but they're used for other purposes.
 234 //      %g0     -- always zero
 235 //      %g1     -- volatile over function calls, used by the linker
 236 //      %g2-%g3 -- used as scratch regs by the C compiler (caller saves)
 237 //      %g4     -- volatile over function calls, used by the linker
 238 //      %g5-%g7 -- reserved by the OS
 239
 240 extern __thread gc_thread* gct;
 241 #define DECLARE_GCT __thread gc_thread* gct;
 242
 243
 244 #elif defined(REG_Base) && !defined(i386_HOST_ARCH)
 245 // on i386, REG_Base is %ebx which is also used for PIC, so we don't
 246 // want to steal it
 247
 248 GLOBAL_REG_DECL(gc_thread*, gct, REG_Base)
 249 #define DECLARE_GCT /* nothing */
 250
 251
 252 #elif defined(REG_R1)
 253
 254 GLOBAL_REG_DECL(gc_thread*, gct, REG_R1)
 255 #define DECLARE_GCT /* nothing */
 256
 257
 258 #elif defined(__GNUC__)
 259
 260 extern __thread gc_thread* gct;
 261 #define DECLARE_GCT __thread gc_thread* gct;
 262
 263 #else
 264
 265 #error Cannot find a way to declare the thread-local gct
 266
 267 #endif
 268
 269 #else  // not the threaded RTS
 270
 271 extern StgWord8 the_gc_thread[];
 272
 273 #define gct ((gc_thread*)&the_gc_thread)
 274 #define SET_GCT(to) /*nothing*/
 275 #define DECLARE_GCT /*nothing*/
 276
 277 #endif
 278
 279 #include "EndPrivate.h"
 280
 281 #endif // SM_GCTHREAD_H
 282