rts/sm/GCThread.h

   1 /* -----------------------------------------------------------------------------
   2  *
   3  * (c) The GHC Team 1998-2008
   4  *
   5  * Generational garbage collector
   6  *
   7  * Documentation on the architecture of the Garbage Collector can be
   8  * found in the online commentary:
   9  *
  10  *   http://hackage.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage/GC
  11  *
  12  * ---------------------------------------------------------------------------*/
  13
  14 #ifndef GCTHREAD_H
  15 #define GCTHREAD_H
  16
  17 #include "OSThreads.h"
  18 #include "WSDeque.h"
  19
  20 /* -----------------------------------------------------------------------------
  21    General scheme
  22
  23    ToDo: move this to the wiki when the implementation is done.
  24
  25    We're only going to try to parallelise the copying GC for now.  The
  26    Plan is as follows.
  27
  28    Each thread has a gc_thread structure (see below) which holds its
  29    thread-local data.  We'll keep a pointer to this in a thread-local
  30    variable, or possibly in a register.
  31
  32    In the gc_thread structure is a step_workspace for each step.  The
  33    primary purpose of the step_workspace is to hold evacuated objects;
  34    when an object is evacuated, it is copied to the "todo" block in
  35    the thread's workspace for the appropriate step.  When the todo
  36    block is full, it is pushed to the global step->todos list, which
  37    is protected by a lock.  (in fact we intervene a one-place buffer
  38    here to reduce contention).
  39
  40    A thread repeatedly grabs a block of work from one of the
  41    step->todos lists, scavenges it, and keeps the scavenged block on
  42    its own ws->scavd_list (this is to avoid unnecessary contention
  43    returning the completed buffers back to the step: we can just
  44    collect them all later).
  45
  46    When there is no global work to do, we start scavenging the todo
  47    blocks in the workspaces.  This is where the scan_bd field comes
  48    in: we can scan the contents of the todo block, when we have
  49    scavenged the contents of the todo block (up to todo_bd->free), we
  50    don't want to move this block immediately to the scavd_list,
  51    because it is probably only partially full.  So we remember that we
  52    have scanned up to this point by saving the block in ws->scan_bd,
  53    with the current scan pointer in ws->scan.  Later, when more
  54    objects have been copied to this block, we can come back and scan
  55    the rest.  When we visit this workspace again in the future,
  56    scan_bd may still be the same as todo_bd, or it might be different:
  57    if enough objects were copied into this block that it filled up,
  58    then we will have allocated a new todo block, but *not* pushed the
  59    old one to the step, because it is partially scanned.
  60
  61    The reason to leave scanning the todo blocks until last is that we
  62    want to deal with full blocks as far as possible.
  63    ------------------------------------------------------------------------- */
  64
  65
  66 /* -----------------------------------------------------------------------------
  67    Step Workspace
  68
  69    A step workspace exists for each step for each GC thread. The GC
  70    thread takes a block from the todos list of the step into the
  71    scanbd and then scans it.  Objects referred to by those in the scan
  72    block are copied into the todo or scavd blocks of the relevant step.
  73
  74    ------------------------------------------------------------------------- */
  75
  76 typedef struct step_workspace_ {
  77     step * step;                // the step for this workspace
  78     struct gc_thread_ * my_gct; // the gc_thread that contains this workspace
  79
  80     // where objects to be scavenged go
  81     bdescr *     todo_bd;
  82     StgPtr       todo_free;            // free ptr for todo_bd
  83     StgPtr       todo_lim;             // lim for todo_bd
  84
  85     WSDeque *    todo_q;
  86     bdescr *     todo_overflow;
  87     nat          n_todo_overflow;
  88
  89     // where large objects to be scavenged go
  90     bdescr *     todo_large_objects;
  91
  92     // Objects that have already been scavenged.
  93     bdescr *     scavd_list;
  94     nat          n_scavd_blocks;     // count of blocks in this list
  95
  96     // Partially-full, scavenged, blocks
  97     bdescr *     part_list;
  98     unsigned int n_part_blocks;      // count of above
  99
 100     StgWord pad[3];
 101
 102 } step_workspace ATTRIBUTE_ALIGNED(64);
 103 // align so that computing gct->steps[n] is a shift, not a multiply
 104 // fails if the size is <64, which is why we need the pad above
 105
 106 /* ----------------------------------------------------------------------------
 107    GC thread object
 108
 109    Every GC thread has one of these. It contains all the step specific
 110    workspaces and other GC thread local information. At some later
 111    point it maybe useful to move this other into the TLS store of the
 112    GC threads
 113    ------------------------------------------------------------------------- */
 114
 115 typedef struct gc_thread_ {
 116 #ifdef THREADED_RTS
 117     OSThreadId id;                 // The OS thread that this struct belongs to
 118     SpinLock   gc_spin;
 119     SpinLock   mut_spin;
 120     volatile rtsBool wakeup;
 121 #endif
 122     nat thread_index;              // a zero based index identifying the thread
 123
 124     bdescr * free_blocks;          // a buffer of free blocks for this thread
 125                                    //  during GC without accessing the block
 126                                    //   allocators spin lock.
 127
 128     StgClosure* static_objects;      // live static objects
 129     StgClosure* scavenged_static_objects;   // static objects scavenged so far
 130
 131     lnat gc_count;                 // number of GCs this thread has done
 132
 133     // block that is currently being scanned
 134     bdescr *     scan_bd;
 135
 136     // Remembered sets on this CPU.  Each GC thread has its own
 137     // private per-generation remembered sets, so it can add an item
 138     // to the remembered set without taking a lock.  The mut_lists
 139     // array on a gc_thread is the same as the one on the
 140     // corresponding Capability; we stash it here too for easy access
 141     // during GC; see recordMutableGen_GC().
 142     bdescr **    mut_lists;
 143
 144     // --------------------
 145     // evacuate flags
 146
 147     step *evac_step;               // Youngest generation that objects
 148                                    // should be evacuated to in
 149                                    // evacuate().  (Logically an
 150                                    // argument to evacuate, but it's
 151                                    // static a lot of the time so we
 152                                    // optimise it into a per-thread
 153                                    // variable).
 154
 155     rtsBool failed_to_evac;        // failure to evacuate an object typically
 156                                    // Causes it to be recorded in the mutable
 157                                    // object list
 158
 159     rtsBool eager_promotion;       // forces promotion to the evac gen
 160                                    // instead of the to-space
 161                                    // corresponding to the object
 162
 163     lnat thunk_selector_depth;     // ummm.... not used as of now
 164
 165 #ifdef USE_PAPI
 166     int papi_events;
 167 #endif
 168
 169     // -------------------
 170     // stats
 171
 172     lnat copied;
 173     lnat scanned;
 174     lnat any_work;
 175     lnat no_work;
 176     lnat scav_find_work;
 177
 178     // -------------------
 179     // workspaces
 180
 181     // array of workspaces, indexed by stp->abs_no.  This is placed
 182     // directly at the end of the gc_thread structure so that we can get from
 183     // the gc_thread pointer to a workspace using only pointer
 184     // arithmetic, no memory access.  This happens in the inner loop
 185     // of the GC, see Evac.c:alloc_for_copy().
 186     step_workspace steps[];
 187 } gc_thread;
 188
 189
 190 extern nat n_gc_threads;
 191
 192 /* -----------------------------------------------------------------------------
 193    The gct variable is thread-local and points to the current thread's
 194    gc_thread structure.  It is heavily accessed, so we try to put gct
 195    into a global register variable if possible; if we don't have a
 196    register then use gcc's __thread extension to create a thread-local
 197    variable.
 198
 199    Even on x86 where registers are scarce, it is worthwhile using a
 200    register variable here: I measured about a 2-5% slowdown with the
 201    __thread version.
 202    -------------------------------------------------------------------------- */
 203
 204 extern gc_thread **gc_threads;
 205
 206 #if defined(THREADED_RTS)
 207
 208 #define GLOBAL_REG_DECL(type,name,reg) register type name REG(reg);
 209
 210 #define SET_GCT(to) gct = (to)
 211
 212
 213
 214 #if (defined(i386_HOST_ARCH) && defined(linux_HOST_OS))
 215 // Using __thread is better than stealing a register on x86/Linux, because
 216 // we have too few registers available.  In my tests it was worth
 217 // about 5% in GC performance, but of course that might change as gcc
 218 // improves. -- SDM 2009/04/03
 219
 220 extern __thread gc_thread* gct;
 221 #define DECLARE_GCT __thread gc_thread* gct;
 222
 223
 224 #elif defined(sparc_TARGET_ARCH)
 225 // On SPARC we can't pin gct to a register. Names like %l1 are just offsets
 226 //      into the register window, which change on each function call.
 227 //
 228 //      There are eight global (non-window) registers, but they're used for other purposes.
 229 //      %g0     -- always zero
 230 //      %g1     -- volatile over function calls, used by the linker
 231 //      %g2-%g3 -- used as scratch regs by the C compiler (caller saves)
 232 //      %g4     -- volatile over function calls, used by the linker
 233 //      %g5-%g7 -- reserved by the OS
 234
 235 extern __thread gc_thread* gct;
 236 #define DECLARE_GCT __thread gc_thread* gct;
 237
 238
 239 #elif defined(REG_Base) && !defined(i386_HOST_ARCH)
 240 // on i386, REG_Base is %ebx which is also used for PIC, so we don't
 241 // want to steal it
 242
 243 GLOBAL_REG_DECL(gc_thread*, gct, REG_Base)
 244 #define DECLARE_GCT /* nothing */
 245
 246
 247 #elif defined(REG_R1)
 248
 249 GLOBAL_REG_DECL(gc_thread*, gct, REG_R1)
 250 #define DECLARE_GCT /* nothing */
 251
 252
 253 #elif defined(__GNUC__)
 254
 255 extern __thread gc_thread* gct;
 256 #define DECLARE_GCT __thread gc_thread* gct;
 257
 258 #else
 259
 260 #error Cannot find a way to declare the thread-local gct
 261
 262 #endif
 263
 264 #else  // not the threaded RTS
 265
 266 extern StgWord8 the_gc_thread[];
 267
 268 #define gct ((gc_thread*)&the_gc_thread)
 269 #define SET_GCT(to) /*nothing*/
 270 #define DECLARE_GCT /*nothing*/
 271
 272 #endif
 273
 274 #endif // GCTHREAD_H
 275