From: Simon Marlow Date: Wed, 16 Apr 2008 21:34:36 +0000 (+0000) Subject: GC: rearrange storage to reduce memory accesses in the inner loop X-Git-Tag: Before_cabalised-GHC~235 X-Git-Url: http://git.megacz.com/?p=ghc-hetmet.git;a=commitdiff_plain;h=2aa877f8588da099351ef51efca3605fd87ea768 GC: rearrange storage to reduce memory accesses in the inner loop --- diff --git a/includes/Storage.h b/includes/Storage.h index a830b44..28225d7 100644 --- a/includes/Storage.h +++ b/includes/Storage.h @@ -53,7 +53,8 @@ * ------------------------------------------------------------------------- */ typedef struct step_ { - unsigned int no; // step number + unsigned int no; // step number in this generation + unsigned int abs_no; // absolute step number int is_compacted; // compact this step? (old gen only) struct generation_ * gen; // generation this step belongs to @@ -67,28 +68,34 @@ typedef struct step_ { bdescr * large_objects; // large objects (doubly linked) unsigned int n_large_blocks; // no. of blocks used by large objs + // ------------------------------------ // Fields below are used during GC only // During GC, if we are collecting this step, blocks and n_blocks // are copied into the following two fields. After GC, these blocks // are freed. - bdescr * old_blocks; // bdescr of first from-space block - unsigned int n_old_blocks; // number of blocks in from-space - - bdescr * todos; // blocks waiting to be scavenged - unsigned int n_todos; // count of above #if defined(THREADED_RTS) + char pad[128]; // make sure the following is + // on a separate cache line. SpinLock sync_todo; // lock for todos SpinLock sync_large_objects; // lock for large_objects // and scavenged_large_objects #endif + bdescr * old_blocks; // bdescr of first from-space block + unsigned int n_old_blocks; // number of blocks in from-space + + bdescr * todos; // blocks waiting to be scavenged + unsigned int n_todos; // count of above + bdescr * scavenged_large_objects; // live large objs after GC (d-link) unsigned int n_scavenged_large_blocks; // size (not count) of above bdescr * bitmap; // bitmap for compacting collection + + } step; @@ -112,6 +119,8 @@ extern generation * RTS_VAR(generations); extern generation * RTS_VAR(g0); extern step * RTS_VAR(g0s0); extern generation * RTS_VAR(oldest_gen); +extern step * RTS_VAR(all_steps); +extern nat total_steps; /* ----------------------------------------------------------------------------- Initialisation / De-initialisation diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c index 0a47c3b..4c386f7 100644 --- a/rts/sm/Evac.c +++ b/rts/sm/Evac.c @@ -55,13 +55,12 @@ alloc_for_copy (nat size, step *stp) } } - ws = &gct->steps[stp->gen_no][stp->no]; + ws = &gct->steps[stp->abs_no]; + // this compiles to a single mem access to stp->abs_no only /* chain a new block onto the to-space for the destination step if * necessary. */ - - ASSERT(ws->todo_free >= ws->todo_bd->free && ws->todo_free <= ws->todo_lim); to = ws->todo_free; if (to + size > ws->todo_lim) { to = gc_alloc_todo_block(ws); @@ -141,7 +140,7 @@ evacuate_large(StgPtr p) } } - ws = &gct->steps[new_stp->gen_no][new_stp->no]; + ws = &gct->steps[new_stp->abs_no]; bd->flags |= BF_EVACUATED; bd->step = new_stp; bd->gen_no = new_stp->gen_no; diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 09e2b2c..80ec6f2 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -118,7 +118,7 @@ nat mutlist_MUTVARS, /* Thread-local data for each GC thread */ -gc_thread *gc_threads = NULL; +gc_thread **gc_threads = NULL; // gc_thread *gct = NULL; // this thread's gct TODO: make thread-local // Number of threads running in *this* GC. Affects how many @@ -297,7 +297,7 @@ GarbageCollect ( rtsBool force_major_gc ) // Initialise all our gc_thread structures for (t = 0; t < n_gc_threads; t++) { - init_gc_thread(&gc_threads[t]); + init_gc_thread(gc_threads[t]); } // the main thread is running: this prevents any other threads from @@ -309,7 +309,7 @@ GarbageCollect ( rtsBool force_major_gc ) copied = 0; // this is the main thread - gct = &gc_threads[0]; + gct = gc_threads[0]; /* ----------------------------------------------------------------------- * follow all the roots that we know about: @@ -421,32 +421,29 @@ GarbageCollect ( rtsBool force_major_gc ) bdescr *prev; for (t = 0; t < n_gc_threads; t++) { - thr = &gc_threads[t]; - - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - for (s = 0; s < generations[g].n_steps; s++) { - ws = &thr->steps[g][s]; - if (g==0 && s==0) continue; - - // Not true? - // ASSERT( ws->scan_bd == ws->todo_bd ); - ASSERT( ws->scan_bd ? ws->scan == ws->scan_bd->free : 1 ); - - // Push the final block - if (ws->scan_bd) { push_scan_block(ws->scan_bd, ws); } - - ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks); - - prev = ws->scavd_list; - for (bd = ws->scavd_list; bd != NULL; bd = bd->link) { - bd->flags &= ~BF_EVACUATED; // now from-space - prev = bd; - } - prev->link = ws->stp->blocks; - ws->stp->blocks = ws->scavd_list; - ws->stp->n_blocks += ws->n_scavd_blocks; - ASSERT(countBlocks(ws->stp->blocks) == ws->stp->n_blocks); - } + thr = gc_threads[t]; + + // not step 0 + for (s = 1; s < total_steps; s++) { + ws = &thr->steps[s]; + // Not true? + // ASSERT( ws->scan_bd == ws->todo_bd ); + ASSERT( ws->scan_bd ? ws->scan == ws->scan_bd->free : 1 ); + + // Push the final block + if (ws->scan_bd) { push_scan_block(ws->scan_bd, ws); } + + ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks); + + prev = ws->scavd_list; + for (bd = ws->scavd_list; bd != NULL; bd = bd->link) { + bd->flags &= ~BF_EVACUATED; // now from-space + prev = bd; + } + prev->link = ws->stp->blocks; + ws->stp->blocks = ws->scavd_list; + ws->stp->n_blocks += ws->n_scavd_blocks; + ASSERT(countBlocks(ws->stp->blocks) == ws->stp->n_blocks); } } } @@ -920,11 +917,15 @@ initialise_N (rtsBool force_major_gc) Initialise the gc_thread structures. -------------------------------------------------------------------------- */ -static void -alloc_gc_thread (gc_thread *t, int n) +static gc_thread * +alloc_gc_thread (int n) { - nat g, s; + nat s; step_workspace *ws; + gc_thread *t; + + t = stgMallocBytes(sizeof(gc_thread) + total_steps * sizeof(step_workspace), + "alloc_gc_thread"); #ifdef THREADED_RTS t->id = 0; @@ -944,32 +945,24 @@ alloc_gc_thread (gc_thread *t, int n) t->papi_events = -1; #endif - t->steps = stgMallocBytes(RtsFlags.GcFlags.generations * - sizeof(step_workspace *), - "initialise_gc_thread"); - - for (g = 0; g < RtsFlags.GcFlags.generations; g++) + for (s = 0; s < total_steps; s++) { - t->steps[g] = stgMallocBytes(generations[g].n_steps * - sizeof(step_workspace), - "initialise_gc_thread/2"); - - for (s = 0; s < generations[g].n_steps; s++) - { - ws = &t->steps[g][s]; - ws->stp = &generations[g].steps[s]; - ws->gct = t; - - ws->scan_bd = NULL; - ws->scan = NULL; - - ws->todo_bd = NULL; - ws->buffer_todo_bd = NULL; - - ws->scavd_list = NULL; - ws->n_scavd_blocks = 0; - } + ws = &t->steps[s]; + ws->stp = &all_steps[s]; + ASSERT(s == ws->stp->abs_no); + ws->gct = t; + + ws->scan_bd = NULL; + ws->scan = NULL; + + ws->todo_bd = NULL; + ws->buffer_todo_bd = NULL; + + ws->scavd_list = NULL; + ws->n_scavd_blocks = 0; } + + return t; } @@ -980,17 +973,17 @@ alloc_gc_threads (void) #if defined(THREADED_RTS) nat i; gc_threads = stgMallocBytes (RtsFlags.ParFlags.gcThreads * - sizeof(gc_thread), + sizeof(gc_thread*), "alloc_gc_threads"); for (i = 0; i < RtsFlags.ParFlags.gcThreads; i++) { - alloc_gc_thread(&gc_threads[i], i); + gc_threads[i] = alloc_gc_thread(i); } #else - gc_threads = stgMallocBytes (sizeof(gc_thread), + gc_threads = stgMallocBytes (sizeof(gc_thread*), "alloc_gc_threads"); - alloc_gc_thread(gc_threads, 0); + gc_threads[0] = alloc_gc_thread(0); #endif } } @@ -1130,7 +1123,7 @@ start_gc_threads (void) // Start from 1: the main thread is 0 for (i = 1; i < RtsFlags.ParFlags.gcThreads; i++) { createOSThread(&id, (OSThreadProc*)&gc_thread_entry, - &gc_threads[i]); + gc_threads[i]); } done = rtsTrue; } @@ -1144,10 +1137,10 @@ wakeup_gc_threads (nat n_threads USED_IF_THREADS) nat i; for (i=1; i < n_threads; i++) { inc_running(); - ACQUIRE_LOCK(&gc_threads[i].wake_mutex); - gc_threads[i].wakeup = rtsTrue; - signalCondition(&gc_threads[i].wake_cond); - RELEASE_LOCK(&gc_threads[i].wake_mutex); + ACQUIRE_LOCK(&gc_threads[i]->wake_mutex); + gc_threads[i]->wakeup = rtsTrue; + signalCondition(&gc_threads[i]->wake_cond); + RELEASE_LOCK(&gc_threads[i]->wake_mutex); } #endif } @@ -1251,7 +1244,7 @@ init_collected_gen (nat g, nat n_threads) // we don't copy objects into g0s0, unless -G0 if (g==0 && s==0 && RtsFlags.GcFlags.generations > 1) continue; - ws = &gc_threads[t].steps[g][s]; + ws = &gc_threads[t]->steps[g * RtsFlags.GcFlags.steps + s]; ws->scan_bd = NULL; ws->scan = NULL; @@ -1292,7 +1285,7 @@ init_uncollected_gen (nat g, nat threads) for (t = 0; t < threads; t++) { for (s = 0; s < generations[g].n_steps; s++) { - ws = &gc_threads[t].steps[g][s]; + ws = &gc_threads[t]->steps[g * RtsFlags.GcFlags.steps + s]; stp = ws->stp; ws->buffer_todo_bd = NULL; diff --git a/rts/sm/GC.h b/rts/sm/GC.h index 03f527d..f98e4a1 100644 --- a/rts/sm/GC.h +++ b/rts/sm/GC.h @@ -116,8 +116,6 @@ typedef struct gc_thread_ { #endif nat thread_index; // a zero based index identifying the thread - step_workspace ** steps; // 2-d array (gen,step) of workspaces - bdescr * free_blocks; // a buffer of free blocks for this thread // during GC without accessing the block // allocators spin lock. @@ -148,13 +146,22 @@ typedef struct gc_thread_ { #ifdef USE_PAPI int papi_events; #endif - + + // ------------------- + // workspaces + + // array of workspaces, indexed by stp->abs_no. This is placed + // directly at the end of the gc_thread structure so that we can get from + // the gc_thread pointer to a workspace using only pointer + // arithmetic, no memory access. This happens in the inner loop + // of the GC, see Evac.c:alloc_for_copy(). + step_workspace steps[]; } gc_thread; extern nat N; extern rtsBool major_gc; -extern gc_thread *gc_threads; +extern gc_thread **gc_threads; register gc_thread *gct __asm__("%rbx"); // extern gc_thread *gct; // this thread's gct TODO: make thread-local diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c index 9361fb7..b22f244 100644 --- a/rts/sm/Scav.c +++ b/rts/sm/Scav.c @@ -1401,40 +1401,39 @@ static rtsBool scavenge_find_global_work (void) { bdescr *bd; - int g, s; + int s; rtsBool flag; step_workspace *ws; flag = rtsFalse; - for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) { - for (s = generations[g].n_steps-1; s >= 0; s--) { - if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) { - continue; - } - ws = &gct->steps[g][s]; + for (s = total_steps-1; s>=0; s--) + { + if (s == 0 && RtsFlags.GcFlags.generations > 1) { + continue; + } + ws = &gct->steps[s]; - // If we have any large objects to scavenge, do them now. - if (ws->todo_large_objects) { - scavenge_large(ws); - flag = rtsTrue; - } + // If we have any large objects to scavenge, do them now. + if (ws->todo_large_objects) { + scavenge_large(ws); + flag = rtsTrue; + } - if ((bd = grab_todo_block(ws)) != NULL) { - // no need to assign this to ws->scan_bd, we're going - // to scavenge the whole thing and then push it on - // our scavd list. This saves pushing out the - // scan_bd block, which might be partial. - if (N == 0) { - scavenge_block0(bd, bd->start); - } else { - scavenge_block(bd, bd->start); - } - push_scan_block(bd, ws); - return rtsTrue; - } + if ((bd = grab_todo_block(ws)) != NULL) { + // no need to assign this to ws->scan_bd, we're going + // to scavenge the whole thing and then push it on + // our scavd list. This saves pushing out the + // scan_bd block, which might be partial. + if (N == 0) { + scavenge_block0(bd, bd->start); + } else { + scavenge_block(bd, bd->start); + } + push_scan_block(bd, ws); + return rtsTrue; + } - if (flag) return rtsTrue; - } + if (flag) return rtsTrue; } return rtsFalse; } @@ -1454,60 +1453,58 @@ scavenge_find_global_work (void) static rtsBool scavenge_find_local_work (void) { - int g, s; + int s; step_workspace *ws; rtsBool flag; flag = rtsFalse; - for (g = RtsFlags.GcFlags.generations; --g >= 0; ) { - for (s = generations[g].n_steps; --s >= 0; ) { - if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) { - continue; - } - ws = &gct->steps[g][s]; + for (s = total_steps-1; s >= 0; s--) { + if (s == 0 && RtsFlags.GcFlags.generations > 1) { + continue; + } + ws = &gct->steps[s]; - if (ws->todo_bd != NULL) - { - ws->todo_bd->free = ws->todo_free; + if (ws->todo_bd != NULL) + { + ws->todo_bd->free = ws->todo_free; + } + + // If we have a todo block and no scan block, start + // scanning the todo block. + if (ws->scan_bd == NULL && ws->todo_bd != NULL) + { + ws->scan_bd = ws->todo_bd; + ws->scan = ws->scan_bd->start; + } + + // If we have a scan block with some work to do, + // scavenge everything up to the free pointer. + if (ws->scan != NULL && ws->scan < ws->scan_bd->free) + { + if (N == 0) { + scavenge_block0(ws->scan_bd, ws->scan); + } else { + scavenge_block(ws->scan_bd, ws->scan); } - - // If we have a todo block and no scan block, start - // scanning the todo block. - if (ws->scan_bd == NULL && ws->todo_bd != NULL) - { - ws->scan_bd = ws->todo_bd; - ws->scan = ws->scan_bd->start; - } - - // If we have a scan block with some work to do, - // scavenge everything up to the free pointer. - if (ws->scan != NULL && ws->scan < ws->scan_bd->free) - { - if (N == 0) { - scavenge_block0(ws->scan_bd, ws->scan); - } else { - scavenge_block(ws->scan_bd, ws->scan); - } - ws->scan = ws->scan_bd->free; - flag = rtsTrue; - } - - if (ws->scan_bd != NULL && ws->scan == ws->scan_bd->free - && ws->scan_bd != ws->todo_bd) - { - // we're not going to evac any more objects into - // this block, so push it now. - push_scan_block(ws->scan_bd, ws); - ws->scan_bd = NULL; - ws->scan = NULL; - // we might be able to scan the todo block now. But - // don't do it right away: there might be full blocks - // waiting to be scanned as a result of scavenge_block above. - flag = rtsTrue; - } - - if (flag) return rtsTrue; - } + ws->scan = ws->scan_bd->free; + flag = rtsTrue; + } + + if (ws->scan_bd != NULL && ws->scan == ws->scan_bd->free + && ws->scan_bd != ws->todo_bd) + { + // we're not going to evac any more objects into + // this block, so push it now. + push_scan_block(ws->scan_bd, ws); + ws->scan_bd = NULL; + ws->scan = NULL; + // we might be able to scan the todo block now. But + // don't do it right away: there might be full blocks + // waiting to be scanned as a result of scavenge_block above. + flag = rtsTrue; + } + + if (flag) return rtsTrue; } return rtsFalse; } @@ -1551,7 +1548,7 @@ loop: rtsBool any_work (void) { - int g, s; + int s; step_workspace *ws; write_barrier(); @@ -1570,15 +1567,13 @@ any_work (void) // Check for global work in any step. We don't need to check for // local work, because we have already exited scavenge_loop(), // which means there is no local work for this thread. - for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) { - for (s = generations[g].n_steps-1; s >= 0; s--) { - if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) { - continue; - } - ws = &gct->steps[g][s]; - if (ws->todo_large_objects) return rtsTrue; - if (ws->stp->todos) return rtsTrue; - } + for (s = total_steps-1; s >= 0; s--) { + if (s == 0 && RtsFlags.GcFlags.generations > 1) { + continue; + } + ws = &gct->steps[s]; + if (ws->todo_large_objects) return rtsTrue; + if (ws->stp->todos) return rtsTrue; } return rtsFalse; diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c index fef882a..7f4c723 100644 --- a/rts/sm/Storage.c +++ b/rts/sm/Storage.c @@ -52,6 +52,9 @@ generation *g0 = NULL; /* generation 0, for convenience */ generation *oldest_gen = NULL; /* oldest generation, for convenience */ step *g0s0 = NULL; /* generation 0, step 0, for convenience */ +nat total_steps = 0; +step *all_steps = NULL; /* single array of steps */ + ullong total_allocated = 0; /* total memory allocated during run */ nat n_nurseries = 0; /* == RtsFlags.ParFlags.nNodes, convenience */ @@ -81,6 +84,7 @@ static void initStep (step *stp, int g, int s) { stp->no = s; + stp->abs_no = RtsFlags.GcFlags.steps * g + s; stp->blocks = NULL; stp->n_blocks = 0; stp->old_blocks = NULL; @@ -104,7 +108,6 @@ initStorage( void ) { nat g, s; generation *gen; - step *step_arr; if (generations != NULL) { // multi-init protection @@ -152,10 +155,9 @@ initStorage( void ) it this way, because we need the invariant that two step pointers can be directly compared to see which is the oldest. Remember that the last generation has only one step. */ - step_arr = stgMallocBytes(sizeof(struct step_) - * (1 + ((RtsFlags.GcFlags.generations - 1) - * RtsFlags.GcFlags.steps)), - "initStorage: steps"); + total_steps = 1 + (RtsFlags.GcFlags.generations - 1) * RtsFlags.GcFlags.steps; + all_steps = stgMallocBytes(total_steps * sizeof(struct step_), + "initStorage: steps"); /* Initialise all generations */ for(g = 0; g < RtsFlags.GcFlags.generations; g++) { @@ -177,19 +179,19 @@ initStorage( void ) /* Oldest generation: one step */ oldest_gen->n_steps = 1; - oldest_gen->steps = step_arr + (RtsFlags.GcFlags.generations - 1) + oldest_gen->steps = all_steps + (RtsFlags.GcFlags.generations - 1) * RtsFlags.GcFlags.steps; /* set up all except the oldest generation with 2 steps */ for(g = 0; g < RtsFlags.GcFlags.generations-1; g++) { generations[g].n_steps = RtsFlags.GcFlags.steps; - generations[g].steps = step_arr + g * RtsFlags.GcFlags.steps; + generations[g].steps = all_steps + g * RtsFlags.GcFlags.steps; } } else { /* single generation, i.e. a two-space collector */ g0->n_steps = 1; - g0->steps = step_arr; + g0->steps = all_steps; } #ifdef THREADED_RTS