From: Simon Marlow Date: Mon, 9 Mar 2009 12:13:00 +0000 (+0000) Subject: Redesign 64-bit HEAP_ALLOCED (FIX #2934 at the same time) X-Git-Tag: 2009-03-13~9 X-Git-Url: http://git.megacz.com/?p=ghc-hetmet.git;a=commitdiff_plain;h=9fe7b8ea2136a4a07752b2851840c9366706f832 Redesign 64-bit HEAP_ALLOCED (FIX #2934 at the same time) After much experimentation, I've found a formulation for HEAP_ALLOCED that (a) improves performance, and (b) doesn't have any race conditions when used concurrently. GC performance on x86_64 should be improved slightly. See extensive comments in MBlock.h for the details. --- diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c index 8d37f27..062b73f 100644 --- a/rts/sm/Evac.c +++ b/rts/sm/Evac.c @@ -29,6 +29,7 @@ StgWord64 whitehole_spin = 0; #if defined(THREADED_RTS) && !defined(PARALLEL_GC) #define evacuate(p) evacuate1(p) +#define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p) #endif #if !defined(PARALLEL_GC) @@ -364,7 +365,7 @@ loop: ASSERT(LOOKS_LIKE_CLOSURE_PTR(q)); - if (!HEAP_ALLOCED(q)) { + if (!HEAP_ALLOCED_GC(q)) { if (!major_gc) return; @@ -780,7 +781,7 @@ unchain_thunk_selectors(StgSelector *p, StgClosure *val) // invoke eval_thunk_selector(), the recursive calls will not // evacuate the value (because we want to select on the value, // not evacuate it), so in this case val is in from-space. - // ASSERT(!HEAP_ALLOCED(val) || Bdescr((P_)val)->gen_no > N || (Bdescr((P_)val)->flags & BF_EVACUATED)); + // ASSERT(!HEAP_ALLOCED_GC(val) || Bdescr((P_)val)->gen_no > N || (Bdescr((P_)val)->flags & BF_EVACUATED)); prev = (StgSelector*)((StgClosure *)p)->payload[0]; @@ -834,7 +835,7 @@ eval_thunk_selector (StgClosure **q, StgSelector * p, rtsBool evac) selector_chain: bd = Bdescr((StgPtr)p); - if (HEAP_ALLOCED(p)) { + if (HEAP_ALLOCED_GC(p)) { // If the THUNK_SELECTOR is in to-space or in a generation that we // are not collecting, then bale out early. We won't be able to // save any space in any case, and updating with an indirection is diff --git a/rts/sm/GCAux.c b/rts/sm/GCAux.c index 66806f4..c1ff541 100644 --- a/rts/sm/GCAux.c +++ b/rts/sm/GCAux.c @@ -48,7 +48,7 @@ isAlive(StgClosure *p) // Problem here is that we sometimes don't set the link field, eg. // for static closures with an empty SRT or CONSTR_STATIC_NOCAFs. // - if (!HEAP_ALLOCED(q)) { + if (!HEAP_ALLOCED_GC(q)) { return p; } diff --git a/rts/sm/MBlock.c b/rts/sm/MBlock.c index 28aa7b0..b3fa13b 100644 --- a/rts/sm/MBlock.c +++ b/rts/sm/MBlock.c @@ -17,13 +17,10 @@ #include "Trace.h" #include "OSMem.h" -lnat mblocks_allocated = 0; +#include -void -initMBlocks(void) -{ - osMemInit(); -} +lnat mblocks_allocated = 0; +lnat mpc_misses = 0; /* ----------------------------------------------------------------------------- The MBlock Map: provides our implementation of HEAP_ALLOCED() @@ -31,12 +28,21 @@ initMBlocks(void) #if SIZEOF_VOID_P == 4 StgWord8 mblock_map[MBLOCK_MAP_SIZE]; // initially all zeros + +static void +markHeapAlloced(void *p) +{ + mblock_map[MBLOCK_MAP_ENTRY(p)] = 1; +} + #elif SIZEOF_VOID_P == 8 -static MBlockMap dummy_mblock_map; -MBlockMap *mblock_cache = &dummy_mblock_map; -nat mblock_map_count = 0; + MBlockMap **mblock_maps = NULL; +nat mblock_map_count = 0; + +MbcCacheLine mblock_cache[MBC_ENTRIES]; + static MBlockMap * findMBlockMap(void *p) { @@ -52,39 +58,56 @@ findMBlockMap(void *p) return NULL; } -StgBool -slowIsHeapAlloced(void *p) +StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p) { - MBlockMap *map = findMBlockMap(p); - if(map) + MBlockMap *map; + MBlockMapLine value; + nat entry_no; + + entry_no = mblock & (MBC_ENTRIES-1); + + map = findMBlockMap(p); + if (map) { - mblock_cache = map; - return map->mblocks[MBLOCK_MAP_ENTRY(p)]; + mpc_misses++; + value = map->lines[MBLOCK_MAP_LINE(p)]; + mblock_cache[entry_no] = (mblock<<1) | value; + return value; } else - return 0; + { + mblock_cache[entry_no] = (mblock<<1); + return 0; + } } -#endif static void markHeapAlloced(void *p) { -#if SIZEOF_VOID_P == 4 - mblock_map[MBLOCK_MAP_ENTRY(p)] = 1; -#elif SIZEOF_VOID_P == 8 MBlockMap *map = findMBlockMap(p); if(map == NULL) { mblock_map_count++; mblock_maps = realloc(mblock_maps, sizeof(MBlockMap*) * mblock_map_count); - map = mblock_maps[mblock_map_count-1] = calloc(1,sizeof(MBlockMap)); + map = mblock_maps[mblock_map_count-1] = + stgMallocBytes(sizeof(MBlockMap),"markHeapAlloced"); + memset(map,0,sizeof(MBlockMap)); map->addrHigh32 = (StgWord32) (((StgWord)p) >> 32); } - map->mblocks[MBLOCK_MAP_ENTRY(p)] = 1; - mblock_cache = map; -#endif + + map->lines[MBLOCK_MAP_LINE(p)] = 1; + + { + StgWord mblock; + nat entry_no; + + mblock = (StgWord)p >> MBLOCK_SHIFT; + entry_no = mblock & (MBC_ENTRIES-1); + mblock_cache[entry_no] = (mblock << 1) + 1; + } } +#endif /* ---------------------------------------------------------------------------- Debugging code for traversing the allocated MBlocks @@ -125,47 +148,59 @@ void * getNextMBlock(void *mblock) #elif SIZEOF_VOID_P == 8 -STATIC_INLINE -void * mapEntryToMBlock(MBlockMap *map, nat i) -{ - return (void *)(((StgWord)map->addrHigh32) << 32) + - ((StgWord)i << MBLOCK_SHIFT); -} - -void * getFirstMBlock(void) -{ - MBlockMap *map; - nat i, j; - - for (j = 0; j < mblock_map_count; j++) { - map = mblock_maps[j]; - for (i = 0; i < MBLOCK_MAP_SIZE; i++) { - if (map->mblocks[i]) return mapEntryToMBlock(map,i); - } - } - return NULL; -} - -void * getNextMBlock(void *mblock) +void * getNextMBlock(void *p) { MBlockMap *map; - nat i, j; + nat off, j; + nat line_no; + MBlockMapLine line; for (j = 0; j < mblock_map_count; j++) { map = mblock_maps[j]; - if (map->addrHigh32 == (StgWord)mblock >> 32) break; + if (map->addrHigh32 == (StgWord)p >> 32) break; } if (j == mblock_map_count) return NULL; for (; j < mblock_map_count; j++) { map = mblock_maps[j]; - if (map->addrHigh32 == (StgWord)mblock >> 32) { - i = MBLOCK_MAP_ENTRY(mblock) + 1; + if (map->addrHigh32 == (StgWord)p >> 32) { + line_no = MBLOCK_MAP_LINE(p); + off = (((StgWord)p >> MBLOCK_SHIFT) & (MBC_LINE_SIZE-1)) + 1; + // + 1 because we want the *next* mblock } else { - i = 0; + line_no = 0; off = 0; } - for (; i < MBLOCK_MAP_SIZE; i++) { - if (map->mblocks[i]) return mapEntryToMBlock(map,i); + for (; line_no < MBLOCK_MAP_ENTRIES; line_no++) { + line = map->lines[line_no]; + for (; off < MBC_LINE_SIZE; off++) { + if (line & (1<addrHigh32 << 32) + + line_no * MBC_LINE_SIZE * MBLOCK_SIZE + + off * MBLOCK_SIZE); + } + } + off = 0; + } + } + return NULL; +} + +void * getFirstMBlock(void) +{ + MBlockMap *map = mblock_maps[0]; + nat line_no, off; + MbcCacheLine line; + + for (line_no = 0; line_no < MBLOCK_MAP_ENTRIES; line_no++) { + line = map->lines[line_no]; + if (line) { + for (off = 0; off < MBC_LINE_SIZE; off++) { + if (line & (1<addrHigh32 << 32) + + line_no * MBC_LINE_SIZE * MBLOCK_SIZE + + off * MBLOCK_SIZE); + } + } } } return NULL; @@ -213,3 +248,12 @@ freeAllMBlocks(void) { osFreeAllMBlocks(); } + +void +initMBlocks(void) +{ + osMemInit(); +#if SIZEOF_VOID_P == 8 + memset(mblock_cache,0xff,sizeof(mblock_cache)); +#endif +} diff --git a/rts/sm/MBlock.h b/rts/sm/MBlock.h index ef6f8de..f9dddc3 100644 --- a/rts/sm/MBlock.h +++ b/rts/sm/MBlock.h @@ -12,6 +12,8 @@ #ifndef MBLOCK_H #define MBLOCK_H +#include "GC.h" + extern lnat RTS_VAR(mblocks_allocated); extern void initMBlocks(void); @@ -59,25 +61,145 @@ extern StgWord8 mblock_map[]; # define MBLOCK_MAP_SIZE 4096 # define MBLOCK_MAP_ENTRY(p) ((StgWord)(p) >> MBLOCK_SHIFT) # define HEAP_ALLOCED(p) mblock_map[MBLOCK_MAP_ENTRY(p)] +# define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p) + +/* ----------------------------------------------------------------------------- + HEAP_ALLOCED for 64-bit machines. + + Here are some cache layout options: + + [1] + 16KB cache of 16-bit entries, 1MB lines (capacity 8GB) + mblock size = 20 bits + entries = 8192 13 bits + line size = 0 bits (1 bit of value) + tag size = 15 bits + = 48 bits + + [2] + 32KB cache of 16-bit entries, 4MB lines (capacity 32GB) + mblock size = 20 bits + entries = 16384 14 bits + line size = 2 bits (4 bits of value) + tag size = 12 bits + = 48 bits + + [3] + 16KB cache of 16-bit entries, 2MB lines (capacity 16GB) + mblock size = 20 bits + entries = 8192 13 bits + line size = 1 bits (2 bits of value) + tag size = 14 bits + = 48 bits + + [4] + 4KB cache of 32-bit entries, 16MB lines (capacity 16GB) + mblock size = 20 bits + entries = 1024 10 bits + line size = 4 bits (16 bits of value) + tag size = 14 bits + = 48 bits + + [5] + 4KB cache of 64-bit entries, 32MB lines (capacity 16GB) + mblock size = 20 bits + entries = 512 9 bits + line size = 5 bits (32 bits of value) + tag size = 14 bits + = 48 bits + + We actually use none of the above. After much experimentation it was + found that optimising the lookup is the most important factor, + followed by reducing the number of misses. To that end, we use a + variant of [1] in which each cache entry is ((mblock << 1) + value) + where value is 0 for non-heap and 1 for heap. The cache entries can + be 32 bits, since the mblock number is 48-20 = 28 bits, and we need + 1 bit for the value. The cache can be as big as we like, but + currently we use 8k entries, giving us 8GB capacity. + + ---------------------------------------------------------------------------- */ #elif SIZEOF_VOID_P == 8 -# define MBLOCK_MAP_SIZE 4096 -# define MBLOCK_MAP_ENTRY(p) (((StgWord)(p) & 0xffffffff) >> MBLOCK_SHIFT) +#define MBC_LINE_BITS 0 +#define MBC_TAG_BITS 15 +typedef StgWord32 MbcCacheLine; // could use 16, but 32 was faster +typedef StgWord8 MBlockMapLine; -typedef struct { - StgWord32 addrHigh32; - StgWord8 mblocks[MBLOCK_MAP_SIZE]; -} MBlockMap; +#define MBLOCK_MAP_LINE(p) (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS)) -extern MBlockMap *mblock_cache; +#define MBC_LINE_SIZE (1<> (MBLOCK_SHIFT + MBC_LINE_BITS)) + +#define MBLOCK_MAP_ENTRIES (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS)) + +typedef struct { + StgWord32 addrHigh32; + MBlockMapLine lines[MBLOCK_MAP_ENTRIES]; +} MBlockMap; -# define HEAP_ALLOCED(p) \ - ( ((((StgWord)(p)) >> 32) == mblock_cache->addrHigh32) \ - ? mblock_cache->mblocks[MBLOCK_MAP_ENTRY(p)] \ - : slowIsHeapAlloced(p) ) +extern lnat mpc_misses; + +StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p); + +INLINE_HEADER +StgBool HEAP_ALLOCED(void *p) +{ + StgWord mblock; + nat entry_no; + MbcCacheLine entry, value; + + mblock = (StgWord)p >> MBLOCK_SHIFT; + entry_no = mblock & (MBC_ENTRIES-1); + entry = mblock_cache[entry_no]; + value = entry ^ (mblock << 1); + // this formulation coaxes gcc into prioritising the value==1 + // case, which we expect to be the most common. + // __builtin_expect() didn't have any useful effect (gcc-4.3.0). + if (value == 1) { + return 1; + } else if (value == 0) { + return 0; + } else { + // putting the rest out of line turned out to be a slight + // performance improvement: + return HEAP_ALLOCED_miss(mblock,p); + } +} + +// In the parallel GC, the cache itself is safe to *read*, and can be +// updated atomically, but we need to place a lock around operations +// that touch the MBlock map. +INLINE_HEADER +StgBool HEAP_ALLOCED_GC(void *p) +{ + StgWord mblock; + nat entry_no; + MbcCacheLine entry, value; + StgBool b; + + mblock = (StgWord)p >> MBLOCK_SHIFT; + entry_no = mblock & (MBC_ENTRIES-1); + entry = mblock_cache[entry_no]; + value = entry ^ (mblock << 1); + if (value == 1) { + return 1; + } else if (value == 0) { + return 0; + } else { + // putting the rest out of line turned out to be a slight + // performance improvement: + ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); + b = HEAP_ALLOCED_miss(mblock,p); + RELEASE_SPIN_LOCK(&gc_alloc_block_sync); + return b; + } +} #else # error HEAP_ALLOCED not defined diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c index 732ad15..34096d4 100644 --- a/rts/sm/Scav.c +++ b/rts/sm/Scav.c @@ -1372,7 +1372,7 @@ scavenge_one(StgPtr p) * evacuated, so we perform that check here. */ StgClosure *q = ((StgInd *)p)->indirectee; - if (HEAP_ALLOCED(q) && Bdescr((StgPtr)q)->flags & BF_EVACUATED) { + if (HEAP_ALLOCED_GC(q) && Bdescr((StgPtr)q)->flags & BF_EVACUATED) { break; } evacuate(&((StgInd *)p)->indirectee);