X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=rts%2Fsm%2FMBlock.h;h=f9dddc3138d190461f146bd9619b20263da99e22;hb=5d379cbe65e406d5c3a848fe7fcd090cafbfeb78;hp=ef6f8de33a98436c75d2f3311a9fc4d44bff19dd;hpb=3a5ac6160f9237a49c2dbc669a8dbb26555ce3d4;p=ghc-hetmet.git diff --git a/rts/sm/MBlock.h b/rts/sm/MBlock.h index ef6f8de..f9dddc3 100644 --- a/rts/sm/MBlock.h +++ b/rts/sm/MBlock.h @@ -12,6 +12,8 @@ #ifndef MBLOCK_H #define MBLOCK_H +#include "GC.h" + extern lnat RTS_VAR(mblocks_allocated); extern void initMBlocks(void); @@ -59,25 +61,145 @@ extern StgWord8 mblock_map[]; # define MBLOCK_MAP_SIZE 4096 # define MBLOCK_MAP_ENTRY(p) ((StgWord)(p) >> MBLOCK_SHIFT) # define HEAP_ALLOCED(p) mblock_map[MBLOCK_MAP_ENTRY(p)] +# define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p) + +/* ----------------------------------------------------------------------------- + HEAP_ALLOCED for 64-bit machines. + + Here are some cache layout options: + + [1] + 16KB cache of 16-bit entries, 1MB lines (capacity 8GB) + mblock size = 20 bits + entries = 8192 13 bits + line size = 0 bits (1 bit of value) + tag size = 15 bits + = 48 bits + + [2] + 32KB cache of 16-bit entries, 4MB lines (capacity 32GB) + mblock size = 20 bits + entries = 16384 14 bits + line size = 2 bits (4 bits of value) + tag size = 12 bits + = 48 bits + + [3] + 16KB cache of 16-bit entries, 2MB lines (capacity 16GB) + mblock size = 20 bits + entries = 8192 13 bits + line size = 1 bits (2 bits of value) + tag size = 14 bits + = 48 bits + + [4] + 4KB cache of 32-bit entries, 16MB lines (capacity 16GB) + mblock size = 20 bits + entries = 1024 10 bits + line size = 4 bits (16 bits of value) + tag size = 14 bits + = 48 bits + + [5] + 4KB cache of 64-bit entries, 32MB lines (capacity 16GB) + mblock size = 20 bits + entries = 512 9 bits + line size = 5 bits (32 bits of value) + tag size = 14 bits + = 48 bits + + We actually use none of the above. After much experimentation it was + found that optimising the lookup is the most important factor, + followed by reducing the number of misses. To that end, we use a + variant of [1] in which each cache entry is ((mblock << 1) + value) + where value is 0 for non-heap and 1 for heap. The cache entries can + be 32 bits, since the mblock number is 48-20 = 28 bits, and we need + 1 bit for the value. The cache can be as big as we like, but + currently we use 8k entries, giving us 8GB capacity. + + ---------------------------------------------------------------------------- */ #elif SIZEOF_VOID_P == 8 -# define MBLOCK_MAP_SIZE 4096 -# define MBLOCK_MAP_ENTRY(p) (((StgWord)(p) & 0xffffffff) >> MBLOCK_SHIFT) +#define MBC_LINE_BITS 0 +#define MBC_TAG_BITS 15 +typedef StgWord32 MbcCacheLine; // could use 16, but 32 was faster +typedef StgWord8 MBlockMapLine; -typedef struct { - StgWord32 addrHigh32; - StgWord8 mblocks[MBLOCK_MAP_SIZE]; -} MBlockMap; +#define MBLOCK_MAP_LINE(p) (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS)) -extern MBlockMap *mblock_cache; +#define MBC_LINE_SIZE (1<> (MBLOCK_SHIFT + MBC_LINE_BITS)) + +#define MBLOCK_MAP_ENTRIES (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS)) + +typedef struct { + StgWord32 addrHigh32; + MBlockMapLine lines[MBLOCK_MAP_ENTRIES]; +} MBlockMap; -# define HEAP_ALLOCED(p) \ - ( ((((StgWord)(p)) >> 32) == mblock_cache->addrHigh32) \ - ? mblock_cache->mblocks[MBLOCK_MAP_ENTRY(p)] \ - : slowIsHeapAlloced(p) ) +extern lnat mpc_misses; + +StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p); + +INLINE_HEADER +StgBool HEAP_ALLOCED(void *p) +{ + StgWord mblock; + nat entry_no; + MbcCacheLine entry, value; + + mblock = (StgWord)p >> MBLOCK_SHIFT; + entry_no = mblock & (MBC_ENTRIES-1); + entry = mblock_cache[entry_no]; + value = entry ^ (mblock << 1); + // this formulation coaxes gcc into prioritising the value==1 + // case, which we expect to be the most common. + // __builtin_expect() didn't have any useful effect (gcc-4.3.0). + if (value == 1) { + return 1; + } else if (value == 0) { + return 0; + } else { + // putting the rest out of line turned out to be a slight + // performance improvement: + return HEAP_ALLOCED_miss(mblock,p); + } +} + +// In the parallel GC, the cache itself is safe to *read*, and can be +// updated atomically, but we need to place a lock around operations +// that touch the MBlock map. +INLINE_HEADER +StgBool HEAP_ALLOCED_GC(void *p) +{ + StgWord mblock; + nat entry_no; + MbcCacheLine entry, value; + StgBool b; + + mblock = (StgWord)p >> MBLOCK_SHIFT; + entry_no = mblock & (MBC_ENTRIES-1); + entry = mblock_cache[entry_no]; + value = entry ^ (mblock << 1); + if (value == 1) { + return 1; + } else if (value == 0) { + return 0; + } else { + // putting the rest out of line turned out to be a slight + // performance improvement: + ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); + b = HEAP_ALLOCED_miss(mblock,p); + RELEASE_SPIN_LOCK(&gc_alloc_block_sync); + return b; + } +} #else # error HEAP_ALLOCED not defined