includes/rts/storage/MBlock.h

   1 /* -----------------------------------------------------------------------------
   2  *
   3  * (c) The GHC Team, 1998-2008
   4  *
   5  * MegaBlock Allocator interface.
   6  *
   7  * See wiki commentary at
   8  *  http://hackage.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
   9  *
  10  * ---------------------------------------------------------------------------*/
  11
  12 #ifndef RTS_STORAGE_MBLOCK_H
  13 #define RTS_STORAGE_MBLOCK_H
  14
  15 extern lnat mblocks_allocated;
  16
  17 extern void initMBlocks(void);
  18 extern void * getMBlock(void);
  19 extern void * getMBlocks(nat n);
  20 extern void freeAllMBlocks(void);
  21
  22 #ifdef DEBUG
  23 extern void *getFirstMBlock(void);
  24 extern void *getNextMBlock(void *mblock);
  25 #endif
  26
  27 #ifdef THREADED_RTS
  28 // needed for HEAP_ALLOCED below
  29 extern SpinLock gc_alloc_block_sync;
  30 #endif
  31
  32 /* -----------------------------------------------------------------------------
  33    The HEAP_ALLOCED() test.
  34
  35    HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
  36    It needs to be FAST.
  37
  38    See wiki commentary at
  39      http://hackage.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
  40
  41    Implementation of HEAP_ALLOCED
  42    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  43
  44    Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
  45    can just use a table to record which megablocks in the address
  46    space belong to the heap.  On a 32-bit machine, with 1Mb
  47    megablocks, using 8 bits for each entry in the table, the table
  48    requires 4k.  Lookups during GC will be fast, because the table
  49    will be quickly cached (indeed, performance measurements showed no
  50    measurable difference between doing the table lookup and using a
  51    constant comparison).
  52
  53    On 64-bit machines, we cache one 12-bit block map that describes
  54    4096 megablocks or 4GB of memory. If HEAP_ALLOCED is called for
  55    an address that is not in the cache, it calls slowIsHeapAlloced
  56    (see MBlock.c) which will find the block map for the 4GB block in
  57    question.
  58    -------------------------------------------------------------------------- */
  59
  60 #if SIZEOF_VOID_P == 4
  61 extern StgWord8 mblock_map[];
  62
  63 /* On a 32-bit machine a 4KB table is always sufficient */
  64 # define MBLOCK_MAP_SIZE        4096
  65 # define MBLOCK_MAP_ENTRY(p)    ((StgWord)(p) >> MBLOCK_SHIFT)
  66 # define HEAP_ALLOCED(p)        mblock_map[MBLOCK_MAP_ENTRY(p)]
  67 # define HEAP_ALLOCED_GC(p)     HEAP_ALLOCED(p)
  68
  69 /* -----------------------------------------------------------------------------
  70    HEAP_ALLOCED for 64-bit machines.
  71
  72  Here are some cache layout options:
  73
  74  [1]
  75  16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
  76   mblock size =          20 bits
  77   entries   =     8192   13 bits
  78   line size =             0 bits (1 bit of value)
  79   tag size  =            15 bits
  80                        = 48 bits
  81
  82  [2]
  83  32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
  84   mblock size =          20 bits
  85   entries   =    16384   14 bits
  86   line size =             2 bits (4 bits of value)
  87   tag size  =            12 bits
  88                        = 48 bits
  89
  90  [3]
  91  16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
  92   mblock size =          20 bits
  93   entries   =    8192    13 bits
  94   line size =             1 bits (2 bits of value)
  95   tag size  =            14 bits
  96                        = 48 bits
  97
  98  [4]
  99  4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
 100   mblock size =          20 bits
 101   entries   =     1024   10 bits
 102   line size =             4 bits (16 bits of value)
 103   tag size  =            14 bits
 104                        = 48 bits
 105
 106  [5]
 107  4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
 108   mblock size =          20 bits
 109   entries   =     512     9 bits
 110   line size =             5 bits (32 bits of value)
 111   tag size  =            14 bits
 112                        = 48 bits
 113
 114  We actually use none of the above.  After much experimentation it was
 115  found that optimising the lookup is the most important factor,
 116  followed by reducing the number of misses.  To that end, we use a
 117  variant of [1] in which each cache entry is ((mblock << 1) + value)
 118  where value is 0 for non-heap and 1 for heap.  The cache entries can
 119  be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
 120  1 bit for the value.  The cache can be as big as we like, but
 121  currently we use 8k entries, giving us 8GB capacity.
 122
 123  ---------------------------------------------------------------------------- */
 124
 125 #elif SIZEOF_VOID_P == 8
 126
 127 #define MBC_LINE_BITS 0
 128 #define MBC_TAG_BITS 15
 129
 130 #if x86_64_HOST_ARCH
 131 // 32bits are enough for 'entry' as modern amd64 boxes have
 132 // only 48bit sized virtual addres.
 133 typedef StgWord32 MbcCacheLine;
 134 #else
 135 // 32bits is not enough here as some arches (like ia64) use
 136 // upper address bits to distinct memory areas.
 137 typedef StgWord64 MbcCacheLine;
 138 #endif
 139
 140 typedef StgWord8  MBlockMapLine;
 141
 142 #define MBLOCK_MAP_LINE(p)  (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))
 143
 144 #define MBC_LINE_SIZE  (1<<MBC_LINE_BITS)
 145 #define MBC_SHIFT      (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
 146 #define MBC_ENTRIES    (1<<MBC_SHIFT)
 147
 148 extern MbcCacheLine mblock_cache[];
 149
 150 #define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))
 151
 152 #define MBLOCK_MAP_ENTRIES  (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))
 153
 154 typedef struct {
 155     StgWord32    addrHigh32;
 156     MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
 157 } MBlockMap;
 158
 159 extern lnat mpc_misses;
 160
 161 StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p);
 162
 163 INLINE_HEADER
 164 StgBool HEAP_ALLOCED(void *p)
 165 {
 166     StgWord mblock;
 167     nat entry_no;
 168     MbcCacheLine entry, value;
 169
 170     mblock   = (StgWord)p >> MBLOCK_SHIFT;
 171     entry_no = mblock & (MBC_ENTRIES-1);
 172     entry    = mblock_cache[entry_no];
 173     value    = entry ^ (mblock << 1);
 174     // this formulation coaxes gcc into prioritising the value==1
 175     // case, which we expect to be the most common.
 176     // __builtin_expect() didn't have any useful effect (gcc-4.3.0).
 177     if (value == 1) {
 178         return 1;
 179     } else if (value == 0) {
 180         return 0;
 181     } else {
 182         // putting the rest out of line turned out to be a slight
 183         // performance improvement:
 184         return HEAP_ALLOCED_miss(mblock,p);
 185     }
 186 }
 187
 188 // In the parallel GC, the cache itself is safe to *read*, and can be
 189 // updated atomically, but we need to place a lock around operations
 190 // that touch the MBlock map.
 191 INLINE_HEADER
 192 StgBool HEAP_ALLOCED_GC(void *p)
 193 {
 194     StgWord mblock;
 195     nat entry_no;
 196     MbcCacheLine entry, value;
 197     StgBool b;
 198
 199     mblock   = (StgWord)p >> MBLOCK_SHIFT;
 200     entry_no = mblock & (MBC_ENTRIES-1);
 201     entry    = mblock_cache[entry_no];
 202     value    = entry ^ (mblock << 1);
 203     if (value == 1) {
 204         return 1;
 205     } else if (value == 0) {
 206         return 0;
 207     } else {
 208         // putting the rest out of line turned out to be a slight
 209         // performance improvement:
 210         ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
 211         b = HEAP_ALLOCED_miss(mblock,p);
 212         RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
 213         return b;
 214     }
 215 }
 216
 217 #else
 218 # error HEAP_ALLOCED not defined
 219 #endif
 220
 221 #endif /* RTS_STORAGE_MBLOCK_H */