rts/sm/MBlock.h

   1 /* -----------------------------------------------------------------------------
   2  *
   3  * (c) The GHC Team, 1998-2008
   4  *
   5  * MegaBlock Allocator interface.
   6  *
   7  * See wiki commentary at
   8  *  http://hackage.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
   9  *
  10  * ---------------------------------------------------------------------------*/
  11
  12 #ifndef MBLOCK_H
  13 #define MBLOCK_H
  14
  15 #include "GC.h"
  16
  17 extern lnat RTS_VAR(mblocks_allocated);
  18
  19 extern void initMBlocks(void);
  20 extern void * getMBlock(void);
  21 extern void * getMBlocks(nat n);
  22 extern void freeAllMBlocks(void);
  23
  24 #ifdef DEBUG
  25 extern void *getFirstMBlock(void);
  26 extern void *getNextMBlock(void *mblock);
  27 #endif
  28
  29 /* -----------------------------------------------------------------------------
  30    The HEAP_ALLOCED() test.
  31
  32    HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
  33    It needs to be FAST.
  34
  35    See wiki commentary at
  36      http://hackage.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
  37
  38    Implementation of HEAP_ALLOCED
  39    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  40
  41    Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
  42    can just use a table to record which megablocks in the address
  43    space belong to the heap.  On a 32-bit machine, with 1Mb
  44    megablocks, using 8 bits for each entry in the table, the table
  45    requires 4k.  Lookups during GC will be fast, because the table
  46    will be quickly cached (indeed, performance measurements showed no
  47    measurable difference between doing the table lookup and using a
  48    constant comparison).
  49
  50    On 64-bit machines, we cache one 12-bit block map that describes
  51    4096 megablocks or 4GB of memory. If HEAP_ALLOCED is called for
  52    an address that is not in the cache, it calls slowIsHeapAlloced
  53    (see MBlock.c) which will find the block map for the 4GB block in
  54    question.
  55    -------------------------------------------------------------------------- */
  56
  57 #if SIZEOF_VOID_P == 4
  58 extern StgWord8 mblock_map[];
  59
  60 /* On a 32-bit machine a 4KB table is always sufficient */
  61 # define MBLOCK_MAP_SIZE        4096
  62 # define MBLOCK_MAP_ENTRY(p)    ((StgWord)(p) >> MBLOCK_SHIFT)
  63 # define HEAP_ALLOCED(p)        mblock_map[MBLOCK_MAP_ENTRY(p)]
  64 # define HEAP_ALLOCED_GC(p)     HEAP_ALLOCED(p)
  65
  66 /* -----------------------------------------------------------------------------
  67    HEAP_ALLOCED for 64-bit machines.
  68
  69  Here are some cache layout options:
  70
  71  [1]
  72  16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
  73   mblock size =          20 bits
  74   entries   =     8192   13 bits
  75   line size =             0 bits (1 bit of value)
  76   tag size  =            15 bits
  77                        = 48 bits
  78
  79  [2]
  80  32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
  81   mblock size =          20 bits
  82   entries   =    16384   14 bits
  83   line size =             2 bits (4 bits of value)
  84   tag size  =            12 bits
  85                        = 48 bits
  86
  87  [3]
  88  16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
  89   mblock size =          20 bits
  90   entries   =    8192    13 bits
  91   line size =             1 bits (2 bits of value)
  92   tag size  =            14 bits
  93                        = 48 bits
  94
  95  [4]
  96  4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
  97   mblock size =          20 bits
  98   entries   =     1024   10 bits
  99   line size =             4 bits (16 bits of value)
 100   tag size  =            14 bits
 101                        = 48 bits
 102
 103  [5]
 104  4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
 105   mblock size =          20 bits
 106   entries   =     512     9 bits
 107   line size =             5 bits (32 bits of value)
 108   tag size  =            14 bits
 109                        = 48 bits
 110
 111  We actually use none of the above.  After much experimentation it was
 112  found that optimising the lookup is the most important factor,
 113  followed by reducing the number of misses.  To that end, we use a
 114  variant of [1] in which each cache entry is ((mblock << 1) + value)
 115  where value is 0 for non-heap and 1 for heap.  The cache entries can
 116  be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
 117  1 bit for the value.  The cache can be as big as we like, but
 118  currently we use 8k entries, giving us 8GB capacity.
 119
 120  ---------------------------------------------------------------------------- */
 121
 122 #elif SIZEOF_VOID_P == 8
 123
 124 #define MBC_LINE_BITS 0
 125 #define MBC_TAG_BITS 15
 126 typedef StgWord32 MbcCacheLine;  // could use 16, but 32 was faster
 127 typedef StgWord8  MBlockMapLine;
 128
 129 #define MBLOCK_MAP_LINE(p)  (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))
 130
 131 #define MBC_LINE_SIZE  (1<<MBC_LINE_BITS)
 132 #define MBC_SHIFT      (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
 133 #define MBC_ENTRIES    (1<<MBC_SHIFT)
 134
 135 extern MbcCacheLine mblock_cache[];
 136
 137 #define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))
 138
 139 #define MBLOCK_MAP_ENTRIES  (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))
 140
 141 typedef struct {
 142     StgWord32    addrHigh32;
 143     MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
 144 } MBlockMap;
 145
 146 extern lnat mpc_misses;
 147
 148 StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p);
 149
 150 INLINE_HEADER
 151 StgBool HEAP_ALLOCED(void *p)
 152 {
 153     StgWord mblock;
 154     nat entry_no;
 155     MbcCacheLine entry, value;
 156
 157     mblock   = (StgWord)p >> MBLOCK_SHIFT;
 158     entry_no = mblock & (MBC_ENTRIES-1);
 159     entry    = mblock_cache[entry_no];
 160     value    = entry ^ (mblock << 1);
 161     // this formulation coaxes gcc into prioritising the value==1
 162     // case, which we expect to be the most common.
 163     // __builtin_expect() didn't have any useful effect (gcc-4.3.0).
 164     if (value == 1) {
 165         return 1;
 166     } else if (value == 0) {
 167         return 0;
 168     } else {
 169         // putting the rest out of line turned out to be a slight
 170         // performance improvement:
 171         return HEAP_ALLOCED_miss(mblock,p);
 172     }
 173 }
 174
 175 // In the parallel GC, the cache itself is safe to *read*, and can be
 176 // updated atomically, but we need to place a lock around operations
 177 // that touch the MBlock map.
 178 INLINE_HEADER
 179 StgBool HEAP_ALLOCED_GC(void *p)
 180 {
 181     StgWord mblock;
 182     nat entry_no;
 183     MbcCacheLine entry, value;
 184     StgBool b;
 185
 186     mblock   = (StgWord)p >> MBLOCK_SHIFT;
 187     entry_no = mblock & (MBC_ENTRIES-1);
 188     entry    = mblock_cache[entry_no];
 189     value    = entry ^ (mblock << 1);
 190     if (value == 1) {
 191         return 1;
 192     } else if (value == 0) {
 193         return 0;
 194     } else {
 195         // putting the rest out of line turned out to be a slight
 196         // performance improvement:
 197         ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
 198         b = HEAP_ALLOCED_miss(mblock,p);
 199         RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
 200         return b;
 201     }
 202 }
 203
 204 #else
 205 # error HEAP_ALLOCED not defined
 206 #endif
 207
 208 #endif /* MBLOCK_H */