includes/rts/storage/MBlock.h

   1 /* -----------------------------------------------------------------------------
   2  *
   3  * (c) The GHC Team, 1998-2008
   4  *
   5  * MegaBlock Allocator interface.
   6  *
   7  * See wiki commentary at
   8  *  http://hackage.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
   9  *
  10  * ---------------------------------------------------------------------------*/
  11
  12 #ifndef RTS_STORAGE_MBLOCK_H
  13 #define RTS_STORAGE_MBLOCK_H
  14
  15 extern lnat mblocks_allocated;
  16
  17 extern void initMBlocks(void);
  18 extern void * getMBlock(void);
  19 extern void * getMBlocks(nat n);
  20 extern void freeAllMBlocks(void);
  21
  22 #ifdef DEBUG
  23 extern void *getFirstMBlock(void);
  24 extern void *getNextMBlock(void *mblock);
  25 #endif
  26
  27 /* -----------------------------------------------------------------------------
  28    The HEAP_ALLOCED() test.
  29
  30    HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
  31    It needs to be FAST.
  32
  33    See wiki commentary at
  34      http://hackage.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
  35
  36    Implementation of HEAP_ALLOCED
  37    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  38
  39    Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
  40    can just use a table to record which megablocks in the address
  41    space belong to the heap.  On a 32-bit machine, with 1Mb
  42    megablocks, using 8 bits for each entry in the table, the table
  43    requires 4k.  Lookups during GC will be fast, because the table
  44    will be quickly cached (indeed, performance measurements showed no
  45    measurable difference between doing the table lookup and using a
  46    constant comparison).
  47
  48    On 64-bit machines, we cache one 12-bit block map that describes
  49    4096 megablocks or 4GB of memory. If HEAP_ALLOCED is called for
  50    an address that is not in the cache, it calls slowIsHeapAlloced
  51    (see MBlock.c) which will find the block map for the 4GB block in
  52    question.
  53    -------------------------------------------------------------------------- */
  54
  55 #if SIZEOF_VOID_P == 4
  56 extern StgWord8 mblock_map[];
  57
  58 /* On a 32-bit machine a 4KB table is always sufficient */
  59 # define MBLOCK_MAP_SIZE        4096
  60 # define MBLOCK_MAP_ENTRY(p)    ((StgWord)(p) >> MBLOCK_SHIFT)
  61 # define HEAP_ALLOCED(p)        mblock_map[MBLOCK_MAP_ENTRY(p)]
  62 # define HEAP_ALLOCED_GC(p)     HEAP_ALLOCED(p)
  63
  64 /* -----------------------------------------------------------------------------
  65    HEAP_ALLOCED for 64-bit machines.
  66
  67  Here are some cache layout options:
  68
  69  [1]
  70  16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
  71   mblock size =          20 bits
  72   entries   =     8192   13 bits
  73   line size =             0 bits (1 bit of value)
  74   tag size  =            15 bits
  75                        = 48 bits
  76
  77  [2]
  78  32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
  79   mblock size =          20 bits
  80   entries   =    16384   14 bits
  81   line size =             2 bits (4 bits of value)
  82   tag size  =            12 bits
  83                        = 48 bits
  84
  85  [3]
  86  16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
  87   mblock size =          20 bits
  88   entries   =    8192    13 bits
  89   line size =             1 bits (2 bits of value)
  90   tag size  =            14 bits
  91                        = 48 bits
  92
  93  [4]
  94  4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
  95   mblock size =          20 bits
  96   entries   =     1024   10 bits
  97   line size =             4 bits (16 bits of value)
  98   tag size  =            14 bits
  99                        = 48 bits
 100
 101  [5]
 102  4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
 103   mblock size =          20 bits
 104   entries   =     512     9 bits
 105   line size =             5 bits (32 bits of value)
 106   tag size  =            14 bits
 107                        = 48 bits
 108
 109  We actually use none of the above.  After much experimentation it was
 110  found that optimising the lookup is the most important factor,
 111  followed by reducing the number of misses.  To that end, we use a
 112  variant of [1] in which each cache entry is ((mblock << 1) + value)
 113  where value is 0 for non-heap and 1 for heap.  The cache entries can
 114  be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
 115  1 bit for the value.  The cache can be as big as we like, but
 116  currently we use 8k entries, giving us 8GB capacity.
 117
 118  ---------------------------------------------------------------------------- */
 119
 120 #elif SIZEOF_VOID_P == 8
 121
 122 #define MBC_LINE_BITS 0
 123 #define MBC_TAG_BITS 15
 124 typedef StgWord32 MbcCacheLine;  // could use 16, but 32 was faster
 125 typedef StgWord8  MBlockMapLine;
 126
 127 #define MBLOCK_MAP_LINE(p)  (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))
 128
 129 #define MBC_LINE_SIZE  (1<<MBC_LINE_BITS)
 130 #define MBC_SHIFT      (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
 131 #define MBC_ENTRIES    (1<<MBC_SHIFT)
 132
 133 extern MbcCacheLine mblock_cache[];
 134
 135 #define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))
 136
 137 #define MBLOCK_MAP_ENTRIES  (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))
 138
 139 typedef struct {
 140     StgWord32    addrHigh32;
 141     MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
 142 } MBlockMap;
 143
 144 extern lnat mpc_misses;
 145
 146 #ifdef THREADED_RTS
 147 extern SpinLock gc_alloc_block_sync;
 148 #endif
 149
 150 StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p);
 151
 152 INLINE_HEADER
 153 StgBool HEAP_ALLOCED(void *p)
 154 {
 155     StgWord mblock;
 156     nat entry_no;
 157     MbcCacheLine entry, value;
 158
 159     mblock   = (StgWord)p >> MBLOCK_SHIFT;
 160     entry_no = mblock & (MBC_ENTRIES-1);
 161     entry    = mblock_cache[entry_no];
 162     value    = entry ^ (mblock << 1);
 163     // this formulation coaxes gcc into prioritising the value==1
 164     // case, which we expect to be the most common.
 165     // __builtin_expect() didn't have any useful effect (gcc-4.3.0).
 166     if (value == 1) {
 167         return 1;
 168     } else if (value == 0) {
 169         return 0;
 170     } else {
 171         // putting the rest out of line turned out to be a slight
 172         // performance improvement:
 173         return HEAP_ALLOCED_miss(mblock,p);
 174     }
 175 }
 176
 177 // In the parallel GC, the cache itself is safe to *read*, and can be
 178 // updated atomically, but we need to place a lock around operations
 179 // that touch the MBlock map.
 180 INLINE_HEADER
 181 StgBool HEAP_ALLOCED_GC(void *p)
 182 {
 183     StgWord mblock;
 184     nat entry_no;
 185     MbcCacheLine entry, value;
 186     StgBool b;
 187
 188     mblock   = (StgWord)p >> MBLOCK_SHIFT;
 189     entry_no = mblock & (MBC_ENTRIES-1);
 190     entry    = mblock_cache[entry_no];
 191     value    = entry ^ (mblock << 1);
 192     if (value == 1) {
 193         return 1;
 194     } else if (value == 0) {
 195         return 0;
 196     } else {
 197         // putting the rest out of line turned out to be a slight
 198         // performance improvement:
 199         ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
 200         b = HEAP_ALLOCED_miss(mblock,p);
 201         RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
 202         return b;
 203     }
 204 }
 205
 206 #else
 207 # error HEAP_ALLOCED not defined
 208 #endif
 209
 210 #endif /* RTS_STORAGE_MBLOCK_H */