1 /* -----------------------------------------------------------------------------
3 * (c) The GHC Team, 1998-2008
5 * MegaBlock Allocator interface.
7 * See wiki commentary at
8 * http://hackage.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
10 * ---------------------------------------------------------------------------*/
12 #ifndef RTS_STORAGE_MBLOCK_H
13 #define RTS_STORAGE_MBLOCK_H
15 extern lnat mblocks_allocated;
17 extern void initMBlocks(void);
18 extern void * getMBlock(void);
19 extern void * getMBlocks(nat n);
20 extern void freeAllMBlocks(void);
23 extern void *getFirstMBlock(void);
24 extern void *getNextMBlock(void *mblock);
28 // needed for HEAP_ALLOCED below
29 extern SpinLock gc_alloc_block_sync;
32 /* -----------------------------------------------------------------------------
33 The HEAP_ALLOCED() test.
35 HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
38 See wiki commentary at
39 http://hackage.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
41 Implementation of HEAP_ALLOCED
42 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
44 Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
45 can just use a table to record which megablocks in the address
46 space belong to the heap. On a 32-bit machine, with 1Mb
47 megablocks, using 8 bits for each entry in the table, the table
48 requires 4k. Lookups during GC will be fast, because the table
49 will be quickly cached (indeed, performance measurements showed no
50 measurable difference between doing the table lookup and using a
53 On 64-bit machines, we cache one 12-bit block map that describes
54 4096 megablocks or 4GB of memory. If HEAP_ALLOCED is called for
55 an address that is not in the cache, it calls slowIsHeapAlloced
56 (see MBlock.c) which will find the block map for the 4GB block in
58 -------------------------------------------------------------------------- */
60 #if SIZEOF_VOID_P == 4
61 extern StgWord8 mblock_map[];
63 /* On a 32-bit machine a 4KB table is always sufficient */
64 # define MBLOCK_MAP_SIZE 4096
65 # define MBLOCK_MAP_ENTRY(p) ((StgWord)(p) >> MBLOCK_SHIFT)
66 # define HEAP_ALLOCED(p) mblock_map[MBLOCK_MAP_ENTRY(p)]
67 # define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p)
69 /* -----------------------------------------------------------------------------
70 HEAP_ALLOCED for 64-bit machines.
72 Here are some cache layout options:
75 16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
77 entries = 8192 13 bits
78 line size = 0 bits (1 bit of value)
83 32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
85 entries = 16384 14 bits
86 line size = 2 bits (4 bits of value)
91 16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
93 entries = 8192 13 bits
94 line size = 1 bits (2 bits of value)
99 4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
100 mblock size = 20 bits
101 entries = 1024 10 bits
102 line size = 4 bits (16 bits of value)
107 4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
108 mblock size = 20 bits
110 line size = 5 bits (32 bits of value)
114 We actually use none of the above. After much experimentation it was
115 found that optimising the lookup is the most important factor,
116 followed by reducing the number of misses. To that end, we use a
117 variant of [1] in which each cache entry is ((mblock << 1) + value)
118 where value is 0 for non-heap and 1 for heap. The cache entries can
119 be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
120 1 bit for the value. The cache can be as big as we like, but
121 currently we use 8k entries, giving us 8GB capacity.
123 ---------------------------------------------------------------------------- */
125 #elif SIZEOF_VOID_P == 8
127 #define MBC_LINE_BITS 0
128 #define MBC_TAG_BITS 15
129 typedef StgWord32 MbcCacheLine; // could use 16, but 32 was faster
130 typedef StgWord8 MBlockMapLine;
132 #define MBLOCK_MAP_LINE(p) (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))
134 #define MBC_LINE_SIZE (1<<MBC_LINE_BITS)
135 #define MBC_SHIFT (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
136 #define MBC_ENTRIES (1<<MBC_SHIFT)
138 extern MbcCacheLine mblock_cache[];
140 #define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))
142 #define MBLOCK_MAP_ENTRIES (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))
145 StgWord32 addrHigh32;
146 MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
149 extern lnat mpc_misses;
151 StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p);
154 StgBool HEAP_ALLOCED(void *p)
158 MbcCacheLine entry, value;
160 mblock = (StgWord)p >> MBLOCK_SHIFT;
161 entry_no = mblock & (MBC_ENTRIES-1);
162 entry = mblock_cache[entry_no];
163 value = entry ^ (mblock << 1);
164 // this formulation coaxes gcc into prioritising the value==1
165 // case, which we expect to be the most common.
166 // __builtin_expect() didn't have any useful effect (gcc-4.3.0).
169 } else if (value == 0) {
172 // putting the rest out of line turned out to be a slight
173 // performance improvement:
174 return HEAP_ALLOCED_miss(mblock,p);
178 // In the parallel GC, the cache itself is safe to *read*, and can be
179 // updated atomically, but we need to place a lock around operations
180 // that touch the MBlock map.
182 StgBool HEAP_ALLOCED_GC(void *p)
186 MbcCacheLine entry, value;
189 mblock = (StgWord)p >> MBLOCK_SHIFT;
190 entry_no = mblock & (MBC_ENTRIES-1);
191 entry = mblock_cache[entry_no];
192 value = entry ^ (mblock << 1);
195 } else if (value == 0) {
198 // putting the rest out of line turned out to be a slight
199 // performance improvement:
200 ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
201 b = HEAP_ALLOCED_miss(mblock,p);
202 RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
208 # error HEAP_ALLOCED not defined
211 #endif /* RTS_STORAGE_MBLOCK_H */