rts/sm/Storage.c

   1 /* -----------------------------------------------------------------------------
   2  *
   3  * (c) The GHC Team, 1998-2004
   4  *
   5  * Storage manager front end
   6  *
   7  * ---------------------------------------------------------------------------*/
   8
   9 #include "PosixSource.h"
  10 #include "Rts.h"
  11 #include "RtsUtils.h"
  12 #include "RtsFlags.h"
  13 #include "Stats.h"
  14 #include "Hooks.h"
  15 #include "BlockAlloc.h"
  16 #include "MBlock.h"
  17 #include "Weak.h"
  18 #include "Sanity.h"
  19 #include "Arena.h"
  20 #include "OSThreads.h"
  21 #include "Capability.h"
  22 #include "Storage.h"
  23 #include "Schedule.h"
  24 #include "RetainerProfile.h"    // for counting memory blocks (memInventory)
  25 #include "OSMem.h"
  26 #include "Trace.h"
  27
  28 #include <stdlib.h>
  29 #include <string.h>
  30
  31 /*
  32  * All these globals require sm_mutex to access in THREADED_RTS mode.
  33  */
  34 StgClosure    *caf_list         = NULL;
  35 StgClosure    *revertible_caf_list = NULL;
  36 rtsBool       keepCAFs;
  37
  38 bdescr *small_alloc_list;       /* allocate()d small objects */
  39 bdescr *pinned_object_block;    /* allocate pinned objects into this block */
  40 nat alloc_blocks;               /* number of allocate()d blocks since GC */
  41 nat alloc_blocks_lim;           /* approximate limit on alloc_blocks */
  42
  43 StgPtr alloc_Hp    = NULL;      /* next free byte in small_alloc_list */
  44 StgPtr alloc_HpLim = NULL;      /* end of block at small_alloc_list   */
  45
  46 generation *generations = NULL; /* all the generations */
  47 generation *g0          = NULL; /* generation 0, for convenience */
  48 generation *oldest_gen  = NULL; /* oldest generation, for convenience */
  49 step *g0s0              = NULL; /* generation 0, step 0, for convenience */
  50
  51 ullong total_allocated = 0;     /* total memory allocated during run */
  52
  53 nat n_nurseries         = 0;    /* == RtsFlags.ParFlags.nNodes, convenience */
  54 step *nurseries         = NULL; /* array of nurseries, >1 only if THREADED_RTS */
  55
  56 #ifdef THREADED_RTS
  57 /*
  58  * Storage manager mutex:  protects all the above state from
  59  * simultaneous access by two STG threads.
  60  */
  61 Mutex sm_mutex;
  62 /*
  63  * This mutex is used by atomicModifyMutVar# only
  64  */
  65 Mutex atomic_modify_mutvar_mutex;
  66 #endif
  67
  68
  69 /*
  70  * Forward references
  71  */
  72 static void *stgAllocForGMP   (size_t size_in_bytes);
  73 static void *stgReallocForGMP (void *ptr, size_t old_size, size_t new_size);
  74 static void  stgDeallocForGMP (void *ptr, size_t size);
  75
  76 static void
  77 initStep (step *stp, int g, int s)
  78 {
  79     stp->no = s;
  80     stp->blocks = NULL;
  81     stp->n_blocks = 0;
  82     stp->old_blocks = NULL;
  83     stp->n_old_blocks = 0;
  84     stp->gen = &generations[g];
  85     stp->gen_no = g;
  86     stp->hp = NULL;
  87     stp->hpLim = NULL;
  88     stp->hp_bd = NULL;
  89     stp->scavd_hp = NULL;
  90     stp->scavd_hpLim = NULL;
  91     stp->scan = NULL;
  92     stp->scan_bd = NULL;
  93     stp->large_objects = NULL;
  94     stp->n_large_blocks = 0;
  95     stp->new_large_objects = NULL;
  96     stp->scavenged_large_objects = NULL;
  97     stp->n_scavenged_large_blocks = 0;
  98     stp->is_compacted = 0;
  99     stp->bitmap = NULL;
 100 }
 101
 102 void
 103 initStorage( void )
 104 {
 105   nat g, s;
 106   generation *gen;
 107
 108   if (generations != NULL) {
 109       // multi-init protection
 110       return;
 111   }
 112
 113   /* Sanity check to make sure the LOOKS_LIKE_ macros appear to be
 114    * doing something reasonable.
 115    */
 116   ASSERT(LOOKS_LIKE_INFO_PTR(&stg_BLACKHOLE_info));
 117   ASSERT(LOOKS_LIKE_CLOSURE_PTR(&stg_dummy_ret_closure));
 118   ASSERT(!HEAP_ALLOCED(&stg_dummy_ret_closure));
 119
 120   if (RtsFlags.GcFlags.maxHeapSize != 0 &&
 121       RtsFlags.GcFlags.heapSizeSuggestion >
 122       RtsFlags.GcFlags.maxHeapSize) {
 123     RtsFlags.GcFlags.maxHeapSize = RtsFlags.GcFlags.heapSizeSuggestion;
 124   }
 125
 126   if (RtsFlags.GcFlags.maxHeapSize != 0 &&
 127       RtsFlags.GcFlags.minAllocAreaSize >
 128       RtsFlags.GcFlags.maxHeapSize) {
 129       errorBelch("maximum heap size (-M) is smaller than minimum alloc area size (-A)");
 130       RtsFlags.GcFlags.minAllocAreaSize = RtsFlags.GcFlags.maxHeapSize;
 131   }
 132
 133   initBlockAllocator();
 134
 135 #if defined(THREADED_RTS)
 136   initMutex(&sm_mutex);
 137   initMutex(&atomic_modify_mutvar_mutex);
 138 #endif
 139
 140   ACQUIRE_SM_LOCK;
 141
 142   /* allocate generation info array */
 143   generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations
 144                                              * sizeof(struct generation_),
 145                                              "initStorage: gens");
 146
 147   /* Initialise all generations */
 148   for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
 149     gen = &generations[g];
 150     gen->no = g;
 151     gen->mut_list = allocBlock();
 152     gen->collections = 0;
 153     gen->failed_promotions = 0;
 154     gen->max_blocks = 0;
 155   }
 156
 157   /* A couple of convenience pointers */
 158   g0 = &generations[0];
 159   oldest_gen = &generations[RtsFlags.GcFlags.generations-1];
 160
 161   /* Allocate step structures in each generation */
 162   if (RtsFlags.GcFlags.generations > 1) {
 163     /* Only for multiple-generations */
 164
 165     /* Oldest generation: one step */
 166     oldest_gen->n_steps = 1;
 167     oldest_gen->steps =
 168       stgMallocBytes(1 * sizeof(struct step_), "initStorage: last step");
 169
 170     /* set up all except the oldest generation with 2 steps */
 171     for(g = 0; g < RtsFlags.GcFlags.generations-1; g++) {
 172       generations[g].n_steps = RtsFlags.GcFlags.steps;
 173       generations[g].steps  =
 174         stgMallocBytes (RtsFlags.GcFlags.steps * sizeof(struct step_),
 175                         "initStorage: steps");
 176     }
 177
 178   } else {
 179     /* single generation, i.e. a two-space collector */
 180     g0->n_steps = 1;
 181     g0->steps = stgMallocBytes (sizeof(struct step_), "initStorage: steps");
 182   }
 183
 184 #ifdef THREADED_RTS
 185   n_nurseries = n_capabilities;
 186   nurseries = stgMallocBytes (n_nurseries * sizeof(struct step_),
 187                               "initStorage: nurseries");
 188 #else
 189   n_nurseries = 1;
 190   nurseries = g0->steps; // just share nurseries[0] with g0s0
 191 #endif
 192
 193   /* Initialise all steps */
 194   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
 195     for (s = 0; s < generations[g].n_steps; s++) {
 196         initStep(&generations[g].steps[s], g, s);
 197     }
 198   }
 199
 200 #ifdef THREADED_RTS
 201   for (s = 0; s < n_nurseries; s++) {
 202       initStep(&nurseries[s], 0, s);
 203   }
 204 #endif
 205
 206   /* Set up the destination pointers in each younger gen. step */
 207   for (g = 0; g < RtsFlags.GcFlags.generations-1; g++) {
 208     for (s = 0; s < generations[g].n_steps-1; s++) {
 209       generations[g].steps[s].to = &generations[g].steps[s+1];
 210     }
 211     generations[g].steps[s].to = &generations[g+1].steps[0];
 212   }
 213   oldest_gen->steps[0].to = &oldest_gen->steps[0];
 214
 215 #ifdef THREADED_RTS
 216   for (s = 0; s < n_nurseries; s++) {
 217       nurseries[s].to = generations[0].steps[0].to;
 218   }
 219 #endif
 220
 221   /* The oldest generation has one step. */
 222   if (RtsFlags.GcFlags.compact) {
 223       if (RtsFlags.GcFlags.generations == 1) {
 224           errorBelch("WARNING: compaction is incompatible with -G1; disabled");
 225       } else {
 226           oldest_gen->steps[0].is_compacted = 1;
 227       }
 228   }
 229
 230 #ifdef THREADED_RTS
 231   if (RtsFlags.GcFlags.generations == 1) {
 232       errorBelch("-G1 is incompatible with -threaded");
 233       stg_exit(EXIT_FAILURE);
 234   }
 235 #endif
 236
 237   /* generation 0 is special: that's the nursery */
 238   generations[0].max_blocks = 0;
 239
 240   /* G0S0: the allocation area.  Policy: keep the allocation area
 241    * small to begin with, even if we have a large suggested heap
 242    * size.  Reason: we're going to do a major collection first, and we
 243    * don't want it to be a big one.  This vague idea is borne out by
 244    * rigorous experimental evidence.
 245    */
 246   g0s0 = &generations[0].steps[0];
 247
 248   allocNurseries();
 249
 250   weak_ptr_list = NULL;
 251   caf_list = NULL;
 252   revertible_caf_list = NULL;
 253
 254   /* initialise the allocate() interface */
 255   small_alloc_list = NULL;
 256   alloc_blocks = 0;
 257   alloc_blocks_lim = RtsFlags.GcFlags.minAllocAreaSize;
 258
 259   /* Tell GNU multi-precision pkg about our custom alloc functions */
 260   mp_set_memory_functions(stgAllocForGMP, stgReallocForGMP, stgDeallocForGMP);
 261
 262   IF_DEBUG(gc, statDescribeGens());
 263
 264   RELEASE_SM_LOCK;
 265 }
 266
 267 void
 268 exitStorage (void)
 269 {
 270     stat_exit(calcAllocated());
 271 }
 272
 273 void
 274 freeStorage (void)
 275 {
 276     nat g;
 277
 278     for(g = 0; g < RtsFlags.GcFlags.generations; g++)
 279       stgFree(generations[g].steps);
 280     stgFree(generations);
 281     freeAllMBlocks();
 282 #if defined(THREADED_RTS)
 283     closeMutex(&sm_mutex);
 284     closeMutex(&atomic_modify_mutvar_mutex);
 285 #endif
 286 }
 287
 288 /* -----------------------------------------------------------------------------
 289    CAF management.
 290
 291    The entry code for every CAF does the following:
 292
 293       - builds a CAF_BLACKHOLE in the heap
 294       - pushes an update frame pointing to the CAF_BLACKHOLE
 295       - invokes UPD_CAF(), which:
 296           - calls newCaf, below
 297           - updates the CAF with a static indirection to the CAF_BLACKHOLE
 298
 299    Why do we build a BLACKHOLE in the heap rather than just updating
 300    the thunk directly?  It's so that we only need one kind of update
 301    frame - otherwise we'd need a static version of the update frame too.
 302
 303    newCaf() does the following:
 304
 305       - it puts the CAF on the oldest generation's mut-once list.
 306         This is so that we can treat the CAF as a root when collecting
 307         younger generations.
 308
 309    For GHCI, we have additional requirements when dealing with CAFs:
 310
 311       - we must *retain* all dynamically-loaded CAFs ever entered,
 312         just in case we need them again.
 313       - we must be able to *revert* CAFs that have been evaluated, to
 314         their pre-evaluated form.
 315
 316       To do this, we use an additional CAF list.  When newCaf() is
 317       called on a dynamically-loaded CAF, we add it to the CAF list
 318       instead of the old-generation mutable list, and save away its
 319       old info pointer (in caf->saved_info) for later reversion.
 320
 321       To revert all the CAFs, we traverse the CAF list and reset the
 322       info pointer to caf->saved_info, then throw away the CAF list.
 323       (see GC.c:revertCAFs()).
 324
 325       -- SDM 29/1/01
 326
 327    -------------------------------------------------------------------------- */
 328
 329 void
 330 newCAF(StgClosure* caf)
 331 {
 332   ACQUIRE_SM_LOCK;
 333
 334   if(keepCAFs)
 335   {
 336     // HACK:
 337     // If we are in GHCi _and_ we are using dynamic libraries,
 338     // then we can't redirect newCAF calls to newDynCAF (see below),
 339     // so we make newCAF behave almost like newDynCAF.
 340     // The dynamic libraries might be used by both the interpreted
 341     // program and GHCi itself, so they must not be reverted.
 342     // This also means that in GHCi with dynamic libraries, CAFs are not
 343     // garbage collected. If this turns out to be a problem, we could
 344     // do another hack here and do an address range test on caf to figure
 345     // out whether it is from a dynamic library.
 346     ((StgIndStatic *)caf)->saved_info  = (StgInfoTable *)caf->header.info;
 347     ((StgIndStatic *)caf)->static_link = caf_list;
 348     caf_list = caf;
 349   }
 350   else
 351   {
 352     /* Put this CAF on the mutable list for the old generation.
 353     * This is a HACK - the IND_STATIC closure doesn't really have
 354     * a mut_link field, but we pretend it has - in fact we re-use
 355     * the STATIC_LINK field for the time being, because when we
 356     * come to do a major GC we won't need the mut_link field
 357     * any more and can use it as a STATIC_LINK.
 358     */
 359     ((StgIndStatic *)caf)->saved_info = NULL;
 360     recordMutableGen(caf, oldest_gen);
 361   }
 362
 363   RELEASE_SM_LOCK;
 364
 365 #ifdef PAR
 366   /* If we are PAR or DIST then  we never forget a CAF */
 367   { globalAddr *newGA;
 368     //debugBelch("<##> Globalising CAF %08x %s",caf,info_type(caf));
 369     newGA=makeGlobal(caf,rtsTrue); /*given full weight*/
 370     ASSERT(newGA);
 371   }
 372 #endif /* PAR */
 373 }
 374
 375 // An alternate version of newCaf which is used for dynamically loaded
 376 // object code in GHCi.  In this case we want to retain *all* CAFs in
 377 // the object code, because they might be demanded at any time from an
 378 // expression evaluated on the command line.
 379 // Also, GHCi might want to revert CAFs, so we add these to the
 380 // revertible_caf_list.
 381 //
 382 // The linker hackily arranges that references to newCaf from dynamic
 383 // code end up pointing to newDynCAF.
 384 void
 385 newDynCAF(StgClosure *caf)
 386 {
 387     ACQUIRE_SM_LOCK;
 388
 389     ((StgIndStatic *)caf)->saved_info  = (StgInfoTable *)caf->header.info;
 390     ((StgIndStatic *)caf)->static_link = revertible_caf_list;
 391     revertible_caf_list = caf;
 392
 393     RELEASE_SM_LOCK;
 394 }
 395
 396 /* -----------------------------------------------------------------------------
 397    Nursery management.
 398    -------------------------------------------------------------------------- */
 399
 400 static bdescr *
 401 allocNursery (step *stp, bdescr *tail, nat blocks)
 402 {
 403     bdescr *bd;
 404     nat i;
 405
 406     // Allocate a nursery: we allocate fresh blocks one at a time and
 407     // cons them on to the front of the list, not forgetting to update
 408     // the back pointer on the tail of the list to point to the new block.
 409     for (i=0; i < blocks; i++) {
 410         // @LDV profiling
 411         /*
 412           processNursery() in LdvProfile.c assumes that every block group in
 413           the nursery contains only a single block. So, if a block group is
 414           given multiple blocks, change processNursery() accordingly.
 415         */
 416         bd = allocBlock();
 417         bd->link = tail;
 418         // double-link the nursery: we might need to insert blocks
 419         if (tail != NULL) {
 420             tail->u.back = bd;
 421         }
 422         bd->step = stp;
 423         bd->gen_no = 0;
 424         bd->flags = 0;
 425         bd->free = bd->start;
 426         tail = bd;
 427     }
 428     tail->u.back = NULL;
 429     return tail;
 430 }
 431
 432 static void
 433 assignNurseriesToCapabilities (void)
 434 {
 435 #ifdef THREADED_RTS
 436     nat i;
 437
 438     for (i = 0; i < n_nurseries; i++) {
 439         capabilities[i].r.rNursery        = &nurseries[i];
 440         capabilities[i].r.rCurrentNursery = nurseries[i].blocks;
 441         capabilities[i].r.rCurrentAlloc   = NULL;
 442     }
 443 #else /* THREADED_RTS */
 444     MainCapability.r.rNursery        = &nurseries[0];
 445     MainCapability.r.rCurrentNursery = nurseries[0].blocks;
 446     MainCapability.r.rCurrentAlloc   = NULL;
 447 #endif
 448 }
 449
 450 void
 451 allocNurseries( void )
 452 {
 453     nat i;
 454
 455     for (i = 0; i < n_nurseries; i++) {
 456         nurseries[i].blocks =
 457             allocNursery(&nurseries[i], NULL,
 458                          RtsFlags.GcFlags.minAllocAreaSize);
 459         nurseries[i].n_blocks    = RtsFlags.GcFlags.minAllocAreaSize;
 460         nurseries[i].old_blocks   = NULL;
 461         nurseries[i].n_old_blocks = 0;
 462         /* hp, hpLim, hp_bd, to_space etc. aren't used in the nursery */
 463     }
 464     assignNurseriesToCapabilities();
 465 }
 466
 467 void
 468 resetNurseries( void )
 469 {
 470     nat i;
 471     bdescr *bd;
 472     step *stp;
 473
 474     for (i = 0; i < n_nurseries; i++) {
 475         stp = &nurseries[i];
 476         for (bd = stp->blocks; bd; bd = bd->link) {
 477             bd->free = bd->start;
 478             ASSERT(bd->gen_no == 0);
 479             ASSERT(bd->step == stp);
 480             IF_DEBUG(sanity,memset(bd->start, 0xaa, BLOCK_SIZE));
 481         }
 482     }
 483     assignNurseriesToCapabilities();
 484 }
 485
 486 lnat
 487 countNurseryBlocks (void)
 488 {
 489     nat i;
 490     lnat blocks = 0;
 491
 492     for (i = 0; i < n_nurseries; i++) {
 493         blocks += nurseries[i].n_blocks;
 494     }
 495     return blocks;
 496 }
 497
 498 static void
 499 resizeNursery ( step *stp, nat blocks )
 500 {
 501   bdescr *bd;
 502   nat nursery_blocks;
 503
 504   nursery_blocks = stp->n_blocks;
 505   if (nursery_blocks == blocks) return;
 506
 507   if (nursery_blocks < blocks) {
 508       debugTrace(DEBUG_gc, "increasing size of nursery to %d blocks",
 509                  blocks);
 510     stp->blocks = allocNursery(stp, stp->blocks, blocks-nursery_blocks);
 511   }
 512   else {
 513     bdescr *next_bd;
 514
 515     debugTrace(DEBUG_gc, "decreasing size of nursery to %d blocks",
 516                blocks);
 517
 518     bd = stp->blocks;
 519     while (nursery_blocks > blocks) {
 520         next_bd = bd->link;
 521         next_bd->u.back = NULL;
 522         nursery_blocks -= bd->blocks; // might be a large block
 523         freeGroup(bd);
 524         bd = next_bd;
 525     }
 526     stp->blocks = bd;
 527     // might have gone just under, by freeing a large block, so make
 528     // up the difference.
 529     if (nursery_blocks < blocks) {
 530         stp->blocks = allocNursery(stp, stp->blocks, blocks-nursery_blocks);
 531     }
 532   }
 533
 534   stp->n_blocks = blocks;
 535   ASSERT(countBlocks(stp->blocks) == stp->n_blocks);
 536 }
 537
 538 //
 539 // Resize each of the nurseries to the specified size.
 540 //
 541 void
 542 resizeNurseriesFixed (nat blocks)
 543 {
 544     nat i;
 545     for (i = 0; i < n_nurseries; i++) {
 546         resizeNursery(&nurseries[i], blocks);
 547     }
 548 }
 549
 550 //
 551 // Resize the nurseries to the total specified size.
 552 //
 553 void
 554 resizeNurseries (nat blocks)
 555 {
 556     // If there are multiple nurseries, then we just divide the number
 557     // of available blocks between them.
 558     resizeNurseriesFixed(blocks / n_nurseries);
 559 }
 560
 561 /* -----------------------------------------------------------------------------
 562    The allocate() interface
 563
 564    allocate(n) always succeeds, and returns a chunk of memory n words
 565    long.  n can be larger than the size of a block if necessary, in
 566    which case a contiguous block group will be allocated.
 567    -------------------------------------------------------------------------- */
 568
 569 StgPtr
 570 allocate( nat n )
 571 {
 572     bdescr *bd;
 573     StgPtr p;
 574
 575     ACQUIRE_SM_LOCK;
 576
 577     TICK_ALLOC_HEAP_NOCTR(n);
 578     CCS_ALLOC(CCCS,n);
 579
 580     /* big allocation (>LARGE_OBJECT_THRESHOLD) */
 581     /* ToDo: allocate directly into generation 1 */
 582     if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
 583         nat req_blocks =  (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
 584         bd = allocGroup(req_blocks);
 585         dbl_link_onto(bd, &g0s0->large_objects);
 586         g0s0->n_large_blocks += req_blocks;
 587         bd->gen_no  = 0;
 588         bd->step = g0s0;
 589         bd->flags = BF_LARGE;
 590         bd->free = bd->start + n;
 591         alloc_blocks += req_blocks;
 592         RELEASE_SM_LOCK;
 593         return bd->start;
 594
 595         /* small allocation (<LARGE_OBJECT_THRESHOLD) */
 596     } else if (small_alloc_list == NULL || alloc_Hp + n > alloc_HpLim) {
 597         if (small_alloc_list) {
 598             small_alloc_list->free = alloc_Hp;
 599         }
 600         bd = allocBlock();
 601         bd->link = small_alloc_list;
 602         small_alloc_list = bd;
 603         bd->gen_no = 0;
 604         bd->step = g0s0;
 605         bd->flags = 0;
 606         alloc_Hp = bd->start;
 607         alloc_HpLim = bd->start + BLOCK_SIZE_W;
 608         alloc_blocks++;
 609     }
 610
 611     p = alloc_Hp;
 612     alloc_Hp += n;
 613     RELEASE_SM_LOCK;
 614     return p;
 615 }
 616
 617 lnat
 618 allocatedBytes( void )
 619 {
 620     lnat allocated;
 621
 622     allocated = alloc_blocks * BLOCK_SIZE_W - (alloc_HpLim - alloc_Hp);
 623     if (pinned_object_block != NULL) {
 624         allocated -= (pinned_object_block->start + BLOCK_SIZE_W) -
 625             pinned_object_block->free;
 626     }
 627
 628     return allocated;
 629 }
 630
 631 void
 632 tidyAllocateLists (void)
 633 {
 634     if (small_alloc_list != NULL) {
 635         ASSERT(alloc_Hp >= small_alloc_list->start &&
 636                alloc_Hp <= small_alloc_list->start + BLOCK_SIZE);
 637         small_alloc_list->free = alloc_Hp;
 638     }
 639 }
 640
 641 /* -----------------------------------------------------------------------------
 642    allocateLocal()
 643
 644    This allocates memory in the current thread - it is intended for
 645    use primarily from STG-land where we have a Capability.  It is
 646    better than allocate() because it doesn't require taking the
 647    sm_mutex lock in the common case.
 648
 649    Memory is allocated directly from the nursery if possible (but not
 650    from the current nursery block, so as not to interfere with
 651    Hp/HpLim).
 652    -------------------------------------------------------------------------- */
 653
 654 StgPtr
 655 allocateLocal (Capability *cap, nat n)
 656 {
 657     bdescr *bd;
 658     StgPtr p;
 659
 660     TICK_ALLOC_HEAP_NOCTR(n);
 661     CCS_ALLOC(CCCS,n);
 662
 663     /* big allocation (>LARGE_OBJECT_THRESHOLD) */
 664     /* ToDo: allocate directly into generation 1 */
 665     if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
 666         nat req_blocks =  (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
 667         ACQUIRE_SM_LOCK;
 668         bd = allocGroup(req_blocks);
 669         dbl_link_onto(bd, &g0s0->large_objects);
 670         g0s0->n_large_blocks += req_blocks;
 671         bd->gen_no  = 0;
 672         bd->step = g0s0;
 673         bd->flags = BF_LARGE;
 674         bd->free = bd->start + n;
 675         alloc_blocks += req_blocks;
 676         RELEASE_SM_LOCK;
 677         return bd->start;
 678
 679         /* small allocation (<LARGE_OBJECT_THRESHOLD) */
 680     } else {
 681
 682         bd = cap->r.rCurrentAlloc;
 683         if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
 684
 685             // The CurrentAlloc block is full, we need to find another
 686             // one.  First, we try taking the next block from the
 687             // nursery:
 688             bd = cap->r.rCurrentNursery->link;
 689
 690             if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
 691                 // The nursery is empty, or the next block is already
 692                 // full: allocate a fresh block (we can't fail here).
 693                 ACQUIRE_SM_LOCK;
 694                 bd = allocBlock();
 695                 cap->r.rNursery->n_blocks++;
 696                 RELEASE_SM_LOCK;
 697                 bd->gen_no = 0;
 698                 bd->step = cap->r.rNursery;
 699                 bd->flags = 0;
 700             } else {
 701                 // we have a block in the nursery: take it and put
 702                 // it at the *front* of the nursery list, and use it
 703                 // to allocate() from.
 704                 cap->r.rCurrentNursery->link = bd->link;
 705                 if (bd->link != NULL) {
 706                     bd->link->u.back = cap->r.rCurrentNursery;
 707                 }
 708             }
 709             dbl_link_onto(bd, &cap->r.rNursery->blocks);
 710             cap->r.rCurrentAlloc = bd;
 711             IF_DEBUG(sanity, checkNurserySanity(cap->r.rNursery));
 712         }
 713     }
 714     p = bd->free;
 715     bd->free += n;
 716     return p;
 717 }
 718
 719 /* ---------------------------------------------------------------------------
 720    Allocate a fixed/pinned object.
 721
 722    We allocate small pinned objects into a single block, allocating a
 723    new block when the current one overflows.  The block is chained
 724    onto the large_object_list of generation 0 step 0.
 725
 726    NOTE: The GC can't in general handle pinned objects.  This
 727    interface is only safe to use for ByteArrays, which have no
 728    pointers and don't require scavenging.  It works because the
 729    block's descriptor has the BF_LARGE flag set, so the block is
 730    treated as a large object and chained onto various lists, rather
 731    than the individual objects being copied.  However, when it comes
 732    to scavenge the block, the GC will only scavenge the first object.
 733    The reason is that the GC can't linearly scan a block of pinned
 734    objects at the moment (doing so would require using the
 735    mostly-copying techniques).  But since we're restricting ourselves
 736    to pinned ByteArrays, not scavenging is ok.
 737
 738    This function is called by newPinnedByteArray# which immediately
 739    fills the allocated memory with a MutableByteArray#.
 740    ------------------------------------------------------------------------- */
 741
 742 StgPtr
 743 allocatePinned( nat n )
 744 {
 745     StgPtr p;
 746     bdescr *bd = pinned_object_block;
 747
 748     // If the request is for a large object, then allocate()
 749     // will give us a pinned object anyway.
 750     if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
 751         return allocate(n);
 752     }
 753
 754     ACQUIRE_SM_LOCK;
 755
 756     TICK_ALLOC_HEAP_NOCTR(n);
 757     CCS_ALLOC(CCCS,n);
 758
 759     // we always return 8-byte aligned memory.  bd->free must be
 760     // 8-byte aligned to begin with, so we just round up n to
 761     // the nearest multiple of 8 bytes.
 762     if (sizeof(StgWord) == 4) {
 763         n = (n+1) & ~1;
 764     }
 765
 766     // If we don't have a block of pinned objects yet, or the current
 767     // one isn't large enough to hold the new object, allocate a new one.
 768     if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) {
 769         pinned_object_block = bd = allocBlock();
 770         dbl_link_onto(bd, &g0s0->large_objects);
 771         bd->gen_no = 0;
 772         bd->step   = g0s0;
 773         bd->flags  = BF_PINNED | BF_LARGE;
 774         bd->free   = bd->start;
 775         alloc_blocks++;
 776     }
 777
 778     p = bd->free;
 779     bd->free += n;
 780     RELEASE_SM_LOCK;
 781     return p;
 782 }
 783
 784 /* -----------------------------------------------------------------------------
 785    This is the write barrier for MUT_VARs, a.k.a. IORefs.  A
 786    MUT_VAR_CLEAN object is not on the mutable list; a MUT_VAR_DIRTY
 787    is.  When written to, a MUT_VAR_CLEAN turns into a MUT_VAR_DIRTY
 788    and is put on the mutable list.
 789    -------------------------------------------------------------------------- */
 790
 791 void
 792 dirty_MUT_VAR(StgRegTable *reg, StgClosure *p)
 793 {
 794     Capability *cap = regTableToCapability(reg);
 795     bdescr *bd;
 796     if (p->header.info == &stg_MUT_VAR_CLEAN_info) {
 797         p->header.info = &stg_MUT_VAR_DIRTY_info;
 798         bd = Bdescr((StgPtr)p);
 799         if (bd->gen_no > 0) recordMutableCap(p,cap,bd->gen_no);
 800     }
 801 }
 802
 803 /* -----------------------------------------------------------------------------
 804    Allocation functions for GMP.
 805
 806    These all use the allocate() interface - we can't have any garbage
 807    collection going on during a gmp operation, so we use allocate()
 808    which always succeeds.  The gmp operations which might need to
 809    allocate will ask the storage manager (via doYouWantToGC()) whether
 810    a garbage collection is required, in case we get into a loop doing
 811    only allocate() style allocation.
 812    -------------------------------------------------------------------------- */
 813
 814 static void *
 815 stgAllocForGMP (size_t size_in_bytes)
 816 {
 817   StgArrWords* arr;
 818   nat data_size_in_words, total_size_in_words;
 819
 820   /* round up to a whole number of words */
 821   data_size_in_words  = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
 822   total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
 823
 824   /* allocate and fill it in. */
 825 #if defined(THREADED_RTS)
 826   arr = (StgArrWords *)allocateLocal(myTask()->cap, total_size_in_words);
 827 #else
 828   arr = (StgArrWords *)allocateLocal(&MainCapability, total_size_in_words);
 829 #endif
 830   SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
 831
 832   /* and return a ptr to the goods inside the array */
 833   return arr->payload;
 834 }
 835
 836 static void *
 837 stgReallocForGMP (void *ptr, size_t old_size, size_t new_size)
 838 {
 839     void *new_stuff_ptr = stgAllocForGMP(new_size);
 840     nat i = 0;
 841     char *p = (char *) ptr;
 842     char *q = (char *) new_stuff_ptr;
 843
 844     for (; i < old_size; i++, p++, q++) {
 845         *q = *p;
 846     }
 847
 848     return(new_stuff_ptr);
 849 }
 850
 851 static void
 852 stgDeallocForGMP (void *ptr STG_UNUSED,
 853                   size_t size STG_UNUSED)
 854 {
 855     /* easy for us: the garbage collector does the dealloc'n */
 856 }
 857
 858 /* -----------------------------------------------------------------------------
 859  * Stats and stuff
 860  * -------------------------------------------------------------------------- */
 861
 862 /* -----------------------------------------------------------------------------
 863  * calcAllocated()
 864  *
 865  * Approximate how much we've allocated: number of blocks in the
 866  * nursery + blocks allocated via allocate() - unused nusery blocks.
 867  * This leaves a little slop at the end of each block, and doesn't
 868  * take into account large objects (ToDo).
 869  * -------------------------------------------------------------------------- */
 870
 871 lnat
 872 calcAllocated( void )
 873 {
 874   nat allocated;
 875   bdescr *bd;
 876
 877   allocated = allocatedBytes();
 878   allocated += countNurseryBlocks() * BLOCK_SIZE_W;
 879
 880   {
 881 #ifdef THREADED_RTS
 882   nat i;
 883   for (i = 0; i < n_nurseries; i++) {
 884       Capability *cap;
 885       for ( bd = capabilities[i].r.rCurrentNursery->link;
 886             bd != NULL; bd = bd->link ) {
 887           allocated -= BLOCK_SIZE_W;
 888       }
 889       cap = &capabilities[i];
 890       if (cap->r.rCurrentNursery->free <
 891           cap->r.rCurrentNursery->start + BLOCK_SIZE_W) {
 892           allocated -= (cap->r.rCurrentNursery->start + BLOCK_SIZE_W)
 893               - cap->r.rCurrentNursery->free;
 894       }
 895   }
 896 #else
 897   bdescr *current_nursery = MainCapability.r.rCurrentNursery;
 898
 899   for ( bd = current_nursery->link; bd != NULL; bd = bd->link ) {
 900       allocated -= BLOCK_SIZE_W;
 901   }
 902   if (current_nursery->free < current_nursery->start + BLOCK_SIZE_W) {
 903       allocated -= (current_nursery->start + BLOCK_SIZE_W)
 904           - current_nursery->free;
 905   }
 906 #endif
 907   }
 908
 909   total_allocated += allocated;
 910   return allocated;
 911 }
 912
 913 /* Approximate the amount of live data in the heap.  To be called just
 914  * after garbage collection (see GarbageCollect()).
 915  */
 916 extern lnat
 917 calcLive(void)
 918 {
 919   nat g, s;
 920   lnat live = 0;
 921   step *stp;
 922
 923   if (RtsFlags.GcFlags.generations == 1) {
 924     live = (g0s0->n_blocks - 1) * BLOCK_SIZE_W +
 925       ((lnat)g0s0->hp_bd->free - (lnat)g0s0->hp_bd->start) / sizeof(W_);
 926     return live;
 927   }
 928
 929   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
 930     for (s = 0; s < generations[g].n_steps; s++) {
 931       /* approximate amount of live data (doesn't take into account slop
 932        * at end of each block).
 933        */
 934       if (g == 0 && s == 0) {
 935           continue;
 936       }
 937       stp = &generations[g].steps[s];
 938       live += (stp->n_large_blocks + stp->n_blocks - 1) * BLOCK_SIZE_W;
 939       if (stp->hp_bd != NULL) {
 940           live += ((lnat)stp->hp_bd->free - (lnat)stp->hp_bd->start)
 941               / sizeof(W_);
 942       }
 943       if (stp->scavd_hp != NULL) {
 944           live -= (P_)(BLOCK_ROUND_UP(stp->scavd_hp)) - stp->scavd_hp;
 945       }
 946     }
 947   }
 948   return live;
 949 }
 950
 951 /* Approximate the number of blocks that will be needed at the next
 952  * garbage collection.
 953  *
 954  * Assume: all data currently live will remain live.  Steps that will
 955  * be collected next time will therefore need twice as many blocks
 956  * since all the data will be copied.
 957  */
 958 extern lnat
 959 calcNeeded(void)
 960 {
 961     lnat needed = 0;
 962     nat g, s;
 963     step *stp;
 964
 965     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
 966         for (s = 0; s < generations[g].n_steps; s++) {
 967             if (g == 0 && s == 0) { continue; }
 968             stp = &generations[g].steps[s];
 969             if (generations[g].steps[0].n_blocks +
 970                 generations[g].steps[0].n_large_blocks
 971                 > generations[g].max_blocks
 972                 && stp->is_compacted == 0) {
 973                 needed += 2 * stp->n_blocks;
 974             } else {
 975                 needed += stp->n_blocks;
 976             }
 977         }
 978     }
 979     return needed;
 980 }
 981
 982 /* ----------------------------------------------------------------------------
 983    Executable memory
 984
 985    Executable memory must be managed separately from non-executable
 986    memory.  Most OSs these days require you to jump through hoops to
 987    dynamically allocate executable memory, due to various security
 988    measures.
 989
 990    Here we provide a small memory allocator for executable memory.
 991    Memory is managed with a page granularity; we allocate linearly
 992    in the page, and when the page is emptied (all objects on the page
 993    are free) we free the page again, not forgetting to make it
 994    non-executable.
 995    ------------------------------------------------------------------------- */
 996
 997 static bdescr *exec_block;
 998
 999 void *allocateExec (nat bytes)
1000 {
1001     void *ret;
1002     nat n;
1003
1004     ACQUIRE_SM_LOCK;
1005
1006     // round up to words.
1007     n  = (bytes + sizeof(W_) + 1) / sizeof(W_);
1008
1009     if (n+1 > BLOCK_SIZE_W) {
1010         barf("allocateExec: can't handle large objects");
1011     }
1012
1013     if (exec_block == NULL ||
1014         exec_block->free + n + 1 > exec_block->start + BLOCK_SIZE_W) {
1015         bdescr *bd;
1016         lnat pagesize = getPageSize();
1017         bd = allocGroup(stg_max(1, pagesize / BLOCK_SIZE));
1018         debugTrace(DEBUG_gc, "allocate exec block %p", bd->start);
1019         bd->gen_no = 0;
1020         bd->flags = BF_EXEC;
1021         bd->link = exec_block;
1022         if (exec_block != NULL) {
1023             exec_block->u.back = bd;
1024         }
1025         bd->u.back = NULL;
1026         setExecutable(bd->start, bd->blocks * BLOCK_SIZE, rtsTrue);
1027         exec_block = bd;
1028     }
1029     *(exec_block->free) = n;  // store the size of this chunk
1030     exec_block->gen_no += n;  // gen_no stores the number of words allocated
1031     ret = exec_block->free + 1;
1032     exec_block->free += n + 1;
1033
1034     RELEASE_SM_LOCK
1035     return ret;
1036 }
1037
1038 void freeExec (void *addr)
1039 {
1040     StgPtr p = (StgPtr)addr - 1;
1041     bdescr *bd = Bdescr((StgPtr)p);
1042
1043     if ((bd->flags & BF_EXEC) == 0) {
1044         barf("freeExec: not executable");
1045     }
1046
1047     if (*(StgPtr)p == 0) {
1048         barf("freeExec: already free?");
1049     }
1050
1051     ACQUIRE_SM_LOCK;
1052
1053     bd->gen_no -= *(StgPtr)p;
1054     *(StgPtr)p = 0;
1055
1056     // Free the block if it is empty, but not if it is the block at
1057     // the head of the queue.
1058     if (bd->gen_no == 0 && bd != exec_block) {
1059         debugTrace(DEBUG_gc, "free exec block %p", bd->start);
1060         if (bd->u.back) {
1061             bd->u.back->link = bd->link;
1062         } else {
1063             exec_block = bd->link;
1064         }
1065         if (bd->link) {
1066             bd->link->u.back = bd->u.back;
1067         }
1068         setExecutable(bd->start, bd->blocks * BLOCK_SIZE, rtsFalse);
1069         freeGroup(bd);
1070     }
1071
1072     RELEASE_SM_LOCK
1073 }
1074
1075 /* -----------------------------------------------------------------------------
1076    Debugging
1077
1078    memInventory() checks for memory leaks by counting up all the
1079    blocks we know about and comparing that to the number of blocks
1080    allegedly floating around in the system.
1081    -------------------------------------------------------------------------- */
1082
1083 #ifdef DEBUG
1084
1085 static lnat
1086 stepBlocks (step *stp)
1087 {
1088     lnat total_blocks;
1089     bdescr *bd;
1090
1091     total_blocks = stp->n_blocks;
1092     total_blocks += stp->n_old_blocks;
1093     for (bd = stp->large_objects; bd; bd = bd->link) {
1094         total_blocks += bd->blocks;
1095         /* hack for megablock groups: they have an extra block or two in
1096            the second and subsequent megablocks where the block
1097            descriptors would normally go.
1098         */
1099         if (bd->blocks > BLOCKS_PER_MBLOCK) {
1100             total_blocks -= (MBLOCK_SIZE / BLOCK_SIZE - BLOCKS_PER_MBLOCK)
1101                 * (bd->blocks/(MBLOCK_SIZE/BLOCK_SIZE));
1102         }
1103     }
1104     return total_blocks;
1105 }
1106
1107 void
1108 memInventory(void)
1109 {
1110   nat g, s, i;
1111   step *stp;
1112   bdescr *bd;
1113   lnat total_blocks = 0, free_blocks = 0;
1114
1115   /* count the blocks we current have */
1116
1117   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1118       for (i = 0; i < n_capabilities; i++) {
1119           for (bd = capabilities[i].mut_lists[g]; bd != NULL; bd = bd->link) {
1120               total_blocks += bd->blocks;
1121           }
1122       }
1123       for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
1124           total_blocks += bd->blocks;
1125       }
1126       for (s = 0; s < generations[g].n_steps; s++) {
1127           if (g==0 && s==0) continue;
1128           stp = &generations[g].steps[s];
1129           total_blocks += stepBlocks(stp);
1130       }
1131   }
1132
1133   for (i = 0; i < n_nurseries; i++) {
1134       total_blocks += stepBlocks(&nurseries[i]);
1135   }
1136 #ifdef THREADED_RTS
1137   // We put pinned object blocks in g0s0, so better count blocks there too.
1138   total_blocks += stepBlocks(g0s0);
1139 #endif
1140
1141   /* any blocks held by allocate() */
1142   for (bd = small_alloc_list; bd; bd = bd->link) {
1143     total_blocks += bd->blocks;
1144   }
1145
1146 #ifdef PROFILING
1147   if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
1148       total_blocks += retainerStackBlocks();
1149   }
1150 #endif
1151
1152   // count the blocks allocated by the arena allocator
1153   total_blocks += arenaBlocks();
1154
1155   // count the blocks containing executable memory
1156   for (bd = exec_block; bd; bd = bd->link) {
1157     total_blocks += bd->blocks;
1158   }
1159
1160   /* count the blocks on the free list */
1161   free_blocks = countFreeList();
1162
1163   if (total_blocks + free_blocks != mblocks_allocated *
1164       BLOCKS_PER_MBLOCK) {
1165     debugBelch("Blocks: %ld live + %ld free  = %ld total (%ld around)\n",
1166             total_blocks, free_blocks, total_blocks + free_blocks,
1167             mblocks_allocated * BLOCKS_PER_MBLOCK);
1168   }
1169
1170   ASSERT(total_blocks + free_blocks == mblocks_allocated * BLOCKS_PER_MBLOCK);
1171 }
1172
1173
1174 nat
1175 countBlocks(bdescr *bd)
1176 {
1177     nat n;
1178     for (n=0; bd != NULL; bd=bd->link) {
1179         n += bd->blocks;
1180     }
1181     return n;
1182 }
1183
1184 /* Full heap sanity check. */
1185 void
1186 checkSanity( void )
1187 {
1188     nat g, s;
1189
1190     if (RtsFlags.GcFlags.generations == 1) {
1191         checkHeap(g0s0->blocks);
1192         checkChain(g0s0->large_objects);
1193     } else {
1194
1195         for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1196             for (s = 0; s < generations[g].n_steps; s++) {
1197                 if (g == 0 && s == 0) { continue; }
1198                 ASSERT(countBlocks(generations[g].steps[s].blocks)
1199                        == generations[g].steps[s].n_blocks);
1200                 ASSERT(countBlocks(generations[g].steps[s].large_objects)
1201                        == generations[g].steps[s].n_large_blocks);
1202                 checkHeap(generations[g].steps[s].blocks);
1203                 checkChain(generations[g].steps[s].large_objects);
1204                 if (g > 0) {
1205                     checkMutableList(generations[g].mut_list, g);
1206                 }
1207             }
1208         }
1209
1210         for (s = 0; s < n_nurseries; s++) {
1211             ASSERT(countBlocks(nurseries[s].blocks)
1212                    == nurseries[s].n_blocks);
1213             ASSERT(countBlocks(nurseries[s].large_objects)
1214                    == nurseries[s].n_large_blocks);
1215         }
1216
1217         checkFreeListSanity();
1218     }
1219 }
1220
1221 /* Nursery sanity check */
1222 void
1223 checkNurserySanity( step *stp )
1224 {
1225     bdescr *bd, *prev;
1226     nat blocks = 0;
1227
1228     prev = NULL;
1229     for (bd = stp->blocks; bd != NULL; bd = bd->link) {
1230         ASSERT(bd->u.back == prev);
1231         prev = bd;
1232         blocks += bd->blocks;
1233     }
1234     ASSERT(blocks == stp->n_blocks);
1235 }
1236
1237 // handy function for use in gdb, because Bdescr() is inlined.
1238 extern bdescr *_bdescr( StgPtr p );
1239
1240 bdescr *
1241 _bdescr( StgPtr p )
1242 {
1243     return Bdescr(p);
1244 }
1245
1246 #endif