Generate INLINE pragmas for PA methods

[ghc-hetmet.git] / rts / sm / GC.c
diff --git a/rts/sm/GC.c b/rts/sm/GC.c

index 02fd6d9..6b7dc29 100644 (file)
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -33,6 +33,7 @@
  #endif
  #include "Trace.h"
  #include "RetainerProfile.h"
+#include "LdvProfile.h"
  #include "RaiseAsync.h"
  #include "Papi.h"
  #include "Stable.h"
@@ -43,6 +44,7 @@
  #include "Evac.h"
  #include "Scav.h"
  #include "GCUtils.h"
+#include "MarkStack.h"
  #include "MarkWeak.h"
  #include "Sparks.h"
  #include "Sweep.h"
@@ -153,21 +155,12 @@ static void gcCAFs                  (void);
  #endif
  
  /* -----------------------------------------------------------------------------
-   The mark bitmap & stack.
+   The mark stack.
     -------------------------------------------------------------------------- */
  
-#define MARK_STACK_BLOCKS 4
-
-bdescr *mark_stack_bdescr;
-StgPtr *mark_stack;
-StgPtr *mark_sp;
-StgPtr *mark_splim;
-
-// Flag and pointers used for falling back to a linear scan when the
-// mark stack overflows.
-rtsBool mark_stack_overflowed;
-bdescr *oldgen_scan_bd;
-StgPtr  oldgen_scan;
+bdescr *mark_stack_top_bd; // topmost block in the mark stack
+bdescr *mark_stack_bd;     // current block in the mark stack
+StgPtr mark_sp;            // pointer to the next unallocated mark stack entry
  
  /* -----------------------------------------------------------------------------
     GarbageCollect: the main entry point to the garbage collector.
@@ -236,7 +229,8 @@ GarbageCollect (rtsBool force_major_gc,
    n = initialise_N(force_major_gc);
  
  #if defined(THREADED_RTS)
-  work_stealing = RtsFlags.ParFlags.parGcLoadBalancing;
+  work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
+                  N >= RtsFlags.ParFlags.parGcLoadBalancingGen;
        // It's not always a good idea to do load balancing in parallel
        // GC.  In particular, for a parallel program we don't want to
        // lose locality by moving cached data into another CPU's cache
@@ -277,12 +271,11 @@ GarbageCollect (rtsBool force_major_gc,
  
  #ifdef DEBUG
    // check for memory leaks if DEBUG is on 
-  memInventory(traceClass(DEBUG_gc));
+  memInventory(DEBUG_gc);
  #endif
  
-  // check stack sanity *before* GC
-  IF_DEBUG(sanity, checkFreeListSanity());
-  IF_DEBUG(sanity, checkMutableLists(rtsTrue));
+  // check sanity *before* GC
+  IF_DEBUG(sanity, checkSanity(rtsTrue));
  
    // Initialise all our gc_thread structures
    for (t = 0; t < n_gc_threads; t++) {
@@ -302,15 +295,15 @@ GarbageCollect (rtsBool force_major_gc,
    /* Allocate a mark stack if we're doing a major collection.
     */
    if (major_gc && oldest_gen->steps[0].mark) {
-      nat mark_stack_blocks;
-      mark_stack_blocks = stg_max(MARK_STACK_BLOCKS, 
-                                  oldest_gen->steps[0].n_old_blocks / 100);
-      mark_stack_bdescr = allocGroup(mark_stack_blocks);
-      mark_stack = (StgPtr *)mark_stack_bdescr->start;
-      mark_sp    = mark_stack;
-      mark_splim = mark_stack + (mark_stack_blocks * BLOCK_SIZE_W);
+      mark_stack_bd     = allocBlock();
+      mark_stack_top_bd = mark_stack_bd;
+      mark_stack_bd->link = NULL;
+      mark_stack_bd->u.back = NULL;
+      mark_sp           = mark_stack_bd->start;
    } else {
-      mark_stack_bdescr = NULL;
+      mark_stack_bd     = NULL;
+      mark_stack_top_bd = NULL;
+      mark_sp           = NULL;
    }
  
    // this is the main thread
@@ -431,9 +424,9 @@ SET_GCT(gc_threads[0]);
    // g0s0->old_blocks is the old nursery
    // g0s0->blocks is to-space from the previous GC
    if (RtsFlags.GcFlags.generations == 1) {
-      if (g0s0->blocks != NULL) {
-         freeChain(g0s0->blocks);
-         g0s0->blocks = NULL;
+      if (g0->steps[0].blocks != NULL) {
+         freeChain(g0->steps[0].blocks);
+         g0->steps[0].blocks = NULL;
        }
    }
  
@@ -652,18 +645,13 @@ SET_GCT(gc_threads[0]);
  
         /* LARGE OBJECTS.  The current live large objects are chained on
          * scavenged_large, having been moved during garbage
-        * collection from large_objects.  Any objects left on
+        * collection from large_objects.  Any objects left on the
          * large_objects list are therefore dead, so we free them here.
          */
-       for (bd = stp->large_objects; bd != NULL; bd = next) {
-         next = bd->link;
-         freeGroup(bd);
-         bd = next;
-       }
-
+        freeChain(stp->large_objects);
         stp->large_objects  = stp->scavenged_large_objects;
         stp->n_large_blocks = stp->n_scavenged_large_blocks;
-
+        ASSERT(countBlocks(stp->large_objects) == stp->n_large_blocks);
        }
        else // for older generations... 
        {
@@ -678,6 +666,7 @@ SET_GCT(gc_threads[0]);
  
         // add the new blocks we promoted during this GC 
         stp->n_large_blocks += stp->n_scavenged_large_blocks;
+        ASSERT(countBlocks(stp->large_objects) == stp->n_large_blocks);
        }
      }
    }
@@ -691,22 +680,25 @@ SET_GCT(gc_threads[0]);
    // Free the small objects allocated via allocate(), since this will
    // all have been copied into G0S1 now.  
    if (RtsFlags.GcFlags.generations > 1) {
-      if (g0s0->blocks != NULL) {
-          freeChain(g0s0->blocks);
-          g0s0->blocks = NULL;
+      if (g0->steps[0].blocks != NULL) {
+          freeChain(g0->steps[0].blocks);
+          g0->steps[0].blocks = NULL;
        }
-      g0s0->n_blocks = 0;
-      g0s0->n_words = 0;
+      g0->steps[0].n_blocks = 0;
+      g0->steps[0].n_words = 0;
    }
-  alloc_blocks = 0;
    alloc_blocks_lim = RtsFlags.GcFlags.minAllocAreaSize;
  
    // Start a new pinned_object_block
-  pinned_object_block = NULL;
+  for (n = 0; n < n_capabilities; n++) {
+      capabilities[n].pinned_object_block = NULL;
+  }
  
    // Free the mark stack.
-  if (mark_stack_bdescr != NULL) {
-      freeGroup(mark_stack_bdescr);
+  if (mark_stack_top_bd != NULL) {
+      debugTrace(DEBUG_gc, "mark stack: %d blocks",
+                 countBlocks(mark_stack_top_bd));
+      freeChain(mark_stack_top_bd);
    }
  
    // Free any bitmaps.
@@ -762,8 +754,8 @@ SET_GCT(gc_threads[0]);
    // Update the stable pointer hash table.
    updateStablePtrTable(major_gc);
  
-  // check sanity after GC 
-  IF_DEBUG(sanity, checkSanity());
+  // check sanity after GC
+  IF_DEBUG(sanity, checkSanity(rtsTrue));
  
    // extra GC trace info 
    IF_DEBUG(gc, statDescribeGens());
@@ -780,7 +772,7 @@ SET_GCT(gc_threads[0]);
  
  #ifdef DEBUG
    // check for memory leaks if DEBUG is on 
-  memInventory(traceClass(DEBUG_gc));
+  memInventory(DEBUG_gc);
  #endif
  
  #ifdef RTS_GTK_FRONTPANEL
@@ -936,14 +928,23 @@ initGcThreads (void)
  void
  freeGcThreads (void)
  {
+    nat s;
      if (gc_threads != NULL) {
  #if defined(THREADED_RTS)
          nat i;
-       for (i = 0; i < RtsFlags.ParFlags.nNodes; i++) {
+       for (i = 0; i < n_capabilities; i++) {
+            for (s = 0; s < total_steps; s++)
+            {
+                freeWSDeque(gc_threads[i]->steps[s].todo_q);
+            }
              stgFree (gc_threads[i]);
         }
          stgFree (gc_threads);
  #else
+        for (s = 0; s < total_steps; s++)
+        {
+            freeWSDeque(gc_threads[0]->steps[s].todo_q);
+        }
          stgFree (gc_threads);
  #endif
          gc_threads = NULL;
@@ -983,8 +984,7 @@ any_work (void)
      write_barrier();
  
      // scavenge objects in compacted generation
-    if (mark_stack_overflowed || oldgen_scan_bd != NULL ||
-       (mark_stack_bdescr != NULL && !mark_stack_empty())) {
+    if (mark_stack_bd != NULL && !mark_stack_empty()) {
         return rtsTrue;
      }
      
@@ -1025,9 +1025,10 @@ scavenge_until_all_done (void)
  {
      nat r;
         
-    debugTrace(DEBUG_gc, "GC thread %d working", gct->thread_index);
  
  loop:
+    traceEvent(&capabilities[gct->thread_index], EVENT_GC_WORK);
+
  #if defined(THREADED_RTS)
      if (n_gc_threads > 1) {
          scavenge_loop();
@@ -1041,8 +1042,9 @@ loop:
      // scavenge_loop() only exits when there's no work to do
      r = dec_running();
      
-    debugTrace(DEBUG_gc, "GC thread %d idle (%d still running)", 
-               gct->thread_index, r);
+    traceEvent(&capabilities[gct->thread_index], EVENT_GC_IDLE);
+
+    debugTrace(DEBUG_gc, "%d GC threads still running", r);
      
      while (gc_running_threads != 0) {
          // usleep(1);
@@ -1056,8 +1058,7 @@ loop:
          // scavenge_loop() to perform any pending work.
      }
      
-    // All threads are now stopped
-    debugTrace(DEBUG_gc, "GC thread %d finished.", gct->thread_index);
+    traceEvent(&capabilities[gct->thread_index], EVENT_GC_DONE);
  }
  
  #if defined(THREADED_RTS)
@@ -1065,7 +1066,10 @@ loop:
  void
  gcWorkerThread (Capability *cap)
  {
-    cap->in_gc = rtsTrue;
+    gc_thread *saved_gct;
+
+    // necessary if we stole a callee-saves register for gct:
+    saved_gct = gct;
  
      gct = gc_threads[cap->no];
      gct->id = osThreadId();
@@ -1104,6 +1108,8 @@ gcWorkerThread (Capability *cap)
                 gct->thread_index);
      ACQUIRE_SPIN_LOCK(&gct->mut_spin);
      debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index);
+
+    SET_GCT(saved_gct);
  }
  
  #endif
@@ -1125,7 +1131,7 @@ waitForGcThreads (Capability *cap USED_IF_THREADS)
                  prodCapability(&capabilities[i], cap->running_task);
              }
          }
-        for (j=0; j < 10000000; j++) {
+        for (j=0; j < 10; j++) {
              retry = rtsFalse;
              for (i=0; i < n_threads; i++) {
                  if (i == me) continue;
@@ -1136,6 +1142,7 @@ waitForGcThreads (Capability *cap USED_IF_THREADS)
                  }
              }
              if (!retry) break;
+            yieldThread();
          }
      }
  }
@@ -1226,8 +1233,21 @@ init_collected_gen (nat g, nat n_threads)
         }
      }
  
+    if (g == 0) {
+        for (i = 0; i < n_capabilities; i++) {
+            stp = &nurseries[i];
+            stp->old_threads = stp->threads;
+            stp->threads = END_TSO_QUEUE;
+        }
+    }
+
      for (s = 0; s < generations[g].n_steps; s++) {
  
+       // generation 0, step 0 doesn't need to-space, unless -G1
+       if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) { 
+           continue; 
+       }
+       
         stp = &generations[g].steps[s];
         ASSERT(stp->gen_no == g);
  
@@ -1236,11 +1256,6 @@ init_collected_gen (nat g, nat n_threads)
          stp->old_threads = stp->threads;
          stp->threads = END_TSO_QUEUE;
  
-       // generation 0, step 0 doesn't need to-space 
-       if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) { 
-           continue; 
-       }
-       
         // deprecate the existing blocks
         stp->old_blocks   = stp->blocks;
         stp->n_old_blocks = stp->n_blocks;
@@ -1544,6 +1559,10 @@ resize_generations (void)
         size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
                        RtsFlags.GcFlags.minOldGenSize);
         
+        if (RtsFlags.GcFlags.heapSizeSuggestionAuto) {
+            RtsFlags.GcFlags.heapSizeSuggestion = size;
+        }
+
         // minimum size for generation zero
         min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
                             RtsFlags.GcFlags.minAllocAreaSize);
@@ -1614,6 +1633,8 @@ resize_generations (void)
  static void
  resize_nursery (void)
  {
+    lnat min_nursery = RtsFlags.GcFlags.minAllocAreaSize * n_capabilities;
+
      if (RtsFlags.GcFlags.generations == 1)
      {   // Two-space collector:
         nat blocks;
@@ -1632,7 +1653,7 @@ resize_nursery (void)
          * performance we get from 3L bytes, reducing to the same
          * performance at 2L bytes.
          */
-       blocks = g0s0->n_blocks;
+       blocks = generations[0].steps[0].n_blocks;
         
         if ( RtsFlags.GcFlags.maxHeapSize != 0 &&
              blocks * RtsFlags.GcFlags.oldGenFactor * 2 > 
@@ -1656,9 +1677,9 @@ resize_nursery (void)
         else
         {
             blocks *= RtsFlags.GcFlags.oldGenFactor;
-           if (blocks < RtsFlags.GcFlags.minAllocAreaSize)
+           if (blocks < min_nursery)
             {
-               blocks = RtsFlags.GcFlags.minAllocAreaSize;
+               blocks = min_nursery;
             }
         }
         resizeNurseries(blocks);
@@ -1705,8 +1726,8 @@ resize_nursery (void)
                 (((long)RtsFlags.GcFlags.heapSizeSuggestion - (long)needed) * 100) /
                 (100 + (long)g0s0_pcnt_kept);
             
-           if (blocks < (long)RtsFlags.GcFlags.minAllocAreaSize) {
-               blocks = RtsFlags.GcFlags.minAllocAreaSize;
+           if (blocks < (long)min_nursery) {
+               blocks = min_nursery;
             }
             
             resizeNurseries((nat)blocks);