Fix a race in the deadlock-detection code

[ghc-hetmet.git] / rts / Schedule.c
diff --git a/rts/Schedule.c b/rts/Schedule.c

index 499cf77..7dd0634 100644 (file)
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -340,7 +340,14 @@ schedule (Capability *initialCapability, Task *task)
  #endif
         /* scheduleDoGC() deletes all the threads */
         cap = scheduleDoGC(cap,task,rtsFalse);
-       break;
+
+        // after scheduleDoGC(), we must be shutting down.  Either some
+        // other Capability did the final GC, or we did it above,
+        // either way we can fall through to the SCHED_SHUTTING_DOWN
+        // case now.
+        ASSERT(sched_state == SCHED_SHUTTING_DOWN);
+        // fall through
+
      case SCHED_SHUTTING_DOWN:
         debugTrace(DEBUG_sched, "SCHED_SHUTTING_DOWN");
         // If we are a worker, just exit.  If we're a bound thread
@@ -453,6 +460,15 @@ schedule (Capability *initialCapability, Task *task)
      }
  #endif
  
+    // If we're shutting down, and this thread has not yet been
+    // killed, kill it now.  This sometimes happens when a finalizer
+    // thread is created by the final GC, or a thread previously
+    // in a foreign call returns.
+    if (sched_state >= SCHED_INTERRUPTING &&
+        !(t->what_next == ThreadComplete || t->what_next == ThreadKilled)) {
+        deleteThread(cap,t);
+    }
+
      /* context switches are initiated by the timer signal, unless
       * the user specified "context switch as often as possible", with
       * +RTS -C0
@@ -985,12 +1001,11 @@ scheduleDetectDeadlock (Capability *cap, Task *task)
         // they are unreachable and will therefore be sent an
         // exception.  Any threads thus released will be immediately
         // runnable.
-       cap = scheduleDoGC (cap, task, rtsTrue/*force  major GC*/);
+       cap = scheduleDoGC (cap, task, rtsTrue/*force major GC*/);
+        // when force_major == rtsTrue. scheduleDoGC sets
+        // recent_activity to ACTIVITY_DONE_GC and turns off the timer
+        // signal.
  
-       recent_activity = ACTIVITY_DONE_GC;
-        // disable timer signals (see #1623)
-        stopTimer();
-       
         if ( !emptyRunQueue(cap) ) return;
  
  #if defined(RTS_USER_SIGNALS) && !defined(THREADED_RTS)
@@ -1148,7 +1163,7 @@ schedulePostRunThread (Capability *cap, StgTSO *t)
              // ATOMICALLY_FRAME, aborting the (nested)
              // transaction, and saving the stack of any
              // partially-evaluated thunks on the heap.
-            throwToSingleThreaded_(cap, t, NULL, rtsTrue, NULL);
+            throwToSingleThreaded_(cap, t, NULL, rtsTrue);
              
              ASSERT(get_itbl((StgClosure *)t->sp)->type == ATOMICALLY_FRAME);
          }
@@ -1467,6 +1482,13 @@ scheduleDoGC (Capability *cap, Task *task USED_IF_THREADS, rtsBool force_major)
      nat i;
  #endif
  
+    if (sched_state == SCHED_SHUTTING_DOWN) {
+        // The final GC has already been done, and the system is
+        // shutting down.  We'll probably deadlock if we try to GC
+        // now.
+        return cap;
+    }
+
  #ifdef THREADED_RTS
      // In order to GC, there must be no threads running Haskell code.
      // Therefore, the GC thread needs to hold *all* the capabilities,
@@ -1554,6 +1576,16 @@ scheduleDoGC (Capability *cap, Task *task USED_IF_THREADS, rtsBool force_major)
      balanceSparkPoolsCaps(n_capabilities, capabilities);
  #endif
  
+    if (force_major)
+    {
+        // We've just done a major GC and we don't need the timer
+        // signal turned on any more (#1623).
+        // NB. do this *before* releasing the Capabilities, to avoid
+        // deadlocks!
+        recent_activity = ACTIVITY_DONE_GC;
+        stopTimer();
+    }
+
  #if defined(THREADED_RTS)
      // release our stash of capabilities.
      for (i = 0; i < n_capabilities; i++) {
@@ -1993,9 +2025,22 @@ workerStart(Task *task)
      // schedule() runs without a lock.
      cap = schedule(cap,task);
  
-    // On exit from schedule(), we have a Capability.
-    releaseCapability(cap);
+    // On exit from schedule(), we have a Capability, but possibly not
+    // the same one we started with.
+
+    // During shutdown, the requirement is that after all the
+    // Capabilities are shut down, all workers that are shutting down
+    // have finished workerTaskStop().  This is why we hold on to
+    // cap->lock until we've finished workerTaskStop() below.
+    //
+    // There may be workers still involved in foreign calls; those
+    // will just block in waitForReturnCapability() because the
+    // Capability has been shut down.
+    //
+    ACQUIRE_LOCK(&cap->lock);
+    releaseCapability_(cap,rtsFalse);
      workerTaskStop(task);
+    RELEASE_LOCK(&cap->lock);
  }
  #endif
  
@@ -2098,7 +2143,6 @@ exitScheduler(
             shutdownCapability(&capabilities[i], task, wait_foreign);
         }
         boundTaskExiting(task);
-       stopTaskManager();
      }
  #endif
  }
@@ -2106,11 +2150,23 @@ exitScheduler(
  void
  freeScheduler( void )
  {
-    freeCapabilities();
-    freeTaskManager();
-    if (n_capabilities != 1) {
-        stgFree(capabilities);
+    nat still_running;
+
+    ACQUIRE_LOCK(&sched_mutex);
+    still_running = freeTaskManager();
+    // We can only free the Capabilities if there are no Tasks still
+    // running.  We might have a Task about to return from a foreign
+    // call into waitForReturnCapability(), for example (actually,
+    // this should be the *only* thing that a still-running Task can
+    // do at this point, and it will block waiting for the
+    // Capability).
+    if (still_running == 0) {
+        freeCapabilities();
+        if (n_capabilities != 1) {
+            stgFree(capabilities);
+        }
      }
+    RELEASE_LOCK(&sched_mutex);
  #if defined(THREADED_RTS)
      closeMutex(&sched_mutex);
  #endif