Fix a race in the deadlock-detection code
[ghc-hetmet.git] / rts / Schedule.c
index 552c2c9..7dd0634 100644 (file)
@@ -1001,12 +1001,11 @@ scheduleDetectDeadlock (Capability *cap, Task *task)
        // they are unreachable and will therefore be sent an
        // exception.  Any threads thus released will be immediately
        // runnable.
-       cap = scheduleDoGC (cap, task, rtsTrue/*force  major GC*/);
+       cap = scheduleDoGC (cap, task, rtsTrue/*force major GC*/);
+        // when force_major == rtsTrue. scheduleDoGC sets
+        // recent_activity to ACTIVITY_DONE_GC and turns off the timer
+        // signal.
 
-       recent_activity = ACTIVITY_DONE_GC;
-        // disable timer signals (see #1623)
-        stopTimer();
-       
        if ( !emptyRunQueue(cap) ) return;
 
 #if defined(RTS_USER_SIGNALS) && !defined(THREADED_RTS)
@@ -1577,6 +1576,16 @@ scheduleDoGC (Capability *cap, Task *task USED_IF_THREADS, rtsBool force_major)
     balanceSparkPoolsCaps(n_capabilities, capabilities);
 #endif
 
+    if (force_major)
+    {
+        // We've just done a major GC and we don't need the timer
+        // signal turned on any more (#1623).
+        // NB. do this *before* releasing the Capabilities, to avoid
+        // deadlocks!
+        recent_activity = ACTIVITY_DONE_GC;
+        stopTimer();
+    }
+
 #if defined(THREADED_RTS)
     // release our stash of capabilities.
     for (i = 0; i < n_capabilities; i++) {
@@ -2016,9 +2025,22 @@ workerStart(Task *task)
     // schedule() runs without a lock.
     cap = schedule(cap,task);
 
-    // On exit from schedule(), we have a Capability.
-    releaseCapability(cap);
+    // On exit from schedule(), we have a Capability, but possibly not
+    // the same one we started with.
+
+    // During shutdown, the requirement is that after all the
+    // Capabilities are shut down, all workers that are shutting down
+    // have finished workerTaskStop().  This is why we hold on to
+    // cap->lock until we've finished workerTaskStop() below.
+    //
+    // There may be workers still involved in foreign calls; those
+    // will just block in waitForReturnCapability() because the
+    // Capability has been shut down.
+    //
+    ACQUIRE_LOCK(&cap->lock);
+    releaseCapability_(cap,rtsFalse);
     workerTaskStop(task);
+    RELEASE_LOCK(&cap->lock);
 }
 #endif
 
@@ -2121,7 +2143,6 @@ exitScheduler(
            shutdownCapability(&capabilities[i], task, wait_foreign);
        }
        boundTaskExiting(task);
-       stopTaskManager();
     }
 #endif
 }
@@ -2129,11 +2150,23 @@ exitScheduler(
 void
 freeScheduler( void )
 {
-    freeCapabilities();
-    freeTaskManager();
-    if (n_capabilities != 1) {
-        stgFree(capabilities);
+    nat still_running;
+
+    ACQUIRE_LOCK(&sched_mutex);
+    still_running = freeTaskManager();
+    // We can only free the Capabilities if there are no Tasks still
+    // running.  We might have a Task about to return from a foreign
+    // call into waitForReturnCapability(), for example (actually,
+    // this should be the *only* thing that a still-running Task can
+    // do at this point, and it will block waiting for the
+    // Capability).
+    if (still_running == 0) {
+        freeCapabilities();
+        if (n_capabilities != 1) {
+            stgFree(capabilities);
+        }
     }
+    RELEASE_LOCK(&sched_mutex);
 #if defined(THREADED_RTS)
     closeMutex(&sched_mutex);
 #endif