Fix some more shutdown races
authorSimon Marlow <marlowsd@gmail.com>
Wed, 19 Nov 2008 12:48:48 +0000 (12:48 +0000)
committerSimon Marlow <marlowsd@gmail.com>
Wed, 19 Nov 2008 12:48:48 +0000 (12:48 +0000)
There were races between workerTaskStop() and freeTaskManager(): we
need to be sure that all Tasks have exited properly before we start
tearing things down.  This isn't completely straighforward, see
comments for details.

rts/RtsAPI.c
rts/Schedule.c
rts/Task.c
rts/Task.h

index 525ead2..d0bdead 100644 (file)
@@ -587,18 +587,20 @@ rts_unlock (Capability *cap)
     task = cap->running_task;
     ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
 
-    // slightly delicate ordering of operations below, pay attention!
-
-    // We are no longer a bound task/thread.  This is important,
-    // because the GC can run when we release the Capability below,
-    // and we don't want it to treat this as a live TSO pointer.
-    task->tso = NULL;
-
     // Now release the Capability.  With the capability released, GC
     // may happen.  NB. does not try to put the current Task on the
     // worker queue.
-    releaseCapability(cap);
+    // NB. keep cap->lock held while we call boundTaskExiting().  This
+    // is necessary during shutdown, where we want the invariant that
+    // after shutdownCapability(), all the Tasks associated with the
+    // Capability have completed their shutdown too.  Otherwise we
+    // could have boundTaskExiting()/workerTaskStop() running at some
+    // random point in the future, which causes problems for
+    // freeTaskManager().
+    ACQUIRE_LOCK(&cap->lock);
+    releaseCapability_(cap,rtsFalse);
 
     // Finally, we can release the Task to the free list.
     boundTaskExiting(task);
+    RELEASE_LOCK(&cap->lock);
 }
index 552c2c9..ed4d4b0 100644 (file)
@@ -2016,9 +2016,22 @@ workerStart(Task *task)
     // schedule() runs without a lock.
     cap = schedule(cap,task);
 
-    // On exit from schedule(), we have a Capability.
-    releaseCapability(cap);
+    // On exit from schedule(), we have a Capability, but possibly not
+    // the same one we started with.
+
+    // During shutdown, the requirement is that after all the
+    // Capabilities are shut down, all workers that are shutting down
+    // have finished workerTaskStop().  This is why we hold on to
+    // cap->lock until we've finished workerTaskStop() below.
+    //
+    // There may be workers still involved in foreign calls; those
+    // will just block in waitForReturnCapability() because the
+    // Capability has been shut down.
+    //
+    ACQUIRE_LOCK(&cap->lock);
+    releaseCapability_(cap,rtsFalse);
     workerTaskStop(task);
+    RELEASE_LOCK(&cap->lock);
 }
 #endif
 
@@ -2121,7 +2134,6 @@ exitScheduler(
            shutdownCapability(&capabilities[i], task, wait_foreign);
        }
        boundTaskExiting(task);
-       stopTaskManager();
     }
 #endif
 }
@@ -2129,11 +2141,23 @@ exitScheduler(
 void
 freeScheduler( void )
 {
-    freeCapabilities();
-    freeTaskManager();
-    if (n_capabilities != 1) {
-        stgFree(capabilities);
+    nat still_running;
+
+    ACQUIRE_LOCK(&sched_mutex);
+    still_running = freeTaskManager();
+    // We can only free the Capabilities if there are no Tasks still
+    // running.  We might have a Task about to return from a foreign
+    // call into waitForReturnCapability(), for example (actually,
+    // this should be the *only* thing that a still-running Task can
+    // do at this point, and it will block waiting for the
+    // Capability).
+    if (still_running == 0) {
+        freeCapabilities();
+        if (n_capabilities != 1) {
+            stgFree(capabilities);
+        }
     }
+    RELEASE_LOCK(&sched_mutex);
 #if defined(THREADED_RTS)
     closeMutex(&sched_mutex);
 #endif
index 7120436..9397789 100644 (file)
@@ -64,25 +64,16 @@ initTaskManager (void)
     }
 }
 
-
-void
-stopTaskManager (void)
-{
-    debugTrace(DEBUG_sched, 
-              "stopping task manager, %d tasks still running",
-              tasksRunning);
-    /* nothing to do */
-}
-
-
-void
+nat
 freeTaskManager (void)
 {
     Task *task, *next;
 
-    debugTrace(DEBUG_sched, "freeing task manager");
+    ASSERT_LOCK_HELD(&sched_mutex);
+
+    debugTrace(DEBUG_sched, "freeing task manager, %d tasks still running",
+               tasksRunning);
 
-    ACQUIRE_LOCK(&sched_mutex);
     for (task = all_tasks; task != NULL; task = next) {
         next = task->all_link;
         if (task->stopped) {
@@ -102,7 +93,8 @@ freeTaskManager (void)
 #if defined(THREADED_RTS)
     freeThreadLocalKey(&currentTaskKey);
 #endif
-    RELEASE_LOCK(&sched_mutex);
+
+    return tasksRunning;
 }
 
 
@@ -149,7 +141,6 @@ newTask (void)
     all_tasks = task;
 
     taskCount++;
-    workerCount++;
 
     return task;
 }
@@ -185,6 +176,7 @@ newBoundTask (void)
 void
 boundTaskExiting (Task *task)
 {
+    task->tso = NULL;
     task->stopped = rtsTrue;
     task->cap = NULL;
 
@@ -218,7 +210,11 @@ discardTask (Task *task)
     if (!task->stopped) {
        debugTrace(DEBUG_sched, "discarding task %ld", (long)TASK_ID(task));
        task->cap = NULL;
-       task->tso = NULL;
+        if (task->tso == NULL) {
+            workerCount--;
+        } else {
+            task->tso = NULL;
+        }
        task->stopped = rtsTrue;
        tasksRunning--;
        task->next = task_free_list;
@@ -263,6 +259,7 @@ workerTaskStop (Task *task)
     taskTimeStamp(task);
     task->stopped = rtsTrue;
     tasksRunning--;
+    workerCount--;
 
     ACQUIRE_LOCK(&sched_mutex);
     task->next = task_free_list;
index 3b7a08e..590dd67 100644 (file)
@@ -169,8 +169,7 @@ extern Task *all_tasks;
 // Requires: sched_mutex.
 //
 void initTaskManager (void);
-void stopTaskManager (void);
-void freeTaskManager (void);
+nat  freeTaskManager (void);
 
 // Create a new Task for a bound thread
 // Requires: sched_mutex.