From: Simon Marlow Date: Wed, 19 Nov 2008 12:48:48 +0000 (+0000) Subject: Fix some more shutdown races X-Git-Url: http://git.megacz.com/?p=ghc-hetmet.git;a=commitdiff_plain;h=5cbe7adb6051a9d1738dfb5735c8c923b74c5945 Fix some more shutdown races There were races between workerTaskStop() and freeTaskManager(): we need to be sure that all Tasks have exited properly before we start tearing things down. This isn't completely straighforward, see comments for details. --- diff --git a/rts/RtsAPI.c b/rts/RtsAPI.c index 525ead2..d0bdead 100644 --- a/rts/RtsAPI.c +++ b/rts/RtsAPI.c @@ -587,18 +587,20 @@ rts_unlock (Capability *cap) task = cap->running_task; ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task); - // slightly delicate ordering of operations below, pay attention! - - // We are no longer a bound task/thread. This is important, - // because the GC can run when we release the Capability below, - // and we don't want it to treat this as a live TSO pointer. - task->tso = NULL; - // Now release the Capability. With the capability released, GC // may happen. NB. does not try to put the current Task on the // worker queue. - releaseCapability(cap); + // NB. keep cap->lock held while we call boundTaskExiting(). This + // is necessary during shutdown, where we want the invariant that + // after shutdownCapability(), all the Tasks associated with the + // Capability have completed their shutdown too. Otherwise we + // could have boundTaskExiting()/workerTaskStop() running at some + // random point in the future, which causes problems for + // freeTaskManager(). + ACQUIRE_LOCK(&cap->lock); + releaseCapability_(cap,rtsFalse); // Finally, we can release the Task to the free list. boundTaskExiting(task); + RELEASE_LOCK(&cap->lock); } diff --git a/rts/Schedule.c b/rts/Schedule.c index 552c2c9..ed4d4b0 100644 --- a/rts/Schedule.c +++ b/rts/Schedule.c @@ -2016,9 +2016,22 @@ workerStart(Task *task) // schedule() runs without a lock. cap = schedule(cap,task); - // On exit from schedule(), we have a Capability. - releaseCapability(cap); + // On exit from schedule(), we have a Capability, but possibly not + // the same one we started with. + + // During shutdown, the requirement is that after all the + // Capabilities are shut down, all workers that are shutting down + // have finished workerTaskStop(). This is why we hold on to + // cap->lock until we've finished workerTaskStop() below. + // + // There may be workers still involved in foreign calls; those + // will just block in waitForReturnCapability() because the + // Capability has been shut down. + // + ACQUIRE_LOCK(&cap->lock); + releaseCapability_(cap,rtsFalse); workerTaskStop(task); + RELEASE_LOCK(&cap->lock); } #endif @@ -2121,7 +2134,6 @@ exitScheduler( shutdownCapability(&capabilities[i], task, wait_foreign); } boundTaskExiting(task); - stopTaskManager(); } #endif } @@ -2129,11 +2141,23 @@ exitScheduler( void freeScheduler( void ) { - freeCapabilities(); - freeTaskManager(); - if (n_capabilities != 1) { - stgFree(capabilities); + nat still_running; + + ACQUIRE_LOCK(&sched_mutex); + still_running = freeTaskManager(); + // We can only free the Capabilities if there are no Tasks still + // running. We might have a Task about to return from a foreign + // call into waitForReturnCapability(), for example (actually, + // this should be the *only* thing that a still-running Task can + // do at this point, and it will block waiting for the + // Capability). + if (still_running == 0) { + freeCapabilities(); + if (n_capabilities != 1) { + stgFree(capabilities); + } } + RELEASE_LOCK(&sched_mutex); #if defined(THREADED_RTS) closeMutex(&sched_mutex); #endif diff --git a/rts/Task.c b/rts/Task.c index 7120436..9397789 100644 --- a/rts/Task.c +++ b/rts/Task.c @@ -64,25 +64,16 @@ initTaskManager (void) } } - -void -stopTaskManager (void) -{ - debugTrace(DEBUG_sched, - "stopping task manager, %d tasks still running", - tasksRunning); - /* nothing to do */ -} - - -void +nat freeTaskManager (void) { Task *task, *next; - debugTrace(DEBUG_sched, "freeing task manager"); + ASSERT_LOCK_HELD(&sched_mutex); + + debugTrace(DEBUG_sched, "freeing task manager, %d tasks still running", + tasksRunning); - ACQUIRE_LOCK(&sched_mutex); for (task = all_tasks; task != NULL; task = next) { next = task->all_link; if (task->stopped) { @@ -102,7 +93,8 @@ freeTaskManager (void) #if defined(THREADED_RTS) freeThreadLocalKey(¤tTaskKey); #endif - RELEASE_LOCK(&sched_mutex); + + return tasksRunning; } @@ -149,7 +141,6 @@ newTask (void) all_tasks = task; taskCount++; - workerCount++; return task; } @@ -185,6 +176,7 @@ newBoundTask (void) void boundTaskExiting (Task *task) { + task->tso = NULL; task->stopped = rtsTrue; task->cap = NULL; @@ -218,7 +210,11 @@ discardTask (Task *task) if (!task->stopped) { debugTrace(DEBUG_sched, "discarding task %ld", (long)TASK_ID(task)); task->cap = NULL; - task->tso = NULL; + if (task->tso == NULL) { + workerCount--; + } else { + task->tso = NULL; + } task->stopped = rtsTrue; tasksRunning--; task->next = task_free_list; @@ -263,6 +259,7 @@ workerTaskStop (Task *task) taskTimeStamp(task); task->stopped = rtsTrue; tasksRunning--; + workerCount--; ACQUIRE_LOCK(&sched_mutex); task->next = task_free_list; diff --git a/rts/Task.h b/rts/Task.h index 3b7a08e..590dd67 100644 --- a/rts/Task.h +++ b/rts/Task.h @@ -169,8 +169,7 @@ extern Task *all_tasks; // Requires: sched_mutex. // void initTaskManager (void); -void stopTaskManager (void); -void freeTaskManager (void); +nat freeTaskManager (void); // Create a new Task for a bound thread // Requires: sched_mutex.