rts/Schedule.c

   1 /* ---------------------------------------------------------------------------
   2  *
   3  * (c) The GHC Team, 1998-2006
   4  *
   5  * The scheduler and thread-related functionality
   6  *
   7  * --------------------------------------------------------------------------*/
   8
   9 #include "PosixSource.h"
  10 #define KEEP_LOCKCLOSURE
  11 #include "Rts.h"
  12
  13 #include "sm/Storage.h"
  14 #include "RtsUtils.h"
  15 #include "StgRun.h"
  16 #include "Schedule.h"
  17 #include "Interpreter.h"
  18 #include "Printer.h"
  19 #include "RtsSignals.h"
  20 #include "sm/Sanity.h"
  21 #include "Stats.h"
  22 #include "STM.h"
  23 #include "Prelude.h"
  24 #include "ThreadLabels.h"
  25 #include "Updates.h"
  26 #include "Proftimer.h"
  27 #include "ProfHeap.h"
  28 #include "Weak.h"
  29 #include "sm/GC.h" // waitForGcThreads, releaseGCThreads, N
  30 #include "Sparks.h"
  31 #include "Capability.h"
  32 #include "Task.h"
  33 #include "AwaitEvent.h"
  34 #if defined(mingw32_HOST_OS)
  35 #include "win32/IOManager.h"
  36 #endif
  37 #include "Trace.h"
  38 #include "RaiseAsync.h"
  39 #include "Threads.h"
  40 #include "Timer.h"
  41 #include "ThreadPaused.h"
  42 #include "Messages.h"
  43
  44 #ifdef HAVE_SYS_TYPES_H
  45 #include <sys/types.h>
  46 #endif
  47 #ifdef HAVE_UNISTD_H
  48 #include <unistd.h>
  49 #endif
  50
  51 #include <string.h>
  52 #include <stdlib.h>
  53 #include <stdarg.h>
  54
  55 #ifdef HAVE_ERRNO_H
  56 #include <errno.h>
  57 #endif
  58
  59 #ifdef TRACING
  60 #include "eventlog/EventLog.h"
  61 #endif
  62 /* -----------------------------------------------------------------------------
  63  * Global variables
  64  * -------------------------------------------------------------------------- */
  65
  66 #if !defined(THREADED_RTS)
  67 // Blocked/sleeping thrads
  68 StgTSO *blocked_queue_hd = NULL;
  69 StgTSO *blocked_queue_tl = NULL;
  70 StgTSO *sleeping_queue = NULL;    // perhaps replace with a hash table?
  71 #endif
  72
  73 /* Set to true when the latest garbage collection failed to reclaim
  74  * enough space, and the runtime should proceed to shut itself down in
  75  * an orderly fashion (emitting profiling info etc.)
  76  */
  77 rtsBool heap_overflow = rtsFalse;
  78
  79 /* flag that tracks whether we have done any execution in this time slice.
  80  * LOCK: currently none, perhaps we should lock (but needs to be
  81  * updated in the fast path of the scheduler).
  82  *
  83  * NB. must be StgWord, we do xchg() on it.
  84  */
  85 volatile StgWord recent_activity = ACTIVITY_YES;
  86
  87 /* if this flag is set as well, give up execution
  88  * LOCK: none (changes monotonically)
  89  */
  90 volatile StgWord sched_state = SCHED_RUNNING;
  91
  92 /*  This is used in `TSO.h' and gcc 2.96 insists that this variable actually
  93  *  exists - earlier gccs apparently didn't.
  94  *  -= chak
  95  */
  96 StgTSO dummy_tso;
  97
  98 /*
  99  * Set to TRUE when entering a shutdown state (via shutdownHaskellAndExit()) --
 100  * in an MT setting, needed to signal that a worker thread shouldn't hang around
 101  * in the scheduler when it is out of work.
 102  */
 103 rtsBool shutting_down_scheduler = rtsFalse;
 104
 105 /*
 106  * This mutex protects most of the global scheduler data in
 107  * the THREADED_RTS runtime.
 108  */
 109 #if defined(THREADED_RTS)
 110 Mutex sched_mutex;
 111 #endif
 112
 113 #if !defined(mingw32_HOST_OS)
 114 #define FORKPROCESS_PRIMOP_SUPPORTED
 115 #endif
 116
 117 /* -----------------------------------------------------------------------------
 118  * static function prototypes
 119  * -------------------------------------------------------------------------- */
 120
 121 static Capability *schedule (Capability *initialCapability, Task *task);
 122
 123 //
 124 // These function all encapsulate parts of the scheduler loop, and are
 125 // abstracted only to make the structure and control flow of the
 126 // scheduler clearer.
 127 //
 128 static void schedulePreLoop (void);
 129 static void scheduleFindWork (Capability *cap);
 130 #if defined(THREADED_RTS)
 131 static void scheduleYield (Capability **pcap, Task *task);
 132 #endif
 133 static void scheduleStartSignalHandlers (Capability *cap);
 134 static void scheduleCheckBlockedThreads (Capability *cap);
 135 static void scheduleProcessInbox(Capability *cap);
 136 static void scheduleDetectDeadlock (Capability *cap, Task *task);
 137 static void schedulePushWork(Capability *cap, Task *task);
 138 #if defined(THREADED_RTS)
 139 static void scheduleActivateSpark(Capability *cap);
 140 #endif
 141 static void schedulePostRunThread(Capability *cap, StgTSO *t);
 142 static rtsBool scheduleHandleHeapOverflow( Capability *cap, StgTSO *t );
 143 static void scheduleHandleStackOverflow( Capability *cap, Task *task,
 144                                          StgTSO *t);
 145 static rtsBool scheduleHandleYield( Capability *cap, StgTSO *t,
 146                                     nat prev_what_next );
 147 static void scheduleHandleThreadBlocked( StgTSO *t );
 148 static rtsBool scheduleHandleThreadFinished( Capability *cap, Task *task,
 149                                              StgTSO *t );
 150 static rtsBool scheduleNeedHeapProfile(rtsBool ready_to_gc);
 151 static Capability *scheduleDoGC(Capability *cap, Task *task,
 152                                 rtsBool force_major);
 153
 154 static StgTSO *threadStackOverflow(Capability *cap, StgTSO *tso);
 155 static StgTSO *threadStackUnderflow(Capability *cap, Task *task, StgTSO *tso);
 156
 157 static void deleteThread (Capability *cap, StgTSO *tso);
 158 static void deleteAllThreads (Capability *cap);
 159
 160 #ifdef FORKPROCESS_PRIMOP_SUPPORTED
 161 static void deleteThread_(Capability *cap, StgTSO *tso);
 162 #endif
 163
 164 /* ---------------------------------------------------------------------------
 165    Main scheduling loop.
 166
 167    We use round-robin scheduling, each thread returning to the
 168    scheduler loop when one of these conditions is detected:
 169
 170       * out of heap space
 171       * timer expires (thread yields)
 172       * thread blocks
 173       * thread ends
 174       * stack overflow
 175
 176    GRAN version:
 177      In a GranSim setup this loop iterates over the global event queue.
 178      This revolves around the global event queue, which determines what
 179      to do next. Therefore, it's more complicated than either the
 180      concurrent or the parallel (GUM) setup.
 181   This version has been entirely removed (JB 2008/08).
 182
 183    GUM version:
 184      GUM iterates over incoming messages.
 185      It starts with nothing to do (thus CurrentTSO == END_TSO_QUEUE),
 186      and sends out a fish whenever it has nothing to do; in-between
 187      doing the actual reductions (shared code below) it processes the
 188      incoming messages and deals with delayed operations
 189      (see PendingFetches).
 190      This is not the ugliest code you could imagine, but it's bloody close.
 191
 192   (JB 2008/08) This version was formerly indicated by a PP-Flag PAR,
 193   now by PP-flag PARALLEL_HASKELL. The Eden RTS (in GHC-6.x) uses it,
 194   as well as future GUM versions. This file has been refurbished to
 195   only contain valid code, which is however incomplete, refers to
 196   invalid includes etc.
 197
 198    ------------------------------------------------------------------------ */
 199
 200 static Capability *
 201 schedule (Capability *initialCapability, Task *task)
 202 {
 203   StgTSO *t;
 204   Capability *cap;
 205   StgThreadReturnCode ret;
 206   nat prev_what_next;
 207   rtsBool ready_to_gc;
 208 #if defined(THREADED_RTS)
 209   rtsBool first = rtsTrue;
 210 #endif
 211
 212   cap = initialCapability;
 213
 214   // Pre-condition: this task owns initialCapability.
 215   // The sched_mutex is *NOT* held
 216   // NB. on return, we still hold a capability.
 217
 218   debugTrace (DEBUG_sched, "cap %d: schedule()", initialCapability->no);
 219
 220   schedulePreLoop();
 221
 222   // -----------------------------------------------------------
 223   // Scheduler loop starts here:
 224
 225   while (1) {
 226
 227     // Check whether we have re-entered the RTS from Haskell without
 228     // going via suspendThread()/resumeThread (i.e. a 'safe' foreign
 229     // call).
 230     if (cap->in_haskell) {
 231           errorBelch("schedule: re-entered unsafely.\n"
 232                      "   Perhaps a 'foreign import unsafe' should be 'safe'?");
 233           stg_exit(EXIT_FAILURE);
 234     }
 235
 236     // The interruption / shutdown sequence.
 237     //
 238     // In order to cleanly shut down the runtime, we want to:
 239     //   * make sure that all main threads return to their callers
 240     //     with the state 'Interrupted'.
 241     //   * clean up all OS threads assocated with the runtime
 242     //   * free all memory etc.
 243     //
 244     // So the sequence for ^C goes like this:
 245     //
 246     //   * ^C handler sets sched_state := SCHED_INTERRUPTING and
 247     //     arranges for some Capability to wake up
 248     //
 249     //   * all threads in the system are halted, and the zombies are
 250     //     placed on the run queue for cleaning up.  We acquire all
 251     //     the capabilities in order to delete the threads, this is
 252     //     done by scheduleDoGC() for convenience (because GC already
 253     //     needs to acquire all the capabilities).  We can't kill
 254     //     threads involved in foreign calls.
 255     //
 256     //   * somebody calls shutdownHaskell(), which calls exitScheduler()
 257     //
 258     //   * sched_state := SCHED_SHUTTING_DOWN
 259     //
 260     //   * all workers exit when the run queue on their capability
 261     //     drains.  All main threads will also exit when their TSO
 262     //     reaches the head of the run queue and they can return.
 263     //
 264     //   * eventually all Capabilities will shut down, and the RTS can
 265     //     exit.
 266     //
 267     //   * We might be left with threads blocked in foreign calls,
 268     //     we should really attempt to kill these somehow (TODO);
 269
 270     switch (sched_state) {
 271     case SCHED_RUNNING:
 272         break;
 273     case SCHED_INTERRUPTING:
 274         debugTrace(DEBUG_sched, "SCHED_INTERRUPTING");
 275 #if defined(THREADED_RTS)
 276         discardSparksCap(cap);
 277 #endif
 278         /* scheduleDoGC() deletes all the threads */
 279         cap = scheduleDoGC(cap,task,rtsFalse);
 280
 281         // after scheduleDoGC(), we must be shutting down.  Either some
 282         // other Capability did the final GC, or we did it above,
 283         // either way we can fall through to the SCHED_SHUTTING_DOWN
 284         // case now.
 285         ASSERT(sched_state == SCHED_SHUTTING_DOWN);
 286         // fall through
 287
 288     case SCHED_SHUTTING_DOWN:
 289         debugTrace(DEBUG_sched, "SCHED_SHUTTING_DOWN");
 290         // If we are a worker, just exit.  If we're a bound thread
 291         // then we will exit below when we've removed our TSO from
 292         // the run queue.
 293         if (!isBoundTask(task) && emptyRunQueue(cap)) {
 294             return cap;
 295         }
 296         break;
 297     default:
 298         barf("sched_state: %d", sched_state);
 299     }
 300
 301     scheduleFindWork(cap);
 302
 303     /* work pushing, currently relevant only for THREADED_RTS:
 304        (pushes threads, wakes up idle capabilities for stealing) */
 305     schedulePushWork(cap,task);
 306
 307     scheduleDetectDeadlock(cap,task);
 308
 309 #if defined(THREADED_RTS)
 310     cap = task->cap;    // reload cap, it might have changed
 311 #endif
 312
 313     // Normally, the only way we can get here with no threads to
 314     // run is if a keyboard interrupt received during
 315     // scheduleCheckBlockedThreads() or scheduleDetectDeadlock().
 316     // Additionally, it is not fatal for the
 317     // threaded RTS to reach here with no threads to run.
 318     //
 319     // win32: might be here due to awaitEvent() being abandoned
 320     // as a result of a console event having been delivered.
 321
 322 #if defined(THREADED_RTS)
 323     if (first)
 324     {
 325     // XXX: ToDo
 326     //     // don't yield the first time, we want a chance to run this
 327     //     // thread for a bit, even if there are others banging at the
 328     //     // door.
 329     //     first = rtsFalse;
 330     //     ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
 331     }
 332
 333     scheduleYield(&cap,task);
 334
 335     if (emptyRunQueue(cap)) continue; // look for work again
 336 #endif
 337
 338 #if !defined(THREADED_RTS) && !defined(mingw32_HOST_OS)
 339     if ( emptyRunQueue(cap) ) {
 340         ASSERT(sched_state >= SCHED_INTERRUPTING);
 341     }
 342 #endif
 343
 344     //
 345     // Get a thread to run
 346     //
 347     t = popRunQueue(cap);
 348
 349     // Sanity check the thread we're about to run.  This can be
 350     // expensive if there is lots of thread switching going on...
 351     IF_DEBUG(sanity,checkTSO(t));
 352
 353 #if defined(THREADED_RTS)
 354     // Check whether we can run this thread in the current task.
 355     // If not, we have to pass our capability to the right task.
 356     {
 357         InCall *bound = t->bound;
 358
 359         if (bound) {
 360             if (bound->task == task) {
 361                 // yes, the Haskell thread is bound to the current native thread
 362             } else {
 363                 debugTrace(DEBUG_sched,
 364                            "thread %lu bound to another OS thread",
 365                            (unsigned long)t->id);
 366                 // no, bound to a different Haskell thread: pass to that thread
 367                 pushOnRunQueue(cap,t);
 368                 continue;
 369             }
 370         } else {
 371             // The thread we want to run is unbound.
 372             if (task->incall->tso) {
 373                 debugTrace(DEBUG_sched,
 374                            "this OS thread cannot run thread %lu",
 375                            (unsigned long)t->id);
 376                 // no, the current native thread is bound to a different
 377                 // Haskell thread, so pass it to any worker thread
 378                 pushOnRunQueue(cap,t);
 379                 continue;
 380             }
 381         }
 382     }
 383 #endif
 384
 385     // If we're shutting down, and this thread has not yet been
 386     // killed, kill it now.  This sometimes happens when a finalizer
 387     // thread is created by the final GC, or a thread previously
 388     // in a foreign call returns.
 389     if (sched_state >= SCHED_INTERRUPTING &&
 390         !(t->what_next == ThreadComplete || t->what_next == ThreadKilled)) {
 391         deleteThread(cap,t);
 392     }
 393
 394     /* context switches are initiated by the timer signal, unless
 395      * the user specified "context switch as often as possible", with
 396      * +RTS -C0
 397      */
 398     if (RtsFlags.ConcFlags.ctxtSwitchTicks == 0
 399         && !emptyThreadQueues(cap)) {
 400         cap->context_switch = 1;
 401     }
 402
 403 run_thread:
 404
 405     // CurrentTSO is the thread to run.  t might be different if we
 406     // loop back to run_thread, so make sure to set CurrentTSO after
 407     // that.
 408     cap->r.rCurrentTSO = t;
 409
 410     startHeapProfTimer();
 411
 412     // ----------------------------------------------------------------------
 413     // Run the current thread
 414
 415     ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
 416     ASSERT(t->cap == cap);
 417     ASSERT(t->bound ? t->bound->task->cap == cap : 1);
 418
 419     prev_what_next = t->what_next;
 420
 421     errno = t->saved_errno;
 422 #if mingw32_HOST_OS
 423     SetLastError(t->saved_winerror);
 424 #endif
 425
 426     cap->in_haskell = rtsTrue;
 427
 428     dirty_TSO(cap,t);
 429
 430 #if defined(THREADED_RTS)
 431     if (recent_activity == ACTIVITY_DONE_GC) {
 432         // ACTIVITY_DONE_GC means we turned off the timer signal to
 433         // conserve power (see #1623).  Re-enable it here.
 434         nat prev;
 435         prev = xchg((P_)&recent_activity, ACTIVITY_YES);
 436         if (prev == ACTIVITY_DONE_GC) {
 437             startTimer();
 438         }
 439     } else if (recent_activity != ACTIVITY_INACTIVE) {
 440         // If we reached ACTIVITY_INACTIVE, then don't reset it until
 441         // we've done the GC.  The thread running here might just be
 442         // the IO manager thread that handle_tick() woke up via
 443         // wakeUpRts().
 444         recent_activity = ACTIVITY_YES;
 445     }
 446 #endif
 447
 448     traceEventRunThread(cap, t);
 449
 450     switch (prev_what_next) {
 451
 452     case ThreadKilled:
 453     case ThreadComplete:
 454         /* Thread already finished, return to scheduler. */
 455         ret = ThreadFinished;
 456         break;
 457
 458     case ThreadRunGHC:
 459     {
 460         StgRegTable *r;
 461         r = StgRun((StgFunPtr) stg_returnToStackTop, &cap->r);
 462         cap = regTableToCapability(r);
 463         ret = r->rRet;
 464         break;
 465     }
 466
 467     case ThreadInterpret:
 468         cap = interpretBCO(cap);
 469         ret = cap->r.rRet;
 470         break;
 471
 472     default:
 473         barf("schedule: invalid what_next field");
 474     }
 475
 476     cap->in_haskell = rtsFalse;
 477
 478     // The TSO might have moved, eg. if it re-entered the RTS and a GC
 479     // happened.  So find the new location:
 480     t = cap->r.rCurrentTSO;
 481
 482     // And save the current errno in this thread.
 483     // XXX: possibly bogus for SMP because this thread might already
 484     // be running again, see code below.
 485     t->saved_errno = errno;
 486 #if mingw32_HOST_OS
 487     // Similarly for Windows error code
 488     t->saved_winerror = GetLastError();
 489 #endif
 490
 491     traceEventStopThread(cap, t, ret);
 492
 493     ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
 494     ASSERT(t->cap == cap);
 495
 496     // ----------------------------------------------------------------------
 497
 498     // Costs for the scheduler are assigned to CCS_SYSTEM
 499     stopHeapProfTimer();
 500 #if defined(PROFILING)
 501     CCCS = CCS_SYSTEM;
 502 #endif
 503
 504     schedulePostRunThread(cap,t);
 505
 506     if (ret != StackOverflow) {
 507         t = threadStackUnderflow(cap,task,t);
 508     }
 509
 510     ready_to_gc = rtsFalse;
 511
 512     switch (ret) {
 513     case HeapOverflow:
 514         ready_to_gc = scheduleHandleHeapOverflow(cap,t);
 515         break;
 516
 517     case StackOverflow:
 518         scheduleHandleStackOverflow(cap,task,t);
 519         break;
 520
 521     case ThreadYielding:
 522         if (scheduleHandleYield(cap, t, prev_what_next)) {
 523             // shortcut for switching between compiler/interpreter:
 524             goto run_thread;
 525         }
 526         break;
 527
 528     case ThreadBlocked:
 529         scheduleHandleThreadBlocked(t);
 530         break;
 531
 532     case ThreadFinished:
 533         if (scheduleHandleThreadFinished(cap, task, t)) return cap;
 534         ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
 535         break;
 536
 537     default:
 538       barf("schedule: invalid thread return code %d", (int)ret);
 539     }
 540
 541     if (ready_to_gc || scheduleNeedHeapProfile(ready_to_gc)) {
 542       cap = scheduleDoGC(cap,task,rtsFalse);
 543     }
 544   } /* end of while() */
 545 }
 546
 547 /* -----------------------------------------------------------------------------
 548  * Run queue operations
 549  * -------------------------------------------------------------------------- */
 550
 551 void
 552 removeFromRunQueue (Capability *cap, StgTSO *tso)
 553 {
 554     if (tso->block_info.prev == END_TSO_QUEUE) {
 555         ASSERT(cap->run_queue_hd == tso);
 556         cap->run_queue_hd = tso->_link;
 557     } else {
 558         setTSOLink(cap, tso->block_info.prev, tso->_link);
 559     }
 560     if (tso->_link == END_TSO_QUEUE) {
 561         ASSERT(cap->run_queue_tl == tso);
 562         cap->run_queue_tl = tso->block_info.prev;
 563     } else {
 564         setTSOPrev(cap, tso->_link, tso->block_info.prev);
 565     }
 566     tso->_link = tso->block_info.prev = END_TSO_QUEUE;
 567
 568     IF_DEBUG(sanity, checkRunQueue(cap));
 569 }
 570
 571 /* ----------------------------------------------------------------------------
 572  * Setting up the scheduler loop
 573  * ------------------------------------------------------------------------- */
 574
 575 static void
 576 schedulePreLoop(void)
 577 {
 578   // initialisation for scheduler - what cannot go into initScheduler()
 579 }
 580
 581 /* -----------------------------------------------------------------------------
 582  * scheduleFindWork()
 583  *
 584  * Search for work to do, and handle messages from elsewhere.
 585  * -------------------------------------------------------------------------- */
 586
 587 static void
 588 scheduleFindWork (Capability *cap)
 589 {
 590     scheduleStartSignalHandlers(cap);
 591
 592     scheduleProcessInbox(cap);
 593
 594     scheduleCheckBlockedThreads(cap);
 595
 596 #if defined(THREADED_RTS)
 597     if (emptyRunQueue(cap)) { scheduleActivateSpark(cap); }
 598 #endif
 599 }
 600
 601 #if defined(THREADED_RTS)
 602 STATIC_INLINE rtsBool
 603 shouldYieldCapability (Capability *cap, Task *task)
 604 {
 605     // we need to yield this capability to someone else if..
 606     //   - another thread is initiating a GC
 607     //   - another Task is returning from a foreign call
 608     //   - the thread at the head of the run queue cannot be run
 609     //     by this Task (it is bound to another Task, or it is unbound
 610     //     and this task it bound).
 611     return (waiting_for_gc ||
 612             cap->returning_tasks_hd != NULL ||
 613             (!emptyRunQueue(cap) && (task->incall->tso == NULL
 614                                      ? cap->run_queue_hd->bound != NULL
 615                                      : cap->run_queue_hd->bound != task->incall)));
 616 }
 617
 618 // This is the single place where a Task goes to sleep.  There are
 619 // two reasons it might need to sleep:
 620 //    - there are no threads to run
 621 //    - we need to yield this Capability to someone else
 622 //      (see shouldYieldCapability())
 623 //
 624 // Careful: the scheduler loop is quite delicate.  Make sure you run
 625 // the tests in testsuite/concurrent (all ways) after modifying this,
 626 // and also check the benchmarks in nofib/parallel for regressions.
 627
 628 static void
 629 scheduleYield (Capability **pcap, Task *task)
 630 {
 631     Capability *cap = *pcap;
 632
 633     // if we have work, and we don't need to give up the Capability, continue.
 634     //
 635     if (!shouldYieldCapability(cap,task) &&
 636         (!emptyRunQueue(cap) ||
 637          !emptyInbox(cap) ||
 638          sched_state >= SCHED_INTERRUPTING))
 639         return;
 640
 641     // otherwise yield (sleep), and keep yielding if necessary.
 642     do {
 643         yieldCapability(&cap,task);
 644     }
 645     while (shouldYieldCapability(cap,task));
 646
 647     // note there may still be no threads on the run queue at this
 648     // point, the caller has to check.
 649
 650     *pcap = cap;
 651     return;
 652 }
 653 #endif
 654
 655 /* -----------------------------------------------------------------------------
 656  * schedulePushWork()
 657  *
 658  * Push work to other Capabilities if we have some.
 659  * -------------------------------------------------------------------------- */
 660
 661 static void
 662 schedulePushWork(Capability *cap USED_IF_THREADS,
 663                  Task *task      USED_IF_THREADS)
 664 {
 665   /* following code not for PARALLEL_HASKELL. I kept the call general,
 666      future GUM versions might use pushing in a distributed setup */
 667 #if defined(THREADED_RTS)
 668
 669     Capability *free_caps[n_capabilities], *cap0;
 670     nat i, n_free_caps;
 671
 672     // migration can be turned off with +RTS -qm
 673     if (!RtsFlags.ParFlags.migrate) return;
 674
 675     // Check whether we have more threads on our run queue, or sparks
 676     // in our pool, that we could hand to another Capability.
 677     if (cap->run_queue_hd == END_TSO_QUEUE) {
 678         if (sparkPoolSizeCap(cap) < 2) return;
 679     } else {
 680         if (cap->run_queue_hd->_link == END_TSO_QUEUE &&
 681             sparkPoolSizeCap(cap) < 1) return;
 682     }
 683
 684     // First grab as many free Capabilities as we can.
 685     for (i=0, n_free_caps=0; i < n_capabilities; i++) {
 686         cap0 = &capabilities[i];
 687         if (cap != cap0 && tryGrabCapability(cap0,task)) {
 688             if (!emptyRunQueue(cap0)
 689                 || cap->returning_tasks_hd != NULL
 690                 || cap->inbox != (Message*)END_TSO_QUEUE) {
 691                 // it already has some work, we just grabbed it at
 692                 // the wrong moment.  Or maybe it's deadlocked!
 693                 releaseCapability(cap0);
 694             } else {
 695                 free_caps[n_free_caps++] = cap0;
 696             }
 697         }
 698     }
 699
 700     // we now have n_free_caps free capabilities stashed in
 701     // free_caps[].  Share our run queue equally with them.  This is
 702     // probably the simplest thing we could do; improvements we might
 703     // want to do include:
 704     //
 705     //   - giving high priority to moving relatively new threads, on
 706     //     the gournds that they haven't had time to build up a
 707     //     working set in the cache on this CPU/Capability.
 708     //
 709     //   - giving low priority to moving long-lived threads
 710
 711     if (n_free_caps > 0) {
 712         StgTSO *prev, *t, *next;
 713         rtsBool pushed_to_all;
 714
 715         debugTrace(DEBUG_sched,
 716                    "cap %d: %s and %d free capabilities, sharing...",
 717                    cap->no,
 718                    (!emptyRunQueue(cap) && cap->run_queue_hd->_link != END_TSO_QUEUE)?
 719                    "excess threads on run queue":"sparks to share (>=2)",
 720                    n_free_caps);
 721
 722         i = 0;
 723         pushed_to_all = rtsFalse;
 724
 725         if (cap->run_queue_hd != END_TSO_QUEUE) {
 726             prev = cap->run_queue_hd;
 727             t = prev->_link;
 728             prev->_link = END_TSO_QUEUE;
 729             for (; t != END_TSO_QUEUE; t = next) {
 730                 next = t->_link;
 731                 t->_link = END_TSO_QUEUE;
 732                 if (t->what_next == ThreadRelocated
 733                     || t->bound == task->incall // don't move my bound thread
 734                     || tsoLocked(t)) {  // don't move a locked thread
 735                     setTSOLink(cap, prev, t);
 736                     setTSOPrev(cap, t, prev);
 737                     prev = t;
 738                 } else if (i == n_free_caps) {
 739                     pushed_to_all = rtsTrue;
 740                     i = 0;
 741                     // keep one for us
 742                     setTSOLink(cap, prev, t);
 743                     setTSOPrev(cap, t, prev);
 744                     prev = t;
 745                 } else {
 746                     appendToRunQueue(free_caps[i],t);
 747
 748                     traceEventMigrateThread (cap, t, free_caps[i]->no);
 749
 750                     if (t->bound) { t->bound->task->cap = free_caps[i]; }
 751                     t->cap = free_caps[i];
 752                     i++;
 753                 }
 754             }
 755             cap->run_queue_tl = prev;
 756
 757             IF_DEBUG(sanity, checkRunQueue(cap));
 758         }
 759
 760 #ifdef SPARK_PUSHING
 761         /* JB I left this code in place, it would work but is not necessary */
 762
 763         // If there are some free capabilities that we didn't push any
 764         // threads to, then try to push a spark to each one.
 765         if (!pushed_to_all) {
 766             StgClosure *spark;
 767             // i is the next free capability to push to
 768             for (; i < n_free_caps; i++) {
 769                 if (emptySparkPoolCap(free_caps[i])) {
 770                     spark = tryStealSpark(cap->sparks);
 771                     if (spark != NULL) {
 772                         debugTrace(DEBUG_sched, "pushing spark %p to capability %d", spark, free_caps[i]->no);
 773
 774             traceEventStealSpark(free_caps[i], t, cap->no);
 775
 776                         newSpark(&(free_caps[i]->r), spark);
 777                     }
 778                 }
 779             }
 780         }
 781 #endif /* SPARK_PUSHING */
 782
 783         // release the capabilities
 784         for (i = 0; i < n_free_caps; i++) {
 785             task->cap = free_caps[i];
 786             releaseAndWakeupCapability(free_caps[i]);
 787         }
 788     }
 789     task->cap = cap; // reset to point to our Capability.
 790
 791 #endif /* THREADED_RTS */
 792
 793 }
 794
 795 /* ----------------------------------------------------------------------------
 796  * Start any pending signal handlers
 797  * ------------------------------------------------------------------------- */
 798
 799 #if defined(RTS_USER_SIGNALS) && !defined(THREADED_RTS)
 800 static void
 801 scheduleStartSignalHandlers(Capability *cap)
 802 {
 803     if (RtsFlags.MiscFlags.install_signal_handlers && signals_pending()) {
 804         // safe outside the lock
 805         startSignalHandlers(cap);
 806     }
 807 }
 808 #else
 809 static void
 810 scheduleStartSignalHandlers(Capability *cap STG_UNUSED)
 811 {
 812 }
 813 #endif
 814
 815 /* ----------------------------------------------------------------------------
 816  * Check for blocked threads that can be woken up.
 817  * ------------------------------------------------------------------------- */
 818
 819 static void
 820 scheduleCheckBlockedThreads(Capability *cap USED_IF_NOT_THREADS)
 821 {
 822 #if !defined(THREADED_RTS)
 823     //
 824     // Check whether any waiting threads need to be woken up.  If the
 825     // run queue is empty, and there are no other tasks running, we
 826     // can wait indefinitely for something to happen.
 827     //
 828     if ( !emptyQueue(blocked_queue_hd) || !emptyQueue(sleeping_queue) )
 829     {
 830         awaitEvent (emptyRunQueue(cap));
 831     }
 832 #endif
 833 }
 834
 835 /* ----------------------------------------------------------------------------
 836  * Detect deadlock conditions and attempt to resolve them.
 837  * ------------------------------------------------------------------------- */
 838
 839 static void
 840 scheduleDetectDeadlock (Capability *cap, Task *task)
 841 {
 842     /*
 843      * Detect deadlock: when we have no threads to run, there are no
 844      * threads blocked, waiting for I/O, or sleeping, and all the
 845      * other tasks are waiting for work, we must have a deadlock of
 846      * some description.
 847      */
 848     if ( emptyThreadQueues(cap) )
 849     {
 850 #if defined(THREADED_RTS)
 851         /*
 852          * In the threaded RTS, we only check for deadlock if there
 853          * has been no activity in a complete timeslice.  This means
 854          * we won't eagerly start a full GC just because we don't have
 855          * any threads to run currently.
 856          */
 857         if (recent_activity != ACTIVITY_INACTIVE) return;
 858 #endif
 859
 860         debugTrace(DEBUG_sched, "deadlocked, forcing major GC...");
 861
 862         // Garbage collection can release some new threads due to
 863         // either (a) finalizers or (b) threads resurrected because
 864         // they are unreachable and will therefore be sent an
 865         // exception.  Any threads thus released will be immediately
 866         // runnable.
 867         cap = scheduleDoGC (cap, task, rtsTrue/*force major GC*/);
 868         // when force_major == rtsTrue. scheduleDoGC sets
 869         // recent_activity to ACTIVITY_DONE_GC and turns off the timer
 870         // signal.
 871
 872         if ( !emptyRunQueue(cap) ) return;
 873
 874 #if defined(RTS_USER_SIGNALS) && !defined(THREADED_RTS)
 875         /* If we have user-installed signal handlers, then wait
 876          * for signals to arrive rather then bombing out with a
 877          * deadlock.
 878          */
 879         if ( RtsFlags.MiscFlags.install_signal_handlers && anyUserHandlers() ) {
 880             debugTrace(DEBUG_sched,
 881                        "still deadlocked, waiting for signals...");
 882
 883             awaitUserSignals();
 884
 885             if (signals_pending()) {
 886                 startSignalHandlers(cap);
 887             }
 888
 889             // either we have threads to run, or we were interrupted:
 890             ASSERT(!emptyRunQueue(cap) || sched_state >= SCHED_INTERRUPTING);
 891
 892             return;
 893         }
 894 #endif
 895
 896 #if !defined(THREADED_RTS)
 897         /* Probably a real deadlock.  Send the current main thread the
 898          * Deadlock exception.
 899          */
 900         if (task->incall->tso) {
 901             switch (task->incall->tso->why_blocked) {
 902             case BlockedOnSTM:
 903             case BlockedOnBlackHole:
 904             case BlockedOnMsgThrowTo:
 905             case BlockedOnMVar:
 906                 throwToSingleThreaded(cap, task->incall->tso,
 907                                       (StgClosure *)nonTermination_closure);
 908                 return;
 909             default:
 910                 barf("deadlock: main thread blocked in a strange way");
 911             }
 912         }
 913         return;
 914 #endif
 915     }
 916 }
 917
 918
 919 /* ----------------------------------------------------------------------------
 920  * Send pending messages (PARALLEL_HASKELL only)
 921  * ------------------------------------------------------------------------- */
 922
 923 #if defined(PARALLEL_HASKELL)
 924 static void
 925 scheduleSendPendingMessages(void)
 926 {
 927
 928 # if defined(PAR) // global Mem.Mgmt., omit for now
 929     if (PendingFetches != END_BF_QUEUE) {
 930         processFetches();
 931     }
 932 # endif
 933
 934     if (RtsFlags.ParFlags.BufferTime) {
 935         // if we use message buffering, we must send away all message
 936         // packets which have become too old...
 937         sendOldBuffers();
 938     }
 939 }
 940 #endif
 941
 942 /* ----------------------------------------------------------------------------
 943  * Process message in the current Capability's inbox
 944  * ------------------------------------------------------------------------- */
 945
 946 static void
 947 scheduleProcessInbox (Capability *cap USED_IF_THREADS)
 948 {
 949 #if defined(THREADED_RTS)
 950     Message *m;
 951
 952     while (!emptyInbox(cap)) {
 953         ACQUIRE_LOCK(&cap->lock);
 954         m = cap->inbox;
 955         cap->inbox = m->link;
 956         RELEASE_LOCK(&cap->lock);
 957         executeMessage(cap, (Message *)m);
 958     }
 959 #endif
 960 }
 961
 962 /* ----------------------------------------------------------------------------
 963  * Activate spark threads (PARALLEL_HASKELL and THREADED_RTS)
 964  * ------------------------------------------------------------------------- */
 965
 966 #if defined(THREADED_RTS)
 967 static void
 968 scheduleActivateSpark(Capability *cap)
 969 {
 970     if (anySparks())
 971     {
 972         createSparkThread(cap);
 973         debugTrace(DEBUG_sched, "creating a spark thread");
 974     }
 975 }
 976 #endif // PARALLEL_HASKELL || THREADED_RTS
 977
 978 /* ----------------------------------------------------------------------------
 979  * After running a thread...
 980  * ------------------------------------------------------------------------- */
 981
 982 static void
 983 schedulePostRunThread (Capability *cap, StgTSO *t)
 984 {
 985     // We have to be able to catch transactions that are in an
 986     // infinite loop as a result of seeing an inconsistent view of
 987     // memory, e.g.
 988     //
 989     //   atomically $ do
 990     //       [a,b] <- mapM readTVar [ta,tb]
 991     //       when (a == b) loop
 992     //
 993     // and a is never equal to b given a consistent view of memory.
 994     //
 995     if (t -> trec != NO_TREC && t -> why_blocked == NotBlocked) {
 996         if (!stmValidateNestOfTransactions (t -> trec)) {
 997             debugTrace(DEBUG_sched | DEBUG_stm,
 998                        "trec %p found wasting its time", t);
 999
1000             // strip the stack back to the
1001             // ATOMICALLY_FRAME, aborting the (nested)
1002             // transaction, and saving the stack of any
1003             // partially-evaluated thunks on the heap.
1004             throwToSingleThreaded_(cap, t, NULL, rtsTrue);
1005
1006 //            ASSERT(get_itbl((StgClosure *)t->sp)->type == ATOMICALLY_FRAME);
1007         }
1008     }
1009
1010   /* some statistics gathering in the parallel case */
1011 }
1012
1013 /* -----------------------------------------------------------------------------
1014  * Handle a thread that returned to the scheduler with ThreadHeepOverflow
1015  * -------------------------------------------------------------------------- */
1016
1017 static rtsBool
1018 scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
1019 {
1020     // did the task ask for a large block?
1021     if (cap->r.rHpAlloc > BLOCK_SIZE) {
1022         // if so, get one and push it on the front of the nursery.
1023         bdescr *bd;
1024         lnat blocks;
1025
1026         blocks = (lnat)BLOCK_ROUND_UP(cap->r.rHpAlloc) / BLOCK_SIZE;
1027
1028         debugTrace(DEBUG_sched,
1029                    "--<< thread %ld (%s) stopped: requesting a large block (size %ld)\n",
1030                    (long)t->id, what_next_strs[t->what_next], blocks);
1031
1032         // don't do this if the nursery is (nearly) full, we'll GC first.
1033         if (cap->r.rCurrentNursery->link != NULL ||
1034             cap->r.rNursery->n_blocks == 1) {  // paranoia to prevent infinite loop
1035                                                // if the nursery has only one block.
1036
1037             bd = allocGroup_lock(blocks);
1038             cap->r.rNursery->n_blocks += blocks;
1039
1040             // link the new group into the list
1041             bd->link = cap->r.rCurrentNursery;
1042             bd->u.back = cap->r.rCurrentNursery->u.back;
1043             if (cap->r.rCurrentNursery->u.back != NULL) {
1044                 cap->r.rCurrentNursery->u.back->link = bd;
1045             } else {
1046                 cap->r.rNursery->blocks = bd;
1047             }
1048             cap->r.rCurrentNursery->u.back = bd;
1049
1050             // initialise it as a nursery block.  We initialise the
1051             // step, gen_no, and flags field of *every* sub-block in
1052             // this large block, because this is easier than making
1053             // sure that we always find the block head of a large
1054             // block whenever we call Bdescr() (eg. evacuate() and
1055             // isAlive() in the GC would both have to do this, at
1056             // least).
1057             {
1058                 bdescr *x;
1059                 for (x = bd; x < bd + blocks; x++) {
1060                     initBdescr(x,g0,g0);
1061                     x->free = x->start;
1062                     x->flags = 0;
1063                 }
1064             }
1065
1066             // This assert can be a killer if the app is doing lots
1067             // of large block allocations.
1068             IF_DEBUG(sanity, checkNurserySanity(cap->r.rNursery));
1069
1070             // now update the nursery to point to the new block
1071             cap->r.rCurrentNursery = bd;
1072
1073             // we might be unlucky and have another thread get on the
1074             // run queue before us and steal the large block, but in that
1075             // case the thread will just end up requesting another large
1076             // block.
1077             pushOnRunQueue(cap,t);
1078             return rtsFalse;  /* not actually GC'ing */
1079         }
1080     }
1081
1082     if (cap->r.rHpLim == NULL || cap->context_switch) {
1083         // Sometimes we miss a context switch, e.g. when calling
1084         // primitives in a tight loop, MAYBE_GC() doesn't check the
1085         // context switch flag, and we end up waiting for a GC.
1086         // See #1984, and concurrent/should_run/1984
1087         cap->context_switch = 0;
1088         appendToRunQueue(cap,t);
1089     } else {
1090         pushOnRunQueue(cap,t);
1091     }
1092     return rtsTrue;
1093     /* actual GC is done at the end of the while loop in schedule() */
1094 }
1095
1096 /* -----------------------------------------------------------------------------
1097  * Handle a thread that returned to the scheduler with ThreadStackOverflow
1098  * -------------------------------------------------------------------------- */
1099
1100 static void
1101 scheduleHandleStackOverflow (Capability *cap, Task *task, StgTSO *t)
1102 {
1103     /* just adjust the stack for this thread, then pop it back
1104      * on the run queue.
1105      */
1106     {
1107         /* enlarge the stack */
1108         StgTSO *new_t = threadStackOverflow(cap, t);
1109
1110         /* The TSO attached to this Task may have moved, so update the
1111          * pointer to it.
1112          */
1113         if (task->incall->tso == t) {
1114             task->incall->tso = new_t;
1115         }
1116         pushOnRunQueue(cap,new_t);
1117     }
1118 }
1119
1120 /* -----------------------------------------------------------------------------
1121  * Handle a thread that returned to the scheduler with ThreadYielding
1122  * -------------------------------------------------------------------------- */
1123
1124 static rtsBool
1125 scheduleHandleYield( Capability *cap, StgTSO *t, nat prev_what_next )
1126 {
1127     /* put the thread back on the run queue.  Then, if we're ready to
1128      * GC, check whether this is the last task to stop.  If so, wake
1129      * up the GC thread.  getThread will block during a GC until the
1130      * GC is finished.
1131      */
1132
1133     ASSERT(t->_link == END_TSO_QUEUE);
1134
1135     // Shortcut if we're just switching evaluators: don't bother
1136     // doing stack squeezing (which can be expensive), just run the
1137     // thread.
1138     if (cap->context_switch == 0 && t->what_next != prev_what_next) {
1139         debugTrace(DEBUG_sched,
1140                    "--<< thread %ld (%s) stopped to switch evaluators",
1141                    (long)t->id, what_next_strs[t->what_next]);
1142         return rtsTrue;
1143     }
1144
1145     // Reset the context switch flag.  We don't do this just before
1146     // running the thread, because that would mean we would lose ticks
1147     // during GC, which can lead to unfair scheduling (a thread hogs
1148     // the CPU because the tick always arrives during GC).  This way
1149     // penalises threads that do a lot of allocation, but that seems
1150     // better than the alternative.
1151     cap->context_switch = 0;
1152
1153     IF_DEBUG(sanity,
1154              //debugBelch("&& Doing sanity check on yielding TSO %ld.", t->id);
1155              checkTSO(t));
1156
1157     appendToRunQueue(cap,t);
1158
1159     return rtsFalse;
1160 }
1161
1162 /* -----------------------------------------------------------------------------
1163  * Handle a thread that returned to the scheduler with ThreadBlocked
1164  * -------------------------------------------------------------------------- */
1165
1166 static void
1167 scheduleHandleThreadBlocked( StgTSO *t
1168 #if !defined(DEBUG)
1169     STG_UNUSED
1170 #endif
1171     )
1172 {
1173
1174       // We don't need to do anything.  The thread is blocked, and it
1175       // has tidied up its stack and placed itself on whatever queue
1176       // it needs to be on.
1177
1178     // ASSERT(t->why_blocked != NotBlocked);
1179     // Not true: for example,
1180     //    - the thread may have woken itself up already, because
1181     //      threadPaused() might have raised a blocked throwTo
1182     //      exception, see maybePerformBlockedException().
1183
1184 #ifdef DEBUG
1185     traceThreadStatus(DEBUG_sched, t);
1186 #endif
1187 }
1188
1189 /* -----------------------------------------------------------------------------
1190  * Handle a thread that returned to the scheduler with ThreadFinished
1191  * -------------------------------------------------------------------------- */
1192
1193 static rtsBool
1194 scheduleHandleThreadFinished (Capability *cap STG_UNUSED, Task *task, StgTSO *t)
1195 {
1196     /* Need to check whether this was a main thread, and if so,
1197      * return with the return value.
1198      *
1199      * We also end up here if the thread kills itself with an
1200      * uncaught exception, see Exception.cmm.
1201      */
1202
1203     // blocked exceptions can now complete, even if the thread was in
1204     // blocked mode (see #2910).
1205     awakenBlockedExceptionQueue (cap, t);
1206
1207       //
1208       // Check whether the thread that just completed was a bound
1209       // thread, and if so return with the result.
1210       //
1211       // There is an assumption here that all thread completion goes
1212       // through this point; we need to make sure that if a thread
1213       // ends up in the ThreadKilled state, that it stays on the run
1214       // queue so it can be dealt with here.
1215       //
1216
1217       if (t->bound) {
1218
1219           if (t->bound != task->incall) {
1220 #if !defined(THREADED_RTS)
1221               // Must be a bound thread that is not the topmost one.  Leave
1222               // it on the run queue until the stack has unwound to the
1223               // point where we can deal with this.  Leaving it on the run
1224               // queue also ensures that the garbage collector knows about
1225               // this thread and its return value (it gets dropped from the
1226               // step->threads list so there's no other way to find it).
1227               appendToRunQueue(cap,t);
1228               return rtsFalse;
1229 #else
1230               // this cannot happen in the threaded RTS, because a
1231               // bound thread can only be run by the appropriate Task.
1232               barf("finished bound thread that isn't mine");
1233 #endif
1234           }
1235
1236           ASSERT(task->incall->tso == t);
1237
1238           if (t->what_next == ThreadComplete) {
1239               if (task->incall->ret) {
1240                   // NOTE: return val is tso->sp[1] (see StgStartup.hc)
1241                   *(task->incall->ret) = (StgClosure *)task->incall->tso->sp[1];
1242               }
1243               task->incall->stat = Success;
1244           } else {
1245               if (task->incall->ret) {
1246                   *(task->incall->ret) = NULL;
1247               }
1248               if (sched_state >= SCHED_INTERRUPTING) {
1249                   if (heap_overflow) {
1250                       task->incall->stat = HeapExhausted;
1251                   } else {
1252                       task->incall->stat = Interrupted;
1253                   }
1254               } else {
1255                   task->incall->stat = Killed;
1256               }
1257           }
1258 #ifdef DEBUG
1259           removeThreadLabel((StgWord)task->incall->tso->id);
1260 #endif
1261
1262           // We no longer consider this thread and task to be bound to
1263           // each other.  The TSO lives on until it is GC'd, but the
1264           // task is about to be released by the caller, and we don't
1265           // want anyone following the pointer from the TSO to the
1266           // defunct task (which might have already been
1267           // re-used). This was a real bug: the GC updated
1268           // tso->bound->tso which lead to a deadlock.
1269           t->bound = NULL;
1270           task->incall->tso = NULL;
1271
1272           return rtsTrue; // tells schedule() to return
1273       }
1274
1275       return rtsFalse;
1276 }
1277
1278 /* -----------------------------------------------------------------------------
1279  * Perform a heap census
1280  * -------------------------------------------------------------------------- */
1281
1282 static rtsBool
1283 scheduleNeedHeapProfile( rtsBool ready_to_gc STG_UNUSED )
1284 {
1285     // When we have +RTS -i0 and we're heap profiling, do a census at
1286     // every GC.  This lets us get repeatable runs for debugging.
1287     if (performHeapProfile ||
1288         (RtsFlags.ProfFlags.profileInterval==0 &&
1289          RtsFlags.ProfFlags.doHeapProfile && ready_to_gc)) {
1290         return rtsTrue;
1291     } else {
1292         return rtsFalse;
1293     }
1294 }
1295
1296 /* -----------------------------------------------------------------------------
1297  * Perform a garbage collection if necessary
1298  * -------------------------------------------------------------------------- */
1299
1300 static Capability *
1301 scheduleDoGC (Capability *cap, Task *task USED_IF_THREADS, rtsBool force_major)
1302 {
1303     rtsBool heap_census;
1304 #ifdef THREADED_RTS
1305     /* extern static volatile StgWord waiting_for_gc;
1306        lives inside capability.c */
1307     rtsBool gc_type, prev_pending_gc;
1308     nat i;
1309 #endif
1310
1311     if (sched_state == SCHED_SHUTTING_DOWN) {
1312         // The final GC has already been done, and the system is
1313         // shutting down.  We'll probably deadlock if we try to GC
1314         // now.
1315         return cap;
1316     }
1317
1318 #ifdef THREADED_RTS
1319     if (sched_state < SCHED_INTERRUPTING
1320         && RtsFlags.ParFlags.parGcEnabled
1321         && N >= RtsFlags.ParFlags.parGcGen
1322         && ! oldest_gen->mark)
1323     {
1324         gc_type = PENDING_GC_PAR;
1325     } else {
1326         gc_type = PENDING_GC_SEQ;
1327     }
1328
1329     // In order to GC, there must be no threads running Haskell code.
1330     // Therefore, the GC thread needs to hold *all* the capabilities,
1331     // and release them after the GC has completed.
1332     //
1333     // This seems to be the simplest way: previous attempts involved
1334     // making all the threads with capabilities give up their
1335     // capabilities and sleep except for the *last* one, which
1336     // actually did the GC.  But it's quite hard to arrange for all
1337     // the other tasks to sleep and stay asleep.
1338     //
1339
1340     /*  Other capabilities are prevented from running yet more Haskell
1341         threads if waiting_for_gc is set. Tested inside
1342         yieldCapability() and releaseCapability() in Capability.c */
1343
1344     prev_pending_gc = cas(&waiting_for_gc, 0, gc_type);
1345     if (prev_pending_gc) {
1346         do {
1347             debugTrace(DEBUG_sched, "someone else is trying to GC (%d)...",
1348                        prev_pending_gc);
1349             ASSERT(cap);
1350             yieldCapability(&cap,task);
1351         } while (waiting_for_gc);
1352         return cap;  // NOTE: task->cap might have changed here
1353     }
1354
1355     setContextSwitches();
1356
1357     // The final shutdown GC is always single-threaded, because it's
1358     // possible that some of the Capabilities have no worker threads.
1359
1360     if (gc_type == PENDING_GC_SEQ)
1361     {
1362         traceEventRequestSeqGc(cap);
1363     }
1364     else
1365     {
1366         traceEventRequestParGc(cap);
1367         debugTrace(DEBUG_sched, "ready_to_gc, grabbing GC threads");
1368     }
1369
1370     if (gc_type == PENDING_GC_SEQ)
1371     {
1372         // single-threaded GC: grab all the capabilities
1373         for (i=0; i < n_capabilities; i++) {
1374             debugTrace(DEBUG_sched, "ready_to_gc, grabbing all the capabilies (%d/%d)", i, n_capabilities);
1375             if (cap != &capabilities[i]) {
1376                 Capability *pcap = &capabilities[i];
1377                 // we better hope this task doesn't get migrated to
1378                 // another Capability while we're waiting for this one.
1379                 // It won't, because load balancing happens while we have
1380                 // all the Capabilities, but even so it's a slightly
1381                 // unsavoury invariant.
1382                 task->cap = pcap;
1383                 waitForReturnCapability(&pcap, task);
1384                 if (pcap != &capabilities[i]) {
1385                     barf("scheduleDoGC: got the wrong capability");
1386                 }
1387             }
1388         }
1389     }
1390     else
1391     {
1392         // multi-threaded GC: make sure all the Capabilities donate one
1393         // GC thread each.
1394         waitForGcThreads(cap);
1395     }
1396
1397 #endif
1398
1399     IF_DEBUG(scheduler, printAllThreads());
1400
1401 delete_threads_and_gc:
1402     /*
1403      * We now have all the capabilities; if we're in an interrupting
1404      * state, then we should take the opportunity to delete all the
1405      * threads in the system.
1406      */
1407     if (sched_state == SCHED_INTERRUPTING) {
1408         deleteAllThreads(cap);
1409         sched_state = SCHED_SHUTTING_DOWN;
1410     }
1411
1412     heap_census = scheduleNeedHeapProfile(rtsTrue);
1413
1414     traceEventGcStart(cap);
1415 #if defined(THREADED_RTS)
1416     // reset waiting_for_gc *before* GC, so that when the GC threads
1417     // emerge they don't immediately re-enter the GC.
1418     waiting_for_gc = 0;
1419     GarbageCollect(force_major || heap_census, gc_type, cap);
1420 #else
1421     GarbageCollect(force_major || heap_census, 0, cap);
1422 #endif
1423     traceEventGcEnd(cap);
1424
1425     if (recent_activity == ACTIVITY_INACTIVE && force_major)
1426     {
1427         // We are doing a GC because the system has been idle for a
1428         // timeslice and we need to check for deadlock.  Record the
1429         // fact that we've done a GC and turn off the timer signal;
1430         // it will get re-enabled if we run any threads after the GC.
1431         recent_activity = ACTIVITY_DONE_GC;
1432         stopTimer();
1433     }
1434     else
1435     {
1436         // the GC might have taken long enough for the timer to set
1437         // recent_activity = ACTIVITY_INACTIVE, but we aren't
1438         // necessarily deadlocked:
1439         recent_activity = ACTIVITY_YES;
1440     }
1441
1442 #if defined(THREADED_RTS)
1443     if (gc_type == PENDING_GC_PAR)
1444     {
1445         releaseGCThreads(cap);
1446     }
1447 #endif
1448
1449     if (heap_census) {
1450         debugTrace(DEBUG_sched, "performing heap census");
1451         heapCensus();
1452         performHeapProfile = rtsFalse;
1453     }
1454
1455     if (heap_overflow && sched_state < SCHED_INTERRUPTING) {
1456         // GC set the heap_overflow flag, so we should proceed with
1457         // an orderly shutdown now.  Ultimately we want the main
1458         // thread to return to its caller with HeapExhausted, at which
1459         // point the caller should call hs_exit().  The first step is
1460         // to delete all the threads.
1461         //
1462         // Another way to do this would be to raise an exception in
1463         // the main thread, which we really should do because it gives
1464         // the program a chance to clean up.  But how do we find the
1465         // main thread?  It should presumably be the same one that
1466         // gets ^C exceptions, but that's all done on the Haskell side
1467         // (GHC.TopHandler).
1468         sched_state = SCHED_INTERRUPTING;
1469         goto delete_threads_and_gc;
1470     }
1471
1472 #ifdef SPARKBALANCE
1473     /* JB
1474        Once we are all together... this would be the place to balance all
1475        spark pools. No concurrent stealing or adding of new sparks can
1476        occur. Should be defined in Sparks.c. */
1477     balanceSparkPoolsCaps(n_capabilities, capabilities);
1478 #endif
1479
1480 #if defined(THREADED_RTS)
1481     if (gc_type == PENDING_GC_SEQ) {
1482         // release our stash of capabilities.
1483         for (i = 0; i < n_capabilities; i++) {
1484             if (cap != &capabilities[i]) {
1485                 task->cap = &capabilities[i];
1486                 releaseCapability(&capabilities[i]);
1487             }
1488         }
1489     }
1490     if (cap) {
1491         task->cap = cap;
1492     } else {
1493         task->cap = NULL;
1494     }
1495 #endif
1496
1497     return cap;
1498 }
1499
1500 /* ---------------------------------------------------------------------------
1501  * Singleton fork(). Do not copy any running threads.
1502  * ------------------------------------------------------------------------- */
1503
1504 pid_t
1505 forkProcess(HsStablePtr *entry
1506 #ifndef FORKPROCESS_PRIMOP_SUPPORTED
1507             STG_UNUSED
1508 #endif
1509            )
1510 {
1511 #ifdef FORKPROCESS_PRIMOP_SUPPORTED
1512     pid_t pid;
1513     StgTSO* t,*next;
1514     Capability *cap;
1515     nat g;
1516
1517 #if defined(THREADED_RTS)
1518     if (RtsFlags.ParFlags.nNodes > 1) {
1519         errorBelch("forking not supported with +RTS -N<n> greater than 1");
1520         stg_exit(EXIT_FAILURE);
1521     }
1522 #endif
1523
1524     debugTrace(DEBUG_sched, "forking!");
1525
1526     // ToDo: for SMP, we should probably acquire *all* the capabilities
1527     cap = rts_lock();
1528
1529     // no funny business: hold locks while we fork, otherwise if some
1530     // other thread is holding a lock when the fork happens, the data
1531     // structure protected by the lock will forever be in an
1532     // inconsistent state in the child.  See also #1391.
1533     ACQUIRE_LOCK(&sched_mutex);
1534     ACQUIRE_LOCK(&cap->lock);
1535     ACQUIRE_LOCK(&cap->running_task->lock);
1536
1537     stopTimer(); // See #4074
1538
1539 #if defined(TRACING)
1540     flushEventLog(); // so that child won't inherit dirty file buffers
1541 #endif
1542
1543     pid = fork();
1544
1545     if (pid) { // parent
1546
1547         startTimer(); // #4074
1548
1549         RELEASE_LOCK(&sched_mutex);
1550         RELEASE_LOCK(&cap->lock);
1551         RELEASE_LOCK(&cap->running_task->lock);
1552
1553         // just return the pid
1554         rts_unlock(cap);
1555         return pid;
1556
1557     } else { // child
1558
1559 #if defined(THREADED_RTS)
1560         initMutex(&sched_mutex);
1561         initMutex(&cap->lock);
1562         initMutex(&cap->running_task->lock);
1563 #endif
1564
1565 #if defined(TRACING)
1566         abortEventLogging(); // abort eventlog inherited from parent
1567         initEventLogging(); // child starts its own eventlog
1568 #endif
1569         // Now, all OS threads except the thread that forked are
1570         // stopped.  We need to stop all Haskell threads, including
1571         // those involved in foreign calls.  Also we need to delete
1572         // all Tasks, because they correspond to OS threads that are
1573         // now gone.
1574
1575         for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1576           for (t = generations[g].threads; t != END_TSO_QUEUE; t = next) {
1577             if (t->what_next == ThreadRelocated) {
1578                 next = t->_link;
1579             } else {
1580                 next = t->global_link;
1581                 // don't allow threads to catch the ThreadKilled
1582                 // exception, but we do want to raiseAsync() because these
1583                 // threads may be evaluating thunks that we need later.
1584                 deleteThread_(cap,t);
1585
1586                 // stop the GC from updating the InCall to point to
1587                 // the TSO.  This is only necessary because the
1588                 // OSThread bound to the TSO has been killed, and
1589                 // won't get a chance to exit in the usual way (see
1590                 // also scheduleHandleThreadFinished).
1591                 t->bound = NULL;
1592             }
1593           }
1594         }
1595
1596         // Empty the run queue.  It seems tempting to let all the
1597         // killed threads stay on the run queue as zombies to be
1598         // cleaned up later, but some of them correspond to bound
1599         // threads for which the corresponding Task does not exist.
1600         cap->run_queue_hd = END_TSO_QUEUE;
1601         cap->run_queue_tl = END_TSO_QUEUE;
1602
1603         // Any suspended C-calling Tasks are no more, their OS threads
1604         // don't exist now:
1605         cap->suspended_ccalls = NULL;
1606
1607         // Empty the threads lists.  Otherwise, the garbage
1608         // collector may attempt to resurrect some of these threads.
1609         for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1610             generations[g].threads = END_TSO_QUEUE;
1611         }
1612
1613         discardTasksExcept(cap->running_task);
1614
1615 #if defined(THREADED_RTS)
1616         // Wipe our spare workers list, they no longer exist.  New
1617         // workers will be created if necessary.
1618         cap->spare_workers = NULL;
1619         cap->n_spare_workers = 0;
1620         cap->returning_tasks_hd = NULL;
1621         cap->returning_tasks_tl = NULL;
1622 #endif
1623
1624         // On Unix, all timers are reset in the child, so we need to start
1625         // the timer again.
1626         initTimer();
1627         startTimer();
1628
1629 #if defined(THREADED_RTS)
1630         cap = ioManagerStartCap(cap);
1631 #endif
1632
1633         cap = rts_evalStableIO(cap, entry, NULL);  // run the action
1634         rts_checkSchedStatus("forkProcess",cap);
1635
1636         rts_unlock(cap);
1637         hs_exit();                      // clean up and exit
1638         stg_exit(EXIT_SUCCESS);
1639     }
1640 #else /* !FORKPROCESS_PRIMOP_SUPPORTED */
1641     barf("forkProcess#: primop not supported on this platform, sorry!\n");
1642 #endif
1643 }
1644
1645 /* ---------------------------------------------------------------------------
1646  * Delete all the threads in the system
1647  * ------------------------------------------------------------------------- */
1648
1649 static void
1650 deleteAllThreads ( Capability *cap )
1651 {
1652     // NOTE: only safe to call if we own all capabilities.
1653
1654     StgTSO* t, *next;
1655     nat g;
1656
1657     debugTrace(DEBUG_sched,"deleting all threads");
1658     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
1659         for (t = generations[g].threads; t != END_TSO_QUEUE; t = next) {
1660             if (t->what_next == ThreadRelocated) {
1661                 next = t->_link;
1662             } else {
1663                 next = t->global_link;
1664                 deleteThread(cap,t);
1665             }
1666         }
1667     }
1668
1669     // The run queue now contains a bunch of ThreadKilled threads.  We
1670     // must not throw these away: the main thread(s) will be in there
1671     // somewhere, and the main scheduler loop has to deal with it.
1672     // Also, the run queue is the only thing keeping these threads from
1673     // being GC'd, and we don't want the "main thread has been GC'd" panic.
1674
1675 #if !defined(THREADED_RTS)
1676     ASSERT(blocked_queue_hd == END_TSO_QUEUE);
1677     ASSERT(sleeping_queue == END_TSO_QUEUE);
1678 #endif
1679 }
1680
1681 /* -----------------------------------------------------------------------------
1682    Managing the suspended_ccalls list.
1683    Locks required: sched_mutex
1684    -------------------------------------------------------------------------- */
1685
1686 STATIC_INLINE void
1687 suspendTask (Capability *cap, Task *task)
1688 {
1689     InCall *incall;
1690
1691     incall = task->incall;
1692     ASSERT(incall->next == NULL && incall->prev == NULL);
1693     incall->next = cap->suspended_ccalls;
1694     incall->prev = NULL;
1695     if (cap->suspended_ccalls) {
1696         cap->suspended_ccalls->prev = incall;
1697     }
1698     cap->suspended_ccalls = incall;
1699 }
1700
1701 STATIC_INLINE void
1702 recoverSuspendedTask (Capability *cap, Task *task)
1703 {
1704     InCall *incall;
1705
1706     incall = task->incall;
1707     if (incall->prev) {
1708         incall->prev->next = incall->next;
1709     } else {
1710         ASSERT(cap->suspended_ccalls == incall);
1711         cap->suspended_ccalls = incall->next;
1712     }
1713     if (incall->next) {
1714         incall->next->prev = incall->prev;
1715     }
1716     incall->next = incall->prev = NULL;
1717 }
1718
1719 /* ---------------------------------------------------------------------------
1720  * Suspending & resuming Haskell threads.
1721  *
1722  * When making a "safe" call to C (aka _ccall_GC), the task gives back
1723  * its capability before calling the C function.  This allows another
1724  * task to pick up the capability and carry on running Haskell
1725  * threads.  It also means that if the C call blocks, it won't lock
1726  * the whole system.
1727  *
1728  * The Haskell thread making the C call is put to sleep for the
1729  * duration of the call, on the suspended_ccalling_threads queue.  We
1730  * give out a token to the task, which it can use to resume the thread
1731  * on return from the C function.
1732  *
1733  * If this is an interruptible C call, this means that the FFI call may be
1734  * unceremoniously terminated and should be scheduled on an
1735  * unbound worker thread.
1736  * ------------------------------------------------------------------------- */
1737
1738 void *
1739 suspendThread (StgRegTable *reg, rtsBool interruptible)
1740 {
1741   Capability *cap;
1742   int saved_errno;
1743   StgTSO *tso;
1744   Task *task;
1745 #if mingw32_HOST_OS
1746   StgWord32 saved_winerror;
1747 #endif
1748
1749   saved_errno = errno;
1750 #if mingw32_HOST_OS
1751   saved_winerror = GetLastError();
1752 #endif
1753
1754   /* assume that *reg is a pointer to the StgRegTable part of a Capability.
1755    */
1756   cap = regTableToCapability(reg);
1757
1758   task = cap->running_task;
1759   tso = cap->r.rCurrentTSO;
1760
1761   traceEventStopThread(cap, tso, THREAD_SUSPENDED_FOREIGN_CALL);
1762
1763   // XXX this might not be necessary --SDM
1764   tso->what_next = ThreadRunGHC;
1765
1766   threadPaused(cap,tso);
1767
1768   if (interruptible) {
1769     tso->why_blocked = BlockedOnCCall_Interruptible;
1770   } else {
1771     tso->why_blocked = BlockedOnCCall;
1772   }
1773
1774   // Hand back capability
1775   task->incall->suspended_tso = tso;
1776   task->incall->suspended_cap = cap;
1777
1778   ACQUIRE_LOCK(&cap->lock);
1779
1780   suspendTask(cap,task);
1781   cap->in_haskell = rtsFalse;
1782   releaseCapability_(cap,rtsFalse);
1783
1784   RELEASE_LOCK(&cap->lock);
1785
1786   errno = saved_errno;
1787 #if mingw32_HOST_OS
1788   SetLastError(saved_winerror);
1789 #endif
1790   return task;
1791 }
1792
1793 StgRegTable *
1794 resumeThread (void *task_)
1795 {
1796     StgTSO *tso;
1797     InCall *incall;
1798     Capability *cap;
1799     Task *task = task_;
1800     int saved_errno;
1801 #if mingw32_HOST_OS
1802     StgWord32 saved_winerror;
1803 #endif
1804
1805     saved_errno = errno;
1806 #if mingw32_HOST_OS
1807     saved_winerror = GetLastError();
1808 #endif
1809
1810     incall = task->incall;
1811     cap = incall->suspended_cap;
1812     task->cap = cap;
1813
1814     // Wait for permission to re-enter the RTS with the result.
1815     waitForReturnCapability(&cap,task);
1816     // we might be on a different capability now... but if so, our
1817     // entry on the suspended_ccalls list will also have been
1818     // migrated.
1819
1820     // Remove the thread from the suspended list
1821     recoverSuspendedTask(cap,task);
1822
1823     tso = incall->suspended_tso;
1824     incall->suspended_tso = NULL;
1825     incall->suspended_cap = NULL;
1826     tso->_link = END_TSO_QUEUE; // no write barrier reqd
1827
1828     traceEventRunThread(cap, tso);
1829
1830     /* Reset blocking status */
1831     tso->why_blocked  = NotBlocked;
1832
1833     if ((tso->flags & TSO_BLOCKEX) == 0) {
1834         // avoid locking the TSO if we don't have to
1835         if (tso->blocked_exceptions != END_BLOCKED_EXCEPTIONS_QUEUE) {
1836             maybePerformBlockedException(cap,tso);
1837         }
1838     }
1839
1840     cap->r.rCurrentTSO = tso;
1841     cap->in_haskell = rtsTrue;
1842     errno = saved_errno;
1843 #if mingw32_HOST_OS
1844     SetLastError(saved_winerror);
1845 #endif
1846
1847     /* We might have GC'd, mark the TSO dirty again */
1848     dirty_TSO(cap,tso);
1849
1850     IF_DEBUG(sanity, checkTSO(tso));
1851
1852     return &cap->r;
1853 }
1854
1855 /* ---------------------------------------------------------------------------
1856  * scheduleThread()
1857  *
1858  * scheduleThread puts a thread on the end  of the runnable queue.
1859  * This will usually be done immediately after a thread is created.
1860  * The caller of scheduleThread must create the thread using e.g.
1861  * createThread and push an appropriate closure
1862  * on this thread's stack before the scheduler is invoked.
1863  * ------------------------------------------------------------------------ */
1864
1865 void
1866 scheduleThread(Capability *cap, StgTSO *tso)
1867 {
1868     // The thread goes at the *end* of the run-queue, to avoid possible
1869     // starvation of any threads already on the queue.
1870     appendToRunQueue(cap,tso);
1871 }
1872
1873 void
1874 scheduleThreadOn(Capability *cap, StgWord cpu USED_IF_THREADS, StgTSO *tso)
1875 {
1876 #if defined(THREADED_RTS)
1877     tso->flags |= TSO_LOCKED; // we requested explicit affinity; don't
1878                               // move this thread from now on.
1879     cpu %= RtsFlags.ParFlags.nNodes;
1880     if (cpu == cap->no) {
1881         appendToRunQueue(cap,tso);
1882     } else {
1883         migrateThread(cap, tso, &capabilities[cpu]);
1884     }
1885 #else
1886     appendToRunQueue(cap,tso);
1887 #endif
1888 }
1889
1890 Capability *
1891 scheduleWaitThread (StgTSO* tso, /*[out]*/HaskellObj* ret, Capability *cap)
1892 {
1893     Task *task;
1894     StgThreadID id;
1895
1896     // We already created/initialised the Task
1897     task = cap->running_task;
1898
1899     // This TSO is now a bound thread; make the Task and TSO
1900     // point to each other.
1901     tso->bound = task->incall;
1902     tso->cap = cap;
1903
1904     task->incall->tso = tso;
1905     task->incall->ret = ret;
1906     task->incall->stat = NoStatus;
1907
1908     appendToRunQueue(cap,tso);
1909
1910     id = tso->id;
1911     debugTrace(DEBUG_sched, "new bound thread (%lu)", (unsigned long)id);
1912
1913     cap = schedule(cap,task);
1914
1915     ASSERT(task->incall->stat != NoStatus);
1916     ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
1917
1918     debugTrace(DEBUG_sched, "bound thread (%lu) finished", (unsigned long)id);
1919     return cap;
1920 }
1921
1922 /* ----------------------------------------------------------------------------
1923  * Starting Tasks
1924  * ------------------------------------------------------------------------- */
1925
1926 #if defined(THREADED_RTS)
1927 void scheduleWorker (Capability *cap, Task *task)
1928 {
1929     // schedule() runs without a lock.
1930     cap = schedule(cap,task);
1931
1932     // On exit from schedule(), we have a Capability, but possibly not
1933     // the same one we started with.
1934
1935     // During shutdown, the requirement is that after all the
1936     // Capabilities are shut down, all workers that are shutting down
1937     // have finished workerTaskStop().  This is why we hold on to
1938     // cap->lock until we've finished workerTaskStop() below.
1939     //
1940     // There may be workers still involved in foreign calls; those
1941     // will just block in waitForReturnCapability() because the
1942     // Capability has been shut down.
1943     //
1944     ACQUIRE_LOCK(&cap->lock);
1945     releaseCapability_(cap,rtsFalse);
1946     workerTaskStop(task);
1947     RELEASE_LOCK(&cap->lock);
1948 }
1949 #endif
1950
1951 /* ---------------------------------------------------------------------------
1952  * initScheduler()
1953  *
1954  * Initialise the scheduler.  This resets all the queues - if the
1955  * queues contained any threads, they'll be garbage collected at the
1956  * next pass.
1957  *
1958  * ------------------------------------------------------------------------ */
1959
1960 void
1961 initScheduler(void)
1962 {
1963 #if !defined(THREADED_RTS)
1964   blocked_queue_hd  = END_TSO_QUEUE;
1965   blocked_queue_tl  = END_TSO_QUEUE;
1966   sleeping_queue    = END_TSO_QUEUE;
1967 #endif
1968
1969   sched_state    = SCHED_RUNNING;
1970   recent_activity = ACTIVITY_YES;
1971
1972 #if defined(THREADED_RTS)
1973   /* Initialise the mutex and condition variables used by
1974    * the scheduler. */
1975   initMutex(&sched_mutex);
1976 #endif
1977
1978   ACQUIRE_LOCK(&sched_mutex);
1979
1980   /* A capability holds the state a native thread needs in
1981    * order to execute STG code. At least one capability is
1982    * floating around (only THREADED_RTS builds have more than one).
1983    */
1984   initCapabilities();
1985
1986   initTaskManager();
1987
1988 #if defined(THREADED_RTS)
1989   initSparkPools();
1990 #endif
1991
1992   RELEASE_LOCK(&sched_mutex);
1993
1994 #if defined(THREADED_RTS)
1995   /*
1996    * Eagerly start one worker to run each Capability, except for
1997    * Capability 0.  The idea is that we're probably going to start a
1998    * bound thread on Capability 0 pretty soon, so we don't want a
1999    * worker task hogging it.
2000    */
2001   {
2002       nat i;
2003       Capability *cap;
2004       for (i = 1; i < n_capabilities; i++) {
2005           cap = &capabilities[i];
2006           ACQUIRE_LOCK(&cap->lock);
2007           startWorkerTask(cap);
2008           RELEASE_LOCK(&cap->lock);
2009       }
2010   }
2011 #endif
2012 }
2013
2014 void
2015 exitScheduler (rtsBool wait_foreign USED_IF_THREADS)
2016                /* see Capability.c, shutdownCapability() */
2017 {
2018     Task *task = NULL;
2019
2020     task = newBoundTask();
2021
2022     // If we haven't killed all the threads yet, do it now.
2023     if (sched_state < SCHED_SHUTTING_DOWN) {
2024         sched_state = SCHED_INTERRUPTING;
2025         waitForReturnCapability(&task->cap,task);
2026         scheduleDoGC(task->cap,task,rtsFalse);
2027         ASSERT(task->incall->tso == NULL);
2028         releaseCapability(task->cap);
2029     }
2030     sched_state = SCHED_SHUTTING_DOWN;
2031
2032 #if defined(THREADED_RTS)
2033     {
2034         nat i;
2035
2036         for (i = 0; i < n_capabilities; i++) {
2037             ASSERT(task->incall->tso == NULL);
2038             shutdownCapability(&capabilities[i], task, wait_foreign);
2039         }
2040     }
2041 #endif
2042
2043     boundTaskExiting(task);
2044 }
2045
2046 void
2047 freeScheduler( void )
2048 {
2049     nat still_running;
2050
2051     ACQUIRE_LOCK(&sched_mutex);
2052     still_running = freeTaskManager();
2053     // We can only free the Capabilities if there are no Tasks still
2054     // running.  We might have a Task about to return from a foreign
2055     // call into waitForReturnCapability(), for example (actually,
2056     // this should be the *only* thing that a still-running Task can
2057     // do at this point, and it will block waiting for the
2058     // Capability).
2059     if (still_running == 0) {
2060         freeCapabilities();
2061         if (n_capabilities != 1) {
2062             stgFree(capabilities);
2063         }
2064     }
2065     RELEASE_LOCK(&sched_mutex);
2066 #if defined(THREADED_RTS)
2067     closeMutex(&sched_mutex);
2068 #endif
2069 }
2070
2071 /* -----------------------------------------------------------------------------
2072    performGC
2073
2074    This is the interface to the garbage collector from Haskell land.
2075    We provide this so that external C code can allocate and garbage
2076    collect when called from Haskell via _ccall_GC.
2077    -------------------------------------------------------------------------- */
2078
2079 static void
2080 performGC_(rtsBool force_major)
2081 {
2082     Task *task;
2083
2084     // We must grab a new Task here, because the existing Task may be
2085     // associated with a particular Capability, and chained onto the
2086     // suspended_ccalls queue.
2087     task = newBoundTask();
2088
2089     waitForReturnCapability(&task->cap,task);
2090     scheduleDoGC(task->cap,task,force_major);
2091     releaseCapability(task->cap);
2092     boundTaskExiting(task);
2093 }
2094
2095 void
2096 performGC(void)
2097 {
2098     performGC_(rtsFalse);
2099 }
2100
2101 void
2102 performMajorGC(void)
2103 {
2104     performGC_(rtsTrue);
2105 }
2106
2107 /* -----------------------------------------------------------------------------
2108    Stack overflow
2109
2110    If the thread has reached its maximum stack size, then raise the
2111    StackOverflow exception in the offending thread.  Otherwise
2112    relocate the TSO into a larger chunk of memory and adjust its stack
2113    size appropriately.
2114    -------------------------------------------------------------------------- */
2115
2116 static StgTSO *
2117 threadStackOverflow(Capability *cap, StgTSO *tso)
2118 {
2119   nat new_stack_size, stack_words;
2120   lnat new_tso_size;
2121   StgPtr new_sp;
2122   StgTSO *dest;
2123
2124   IF_DEBUG(sanity,checkTSO(tso));
2125
2126   if (tso->stack_size >= tso->max_stack_size
2127       && !(tso->flags & TSO_BLOCKEX)) {
2128       // NB. never raise a StackOverflow exception if the thread is
2129       // inside Control.Exceptino.block.  It is impractical to protect
2130       // against stack overflow exceptions, since virtually anything
2131       // can raise one (even 'catch'), so this is the only sensible
2132       // thing to do here.  See bug #767.
2133       //
2134
2135       if (tso->flags & TSO_SQUEEZED) {
2136           return tso;
2137       }
2138       // #3677: In a stack overflow situation, stack squeezing may
2139       // reduce the stack size, but we don't know whether it has been
2140       // reduced enough for the stack check to succeed if we try
2141       // again.  Fortunately stack squeezing is idempotent, so all we
2142       // need to do is record whether *any* squeezing happened.  If we
2143       // are at the stack's absolute -K limit, and stack squeezing
2144       // happened, then we try running the thread again.  The
2145       // TSO_SQUEEZED flag is set by threadPaused() to tell us whether
2146       // squeezing happened or not.
2147
2148       debugTrace(DEBUG_gc,
2149                  "threadStackOverflow of TSO %ld (%p): stack too large (now %ld; max is %ld)",
2150                  (long)tso->id, tso, (long)tso->stack_size, (long)tso->max_stack_size);
2151       IF_DEBUG(gc,
2152                /* If we're debugging, just print out the top of the stack */
2153                printStackChunk(tso->sp, stg_min(tso->stack+tso->stack_size,
2154                                                 tso->sp+64)));
2155
2156       // Send this thread the StackOverflow exception
2157       throwToSingleThreaded(cap, tso, (StgClosure *)stackOverflow_closure);
2158       return tso;
2159   }
2160
2161
2162   // We also want to avoid enlarging the stack if squeezing has
2163   // already released some of it.  However, we don't want to get into
2164   // a pathalogical situation where a thread has a nearly full stack
2165   // (near its current limit, but not near the absolute -K limit),
2166   // keeps allocating a little bit, squeezing removes a little bit,
2167   // and then it runs again.  So to avoid this, if we squeezed *and*
2168   // there is still less than BLOCK_SIZE_W words free, then we enlarge
2169   // the stack anyway.
2170   if ((tso->flags & TSO_SQUEEZED) &&
2171       ((W_)(tso->sp - tso->stack) >= BLOCK_SIZE_W)) {
2172       return tso;
2173   }
2174
2175   /* Try to double the current stack size.  If that takes us over the
2176    * maximum stack size for this thread, then use the maximum instead
2177    * (that is, unless we're already at or over the max size and we
2178    * can't raise the StackOverflow exception (see above), in which
2179    * case just double the size). Finally round up so the TSO ends up as
2180    * a whole number of blocks.
2181    */
2182   if (tso->stack_size >= tso->max_stack_size) {
2183       new_stack_size = tso->stack_size * 2;
2184   } else {
2185       new_stack_size = stg_min(tso->stack_size * 2, tso->max_stack_size);
2186   }
2187   new_tso_size   = (lnat)BLOCK_ROUND_UP(new_stack_size * sizeof(W_) +
2188                                        TSO_STRUCT_SIZE)/sizeof(W_);
2189   new_tso_size = round_to_mblocks(new_tso_size);  /* Be MBLOCK-friendly */
2190   new_stack_size = new_tso_size - TSO_STRUCT_SIZEW;
2191
2192   debugTrace(DEBUG_sched,
2193              "increasing stack size from %ld words to %d.",
2194              (long)tso->stack_size, new_stack_size);
2195
2196   dest = (StgTSO *)allocate(cap,new_tso_size);
2197   TICK_ALLOC_TSO(new_stack_size,0);
2198
2199   /* copy the TSO block and the old stack into the new area */
2200   memcpy(dest,tso,TSO_STRUCT_SIZE);
2201   stack_words = tso->stack + tso->stack_size - tso->sp;
2202   new_sp = (P_)dest + new_tso_size - stack_words;
2203   memcpy(new_sp, tso->sp, stack_words * sizeof(W_));
2204
2205   /* relocate the stack pointers... */
2206   dest->sp         = new_sp;
2207   dest->stack_size = new_stack_size;
2208
2209   /* Mark the old TSO as relocated.  We have to check for relocated
2210    * TSOs in the garbage collector and any primops that deal with TSOs.
2211    *
2212    * It's important to set the sp value to just beyond the end
2213    * of the stack, so we don't attempt to scavenge any part of the
2214    * dead TSO's stack.
2215    */
2216   setTSOLink(cap,tso,dest);
2217   write_barrier(); // other threads seeing ThreadRelocated will look at _link
2218   tso->what_next = ThreadRelocated;
2219   tso->sp = (P_)&(tso->stack[tso->stack_size]);
2220   tso->why_blocked = NotBlocked;
2221
2222   IF_DEBUG(sanity,checkTSO(dest));
2223 #if 0
2224   IF_DEBUG(scheduler,printTSO(dest));
2225 #endif
2226
2227   return dest;
2228 }
2229
2230 static StgTSO *
2231 threadStackUnderflow (Capability *cap, Task *task, StgTSO *tso)
2232 {
2233     bdescr *bd, *new_bd;
2234     lnat free_w, tso_size_w;
2235     StgTSO *new_tso;
2236
2237     tso_size_w = tso_sizeW(tso);
2238
2239     if (tso_size_w < MBLOCK_SIZE_W ||
2240           // TSO is less than 2 mblocks (since the first mblock is
2241           // shorter than MBLOCK_SIZE_W)
2242         (tso_size_w - BLOCKS_PER_MBLOCK*BLOCK_SIZE_W) % MBLOCK_SIZE_W != 0 ||
2243           // or TSO is not a whole number of megablocks (ensuring
2244           // precondition of splitLargeBlock() below)
2245         (tso_size_w <= round_up_to_mblocks(RtsFlags.GcFlags.initialStkSize)) ||
2246           // or TSO is smaller than the minimum stack size (rounded up)
2247         (nat)(tso->stack + tso->stack_size - tso->sp) > tso->stack_size / 4)
2248           // or stack is using more than 1/4 of the available space
2249     {
2250         // then do nothing
2251         return tso;
2252     }
2253
2254     // this is the number of words we'll free
2255     free_w = round_to_mblocks(tso_size_w/2);
2256
2257     bd = Bdescr((StgPtr)tso);
2258     new_bd = splitLargeBlock(bd, free_w / BLOCK_SIZE_W);
2259     bd->free = bd->start + TSO_STRUCT_SIZEW;
2260
2261     new_tso = (StgTSO *)new_bd->start;
2262     memcpy(new_tso,tso,TSO_STRUCT_SIZE);
2263     new_tso->stack_size = new_bd->free - new_tso->stack;
2264
2265     // The original TSO was dirty and probably on the mutable
2266     // list. The new TSO is not yet on the mutable list, so we better
2267     // put it there.
2268     new_tso->dirty = 0;
2269     new_tso->flags &= ~TSO_LINK_DIRTY;
2270     dirty_TSO(cap, new_tso);
2271
2272     debugTrace(DEBUG_sched, "thread %ld: reducing TSO size from %lu words to %lu",
2273                (long)tso->id, tso_size_w, tso_sizeW(new_tso));
2274
2275     tso->_link = new_tso; // no write barrier reqd: same generation
2276     write_barrier(); // other threads seeing ThreadRelocated will look at _link
2277     tso->what_next = ThreadRelocated;
2278
2279     // The TSO attached to this Task may have moved, so update the
2280     // pointer to it.
2281     if (task->incall->tso == tso) {
2282         task->incall->tso = new_tso;
2283     }
2284
2285     IF_DEBUG(sanity,checkTSO(new_tso));
2286
2287     return new_tso;
2288 }
2289
2290 /* ---------------------------------------------------------------------------
2291    Interrupt execution
2292    - usually called inside a signal handler so it mustn't do anything fancy.
2293    ------------------------------------------------------------------------ */
2294
2295 void
2296 interruptStgRts(void)
2297 {
2298     sched_state = SCHED_INTERRUPTING;
2299     setContextSwitches();
2300 #if defined(THREADED_RTS)
2301     wakeUpRts();
2302 #endif
2303 }
2304
2305 /* -----------------------------------------------------------------------------
2306    Wake up the RTS
2307
2308    This function causes at least one OS thread to wake up and run the
2309    scheduler loop.  It is invoked when the RTS might be deadlocked, or
2310    an external event has arrived that may need servicing (eg. a
2311    keyboard interrupt).
2312
2313    In the single-threaded RTS we don't do anything here; we only have
2314    one thread anyway, and the event that caused us to want to wake up
2315    will have interrupted any blocking system call in progress anyway.
2316    -------------------------------------------------------------------------- */
2317
2318 #if defined(THREADED_RTS)
2319 void wakeUpRts(void)
2320 {
2321     // This forces the IO Manager thread to wakeup, which will
2322     // in turn ensure that some OS thread wakes up and runs the
2323     // scheduler loop, which will cause a GC and deadlock check.
2324     ioManagerWakeup();
2325 }
2326 #endif
2327
2328 /* -----------------------------------------------------------------------------
2329    Deleting threads
2330
2331    This is used for interruption (^C) and forking, and corresponds to
2332    raising an exception but without letting the thread catch the
2333    exception.
2334    -------------------------------------------------------------------------- */
2335
2336 static void
2337 deleteThread (Capability *cap STG_UNUSED, StgTSO *tso)
2338 {
2339     // NOTE: must only be called on a TSO that we have exclusive
2340     // access to, because we will call throwToSingleThreaded() below.
2341     // The TSO must be on the run queue of the Capability we own, or
2342     // we must own all Capabilities.
2343
2344     if (tso->why_blocked != BlockedOnCCall &&
2345         tso->why_blocked != BlockedOnCCall_Interruptible) {
2346         throwToSingleThreaded(tso->cap,tso,NULL);
2347     }
2348 }
2349
2350 #ifdef FORKPROCESS_PRIMOP_SUPPORTED
2351 static void
2352 deleteThread_(Capability *cap, StgTSO *tso)
2353 { // for forkProcess only:
2354   // like deleteThread(), but we delete threads in foreign calls, too.
2355
2356     if (tso->why_blocked == BlockedOnCCall ||
2357         tso->why_blocked == BlockedOnCCall_Interruptible) {
2358         tso->what_next = ThreadKilled;
2359         appendToRunQueue(tso->cap, tso);
2360     } else {
2361         deleteThread(cap,tso);
2362     }
2363 }
2364 #endif
2365
2366 /* -----------------------------------------------------------------------------
2367    raiseExceptionHelper
2368
2369    This function is called by the raise# primitve, just so that we can
2370    move some of the tricky bits of raising an exception from C-- into
2371    C.  Who knows, it might be a useful re-useable thing here too.
2372    -------------------------------------------------------------------------- */
2373
2374 StgWord
2375 raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception)
2376 {
2377     Capability *cap = regTableToCapability(reg);
2378     StgThunk *raise_closure = NULL;
2379     StgPtr p, next;
2380     StgRetInfoTable *info;
2381     //
2382     // This closure represents the expression 'raise# E' where E
2383     // is the exception raise.  It is used to overwrite all the
2384     // thunks which are currently under evaluataion.
2385     //
2386
2387     // OLD COMMENT (we don't have MIN_UPD_SIZE now):
2388     // LDV profiling: stg_raise_info has THUNK as its closure
2389     // type. Since a THUNK takes at least MIN_UPD_SIZE words in its
2390     // payload, MIN_UPD_SIZE is more approprate than 1.  It seems that
2391     // 1 does not cause any problem unless profiling is performed.
2392     // However, when LDV profiling goes on, we need to linearly scan
2393     // small object pool, where raise_closure is stored, so we should
2394     // use MIN_UPD_SIZE.
2395     //
2396     // raise_closure = (StgClosure *)RET_STGCALL1(P_,allocate,
2397     //                                 sizeofW(StgClosure)+1);
2398     //
2399
2400     //
2401     // Walk up the stack, looking for the catch frame.  On the way,
2402     // we update any closures pointed to from update frames with the
2403     // raise closure that we just built.
2404     //
2405     p = tso->sp;
2406     while(1) {
2407         info = get_ret_itbl((StgClosure *)p);
2408         next = p + stack_frame_sizeW((StgClosure *)p);
2409         switch (info->i.type) {
2410
2411         case UPDATE_FRAME:
2412             // Only create raise_closure if we need to.
2413             if (raise_closure == NULL) {
2414                 raise_closure =
2415                     (StgThunk *)allocate(cap,sizeofW(StgThunk)+1);
2416                 SET_HDR(raise_closure, &stg_raise_info, CCCS);
2417                 raise_closure->payload[0] = exception;
2418             }
2419             updateThunk(cap, tso, ((StgUpdateFrame *)p)->updatee,
2420                         (StgClosure *)raise_closure);
2421             p = next;
2422             continue;
2423
2424         case ATOMICALLY_FRAME:
2425             debugTrace(DEBUG_stm, "found ATOMICALLY_FRAME at %p", p);
2426             tso->sp = p;
2427             return ATOMICALLY_FRAME;
2428
2429         case CATCH_FRAME:
2430             tso->sp = p;
2431             return CATCH_FRAME;
2432
2433         case CATCH_STM_FRAME:
2434             debugTrace(DEBUG_stm, "found CATCH_STM_FRAME at %p", p);
2435             tso->sp = p;
2436             return CATCH_STM_FRAME;
2437
2438         case STOP_FRAME:
2439             tso->sp = p;
2440             return STOP_FRAME;
2441
2442         case CATCH_RETRY_FRAME:
2443         default:
2444             p = next;
2445             continue;
2446         }
2447     }
2448 }
2449
2450
2451 /* -----------------------------------------------------------------------------
2452    findRetryFrameHelper
2453
2454    This function is called by the retry# primitive.  It traverses the stack
2455    leaving tso->sp referring to the frame which should handle the retry.
2456
2457    This should either be a CATCH_RETRY_FRAME (if the retry# is within an orElse#)
2458    or should be a ATOMICALLY_FRAME (if the retry# reaches the top level).
2459
2460    We skip CATCH_STM_FRAMEs (aborting and rolling back the nested tx that they
2461    create) because retries are not considered to be exceptions, despite the
2462    similar implementation.
2463
2464    We should not expect to see CATCH_FRAME or STOP_FRAME because those should
2465    not be created within memory transactions.
2466    -------------------------------------------------------------------------- */
2467
2468 StgWord
2469 findRetryFrameHelper (StgTSO *tso)
2470 {
2471   StgPtr           p, next;
2472   StgRetInfoTable *info;
2473
2474   p = tso -> sp;
2475   while (1) {
2476     info = get_ret_itbl((StgClosure *)p);
2477     next = p + stack_frame_sizeW((StgClosure *)p);
2478     switch (info->i.type) {
2479
2480     case ATOMICALLY_FRAME:
2481         debugTrace(DEBUG_stm,
2482                    "found ATOMICALLY_FRAME at %p during retry", p);
2483         tso->sp = p;
2484         return ATOMICALLY_FRAME;
2485
2486     case CATCH_RETRY_FRAME:
2487         debugTrace(DEBUG_stm,
2488                    "found CATCH_RETRY_FRAME at %p during retrry", p);
2489         tso->sp = p;
2490         return CATCH_RETRY_FRAME;
2491
2492     case CATCH_STM_FRAME: {
2493         StgTRecHeader *trec = tso -> trec;
2494         StgTRecHeader *outer = trec -> enclosing_trec;
2495         debugTrace(DEBUG_stm,
2496                    "found CATCH_STM_FRAME at %p during retry", p);
2497         debugTrace(DEBUG_stm, "trec=%p outer=%p", trec, outer);
2498         stmAbortTransaction(tso -> cap, trec);
2499         stmFreeAbortedTRec(tso -> cap, trec);
2500         tso -> trec = outer;
2501         p = next;
2502         continue;
2503     }
2504
2505
2506     default:
2507       ASSERT(info->i.type != CATCH_FRAME);
2508       ASSERT(info->i.type != STOP_FRAME);
2509       p = next;
2510       continue;
2511     }
2512   }
2513 }
2514
2515 /* -----------------------------------------------------------------------------
2516    resurrectThreads is called after garbage collection on the list of
2517    threads found to be garbage.  Each of these threads will be woken
2518    up and sent a signal: BlockedOnDeadMVar if the thread was blocked
2519    on an MVar, or NonTermination if the thread was blocked on a Black
2520    Hole.
2521
2522    Locks: assumes we hold *all* the capabilities.
2523    -------------------------------------------------------------------------- */
2524
2525 void
2526 resurrectThreads (StgTSO *threads)
2527 {
2528     StgTSO *tso, *next;
2529     Capability *cap;
2530     generation *gen;
2531
2532     for (tso = threads; tso != END_TSO_QUEUE; tso = next) {
2533         next = tso->global_link;
2534
2535         gen = Bdescr((P_)tso)->gen;
2536         tso->global_link = gen->threads;
2537         gen->threads = tso;
2538
2539         debugTrace(DEBUG_sched, "resurrecting thread %lu", (unsigned long)tso->id);
2540
2541         // Wake up the thread on the Capability it was last on
2542         cap = tso->cap;
2543
2544         switch (tso->why_blocked) {
2545         case BlockedOnMVar:
2546             /* Called by GC - sched_mutex lock is currently held. */
2547             throwToSingleThreaded(cap, tso,
2548                                   (StgClosure *)blockedIndefinitelyOnMVar_closure);
2549             break;
2550         case BlockedOnBlackHole:
2551             throwToSingleThreaded(cap, tso,
2552                                   (StgClosure *)nonTermination_closure);
2553             break;
2554         case BlockedOnSTM:
2555             throwToSingleThreaded(cap, tso,
2556                                   (StgClosure *)blockedIndefinitelyOnSTM_closure);
2557             break;
2558         case NotBlocked:
2559             /* This might happen if the thread was blocked on a black hole
2560              * belonging to a thread that we've just woken up (raiseAsync
2561              * can wake up threads, remember...).
2562              */
2563             continue;
2564         case BlockedOnMsgThrowTo:
2565             // This can happen if the target is masking, blocks on a
2566             // black hole, and then is found to be unreachable.  In
2567             // this case, we want to let the target wake up and carry
2568             // on, and do nothing to this thread.
2569             continue;
2570         default:
2571             barf("resurrectThreads: thread blocked in a strange way: %d",
2572                  tso->why_blocked);
2573         }
2574     }
2575 }