[project @ 2005-04-05 12:19:54 by simonmar]
[ghc-hetmet.git] / ghc / rts / Schedule.c
index 5ca5666..6e363a6 100644 (file)
@@ -1,7 +1,6 @@
 /* ---------------------------------------------------------------------------
 /* ---------------------------------------------------------------------------
- * $Id: Schedule.c,v 1.188 2004/02/26 16:19:32 simonmar Exp $
  *
  *
- * (c) The GHC Team, 1998-2003
+ * (c) The GHC Team, 1998-2004
  *
  * Scheduler
  *
  *
  * Scheduler
  *
@@ -10,7 +9,7 @@
  *
  * WAY  Name     CPP flag  What's it for
  * --------------------------------------
  *
  * WAY  Name     CPP flag  What's it for
  * --------------------------------------
- * mp   GUM      PAR          Parallel execution on a distrib. memory machine
+ * mp   GUM      PARALLEL_HASKELL          Parallel execution on a distrib. memory machine
  * s    SMP      SMP          Parallel execution on a shared memory machine
  * mg   GranSim  GRAN         Simulation of parallel execution
  * md   GUM/GdH  DIST         Distributed execution (based on GUM)
  * s    SMP      SMP          Parallel execution on a shared memory machine
  * mg   GranSim  GRAN         Simulation of parallel execution
  * md   GUM/GdH  DIST         Distributed execution (based on GUM)
 #include "SchedAPI.h"
 #include "RtsUtils.h"
 #include "RtsFlags.h"
 #include "SchedAPI.h"
 #include "RtsUtils.h"
 #include "RtsFlags.h"
+#include "BlockAlloc.h"
+#include "OSThreads.h"
 #include "Storage.h"
 #include "StgRun.h"
 #include "Storage.h"
 #include "StgRun.h"
-#include "StgStartup.h"
 #include "Hooks.h"
 #define COMPILING_SCHEDULER
 #include "Schedule.h"
 #include "StgMiscClosures.h"
 #include "Hooks.h"
 #define COMPILING_SCHEDULER
 #include "Schedule.h"
 #include "StgMiscClosures.h"
-#include "Storage.h"
 #include "Interpreter.h"
 #include "Exception.h"
 #include "Printer.h"
 #include "Signals.h"
 #include "Sanity.h"
 #include "Stats.h"
 #include "Interpreter.h"
 #include "Exception.h"
 #include "Printer.h"
 #include "Signals.h"
 #include "Sanity.h"
 #include "Stats.h"
+#include "STM.h"
 #include "Timer.h"
 #include "Prelude.h"
 #include "ThreadLabels.h"
 #include "Timer.h"
 #include "Prelude.h"
 #include "ThreadLabels.h"
+#include "LdvProfile.h"
+#include "Updates.h"
 #ifdef PROFILING
 #include "Proftimer.h"
 #include "ProfHeap.h"
 #endif
 #ifdef PROFILING
 #include "Proftimer.h"
 #include "ProfHeap.h"
 #endif
-#if defined(GRAN) || defined(PAR)
+#if defined(GRAN) || defined(PARALLEL_HASKELL)
 # include "GranSimRts.h"
 # include "GranSim.h"
 # include "ParallelRts.h"
 # include "GranSimRts.h"
 # include "GranSim.h"
 # include "ParallelRts.h"
@@ -74,7 +76,6 @@
 #endif
 #include "Sparks.h"
 #include "Capability.h"
 #endif
 #include "Sparks.h"
 #include "Capability.h"
-#include "OSThreads.h"
 #include  "Task.h"
 
 #ifdef HAVE_SYS_TYPES_H
 #include  "Task.h"
 
 #ifdef HAVE_SYS_TYPES_H
 #include <errno.h>
 #endif
 
 #include <errno.h>
 #endif
 
+// Turn off inlining when debugging - it obfuscates things
+#ifdef DEBUG
+# undef  STATIC_INLINE
+# define STATIC_INLINE static
+#endif
+
 #ifdef THREADED_RTS
 #define USED_IN_THREADED_RTS
 #else
 #ifdef THREADED_RTS
 #define USED_IN_THREADED_RTS
 #else
  */
 StgMainThread *main_threads = NULL;
 
  */
 StgMainThread *main_threads = NULL;
 
-/* Thread queues.
- * Locks required: sched_mutex.
- */
 #if defined(GRAN)
 
 StgTSO* ActiveTSO = NULL; /* for assigning system costs; GranSim-Light only */
 #if defined(GRAN)
 
 StgTSO* ActiveTSO = NULL; /* for assigning system costs; GranSim-Light only */
@@ -134,14 +138,23 @@ StgTSO *ccalling_threadss[MAX_PROC];
 
 #else /* !GRAN */
 
 
 #else /* !GRAN */
 
+/* Thread queues.
+ * Locks required: sched_mutex.
+ */
 StgTSO *run_queue_hd = NULL;
 StgTSO *run_queue_tl = NULL;
 StgTSO *blocked_queue_hd = NULL;
 StgTSO *blocked_queue_tl = NULL;
 StgTSO *run_queue_hd = NULL;
 StgTSO *run_queue_tl = NULL;
 StgTSO *blocked_queue_hd = NULL;
 StgTSO *blocked_queue_tl = NULL;
+StgTSO *blackhole_queue = NULL;
 StgTSO *sleeping_queue = NULL;    /* perhaps replace with a hash table? */
 
 #endif
 
 StgTSO *sleeping_queue = NULL;    /* perhaps replace with a hash table? */
 
 #endif
 
+/* The blackhole_queue should be checked for threads to wake up.  See
+ * Schedule.h for more thorough comment.
+ */
+rtsBool blackholes_need_checking = rtsFalse;
+
 /* Linked list of all threads.
  * Used for detecting garbage collected threads.
  */
 /* Linked list of all threads.
  * Used for detecting garbage collected threads.
  */
@@ -153,19 +166,22 @@ StgTSO *all_threads = NULL;
  */
 static StgTSO *suspended_ccalling_threads;
 
  */
 static StgTSO *suspended_ccalling_threads;
 
-static StgTSO *threadStackOverflow(StgTSO *tso);
-
 /* KH: The following two flags are shared memory locations.  There is no need
        to lock them, since they are only unset at the end of a scheduler
        operation.
 */
 
 /* flag set by signal handler to precipitate a context switch */
 /* KH: The following two flags are shared memory locations.  There is no need
        to lock them, since they are only unset at the end of a scheduler
        operation.
 */
 
 /* flag set by signal handler to precipitate a context switch */
-nat context_switch = 0;
+int context_switch = 0;
 
 /* if this flag is set as well, give up execution */
 rtsBool interrupted = rtsFalse;
 
 
 /* if this flag is set as well, give up execution */
 rtsBool interrupted = rtsFalse;
 
+/* If this flag is set, we are running Haskell code.  Used to detect
+ * uses of 'foreign import unsafe' that should be 'safe'.
+ */
+static rtsBool in_haskell = rtsFalse;
+
 /* Next thread ID to allocate.
  * Locks required: thread_id_mutex
  */
 /* Next thread ID to allocate.
  * Locks required: thread_id_mutex
  */
@@ -201,6 +217,10 @@ StgTSO *CurrentTSO;
  */
 StgTSO dummy_tso;
 
  */
 StgTSO dummy_tso;
 
+# if defined(SMP)
+static Condition gc_pending_cond = INIT_COND_VAR;
+# endif
+
 static rtsBool ready_to_gc;
 
 /*
 static rtsBool ready_to_gc;
 
 /*
@@ -210,13 +230,6 @@ static rtsBool ready_to_gc;
  */
 static rtsBool shutting_down_scheduler = rtsFalse;
 
  */
 static rtsBool shutting_down_scheduler = rtsFalse;
 
-void            addToBlockedQueue ( StgTSO *tso );
-
-static void     schedule          ( StgMainThread *mainThread, Capability *initialCapability );
-       void     interruptStgRts   ( void );
-
-static void     detectBlackHoles  ( void );
-
 #if defined(RTS_SUPPORTS_THREADS)
 /* ToDo: carefully document the invariants that go together
  *       with these synchronisation objects.
 #if defined(RTS_SUPPORTS_THREADS)
 /* ToDo: carefully document the invariants that go together
  *       with these synchronisation objects.
@@ -226,7 +239,7 @@ Mutex     term_mutex        = INIT_MUTEX_VAR;
 
 #endif /* RTS_SUPPORTS_THREADS */
 
 
 #endif /* RTS_SUPPORTS_THREADS */
 
-#if defined(PAR)
+#if defined(PARALLEL_HASKELL)
 StgTSO *LastTSO;
 rtsTime TimeOfLastYield;
 rtsBool emitSchedule = rtsTrue;
 StgTSO *LastTSO;
 rtsTime TimeOfLastYield;
 rtsBool emitSchedule = rtsTrue;
@@ -234,6 +247,7 @@ rtsBool emitSchedule = rtsTrue;
 
 #if DEBUG
 static char *whatNext_strs[] = {
 
 #if DEBUG
 static char *whatNext_strs[] = {
+  "(unknown)",
   "ThreadRunGHC",
   "ThreadInterpret",
   "ThreadKilled",
   "ThreadRunGHC",
   "ThreadInterpret",
   "ThreadKilled",
@@ -242,7 +256,66 @@ static char *whatNext_strs[] = {
 };
 #endif
 
 };
 #endif
 
-#if defined(PAR)
+/* -----------------------------------------------------------------------------
+ * static function prototypes
+ * -------------------------------------------------------------------------- */
+
+#if defined(RTS_SUPPORTS_THREADS)
+static void taskStart(void);
+#endif
+
+static void schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
+                     Capability *initialCapability );
+
+//
+// These function all encapsulate parts of the scheduler loop, and are
+// abstracted only to make the structure and control flow of the
+// scheduler clearer.
+//
+static void schedulePreLoop(void);
+static void scheduleHandleInterrupt(void);
+static void scheduleStartSignalHandlers(void);
+static void scheduleCheckBlockedThreads(void);
+static void scheduleCheckBlackHoles(void);
+static void scheduleDetectDeadlock(void);
+#if defined(GRAN)
+static StgTSO *scheduleProcessEvent(rtsEvent *event);
+#endif
+#if defined(PARALLEL_HASKELL)
+static StgTSO *scheduleSendPendingMessages(void);
+static void scheduleActivateSpark(void);
+static rtsBool scheduleGetRemoteWork(rtsBool *receivedFinish);
+#endif
+#if defined(PAR) || defined(GRAN)
+static void scheduleGranParReport(void);
+#endif
+static void schedulePostRunThread(void);
+static rtsBool scheduleHandleHeapOverflow( Capability *cap, StgTSO *t );
+static void scheduleHandleStackOverflow( StgTSO *t);
+static rtsBool scheduleHandleYield( StgTSO *t, nat prev_what_next );
+static void scheduleHandleThreadBlocked( StgTSO *t );
+static rtsBool scheduleHandleThreadFinished( StgMainThread *mainThread, 
+                                            Capability *cap, StgTSO *t );
+static void scheduleDoHeapProfile(void);
+static void scheduleDoGC(void);
+
+static void unblockThread(StgTSO *tso);
+static rtsBool checkBlackHoles(void);
+static SchedulerStatus waitThread_(/*out*/StgMainThread* m,
+                                  Capability *initialCapability
+                                  );
+static void scheduleThread_ (StgTSO* tso);
+static void AllRoots(evac_fn evac);
+
+static StgTSO *threadStackOverflow(StgTSO *tso);
+
+static void raiseAsync_(StgTSO *tso, StgClosure *exception, 
+                       rtsBool stop_at_atomically);
+
+static void printThreadBlockage(StgTSO *tso);
+static void printThreadStatus(StgTSO *tso);
+
+#if defined(PARALLEL_HASKELL)
 StgTSO * createSparkThread(rtsSpark spark);
 StgTSO * activateSpark (rtsSpark spark);  
 #endif
 StgTSO * createSparkThread(rtsSpark spark);
 StgTSO * activateSpark (rtsSpark spark);  
 #endif
@@ -254,11 +327,11 @@ StgTSO * activateSpark (rtsSpark spark);
 #if defined(RTS_SUPPORTS_THREADS)
 static rtsBool startingWorkerThread = rtsFalse;
 
 #if defined(RTS_SUPPORTS_THREADS)
 static rtsBool startingWorkerThread = rtsFalse;
 
-static void taskStart(void);
 static void
 taskStart(void)
 {
   ACQUIRE_LOCK(&sched_mutex);
 static void
 taskStart(void)
 {
   ACQUIRE_LOCK(&sched_mutex);
+  startingWorkerThread = rtsFalse;
   schedule(NULL,NULL);
   RELEASE_LOCK(&sched_mutex);
 }
   schedule(NULL,NULL);
   RELEASE_LOCK(&sched_mutex);
 }
@@ -275,12 +348,35 @@ startSchedulerTaskIfNecessary(void)
       // just because the last one hasn't yet reached the
       // "waiting for capability" state
       startingWorkerThread = rtsTrue;
       // just because the last one hasn't yet reached the
       // "waiting for capability" state
       startingWorkerThread = rtsTrue;
-      startTask(taskStart);
+      if (!startTask(taskStart)) {
+         startingWorkerThread = rtsFalse;
+      }
     }
   }
 }
 #endif
 
     }
   }
 }
 #endif
 
+/* -----------------------------------------------------------------------------
+ * Putting a thread on the run queue: different scheduling policies
+ * -------------------------------------------------------------------------- */
+
+STATIC_INLINE void
+addToRunQueue( StgTSO *t )
+{
+#if defined(PARALLEL_HASKELL)
+    if (RtsFlags.ParFlags.doFairScheduling) { 
+       // this does round-robin scheduling; good for concurrency
+       APPEND_TO_RUN_QUEUE(t);
+    } else {
+       // this does unfair scheduling; good for parallelism
+       PUSH_ON_RUN_QUEUE(t);
+    }
+#else
+    // this does round-robin scheduling; good for concurrency
+    APPEND_TO_RUN_QUEUE(t);
+#endif
+}
+    
 /* ---------------------------------------------------------------------------
    Main scheduling loop.
 
 /* ---------------------------------------------------------------------------
    Main scheduling loop.
 
@@ -316,18 +412,17 @@ startSchedulerTaskIfNecessary(void)
      This is not the ugliest code you could imagine, but it's bloody close.
 
    ------------------------------------------------------------------------ */
      This is not the ugliest code you could imagine, but it's bloody close.
 
    ------------------------------------------------------------------------ */
+
 static void
 schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
           Capability *initialCapability )
 {
   StgTSO *t;
 static void
 schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
           Capability *initialCapability )
 {
   StgTSO *t;
-  Capability *cap = initialCapability;
+  Capability *cap;
   StgThreadReturnCode ret;
 #if defined(GRAN)
   rtsEvent *event;
   StgThreadReturnCode ret;
 #if defined(GRAN)
   rtsEvent *event;
-#elif defined(PAR)
-  StgSparkPool *pool;
-  rtsSpark spark;
+#elif defined(PARALLEL_HASKELL)
   StgTSO *tso;
   GlobalTaskId pe;
   rtsBool receivedFinish = rtsFalse;
   StgTSO *tso;
   GlobalTaskId pe;
   rtsBool receivedFinish = rtsFalse;
@@ -335,62 +430,58 @@ schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
   nat tp_size, sp_size; // stats only
 # endif
 #endif
   nat tp_size, sp_size; // stats only
 # endif
 #endif
-  rtsBool was_interrupted = rtsFalse;
-  StgTSOWhatNext prev_what_next;
+  nat prev_what_next;
   
   // Pre-condition: sched_mutex is held.
   
   // Pre-condition: sched_mutex is held.
-#if defined(RTS_SUPPORTS_THREADS)
-  //
-  // in the threaded case, the capability is either passed in via the
-  // initialCapability parameter, or initialized inside the scheduler
-  // loop 
-  //
-  IF_DEBUG(scheduler,
-          sched_belch("### NEW SCHEDULER LOOP (main thr: %p, cap: %p)",
-                      mainThread, initialCapability);
-      );
-#else
+  // We might have a capability, passed in as initialCapability.
+  cap = initialCapability;
+
+#if !defined(RTS_SUPPORTS_THREADS)
   // simply initialise it in the non-threaded case
   grabCapability(&cap);
 #endif
 
   // simply initialise it in the non-threaded case
   grabCapability(&cap);
 #endif
 
-#if defined(GRAN)
-  /* set up first event to get things going */
-  /* ToDo: assign costs for system setup and init MainTSO ! */
-  new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc],
-           ContinueThread, 
-           CurrentTSO, (StgClosure*)NULL, (rtsSpark*)NULL);
-
-  IF_DEBUG(gran,
-          fprintf(stderr, "GRAN: Init CurrentTSO (in schedule) = %p\n", CurrentTSO);
-          G_TSO(CurrentTSO, 5));
-
-  if (RtsFlags.GranFlags.Light) {
-    /* Save current time; GranSim Light only */
-    CurrentTSO->gran.clock = CurrentTime[CurrentProc];
-  }      
+  IF_DEBUG(scheduler,
+          sched_belch("### NEW SCHEDULER LOOP (main thr: %p, cap: %p)",
+                      mainThread, initialCapability);
+      );
 
 
-  event = get_next_event();
+  schedulePreLoop();
 
 
-  while (event!=(rtsEvent*)NULL) {
-    /* Choose the processor with the next event */
-    CurrentProc = event->proc;
-    CurrentTSO = event->tso;
+  // -----------------------------------------------------------
+  // Scheduler loop starts here:
 
 
-#elif defined(PAR)
+#if defined(PARALLEL_HASKELL)
+#define TERMINATION_CONDITION        (!receivedFinish)
+#elif defined(GRAN)
+#define TERMINATION_CONDITION        ((event = get_next_event()) != (rtsEvent*)NULL) 
+#else
+#define TERMINATION_CONDITION        rtsTrue
+#endif
 
 
-  while (!receivedFinish) {    /* set by processMessages */
-                               /* when receiving PP_FINISH message         */ 
+  while (TERMINATION_CONDITION) {
 
 
-#else // everything except GRAN and PAR
+#if defined(GRAN)
+      /* Choose the processor with the next event */
+      CurrentProc = event->proc;
+      CurrentTSO = event->tso;
+#endif
 
 
-  while (1) {
+      IF_DEBUG(scheduler, printAllThreads());
 
 
+#if defined(SMP)
+      // 
+      // Wait until GC has completed, if necessary.
+      //
+      if (ready_to_gc) {
+         if (cap != NULL) {
+             releaseCapability(cap);
+             IF_DEBUG(scheduler,sched_belch("waiting for GC"));
+             waitCondition( &gc_pending_cond, &sched_mutex );
+         }
+      }
 #endif
 
 #endif
 
-     IF_DEBUG(scheduler, printAllThreads());
-
 #if defined(RTS_SUPPORTS_THREADS)
       // Yield the capability to higher-priority tasks if necessary.
       //
 #if defined(RTS_SUPPORTS_THREADS)
       // Yield the capability to higher-priority tasks if necessary.
       //
@@ -408,238 +499,466 @@ schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
       // We now have a capability...
 #endif
 
       // We now have a capability...
 #endif
 
+    // Check whether we have re-entered the RTS from Haskell without
+    // going via suspendThread()/resumeThread (i.e. a 'safe' foreign
+    // call).
+    if (in_haskell) {
+         errorBelch("schedule: re-entered unsafely.\n"
+                    "   Perhaps a 'foreign import unsafe' should be 'safe'?");
+         stg_exit(1);
+    }
+
+    scheduleHandleInterrupt();
+
+#if defined(not_yet) && defined(SMP)
     //
     //
-    // If we're interrupted (the user pressed ^C, or some other
-    // termination condition occurred), kill all the currently running
-    // threads.
+    // Top up the run queue from our spark pool.  We try to make the
+    // number of threads in the run queue equal to the number of
+    // free capabilities.
     //
     //
-    if (interrupted) {
-       IF_DEBUG(scheduler, sched_belch("interrupted"));
-       interrupted = rtsFalse;
-       was_interrupted = rtsTrue;
-#if defined(RTS_SUPPORTS_THREADS)
-       // In the threaded RTS, deadlock detection doesn't work,
-       // so just exit right away.
-       prog_belch("interrupted");
-       releaseCapability(cap);
-       RELEASE_LOCK(&sched_mutex);
-       shutdownHaskellAndExit(EXIT_SUCCESS);
-#else
-       deleteAllThreads();
-#endif
+    {
+       StgClosure *spark;
+       if (EMPTY_RUN_QUEUE()) {
+           spark = findSpark(rtsFalse);
+           if (spark == NULL) {
+               break; /* no more sparks in the pool */
+           } else {
+               createSparkThread(spark);         
+               IF_DEBUG(scheduler,
+                        sched_belch("==^^ turning spark of closure %p into a thread",
+                                    (StgClosure *)spark));
+           }
+       }
     }
     }
+#endif // SMP
 
 
+    scheduleStartSignalHandlers();
+
+    // Only check the black holes here if we've nothing else to do.
+    // During normal execution, the black hole list only gets checked
+    // at GC time, to avoid repeatedly traversing this possibly long
+    // list each time around the scheduler.
+    if (EMPTY_RUN_QUEUE()) { scheduleCheckBlackHoles(); }
+
+    scheduleCheckBlockedThreads();
+
+    scheduleDetectDeadlock();
+
+    // Normally, the only way we can get here with no threads to
+    // run is if a keyboard interrupt received during 
+    // scheduleCheckBlockedThreads() or scheduleDetectDeadlock().
+    // Additionally, it is not fatal for the
+    // threaded RTS to reach here with no threads to run.
     //
     //
-    // Go through the list of main threads and wake up any
-    // clients whose computations have finished.  ToDo: this
-    // should be done more efficiently without a linear scan
-    // of the main threads list, somehow...
-    //
-#if defined(RTS_SUPPORTS_THREADS)
-    { 
-       StgMainThread *m, **prev;
-       prev = &main_threads;
-       for (m = main_threads; m != NULL; prev = &m->link, m = m->link) {
-         if (m->tso->what_next == ThreadComplete
-             || m->tso->what_next == ThreadKilled)
-         {
-           if (m == mainThread)
-           {
-              if (m->tso->what_next == ThreadComplete)
-              {
-                if (m->ret)
-                {
-                  // NOTE: return val is tso->sp[1] (see StgStartup.hc)
-                  *(m->ret) = (StgClosure *)m->tso->sp[1]; 
-                }
-                m->stat = Success;
-              }
-              else
-              {
-                if (m->ret)
-                {
-                  *(m->ret) = NULL;
-                }
-                if (was_interrupted)
-                {
-                  m->stat = Interrupted;
-                }
-                else
-                {
-                  m->stat = Killed;
-                }
-              }
-              *prev = m->link;
-           
-#ifdef DEBUG
-             removeThreadLabel((StgWord)m->tso->id);
+    // win32: might be here due to awaitEvent() being abandoned
+    // as a result of a console event having been delivered.
+    if ( EMPTY_RUN_QUEUE() ) {
+#if !defined(RTS_SUPPORTS_THREADS) && !defined(mingw32_HOST_OS)
+       ASSERT(interrupted);
 #endif
 #endif
-              releaseCapability(cap);
-              return;
-            }
-            else
-            {
-                // The current OS thread can not handle the fact that
-                // the Haskell thread "m" has ended.  "m" is bound;
-                // the scheduler loop in it's bound OS thread has to
-                // return, so let's pass our capability directly to
-                // that thread.
-               passCapability(&m->bound_thread_cond);
-               continue;
-            }
-          }
-       }
+       continue; // nothing to do
     }
     }
-    
-#else /* not threaded */
 
 
-# if defined(PAR)
-    /* in GUM do this only on the Main PE */
-    if (IAmMainThread)
-# endif
-    /* If our main thread has finished or been killed, return.
-     */
-    {
-      StgMainThread *m = main_threads;
-      if (m->tso->what_next == ThreadComplete
-         || m->tso->what_next == ThreadKilled) {
-#ifdef DEBUG
-       removeThreadLabel((StgWord)m->tso->id);
-#endif
-       main_threads = main_threads->link;
-       if (m->tso->what_next == ThreadComplete) {
-           // We finished successfully, fill in the return value
-           // NOTE: return val is tso->sp[1] (see StgStartup.hc)
-           if (m->ret) { *(m->ret) = (StgClosure *)m->tso->sp[1]; };
-           m->stat = Success;
-           return;
-       } else {
-         if (m->ret) { *(m->ret) = NULL; };
-         if (was_interrupted) {
-           m->stat = Interrupted;
-         } else {
-           m->stat = Killed;
-         }
-         return;
-       }
-      }
+#if defined(PARALLEL_HASKELL)
+    scheduleSendPendingMessages();
+    if (EMPTY_RUN_QUEUE() && scheduleActivateSpark()) 
+       continue;
+
+#if defined(SPARKS)
+    ASSERT(next_fish_to_send_at==0);  // i.e. no delayed fishes left!
+#endif
+
+    /* If we still have no work we need to send a FISH to get a spark
+       from another PE */
+    if (EMPTY_RUN_QUEUE()) {
+       if (!scheduleGetRemoteWork(&receivedFinish)) continue;
+       ASSERT(rtsFalse); // should not happen at the moment
+    }
+    // from here: non-empty run queue.
+    //  TODO: merge above case with this, only one call processMessages() !
+    if (PacketsWaiting()) {  /* process incoming messages, if
+                               any pending...  only in else
+                               because getRemoteWork waits for
+                               messages as well */
+       receivedFinish = processMessages();
     }
 #endif
 
     }
 #endif
 
+#if defined(GRAN)
+    scheduleProcessEvent(event);
+#endif
 
 
-#if defined(RTS_USER_SIGNALS)
-    // check for signals each time around the scheduler
-    if (signals_pending()) {
-      RELEASE_LOCK(&sched_mutex); /* ToDo: kill */
-      startSignalHandlers();
-      ACQUIRE_LOCK(&sched_mutex);
-    }
+    // 
+    // Get a thread to run
+    //
+    ASSERT(run_queue_hd != END_TSO_QUEUE);
+    POP_RUN_QUEUE(t);
+
+#if defined(GRAN) || defined(PAR)
+    scheduleGranParReport(); // some kind of debuging output
+#else
+    // Sanity check the thread we're about to run.  This can be
+    // expensive if there is lots of thread switching going on...
+    IF_DEBUG(sanity,checkTSO(t));
 #endif
 
 #endif
 
-    /* Check whether any waiting threads need to be woken up.  If the
-     * run queue is empty, and there are no other tasks running, we
-     * can wait indefinitely for something to happen.
-     */
-    if ( !EMPTY_QUEUE(blocked_queue_hd) || !EMPTY_QUEUE(sleeping_queue) 
 #if defined(RTS_SUPPORTS_THREADS)
 #if defined(RTS_SUPPORTS_THREADS)
-               || EMPTY_RUN_QUEUE()
-#endif
-        )
+    // Check whether we can run this thread in the current task.
+    // If not, we have to pass our capability to the right task.
     {
     {
-      awaitEvent( EMPTY_RUN_QUEUE() );
+      StgMainThread *m = t->main;
+      
+      if(m)
+      {
+       if(m == mainThread)
+       {
+         IF_DEBUG(scheduler,
+           sched_belch("### Running thread %d in bound thread", t->id));
+         // yes, the Haskell thread is bound to the current native thread
+       }
+       else
+       {
+         IF_DEBUG(scheduler,
+           sched_belch("### thread %d bound to another OS thread", t->id));
+         // no, bound to a different Haskell thread: pass to that thread
+         PUSH_ON_RUN_QUEUE(t);
+         passCapability(&m->bound_thread_cond);
+         continue;
+       }
+      }
+      else
+      {
+       if(mainThread != NULL)
+        // The thread we want to run is bound.
+       {
+         IF_DEBUG(scheduler,
+           sched_belch("### this OS thread cannot run thread %d", t->id));
+         // no, the current native thread is bound to a different
+         // Haskell thread, so pass it to any worker thread
+         PUSH_ON_RUN_QUEUE(t);
+         passCapabilityToWorker();
+         continue; 
+       }
+      }
     }
     }
-    /* we can be interrupted while waiting for I/O... */
-    if (interrupted) continue;
+#endif
 
 
-    /* 
-     * Detect deadlock: when we have no threads to run, there are no
-     * threads waiting on I/O or sleeping, and all the other tasks are
-     * waiting for work, we must have a deadlock of some description.
-     *
-     * We first try to find threads blocked on themselves (ie. black
-     * holes), and generate NonTermination exceptions where necessary.
-     *
-     * If no threads are black holed, we have a deadlock situation, so
-     * inform all the main threads.
+    cap->r.rCurrentTSO = t;
+    
+    /* context switches are now initiated by the timer signal, unless
+     * the user specified "context switch as often as possible", with
+     * +RTS -C0
      */
      */
-#if !defined(PAR) && !defined(RTS_SUPPORTS_THREADS)
-    if (   EMPTY_THREAD_QUEUES() )
-    {
-       IF_DEBUG(scheduler, sched_belch("deadlocked, forcing major GC..."));
-       // Garbage collection can release some new threads due to
-       // either (a) finalizers or (b) threads resurrected because
-       // they are about to be send BlockedOnDeadMVar.  Any threads
-       // thus released will be immediately runnable.
-       GarbageCollect(GetRoots,rtsTrue);
+    if ((RtsFlags.ConcFlags.ctxtSwitchTicks == 0
+        && (run_queue_hd != END_TSO_QUEUE
+            || blocked_queue_hd != END_TSO_QUEUE
+            || sleeping_queue != END_TSO_QUEUE)))
+       context_switch = 1;
 
 
-       if ( !EMPTY_RUN_QUEUE() ) { goto not_deadlocked; }
+run_thread:
 
 
-       IF_DEBUG(scheduler, 
-                sched_belch("still deadlocked, checking for black holes..."));
-       detectBlackHoles();
+    RELEASE_LOCK(&sched_mutex);
 
 
-       if ( !EMPTY_RUN_QUEUE() ) { goto not_deadlocked; }
+    IF_DEBUG(scheduler, sched_belch("-->> running thread %ld %s ...", 
+                             (long)t->id, whatNext_strs[t->what_next]));
 
 
-#if defined(RTS_USER_SIGNALS)
-       /* If we have user-installed signal handlers, then wait
-        * for signals to arrive rather then bombing out with a
-        * deadlock.
-        */
-       if ( anyUserHandlers() ) {
-           IF_DEBUG(scheduler, 
-                    sched_belch("still deadlocked, waiting for signals..."));
+#if defined(PROFILING)
+    startHeapProfTimer();
+#endif
 
 
-           awaitUserSignals();
+    // ----------------------------------------------------------------------
+    // Run the current thread 
 
 
-           // we might be interrupted...
-           if (interrupted) { continue; }
+    prev_what_next = t->what_next;
 
 
-           if (signals_pending()) {
-               RELEASE_LOCK(&sched_mutex);
-               startSignalHandlers();
-               ACQUIRE_LOCK(&sched_mutex);
-           }
-           ASSERT(!EMPTY_RUN_QUEUE());
-           goto not_deadlocked;
-       }
-#endif
+    errno = t->saved_errno;
+    in_haskell = rtsTrue;
 
 
-       /* Probably a real deadlock.  Send the current main thread the
-        * Deadlock exception (or in the SMP build, send *all* main
-        * threads the deadlock exception, since none of them can make
-        * progress).
-        */
-       {
-           StgMainThread *m;
-           m = main_threads;
-           switch (m->tso->why_blocked) {
-           case BlockedOnBlackHole:
-               raiseAsync(m->tso, (StgClosure *)NonTermination_closure);
-               break;
-           case BlockedOnException:
-           case BlockedOnMVar:
-               raiseAsync(m->tso, (StgClosure *)Deadlock_closure);
-               break;
-           default:
-               barf("deadlock: main thread blocked in a strange way");
-           }
-       }
-    }
-  not_deadlocked:
+    switch (prev_what_next) {
 
 
-#elif defined(RTS_SUPPORTS_THREADS)
-    // ToDo: add deadlock detection in threaded RTS
-#elif defined(PAR)
-    // ToDo: add deadlock detection in GUM (similar to SMP) -- HWL
-#endif
+    case ThreadKilled:
+    case ThreadComplete:
+       /* Thread already finished, return to scheduler. */
+       ret = ThreadFinished;
+       break;
 
 
-#if defined(RTS_SUPPORTS_THREADS)
-    if ( EMPTY_RUN_QUEUE() ) {
-      continue; // nothing to do
+    case ThreadRunGHC:
+       ret = StgRun((StgFunPtr) stg_returnToStackTop, &cap->r);
+       break;
+
+    case ThreadInterpret:
+       ret = interpretBCO(cap);
+       break;
+
+    default:
+      barf("schedule: invalid what_next field");
+    }
+
+    // We have run some Haskell code: there might be blackhole-blocked
+    // threads to wake up now.
+    if ( blackhole_queue != END_TSO_QUEUE ) {
+       blackholes_need_checking = rtsTrue;
+    }
+
+    in_haskell = rtsFalse;
+
+    // The TSO might have moved, eg. if it re-entered the RTS and a GC
+    // happened.  So find the new location:
+    t = cap->r.rCurrentTSO;
+
+    // And save the current errno in this thread.
+    t->saved_errno = errno;
+
+    // ----------------------------------------------------------------------
+    
+    /* Costs for the scheduler are assigned to CCS_SYSTEM */
+#if defined(PROFILING)
+    stopHeapProfTimer();
+    CCCS = CCS_SYSTEM;
+#endif
+    
+    ACQUIRE_LOCK(&sched_mutex);
+    
+#if defined(RTS_SUPPORTS_THREADS)
+    IF_DEBUG(scheduler,debugBelch("sched (task %p): ", osThreadId()););
+#elif !defined(GRAN) && !defined(PARALLEL_HASKELL)
+    IF_DEBUG(scheduler,debugBelch("sched: "););
+#endif
+    
+    schedulePostRunThread();
+
+    switch (ret) {
+    case HeapOverflow:
+       ready_to_gc = scheduleHandleHeapOverflow(cap,t);
+       break;
+
+    case StackOverflow:
+       scheduleHandleStackOverflow(t);
+       break;
+
+    case ThreadYielding:
+       if (scheduleHandleYield(t, prev_what_next)) {
+            // shortcut for switching between compiler/interpreter:
+           goto run_thread; 
+       }
+       break;
+
+    case ThreadBlocked:
+       scheduleHandleThreadBlocked(t);
+       threadPaused(t);
+       break;
+
+    case ThreadFinished:
+       if (scheduleHandleThreadFinished(mainThread, cap, t)) return;;
+       break;
+
+    default:
+      barf("schedule: invalid thread return code %d", (int)ret);
+    }
+
+    scheduleDoHeapProfile();
+    scheduleDoGC();
+  } /* end of while() */
+
+  IF_PAR_DEBUG(verbose,
+              debugBelch("== Leaving schedule() after having received Finish\n"));
+}
+
+/* ----------------------------------------------------------------------------
+ * Setting up the scheduler loop
+ * ASSUMES: sched_mutex
+ * ------------------------------------------------------------------------- */
+
+static void
+schedulePreLoop(void)
+{
+#if defined(GRAN) 
+    /* set up first event to get things going */
+    /* ToDo: assign costs for system setup and init MainTSO ! */
+    new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc],
+             ContinueThread, 
+             CurrentTSO, (StgClosure*)NULL, (rtsSpark*)NULL);
+    
+    IF_DEBUG(gran,
+            debugBelch("GRAN: Init CurrentTSO (in schedule) = %p\n", 
+                       CurrentTSO);
+            G_TSO(CurrentTSO, 5));
+    
+    if (RtsFlags.GranFlags.Light) {
+       /* Save current time; GranSim Light only */
+       CurrentTSO->gran.clock = CurrentTime[CurrentProc];
+    }      
+#endif
+}
+
+/* ----------------------------------------------------------------------------
+ * Deal with the interrupt flag
+ * ASSUMES: sched_mutex
+ * ------------------------------------------------------------------------- */
+
+static
+void scheduleHandleInterrupt(void)
+{
+    //
+    // Test for interruption.  If interrupted==rtsTrue, then either
+    // we received a keyboard interrupt (^C), or the scheduler is
+    // trying to shut down all the tasks (shutting_down_scheduler) in
+    // the threaded RTS.
+    //
+    if (interrupted) {
+       if (shutting_down_scheduler) {
+           IF_DEBUG(scheduler, sched_belch("shutting down"));
+#if defined(RTS_SUPPORTS_THREADS)
+           shutdownThread();
+#endif
+       } else {
+           IF_DEBUG(scheduler, sched_belch("interrupted"));
+           deleteAllThreads();
+       }
+    }
+}
+
+/* ----------------------------------------------------------------------------
+ * Start any pending signal handlers
+ * ASSUMES: sched_mutex
+ * ------------------------------------------------------------------------- */
+
+static void
+scheduleStartSignalHandlers(void)
+{
+#if defined(RTS_USER_SIGNALS)
+    if (signals_pending()) {
+      RELEASE_LOCK(&sched_mutex); /* ToDo: kill */
+      startSignalHandlers();
+      ACQUIRE_LOCK(&sched_mutex);
+    }
+#endif
+}
+
+/* ----------------------------------------------------------------------------
+ * Check for blocked threads that can be woken up.
+ * ASSUMES: sched_mutex
+ * ------------------------------------------------------------------------- */
+
+static void
+scheduleCheckBlockedThreads(void)
+{
+    //
+    // Check whether any waiting threads need to be woken up.  If the
+    // run queue is empty, and there are no other tasks running, we
+    // can wait indefinitely for something to happen.
+    //
+    if ( !EMPTY_QUEUE(blocked_queue_hd) || !EMPTY_QUEUE(sleeping_queue) )
+    {
+#if defined(RTS_SUPPORTS_THREADS)
+       // We shouldn't be here...
+       barf("schedule: awaitEvent() in threaded RTS");
+#endif
+       awaitEvent( EMPTY_RUN_QUEUE() && !blackholes_need_checking );
+    }
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Check for threads blocked on BLACKHOLEs that can be woken up
+ * ASSUMES: sched_mutex
+ * ------------------------------------------------------------------------- */
+static void
+scheduleCheckBlackHoles( void )
+{
+    if ( blackholes_need_checking )
+    {
+       checkBlackHoles();
+       blackholes_need_checking = rtsFalse;
     }
     }
+}
+
+/* ----------------------------------------------------------------------------
+ * Detect deadlock conditions and attempt to resolve them.
+ * ASSUMES: sched_mutex
+ * ------------------------------------------------------------------------- */
+
+static void
+scheduleDetectDeadlock(void)
+{
+    /* 
+     * Detect deadlock: when we have no threads to run, there are no
+     * threads blocked, waiting for I/O, or sleeping, and all the
+     * other tasks are waiting for work, we must have a deadlock of
+     * some description.
+     */
+    if ( EMPTY_THREAD_QUEUES() )
+    {
+#if !defined(PARALLEL_HASKELL) && !defined(RTS_SUPPORTS_THREADS)
+       IF_DEBUG(scheduler, sched_belch("deadlocked, forcing major GC..."));
+
+       // Garbage collection can release some new threads due to
+       // either (a) finalizers or (b) threads resurrected because
+       // they are unreachable and will therefore be sent an
+       // exception.  Any threads thus released will be immediately
+       // runnable.
+       GarbageCollect(GetRoots,rtsTrue);
+       if ( !EMPTY_RUN_QUEUE() ) return;
+
+#if defined(RTS_USER_SIGNALS)
+       /* If we have user-installed signal handlers, then wait
+        * for signals to arrive rather then bombing out with a
+        * deadlock.
+        */
+       if ( anyUserHandlers() ) {
+           IF_DEBUG(scheduler, 
+                    sched_belch("still deadlocked, waiting for signals..."));
+
+           awaitUserSignals();
+
+           if (signals_pending()) {
+               RELEASE_LOCK(&sched_mutex);
+               startSignalHandlers();
+               ACQUIRE_LOCK(&sched_mutex);
+           }
+
+           // either we have threads to run, or we were interrupted:
+           ASSERT(!EMPTY_RUN_QUEUE() || interrupted);
+       }
+#endif
+
+       /* Probably a real deadlock.  Send the current main thread the
+        * Deadlock exception (or in the SMP build, send *all* main
+        * threads the deadlock exception, since none of them can make
+        * progress).
+        */
+       {
+           StgMainThread *m;
+           m = main_threads;
+           switch (m->tso->why_blocked) {
+           case BlockedOnBlackHole:
+           case BlockedOnException:
+           case BlockedOnMVar:
+               raiseAsync(m->tso, (StgClosure *)NonTermination_closure);
+               return;
+           default:
+               barf("deadlock: main thread blocked in a strange way");
+           }
+       }
+
+#elif defined(RTS_SUPPORTS_THREADS)
+    // ToDo: add deadlock detection in threaded RTS
+#elif defined(PARALLEL_HASKELL)
+    // ToDo: add deadlock detection in GUM (similar to SMP) -- HWL
 #endif
 #endif
+    }
+}
+
+/* ----------------------------------------------------------------------------
+ * Process an event (GRAN only)
+ * ------------------------------------------------------------------------- */
 
 #if defined(GRAN)
 
 #if defined(GRAN)
+static StgTSO *
+scheduleProcessEvent(rtsEvent *event)
+{
+    StgTSO *t;
+
     if (RtsFlags.GranFlags.Light)
       GranSimLight_enter_system(event, &ActiveTSO); // adjust ActiveTSO etc
 
     if (RtsFlags.GranFlags.Light)
       GranSimLight_enter_system(event, &ActiveTSO); // adjust ActiveTSO etc
 
@@ -652,13 +971,13 @@ schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
     if (!RtsFlags.GranFlags.Light)
       handleIdlePEs();
 
     if (!RtsFlags.GranFlags.Light)
       handleIdlePEs();
 
-    IF_DEBUG(gran, fprintf(stderr, "GRAN: switch by event-type\n"));
+    IF_DEBUG(gran, debugBelch("GRAN: switch by event-type\n"));
 
     /* main event dispatcher in GranSim */
     switch (event->evttype) {
       /* Should just be continuing execution */
     case ContinueThread:
 
     /* main event dispatcher in GranSim */
     switch (event->evttype) {
       /* Should just be continuing execution */
     case ContinueThread:
-      IF_DEBUG(gran, fprintf(stderr, "GRAN: doing ContinueThread\n"));
+      IF_DEBUG(gran, debugBelch("GRAN: doing ContinueThread\n"));
       /* ToDo: check assertion
       ASSERT(run_queue_hd != (StgTSO*)NULL &&
             run_queue_hd != END_TSO_QUEUE);
       /* ToDo: check assertion
       ASSERT(run_queue_hd != (StgTSO*)NULL &&
             run_queue_hd != END_TSO_QUEUE);
@@ -666,25 +985,25 @@ schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
       /* Ignore ContinueThreads for fetching threads (if synchr comm) */
       if (!RtsFlags.GranFlags.DoAsyncFetch &&
          procStatus[CurrentProc]==Fetching) {
       /* Ignore ContinueThreads for fetching threads (if synchr comm) */
       if (!RtsFlags.GranFlags.DoAsyncFetch &&
          procStatus[CurrentProc]==Fetching) {
-       belch("ghuH: Spurious ContinueThread while Fetching ignored; TSO %d (%p) [PE %d]",
+       debugBelch("ghuH: Spurious ContinueThread while Fetching ignored; TSO %d (%p) [PE %d]\n",
              CurrentTSO->id, CurrentTSO, CurrentProc);
        goto next_thread;
       }        
       /* Ignore ContinueThreads for completed threads */
       if (CurrentTSO->what_next == ThreadComplete) {
              CurrentTSO->id, CurrentTSO, CurrentProc);
        goto next_thread;
       }        
       /* Ignore ContinueThreads for completed threads */
       if (CurrentTSO->what_next == ThreadComplete) {
-       belch("ghuH: found a ContinueThread event for completed thread %d (%p) [PE %d] (ignoring ContinueThread)", 
+       debugBelch("ghuH: found a ContinueThread event for completed thread %d (%p) [PE %d] (ignoring ContinueThread)\n", 
              CurrentTSO->id, CurrentTSO, CurrentProc);
        goto next_thread;
       }        
       /* Ignore ContinueThreads for threads that are being migrated */
       if (PROCS(CurrentTSO)==Nowhere) { 
              CurrentTSO->id, CurrentTSO, CurrentProc);
        goto next_thread;
       }        
       /* Ignore ContinueThreads for threads that are being migrated */
       if (PROCS(CurrentTSO)==Nowhere) { 
-       belch("ghuH: trying to run the migrating TSO %d (%p) [PE %d] (ignoring ContinueThread)",
+       debugBelch("ghuH: trying to run the migrating TSO %d (%p) [PE %d] (ignoring ContinueThread)\n",
              CurrentTSO->id, CurrentTSO, CurrentProc);
        goto next_thread;
       }
       /* The thread should be at the beginning of the run queue */
       if (CurrentTSO!=run_queue_hds[CurrentProc]) { 
              CurrentTSO->id, CurrentTSO, CurrentProc);
        goto next_thread;
       }
       /* The thread should be at the beginning of the run queue */
       if (CurrentTSO!=run_queue_hds[CurrentProc]) { 
-       belch("ghuH: TSO %d (%p) [PE %d] is not at the start of the run_queue when doing a ContinueThread",
+       debugBelch("ghuH: TSO %d (%p) [PE %d] is not at the start of the run_queue when doing a ContinueThread\n",
              CurrentTSO->id, CurrentTSO, CurrentProc);
        break; // run the thread anyway
       }
              CurrentTSO->id, CurrentTSO, CurrentProc);
        break; // run the thread anyway
       }
@@ -741,14 +1060,14 @@ schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
     
     /* This point was scheduler_loop in the old RTS */
 
     
     /* This point was scheduler_loop in the old RTS */
 
-    IF_DEBUG(gran, belch("GRAN: after main switch"));
+    IF_DEBUG(gran, debugBelch("GRAN: after main switch\n"));
 
     TimeOfLastEvent = CurrentTime[CurrentProc];
     TimeOfNextEvent = get_time_of_next_event();
     IgnoreEvents=(TimeOfNextEvent==0); // HWL HACK
     // CurrentTSO = ThreadQueueHd;
 
 
     TimeOfLastEvent = CurrentTime[CurrentProc];
     TimeOfNextEvent = get_time_of_next_event();
     IgnoreEvents=(TimeOfNextEvent==0); // HWL HACK
     // CurrentTSO = ThreadQueueHd;
 
-    IF_DEBUG(gran, belch("GRAN: time of next event is: %ld", 
+    IF_DEBUG(gran, debugBelch("GRAN: time of next event is: %ld\n", 
                         TimeOfNextEvent));
 
     if (RtsFlags.GranFlags.Light) 
                         TimeOfNextEvent));
 
     if (RtsFlags.GranFlags.Light) 
@@ -757,7 +1076,7 @@ schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
     EndOfTimeSlice = CurrentTime[CurrentProc]+RtsFlags.GranFlags.time_slice;
 
     IF_DEBUG(gran, 
     EndOfTimeSlice = CurrentTime[CurrentProc]+RtsFlags.GranFlags.time_slice;
 
     IF_DEBUG(gran, 
-            belch("GRAN: end of time-slice is %#lx", EndOfTimeSlice));
+            debugBelch("GRAN: end of time-slice is %#lx\n", EndOfTimeSlice));
 
     /* in a GranSim setup the TSO stays on the run queue */
     t = CurrentTSO;
 
     /* in a GranSim setup the TSO stays on the run queue */
     t = CurrentTSO;
@@ -765,7 +1084,7 @@ schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
     POP_RUN_QUEUE(t); // take_off_run_queue(t);
 
     IF_DEBUG(gran, 
     POP_RUN_QUEUE(t); // take_off_run_queue(t);
 
     IF_DEBUG(gran, 
-            fprintf(stderr, "GRAN: About to run current thread, which is\n");
+            debugBelch("GRAN: About to run current thread, which is\n");
             G_TSO(t,5));
 
     context_switch = 0; // turned on via GranYield, checking events and time slice
             G_TSO(t,5));
 
     context_switch = 0; // turned on via GranYield, checking events and time slice
@@ -774,18 +1093,54 @@ schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
             DumpGranEvent(GR_SCHEDULE, t));
 
     procStatus[CurrentProc] = Busy;
             DumpGranEvent(GR_SCHEDULE, t));
 
     procStatus[CurrentProc] = Busy;
+}
+#endif // GRAN
 
 
-#elif defined(PAR)
+/* ----------------------------------------------------------------------------
+ * Send pending messages (PARALLEL_HASKELL only)
+ * ------------------------------------------------------------------------- */
+
+#if defined(PARALLEL_HASKELL)
+static StgTSO *
+scheduleSendPendingMessages(void)
+{
+    StgSparkPool *pool;
+    rtsSpark spark;
+    StgTSO *t;
+
+# if defined(PAR) // global Mem.Mgmt., omit for now
     if (PendingFetches != END_BF_QUEUE) {
         processFetches();
     }
     if (PendingFetches != END_BF_QUEUE) {
         processFetches();
     }
+# endif
+    
+    if (RtsFlags.ParFlags.BufferTime) {
+       // if we use message buffering, we must send away all message
+       // packets which have become too old...
+       sendOldBuffers(); 
+    }
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * Activate spark threads (PARALLEL_HASKELL only)
+ * ------------------------------------------------------------------------- */
+
+#if defined(PARALLEL_HASKELL)
+static void
+scheduleActivateSpark(void)
+{
+#if defined(SPARKS)
+  ASSERT(EMPTY_RUN_QUEUE());
+/* We get here if the run queue is empty and want some work.
+   We try to turn a spark into a thread, and add it to the run queue,
+   from where it will be picked up in the next iteration of the scheduler
+   loop.
+*/
 
 
-    /* ToDo: phps merge with spark activation above */
-    /* check whether we have local work and send requests if we have none */
-    if (EMPTY_RUN_QUEUE()) {  /* no runnable threads */
       /* :-[  no local threads => look out for local sparks */
       /* the spark pool for the current PE */
       /* :-[  no local threads => look out for local sparks */
       /* the spark pool for the current PE */
-      pool = &(MainRegTable.rSparks); // generalise to cap = &MainRegTable
+      pool = &(cap.r.rSparks); // JB: cap = (old) MainCap
       if (advisory_thread_count < RtsFlags.ParFlags.maxThreads &&
          pool->hd < pool->tl) {
        /* 
       if (advisory_thread_count < RtsFlags.ParFlags.maxThreads &&
          pool->hd < pool->tl) {
        /* 
@@ -797,29 +1152,73 @@ schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
         * thread... 
         */
 
         * thread... 
         */
 
-       spark = findSpark(rtsFalse);                /* get a spark */
-       if (spark != (rtsSpark) NULL) {
-         tso = activateSpark(spark);       /* turn the spark into a thread */
-         IF_PAR_DEBUG(schedule,
-                      belch("==== schedule: Created TSO %d (%p); %d threads active",
-                            tso->id, tso, advisory_thread_count));
-
-         if (tso==END_TSO_QUEUE) { /* failed to activate spark->back to loop */
-           belch("==^^ failed to activate spark");
-           goto next_thread;
-         }               /* otherwise fall through & pick-up new tso */
-       } else {
-         IF_PAR_DEBUG(verbose,
-                      belch("==^^ no local sparks (spark pool contains only NFs: %d)", 
-                            spark_queue_len(pool)));
-         goto next_thread;
+       spark = findSpark(rtsFalse);            /* get a spark */
+       if (spark != (rtsSpark) NULL) {
+         tso = createThreadFromSpark(spark);       /* turn the spark into a thread */
+         IF_PAR_DEBUG(fish, // schedule,
+                      debugBelch("==== schedule: Created TSO %d (%p); %d threads active\n",
+                            tso->id, tso, advisory_thread_count));
+
+         if (tso==END_TSO_QUEUE) { /* failed to activate spark->back to loop */
+           IF_PAR_DEBUG(fish, // schedule,
+                        debugBelch("==^^ failed to create thread from spark @ %lx\n",
+                            spark));
+           return rtsFalse; /* failed to generate a thread */
+         }                  /* otherwise fall through & pick-up new tso */
+       } else {
+         IF_PAR_DEBUG(fish, // schedule,
+                      debugBelch("==^^ no local sparks (spark pool contains only NFs: %d)\n", 
+                            spark_queue_len(pool)));
+         return rtsFalse;  /* failed to generate a thread */
+       }
+       return rtsTrue;  /* success in generating a thread */
+  } else { /* no more threads permitted or pool empty */
+    return rtsFalse;  /* failed to generateThread */
+  }
+#else
+  tso = NULL; // avoid compiler warning only
+  return rtsFalse;  /* dummy in non-PAR setup */
+#endif // SPARKS
+}
+#endif // PARALLEL_HASKELL
+
+/* ----------------------------------------------------------------------------
+ * Get work from a remote node (PARALLEL_HASKELL only)
+ * ------------------------------------------------------------------------- */
+    
+#if defined(PARALLEL_HASKELL)
+static rtsBool
+scheduleGetRemoteWork(rtsBool *receivedFinish)
+{
+  ASSERT(EMPTY_RUN_QUEUE());
+
+  if (RtsFlags.ParFlags.BufferTime) {
+       IF_PAR_DEBUG(verbose, 
+               debugBelch("...send all pending data,"));
+        {
+         nat i;
+         for (i=1; i<=nPEs; i++)
+           sendImmediately(i); // send all messages away immediately
        }
        }
-      }
+  }
+# ifndef SPARKS
+       //++EDEN++ idle() , i.e. send all buffers, wait for work
+       // suppress fishing in EDEN... just look for incoming messages
+       // (blocking receive)
+  IF_PAR_DEBUG(verbose, 
+              debugBelch("...wait for incoming messages...\n"));
+  *receivedFinish = processMessages(); // blocking receive...
+
+       // and reenter scheduling loop after having received something
+       // (return rtsFalse below)
+
+# else /* activate SPARKS machinery */
+/* We get here, if we have no work, tried to activate a local spark, but still
+   have no work. We try to get a remote spark, by sending a FISH message.
+   Thread migration should be added here, and triggered when a sequence of 
+   fishes returns without work. */
+       delay = (RtsFlags.ParFlags.fishDelay!=0ll ? RtsFlags.ParFlags.fishDelay : 0ll);
 
 
-      /* If we still have no work we need to send a FISH to get a spark
-        from another PE 
-      */
-      if (EMPTY_RUN_QUEUE()) {
       /* =8-[  no local sparks => look for work on other PEs */
        /*
         * We really have absolutely no work.  Send out a fish
       /* =8-[  no local sparks => look for work on other PEs */
        /*
         * We really have absolutely no work.  Send out a fish
@@ -829,48 +1228,106 @@ schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
         * we're hoping to see.  (Of course, we still have to
         * respond to other types of messages.)
         */
         * we're hoping to see.  (Of course, we still have to
         * respond to other types of messages.)
         */
-       TIME now = msTime() /*CURRENT_TIME*/;
+       rtsTime now = msTime() /*CURRENT_TIME*/;
        IF_PAR_DEBUG(verbose, 
        IF_PAR_DEBUG(verbose, 
-                    belch("--  now=%ld", now));
-       IF_PAR_DEBUG(verbose,
-                    if (outstandingFishes < RtsFlags.ParFlags.maxFishes &&
-                        (last_fish_arrived_at!=0 &&
-                         last_fish_arrived_at+RtsFlags.ParFlags.fishDelay > now)) {
-                      belch("--$$ delaying FISH until %ld (last fish %ld, delay %ld, now %ld)",
-                            last_fish_arrived_at+RtsFlags.ParFlags.fishDelay,
-                            last_fish_arrived_at,
-                            RtsFlags.ParFlags.fishDelay, now);
-                    });
-       
+                    debugBelch("--  now=%ld\n", now));
+       IF_PAR_DEBUG(fish, // verbose,
+            if (outstandingFishes < RtsFlags.ParFlags.maxFishes &&
+                (last_fish_arrived_at!=0 &&
+                 last_fish_arrived_at+delay > now)) {
+              debugBelch("--$$ <%llu> delaying FISH until %llu (last fish %llu, delay %llu)\n",
+                    now, last_fish_arrived_at+delay, 
+                    last_fish_arrived_at,
+                    delay);
+            });
+  
        if (outstandingFishes < RtsFlags.ParFlags.maxFishes &&
        if (outstandingFishes < RtsFlags.ParFlags.maxFishes &&
-           (last_fish_arrived_at==0 ||
-            (last_fish_arrived_at+RtsFlags.ParFlags.fishDelay <= now))) {
-         /* outstandingFishes is set in sendFish, processFish;
-            avoid flooding system with fishes via delay */
-         pe = choosePE();
-         sendFish(pe, mytid, NEW_FISH_AGE, NEW_FISH_HISTORY, 
-                  NEW_FISH_HUNGER);
-
-         // Global statistics: count no. of fishes
-         if (RtsFlags.ParFlags.ParStats.Global &&
-             RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
-           globalParStats.tot_fish_mess++;
-         }
-       }
-      
-       receivedFinish = processMessages();
-       goto next_thread;
+           advisory_thread_count < RtsFlags.ParFlags.maxThreads) { // send a FISH, but when?
+         if (last_fish_arrived_at==0 ||
+             (last_fish_arrived_at+delay <= now)) {           // send FISH now!
+           /* outstandingFishes is set in sendFish, processFish;
+              avoid flooding system with fishes via delay */
+    next_fish_to_send_at = 0;  
+  } else {
+    /* ToDo: this should be done in the main scheduling loop to avoid the
+             busy wait here; not so bad if fish delay is very small  */
+    int iq = 0; // DEBUGGING -- HWL
+    next_fish_to_send_at = last_fish_arrived_at+delay; // remember when to send  
+    /* send a fish when ready, but process messages that arrive in the meantime */
+    do {
+      if (PacketsWaiting()) {
+        iq++; // DEBUGGING
+        *receivedFinish = processMessages();
       }
       }
-    } else if (PacketsWaiting()) {  /* Look for incoming messages */
-      receivedFinish = processMessages();
-    }
+      now = msTime();
+    } while (!*receivedFinish || now<next_fish_to_send_at);
+    // JB: This means the fish could become obsolete, if we receive
+    // work. Better check for work again? 
+    // last line: while (!receivedFinish || !haveWork || now<...)
+    // next line: if (receivedFinish || haveWork )
+
+    if (*receivedFinish) // no need to send a FISH if we are finishing anyway
+      return rtsFalse;  // NB: this will leave scheduler loop
+                       // immediately after return!
+                         
+    IF_PAR_DEBUG(fish, // verbose,
+              debugBelch("--$$ <%llu> sent delayed fish (%d processMessages); active/total threads=%d/%d\n",now,iq,run_queue_len(),advisory_thread_count));
 
 
-    /* Now we are sure that we have some work available */
-    ASSERT(run_queue_hd != END_TSO_QUEUE);
+  }
 
 
-    /* Take a thread from the run queue, if we have work */
-    POP_RUN_QUEUE(t);  // take_off_run_queue(END_TSO_QUEUE);
-    IF_DEBUG(sanity,checkTSO(t));
+    // JB: IMHO, this should all be hidden inside sendFish(...)
+    /* pe = choosePE(); 
+       sendFish(pe, thisPE, NEW_FISH_AGE, NEW_FISH_HISTORY, 
+                NEW_FISH_HUNGER);
+
+    // Global statistics: count no. of fishes
+    if (RtsFlags.ParFlags.ParStats.Global &&
+         RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+          globalParStats.tot_fish_mess++;
+          }
+    */ 
+
+  /* delayed fishes must have been sent by now! */
+  next_fish_to_send_at = 0;  
+  }
+      
+  *receivedFinish = processMessages();
+# endif /* SPARKS */
+
+ return rtsFalse;
+ /* NB: this function always returns rtsFalse, meaning the scheduler
+    loop continues with the next iteration; 
+    rationale: 
+      return code means success in finding work; we enter this function
+      if there is no local work, thus have to send a fish which takes
+      time until it arrives with work; in the meantime we should process
+      messages in the main loop;
+ */
+}
+#endif // PARALLEL_HASKELL
+
+/* ----------------------------------------------------------------------------
+ * PAR/GRAN: Report stats & debugging info(?)
+ * ------------------------------------------------------------------------- */
+
+#if defined(PAR) || defined(GRAN)
+static void
+scheduleGranParReport(void)
+{
+  ASSERT(run_queue_hd != END_TSO_QUEUE);
+
+  /* Take a thread from the run queue, if we have work */
+  POP_RUN_QUEUE(t);  // take_off_run_queue(END_TSO_QUEUE);
+
+    /* If this TSO has got its outport closed in the meantime, 
+     *   it mustn't be run. Instead, we have to clean it up as if it was finished.
+     * It has to be marked as TH_DEAD for this purpose.
+     * If it is TH_TERM instead, it is supposed to have finished in the normal way.
+
+JB: TODO: investigate wether state change field could be nuked
+     entirely and replaced by the normal tso state (whatnext
+     field). All we want to do is to kill tsos from outside.
+     */
 
     /* ToDo: write something to the log-file
     if (RTSflags.ParFlags.granSimStats && !sameThread)
 
     /* ToDo: write something to the log-file
     if (RTSflags.ParFlags.granSimStats && !sameThread)
@@ -879,25 +1336,20 @@ schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
     CurrentTSO = t;
     */
     /* the spark pool for the current PE */
     CurrentTSO = t;
     */
     /* the spark pool for the current PE */
-    pool = &(MainRegTable.rSparks); // generalise to cap = &MainRegTable
+    pool = &(cap.r.rSparks); //  cap = (old) MainCap
 
     IF_DEBUG(scheduler, 
 
     IF_DEBUG(scheduler, 
-            belch("--=^ %d threads, %d sparks on [%#x]", 
+            debugBelch("--=^ %d threads, %d sparks on [%#x]\n", 
                   run_queue_len(), spark_queue_len(pool), CURRENT_PROC));
 
                   run_queue_len(), spark_queue_len(pool), CURRENT_PROC));
 
-# if 1
-    if (0 && RtsFlags.ParFlags.ParStats.Full && 
-       t && LastTSO && t->id != LastTSO->id && 
-       LastTSO->why_blocked == NotBlocked && 
-       LastTSO->what_next != ThreadComplete) {
-      // if previously scheduled TSO not blocked we have to record the context switch
-      DumpVeryRawGranEvent(TimeOfLastYield, CURRENT_PROC, CURRENT_PROC,
-                          GR_DESCHEDULE, LastTSO, (StgClosure *)NULL, 0, 0);
-    }
+    IF_PAR_DEBUG(fish,
+            debugBelch("--=^ %d threads, %d sparks on [%#x]\n", 
+                  run_queue_len(), spark_queue_len(pool), CURRENT_PROC));
 
     if (RtsFlags.ParFlags.ParStats.Full && 
 
     if (RtsFlags.ParFlags.ParStats.Full && 
-       (emitSchedule /* forced emit */ ||
-        (t && LastTSO && t->id != LastTSO->id))) {
+       (t->par.sparkname != (StgInt)0) && // only log spark generated threads
+       (emitSchedule || // forced emit
+         (t && LastTSO && t->id != LastTSO->id))) {
       /* 
         we are running a different TSO, so write a schedule event to log file
         NB: If we use fair scheduling we also have to write  a deschedule 
       /* 
         we are running a different TSO, so write a schedule event to log file
         NB: If we use fair scheduling we also have to write  a deschedule 
@@ -905,130 +1357,24 @@ schedule( StgMainThread *mainThread USED_WHEN_RTS_SUPPORTS_THREADS,
             previous tso has blocked whenever we switch to another tso, so
             we don't need it in GUM for now
       */
             previous tso has blocked whenever we switch to another tso, so
             we don't need it in GUM for now
       */
+      IF_PAR_DEBUG(fish, // schedule,
+                  debugBelch("____ scheduling spark generated thread %d (%lx) (%lx) via a forced emit\n",t->id,t,t->par.sparkname));
+
       DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC,
                       GR_SCHEDULE, t, (StgClosure *)NULL, 0, 0);
       emitSchedule = rtsFalse;
     }
       DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC,
                       GR_SCHEDULE, t, (StgClosure *)NULL, 0, 0);
       emitSchedule = rtsFalse;
     }
-     
-# endif
-#else /* !GRAN && !PAR */
-  
-    // grab a thread from the run queue
-    ASSERT(run_queue_hd != END_TSO_QUEUE);
-    POP_RUN_QUEUE(t);
+}     
+#endif
 
 
-    // Sanity check the thread we're about to run.  This can be
-    // expensive if there is lots of thread switching going on...
-    IF_DEBUG(sanity,checkTSO(t));
-#endif
-
-#ifdef THREADED_RTS
-    {
-      StgMainThread *m;
-      for(m = main_threads; m; m = m->link)
-      {
-       if(m->tso == t)
-         break;
-      }
-      
-      if(m)
-      {
-       if(m == mainThread)
-       {
-         IF_DEBUG(scheduler,
-           sched_belch("### Running thread %d in bound thread", t->id));
-         // yes, the Haskell thread is bound to the current native thread
-       }
-       else
-       {
-         IF_DEBUG(scheduler,
-           sched_belch("### thread %d bound to another OS thread", t->id));
-         // no, bound to a different Haskell thread: pass to that thread
-         PUSH_ON_RUN_QUEUE(t);
-         passCapability(&m->bound_thread_cond);
-         continue;
-       }
-      }
-      else
-      {
-       if(mainThread != NULL)
-        // The thread we want to run is bound.
-       {
-         IF_DEBUG(scheduler,
-           sched_belch("### this OS thread cannot run thread %d", t->id));
-         // no, the current native thread is bound to a different
-         // Haskell thread, so pass it to any worker thread
-         PUSH_ON_RUN_QUEUE(t);
-         passCapabilityToWorker();
-         continue; 
-       }
-      }
-    }
-#endif
-
-    cap->r.rCurrentTSO = t;
-    
-    /* context switches are now initiated by the timer signal, unless
-     * the user specified "context switch as often as possible", with
-     * +RTS -C0
-     */
-    if ((RtsFlags.ConcFlags.ctxtSwitchTicks == 0
-        && (run_queue_hd != END_TSO_QUEUE
-            || blocked_queue_hd != END_TSO_QUEUE
-            || sleeping_queue != END_TSO_QUEUE)))
-       context_switch = 1;
-    else
-       context_switch = 0;
-
-run_thread:
-
-    RELEASE_LOCK(&sched_mutex);
-
-    IF_DEBUG(scheduler, sched_belch("-->> running thread %ld %s ...", 
-                             t->id, whatNext_strs[t->what_next]));
-
-#ifdef PROFILING
-    startHeapProfTimer();
-#endif
+/* ----------------------------------------------------------------------------
+ * After running a thread...
+ * ASSUMES: sched_mutex
+ * ------------------------------------------------------------------------- */
 
 
-    /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
-    /* Run the current thread 
-     */
-    prev_what_next = t->what_next;
-    switch (prev_what_next) {
-    case ThreadKilled:
-    case ThreadComplete:
-       /* Thread already finished, return to scheduler. */
-       ret = ThreadFinished;
-       break;
-    case ThreadRunGHC:
-       errno = t->saved_errno;
-       ret = StgRun((StgFunPtr) stg_returnToStackTop, &cap->r);
-       t->saved_errno = errno;
-       break;
-    case ThreadInterpret:
-       ret = interpretBCO(cap);
-       break;
-    default:
-      barf("schedule: invalid what_next field");
-    }
-    /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
-    
-    /* Costs for the scheduler are assigned to CCS_SYSTEM */
-#ifdef PROFILING
-    stopHeapProfTimer();
-    CCCS = CCS_SYSTEM;
-#endif
-    
-    ACQUIRE_LOCK(&sched_mutex);
-    
-#ifdef RTS_SUPPORTS_THREADS
-    IF_DEBUG(scheduler,fprintf(stderr,"sched (task %p): ", osThreadId()););
-#elif !defined(GRAN) && !defined(PAR)
-    IF_DEBUG(scheduler,fprintf(stderr,"sched: "););
-#endif
-    t = cap->r.rCurrentTSO;
-    
+static void
+schedulePostRunThread(void)
+{
 #if defined(PAR)
     /* HACK 675: if the last thread didn't yield, make sure to print a 
        SCHEDULE event to the log file when StgRunning the next thread, even
 #if defined(PAR)
     /* HACK 675: if the last thread didn't yield, make sure to print a 
        SCHEDULE event to the log file when StgRunning the next thread, even
@@ -1037,218 +1383,52 @@ run_thread:
     TimeOfLastYield = CURRENT_TIME;
 #endif
 
     TimeOfLastYield = CURRENT_TIME;
 #endif
 
-    switch (ret) {
+  /* some statistics gathering in the parallel case */
+
+#if defined(GRAN) || defined(PAR) || defined(EDEN)
+  switch (ret) {
     case HeapOverflow:
     case HeapOverflow:
-#if defined(GRAN)
+# if defined(GRAN)
       IF_DEBUG(gran, DumpGranEvent(GR_DESCHEDULE, t));
       globalGranStats.tot_heapover++;
       IF_DEBUG(gran, DumpGranEvent(GR_DESCHEDULE, t));
       globalGranStats.tot_heapover++;
-#elif defined(PAR)
+# elif defined(PAR)
       globalParStats.tot_heapover++;
       globalParStats.tot_heapover++;
-#endif
-
-      // did the task ask for a large block?
-      if (cap->r.rHpAlloc > BLOCK_SIZE_W) {
-         // if so, get one and push it on the front of the nursery.
-         bdescr *bd;
-         nat blocks;
-         
-         blocks = (nat)BLOCK_ROUND_UP(cap->r.rHpAlloc * sizeof(W_)) / BLOCK_SIZE;
-
-         IF_DEBUG(scheduler,belch("--<< thread %ld (%s) stopped: requesting a large block (size %d)", 
-                                  t->id, whatNext_strs[t->what_next], blocks));
-
-         // don't do this if it would push us over the
-         // alloc_blocks_lim limit; we'll GC first.
-         if (alloc_blocks + blocks < alloc_blocks_lim) {
-
-             alloc_blocks += blocks;
-             bd = allocGroup( blocks );
-
-             // link the new group into the list
-             bd->link = cap->r.rCurrentNursery;
-             bd->u.back = cap->r.rCurrentNursery->u.back;
-             if (cap->r.rCurrentNursery->u.back != NULL) {
-                 cap->r.rCurrentNursery->u.back->link = bd;
-             } else {
-                 ASSERT(g0s0->blocks == cap->r.rCurrentNursery &&
-                        g0s0->blocks == cap->r.rNursery);
-                 cap->r.rNursery = g0s0->blocks = bd;
-             }           
-             cap->r.rCurrentNursery->u.back = bd;
-
-             // initialise it as a nursery block.  We initialise the
-             // step, gen_no, and flags field of *every* sub-block in
-             // this large block, because this is easier than making
-             // sure that we always find the block head of a large
-             // block whenever we call Bdescr() (eg. evacuate() and
-             // isAlive() in the GC would both have to do this, at
-             // least).
-             { 
-                 bdescr *x;
-                 for (x = bd; x < bd + blocks; x++) {
-                     x->step = g0s0;
-                     x->gen_no = 0;
-                     x->flags = 0;
-                 }
-             }
-
-             // don't forget to update the block count in g0s0.
-             g0s0->n_blocks += blocks;
-             // This assert can be a killer if the app is doing lots
-             // of large block allocations.
-             ASSERT(countBlocks(g0s0->blocks) == g0s0->n_blocks);
-
-             // now update the nursery to point to the new block
-             cap->r.rCurrentNursery = bd;
-
-             // we might be unlucky and have another thread get on the
-             // run queue before us and steal the large block, but in that
-             // case the thread will just end up requesting another large
-             // block.
-             PUSH_ON_RUN_QUEUE(t);
-             break;
-         }
-      }
-
-      /* make all the running tasks block on a condition variable,
-       * maybe set context_switch and wait till they all pile in,
-       * then have them wait on a GC condition variable.
-       */
-      IF_DEBUG(scheduler,belch("--<< thread %ld (%s) stopped: HeapOverflow", 
-                              t->id, whatNext_strs[t->what_next]));
-      threadPaused(t);
-#if defined(GRAN)
-      ASSERT(!is_on_queue(t,CurrentProc));
-#elif defined(PAR)
-      /* Currently we emit a DESCHEDULE event before GC in GUM.
-         ToDo: either add separate event to distinguish SYSTEM time from rest
-              or just nuke this DESCHEDULE (and the following SCHEDULE) */
-      if (0 && RtsFlags.ParFlags.ParStats.Full) {
-       DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC,
-                        GR_DESCHEDULE, t, (StgClosure *)NULL, 0, 0);
-       emitSchedule = rtsTrue;
-      }
-#endif
-      
-      ready_to_gc = rtsTrue;
-      context_switch = 1;              /* stop other threads ASAP */
-      PUSH_ON_RUN_QUEUE(t);
-      /* actual GC is done at the end of the while loop */
+# endif
       break;
       break;
-      
-    case StackOverflow:
-#if defined(GRAN)
+
+     case StackOverflow:
+# if defined(GRAN)
       IF_DEBUG(gran, 
               DumpGranEvent(GR_DESCHEDULE, t));
       globalGranStats.tot_stackover++;
       IF_DEBUG(gran, 
               DumpGranEvent(GR_DESCHEDULE, t));
       globalGranStats.tot_stackover++;
-#elif defined(PAR)
+# elif defined(PAR)
       // IF_DEBUG(par, 
       // DumpGranEvent(GR_DESCHEDULE, t);
       globalParStats.tot_stackover++;
       // IF_DEBUG(par, 
       // DumpGranEvent(GR_DESCHEDULE, t);
       globalParStats.tot_stackover++;
-#endif
-      IF_DEBUG(scheduler,belch("--<< thread %ld (%s) stopped, StackOverflow", 
-                              t->id, whatNext_strs[t->what_next]));
-      /* just adjust the stack for this thread, then pop it back
-       * on the run queue.
-       */
-      threadPaused(t);
-      { 
-       StgMainThread *m;
-       /* enlarge the stack */
-       StgTSO *new_t = threadStackOverflow(t);
-       
-       /* This TSO has moved, so update any pointers to it from the
-        * main thread stack.  It better not be on any other queues...
-        * (it shouldn't be).
-        */
-       for (m = main_threads; m != NULL; m = m->link) {
-         if (m->tso == t) {
-           m->tso = new_t;
-         }
-       }
-       threadPaused(new_t);
-       PUSH_ON_RUN_QUEUE(new_t);
-      }
+# endif
       break;
 
     case ThreadYielding:
       break;
 
     case ThreadYielding:
-#if defined(GRAN)
+# if defined(GRAN)
       IF_DEBUG(gran, 
               DumpGranEvent(GR_DESCHEDULE, t));
       globalGranStats.tot_yields++;
       IF_DEBUG(gran, 
               DumpGranEvent(GR_DESCHEDULE, t));
       globalGranStats.tot_yields++;
-#elif defined(PAR)
+# elif defined(PAR)
       // IF_DEBUG(par, 
       // DumpGranEvent(GR_DESCHEDULE, t);
       globalParStats.tot_yields++;
       // IF_DEBUG(par, 
       // DumpGranEvent(GR_DESCHEDULE, t);
       globalParStats.tot_yields++;
-#endif
-      /* put the thread back on the run queue.  Then, if we're ready to
-       * GC, check whether this is the last task to stop.  If so, wake
-       * up the GC thread.  getThread will block during a GC until the
-       * GC is finished.
-       */
-      IF_DEBUG(scheduler,
-               if (t->what_next != prev_what_next) {
-                  belch("--<< thread %ld (%s) stopped to switch evaluators", 
-                        t->id, whatNext_strs[t->what_next]);
-               } else {
-                   belch("--<< thread %ld (%s) stopped, yielding", 
-                        t->id, whatNext_strs[t->what_next]);
-               }
-               );
-
-      IF_DEBUG(sanity,
-              //belch("&& Doing sanity check on yielding TSO %ld.", t->id);
-              checkTSO(t));
-      ASSERT(t->link == END_TSO_QUEUE);
-
-      // Shortcut if we're just switching evaluators: don't bother
-      // doing stack squeezing (which can be expensive), just run the
-      // thread.
-      if (t->what_next != prev_what_next) {
-         goto run_thread;
-      }
-
-      threadPaused(t);
-
-#if defined(GRAN)
-      ASSERT(!is_on_queue(t,CurrentProc));
-
-      IF_DEBUG(sanity,
-              //belch("&& Doing sanity check on all ThreadQueues (and their TSOs).");
-              checkThreadQsSanity(rtsTrue));
-#endif
-
-#if defined(PAR)
-      if (RtsFlags.ParFlags.doFairScheduling) { 
-       /* this does round-robin scheduling; good for concurrency */
-       APPEND_TO_RUN_QUEUE(t);
-      } else {
-       /* this does unfair scheduling; good for parallelism */
-       PUSH_ON_RUN_QUEUE(t);
-      }
-#else
-      // this does round-robin scheduling; good for concurrency
-      APPEND_TO_RUN_QUEUE(t);
-#endif
-
-#if defined(GRAN)
-      /* add a ContinueThread event to actually process the thread */
-      new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc],
-               ContinueThread,
-               t, (StgClosure*)NULL, (rtsSpark*)NULL);
-      IF_GRAN_DEBUG(bq, 
-              belch("GRAN: eventq and runnableq after adding yielded thread to queue again:");
-              G_EVENTQ(0);
-              G_CURR_THREADQ(0));
-#endif /* GRAN */
-      break;
+# endif
+      break; 
 
     case ThreadBlocked:
 
     case ThreadBlocked:
-#if defined(GRAN)
+# if defined(GRAN)
       IF_DEBUG(scheduler,
       IF_DEBUG(scheduler,
-              belch("--<< thread %ld (%p; %s) stopped, blocking on node %p [PE %d] with BQ: ", 
-                              t->id, t, whatNext_strs[t->what_next], t->block_info.closure, (t->block_info.closure==(StgClosure*)NULL ? 99 : where_is(t->block_info.closure)));
-              if (t->block_info.closure!=(StgClosure*)NULL) print_bq(t->block_info.closure));
+              debugBelch("--<< thread %ld (%p; %s) stopped, blocking on node %p [PE %d] with BQ: ", 
+                         t->id, t, whatNext_strs[t->what_next], t->block_info.closure, 
+                         (t->block_info.closure==(StgClosure*)NULL ? 99 : where_is(t->block_info.closure)));
+              if (t->block_info.closure!=(StgClosure*)NULL)
+                print_bq(t->block_info.closure);
+              debugBelch("\n"));
 
       // ??? needed; should emit block before
       IF_DEBUG(gran, 
 
       // ??? needed; should emit block before
       IF_DEBUG(gran, 
@@ -1264,75 +1444,422 @@ run_thread:
            procStatus[CurrentProc]==Fetching)) 
        procStatus[CurrentProc] = Idle;
       */
            procStatus[CurrentProc]==Fetching)) 
        procStatus[CurrentProc] = Idle;
       */
-#elif defined(PAR)
-      IF_DEBUG(scheduler,
-              belch("--<< thread %ld (%p; %s) stopped, blocking on node %p with BQ: ", 
-                    t->id, t, whatNext_strs[t->what_next], t->block_info.closure));
-      IF_PAR_DEBUG(bq,
+# elif defined(PAR)
+//++PAR++  blockThread() writes the event (change?)
+# endif
+    break;
 
 
-                  if (t->block_info.closure!=(StgClosure*)NULL) 
-                    print_bq(t->block_info.closure));
+  case ThreadFinished:
+    break;
 
 
-      /* Send a fetch (if BlockedOnGA) and dump event to log file */
-      blockThread(t);
+  default:
+    barf("parGlobalStats: unknown return code");
+    break;
+    }
+#endif
+}
 
 
-      /* whatever we schedule next, we must log that schedule */
-      emitSchedule = rtsTrue;
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadHeepOverflow
+ * ASSUMES: sched_mutex
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
+{
+    // did the task ask for a large block?
+    if (cap->r.rHpAlloc > BLOCK_SIZE) {
+       // if so, get one and push it on the front of the nursery.
+       bdescr *bd;
+       lnat blocks;
+       
+       blocks = (lnat)BLOCK_ROUND_UP(cap->r.rHpAlloc) / BLOCK_SIZE;
+       
+       IF_DEBUG(scheduler,
+                debugBelch("--<< thread %ld (%s) stopped: requesting a large block (size %d)\n", 
+                           (long)t->id, whatNext_strs[t->what_next], blocks));
+       
+       // don't do this if it would push us over the
+       // alloc_blocks_lim limit; we'll GC first.
+       if (alloc_blocks + blocks < alloc_blocks_lim) {
+           
+           alloc_blocks += blocks;
+           bd = allocGroup( blocks );
+           
+           // link the new group into the list
+           bd->link = cap->r.rCurrentNursery;
+           bd->u.back = cap->r.rCurrentNursery->u.back;
+           if (cap->r.rCurrentNursery->u.back != NULL) {
+               cap->r.rCurrentNursery->u.back->link = bd;
+           } else {
+               ASSERT(g0s0->blocks == cap->r.rCurrentNursery &&
+                      g0s0->blocks == cap->r.rNursery);
+               cap->r.rNursery = g0s0->blocks = bd;
+           }             
+           cap->r.rCurrentNursery->u.back = bd;
+           
+           // initialise it as a nursery block.  We initialise the
+           // step, gen_no, and flags field of *every* sub-block in
+           // this large block, because this is easier than making
+           // sure that we always find the block head of a large
+           // block whenever we call Bdescr() (eg. evacuate() and
+           // isAlive() in the GC would both have to do this, at
+           // least).
+           { 
+               bdescr *x;
+               for (x = bd; x < bd + blocks; x++) {
+                   x->step = g0s0;
+                   x->gen_no = 0;
+                   x->flags = 0;
+               }
+           }
+           
+           // don't forget to update the block count in g0s0.
+           g0s0->n_blocks += blocks;
+           // This assert can be a killer if the app is doing lots
+           // of large block allocations.
+           ASSERT(countBlocks(g0s0->blocks) == g0s0->n_blocks);
+           
+           // now update the nursery to point to the new block
+           cap->r.rCurrentNursery = bd;
+           
+           // we might be unlucky and have another thread get on the
+           // run queue before us and steal the large block, but in that
+           // case the thread will just end up requesting another large
+           // block.
+           PUSH_ON_RUN_QUEUE(t);
+           return rtsFalse;  /* not actually GC'ing */
+       }
+    }
+    
+    /* make all the running tasks block on a condition variable,
+     * maybe set context_switch and wait till they all pile in,
+     * then have them wait on a GC condition variable.
+     */
+    IF_DEBUG(scheduler,
+            debugBelch("--<< thread %ld (%s) stopped: HeapOverflow\n", 
+                       (long)t->id, whatNext_strs[t->what_next]));
+    threadPaused(t);
+#if defined(GRAN)
+    ASSERT(!is_on_queue(t,CurrentProc));
+#elif defined(PARALLEL_HASKELL)
+    /* Currently we emit a DESCHEDULE event before GC in GUM.
+       ToDo: either add separate event to distinguish SYSTEM time from rest
+       or just nuke this DESCHEDULE (and the following SCHEDULE) */
+    if (0 && RtsFlags.ParFlags.ParStats.Full) {
+       DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC,
+                        GR_DESCHEDULE, t, (StgClosure *)NULL, 0, 0);
+       emitSchedule = rtsTrue;
+    }
+#endif
+      
+    PUSH_ON_RUN_QUEUE(t);
+    return rtsTrue;
+    /* actual GC is done at the end of the while loop in schedule() */
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadStackOverflow
+ * ASSUMES: sched_mutex
+ * -------------------------------------------------------------------------- */
+
+static void
+scheduleHandleStackOverflow( StgTSO *t)
+{
+    IF_DEBUG(scheduler,debugBelch("--<< thread %ld (%s) stopped, StackOverflow\n", 
+                                 (long)t->id, whatNext_strs[t->what_next]));
+    /* just adjust the stack for this thread, then pop it back
+     * on the run queue.
+     */
+    threadPaused(t);
+    { 
+       /* enlarge the stack */
+       StgTSO *new_t = threadStackOverflow(t);
+       
+       /* This TSO has moved, so update any pointers to it from the
+        * main thread stack.  It better not be on any other queues...
+        * (it shouldn't be).
+        */
+       if (t->main != NULL) {
+           t->main->tso = new_t;
+       }
+       PUSH_ON_RUN_QUEUE(new_t);
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadYielding
+ * ASSUMES: sched_mutex
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+scheduleHandleYield( StgTSO *t, nat prev_what_next )
+{
+    // Reset the context switch flag.  We don't do this just before
+    // running the thread, because that would mean we would lose ticks
+    // during GC, which can lead to unfair scheduling (a thread hogs
+    // the CPU because the tick always arrives during GC).  This way
+    // penalises threads that do a lot of allocation, but that seems
+    // better than the alternative.
+    context_switch = 0;
+    
+    /* put the thread back on the run queue.  Then, if we're ready to
+     * GC, check whether this is the last task to stop.  If so, wake
+     * up the GC thread.  getThread will block during a GC until the
+     * GC is finished.
+     */
+    IF_DEBUG(scheduler,
+            if (t->what_next != prev_what_next) {
+                debugBelch("--<< thread %ld (%s) stopped to switch evaluators\n", 
+                           (long)t->id, whatNext_strs[t->what_next]);
+            } else {
+                debugBelch("--<< thread %ld (%s) stopped, yielding\n",
+                           (long)t->id, whatNext_strs[t->what_next]);
+            }
+       );
+    
+    IF_DEBUG(sanity,
+            //debugBelch("&& Doing sanity check on yielding TSO %ld.", t->id);
+            checkTSO(t));
+    ASSERT(t->link == END_TSO_QUEUE);
+    
+    // Shortcut if we're just switching evaluators: don't bother
+    // doing stack squeezing (which can be expensive), just run the
+    // thread.
+    if (t->what_next != prev_what_next) {
+       return rtsTrue;
+    }
+    
+    threadPaused(t);
+    
+#if defined(GRAN)
+    ASSERT(!is_on_queue(t,CurrentProc));
+      
+    IF_DEBUG(sanity,
+            //debugBelch("&& Doing sanity check on all ThreadQueues (and their TSOs).");
+            checkThreadQsSanity(rtsTrue));
+
+#endif
+
+    addToRunQueue(t);
+
+#if defined(GRAN)
+    /* add a ContinueThread event to actually process the thread */
+    new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc],
+             ContinueThread,
+             t, (StgClosure*)NULL, (rtsSpark*)NULL);
+    IF_GRAN_DEBUG(bq, 
+                 debugBelch("GRAN: eventq and runnableq after adding yielded thread to queue again:\n");
+                 G_EVENTQ(0);
+                 G_CURR_THREADQ(0));
+#endif
+    return rtsFalse;
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadBlocked
+ * ASSUMES: sched_mutex
+ * -------------------------------------------------------------------------- */
 
 
+static void
+scheduleHandleThreadBlocked( StgTSO *t
+#if !defined(GRAN) && !defined(DEBUG)
+    STG_UNUSED
+#endif
+    )
+{
+#if defined(GRAN)
+    IF_DEBUG(scheduler,
+            debugBelch("--<< thread %ld (%p; %s) stopped, blocking on node %p [PE %d] with BQ: \n", 
+                       t->id, t, whatNext_strs[t->what_next], t->block_info.closure, (t->block_info.closure==(StgClosure*)NULL ? 99 : where_is(t->block_info.closure)));
+            if (t->block_info.closure!=(StgClosure*)NULL) print_bq(t->block_info.closure));
+    
+    // ??? needed; should emit block before
+    IF_DEBUG(gran, 
+            DumpGranEvent(GR_DESCHEDULE, t)); 
+    prune_eventq(t, (StgClosure *)NULL); // prune ContinueThreads for t
+    /*
+      ngoq Dogh!
+      ASSERT(procStatus[CurrentProc]==Busy || 
+      ((procStatus[CurrentProc]==Fetching) && 
+      (t->block_info.closure!=(StgClosure*)NULL)));
+      if (run_queue_hds[CurrentProc] == END_TSO_QUEUE &&
+      !(!RtsFlags.GranFlags.DoAsyncFetch &&
+      procStatus[CurrentProc]==Fetching)) 
+      procStatus[CurrentProc] = Idle;
+    */
+#elif defined(PAR)
+    IF_DEBUG(scheduler,
+            debugBelch("--<< thread %ld (%p; %s) stopped, blocking on node %p with BQ: \n", 
+                       t->id, t, whatNext_strs[t->what_next], t->block_info.closure));
+    IF_PAR_DEBUG(bq,
+                
+                if (t->block_info.closure!=(StgClosure*)NULL) 
+                print_bq(t->block_info.closure));
+    
+    /* Send a fetch (if BlockedOnGA) and dump event to log file */
+    blockThread(t);
+    
+    /* whatever we schedule next, we must log that schedule */
+    emitSchedule = rtsTrue;
+    
 #else /* !GRAN */
       /* don't need to do anything.  Either the thread is blocked on
        * I/O, in which case we'll have called addToBlockedQueue
        * previously, or it's blocked on an MVar or Blackhole, in which
        * case it'll be on the relevant queue already.
        */
 #else /* !GRAN */
       /* don't need to do anything.  Either the thread is blocked on
        * I/O, in which case we'll have called addToBlockedQueue
        * previously, or it's blocked on an MVar or Blackhole, in which
        * case it'll be on the relevant queue already.
        */
-      IF_DEBUG(scheduler,
-              fprintf(stderr, "--<< thread %d (%s) stopped: ", 
-                      t->id, whatNext_strs[t->what_next]);
-              printThreadBlockage(t);
-              fprintf(stderr, "\n"));
-      fflush(stderr);
-
-      /* Only for dumping event to log file 
-        ToDo: do I need this in GranSim, too?
-      blockThread(t);
-      */
+    ASSERT(t->why_blocked != NotBlocked);
+    IF_DEBUG(scheduler,
+            debugBelch("--<< thread %d (%s) stopped: ", 
+                       t->id, whatNext_strs[t->what_next]);
+            printThreadBlockage(t);
+            debugBelch("\n"));
+    
+    /* Only for dumping event to log file 
+       ToDo: do I need this in GranSim, too?
+       blockThread(t);
+    */
 #endif
 #endif
-      threadPaused(t);
-      break;
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadFinished
+ * ASSUMES: sched_mutex
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+scheduleHandleThreadFinished( StgMainThread *mainThread
+                             USED_WHEN_RTS_SUPPORTS_THREADS,
+                             Capability *cap,
+                             StgTSO *t )
+{
+    /* Need to check whether this was a main thread, and if so,
+     * return with the return value.
+     *
+     * We also end up here if the thread kills itself with an
+     * uncaught exception, see Exception.cmm.
+     */
+    IF_DEBUG(scheduler,debugBelch("--++ thread %d (%s) finished\n", 
+                                 t->id, whatNext_strs[t->what_next]));
 
 
-    case ThreadFinished:
-      /* Need to check whether this was a main thread, and if so, signal
-       * the task that started it with the return value.  If we have no
-       * more main threads, we probably need to stop all the tasks until
-       * we get a new one.
-       */
-      /* We also end up here if the thread kills itself with an
-       * uncaught exception, see Exception.hc.
-       */
-      IF_DEBUG(scheduler,belch("--++ thread %d (%s) finished", 
-                              t->id, whatNext_strs[t->what_next]));
 #if defined(GRAN)
       endThread(t, CurrentProc); // clean-up the thread
 #if defined(GRAN)
       endThread(t, CurrentProc); // clean-up the thread
-#elif defined(PAR)
+#elif defined(PARALLEL_HASKELL)
       /* For now all are advisory -- HWL */
       //if(t->priority==AdvisoryPriority) ??
       /* For now all are advisory -- HWL */
       //if(t->priority==AdvisoryPriority) ??
-      advisory_thread_count--;
+      advisory_thread_count--; // JB: Caution with this counter, buggy!
       
       
-# ifdef DIST
+# if defined(DIST)
       if(t->dist.priority==RevalPriority)
        FinishReval(t);
 # endif
       if(t->dist.priority==RevalPriority)
        FinishReval(t);
 # endif
-      
+    
+# if defined(EDENOLD)
+      // the thread could still have an outport... (BUG)
+      if (t->eden.outport != -1) {
+      // delete the outport for the tso which has finished...
+       IF_PAR_DEBUG(eden_ports,
+                  debugBelch("WARNING: Scheduler removes outport %d for TSO %d.\n",
+                             t->eden.outport, t->id));
+       deleteOPT(t);
+      }
+      // thread still in the process (HEAVY BUG! since outport has just been closed...)
+      if (t->eden.epid != -1) {
+       IF_PAR_DEBUG(eden_ports,
+                  debugBelch("WARNING: Scheduler removes TSO %d from process %d .\n",
+                          t->id, t->eden.epid));
+       removeTSOfromProcess(t);
+      }
+# endif 
+
+# if defined(PAR)
       if (RtsFlags.ParFlags.ParStats.Full &&
          !RtsFlags.ParFlags.ParStats.Suppressed) 
        DumpEndEvent(CURRENT_PROC, t, rtsFalse /* not mandatory */);
       if (RtsFlags.ParFlags.ParStats.Full &&
          !RtsFlags.ParFlags.ParStats.Suppressed) 
        DumpEndEvent(CURRENT_PROC, t, rtsFalse /* not mandatory */);
+
+      //  t->par only contains statistics: left out for now...
+      IF_PAR_DEBUG(fish,
+                  debugBelch("**** end thread: ended sparked thread %d (%lx); sparkname: %lx\n",
+                             t->id,t,t->par.sparkname));
+# endif
+#endif // PARALLEL_HASKELL
+
+      //
+      // Check whether the thread that just completed was a main
+      // thread, and if so return with the result.  
+      //
+      // There is an assumption here that all thread completion goes
+      // through this point; we need to make sure that if a thread
+      // ends up in the ThreadKilled state, that it stays on the run
+      // queue so it can be dealt with here.
+      //
+      if (
+#if defined(RTS_SUPPORTS_THREADS)
+         mainThread != NULL
+#else
+         mainThread->tso == t
 #endif
 #endif
-      break;
-      
-    default:
-      barf("schedule: invalid thread return code %d", (int)ret);
-    }
+         )
+      {
+         // We are a bound thread: this must be our thread that just
+         // completed.
+         ASSERT(mainThread->tso == t);
+
+         if (t->what_next == ThreadComplete) {
+             if (mainThread->ret) {
+                 // NOTE: return val is tso->sp[1] (see StgStartup.hc)
+                 *(mainThread->ret) = (StgClosure *)mainThread->tso->sp[1]; 
+             }
+             mainThread->stat = Success;
+         } else {
+             if (mainThread->ret) {
+                 *(mainThread->ret) = NULL;
+             }
+             if (interrupted) {
+                 mainThread->stat = Interrupted;
+             } else {
+                 mainThread->stat = Killed;
+             }
+         }
+#ifdef DEBUG
+         removeThreadLabel((StgWord)mainThread->tso->id);
+#endif
+         if (mainThread->prev == NULL) {
+             main_threads = mainThread->link;
+         } else {
+             mainThread->prev->link = mainThread->link;
+         }
+         if (mainThread->link != NULL) {
+             mainThread->link->prev = NULL;
+         }
+         releaseCapability(cap);
+         return rtsTrue; // tells schedule() to return
+      }
 
 
+#ifdef RTS_SUPPORTS_THREADS
+      ASSERT(t->main == NULL);
+#else
+      if (t->main != NULL) {
+         // Must be a main thread that is not the topmost one.  Leave
+         // it on the run queue until the stack has unwound to the
+         // point where we can deal with this.  Leaving it on the run
+         // queue also ensures that the garbage collector knows about
+         // this thread and its return value (it gets dropped from the
+         // all_threads list so there's no other way to find it).
+         APPEND_TO_RUN_QUEUE(t);
+      }
+#endif
+      return rtsFalse;
+}
+
+/* -----------------------------------------------------------------------------
+ * Perform a heap census, if PROFILING
+ * -------------------------------------------------------------------------- */
+
+static void
+scheduleDoHeapProfile(void)
+{
 #ifdef PROFILING
     // When we have +RTS -i0 and we're heap profiling, do a census at
     // every GC.  This lets us get repeatable runs for debugging.
 #ifdef PROFILING
     // When we have +RTS -i0 and we're heap profiling, do a census at
     // every GC.  This lets us get repeatable runs for debugging.
@@ -1345,45 +1872,74 @@ run_thread:
        ready_to_gc = rtsFalse; // we already GC'd
     }
 #endif
        ready_to_gc = rtsFalse; // we already GC'd
     }
 #endif
+}
+
+/* -----------------------------------------------------------------------------
+ * Perform a garbage collection if necessary
+ * ASSUMES: sched_mutex
+ * -------------------------------------------------------------------------- */
+
+static void
+scheduleDoGC(void)
+{
+    StgTSO *t;
+
+#ifdef SMP
+    // The last task to stop actually gets to do the GC.  The rest
+    // of the tasks release their capabilities and wait gc_pending_cond.
+    if (ready_to_gc && allFreeCapabilities())
+#else
+    if (ready_to_gc)
+#endif
+    {
+       /* Kick any transactions which are invalid back to their
+        * atomically frames.  When next scheduled they will try to
+        * commit, this commit will fail and they will retry.
+        */
+       for (t = all_threads; t != END_TSO_QUEUE; t = t -> link) {
+           if (t -> what_next != ThreadRelocated && t -> trec != NO_TREC && t -> why_blocked == NotBlocked) {
+               if (!stmValidateTransaction (t -> trec)) {
+                   IF_DEBUG(stm, sched_belch("trec %p found wasting its time", t));
+                   
+                   // strip the stack back to the ATOMICALLY_FRAME, aborting
+                   // the (nested) transaction, and saving the stack of any
+                   // partially-evaluated thunks on the heap.
+                   raiseAsync_(t, NULL, rtsTrue);
+                   
+#ifdef REG_R1
+                   ASSERT(get_itbl((StgClosure *)t->sp)->type == ATOMICALLY_FRAME);
+#endif
+               }
+           }
+       }
 
 
-    if (ready_to_gc) {
-      /* everybody back, start the GC.
-       * Could do it in this thread, or signal a condition var
-       * to do it in another thread.  Either way, we need to
-       * broadcast on gc_pending_cond afterward.
-       */
+       // so this happens periodically:
+       scheduleCheckBlackHoles();
+
+       /* everybody back, start the GC.
+        * Could do it in this thread, or signal a condition var
+        * to do it in another thread.  Either way, we need to
+        * broadcast on gc_pending_cond afterward.
+        */
 #if defined(RTS_SUPPORTS_THREADS)
 #if defined(RTS_SUPPORTS_THREADS)
-      IF_DEBUG(scheduler,sched_belch("doing GC"));
+       IF_DEBUG(scheduler,sched_belch("doing GC"));
+#endif
+       GarbageCollect(GetRoots,rtsFalse);
+       ready_to_gc = rtsFalse;
+#if defined(SMP)
+       broadcastCondition(&gc_pending_cond);
 #endif
 #endif
-      GarbageCollect(GetRoots,rtsFalse);
-      ready_to_gc = rtsFalse;
 #if defined(GRAN)
 #if defined(GRAN)
-      /* add a ContinueThread event to continue execution of current thread */
-      new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc],
-               ContinueThread,
-               t, (StgClosure*)NULL, (rtsSpark*)NULL);
-      IF_GRAN_DEBUG(bq, 
-              fprintf(stderr, "GRAN: eventq and runnableq after Garbage collection:\n");
-              G_EVENTQ(0);
-              G_CURR_THREADQ(0));
+       /* add a ContinueThread event to continue execution of current thread */
+       new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc],
+                 ContinueThread,
+                 t, (StgClosure*)NULL, (rtsSpark*)NULL);
+       IF_GRAN_DEBUG(bq, 
+                     debugBelch("GRAN: eventq and runnableq after Garbage collection:\n\n");
+                     G_EVENTQ(0);
+                     G_CURR_THREADQ(0));
 #endif /* GRAN */
     }
 #endif /* GRAN */
     }
-
-#if defined(GRAN)
-  next_thread:
-    IF_GRAN_DEBUG(unused,
-                 print_eventq(EventHd));
-
-    event = get_next_event();
-#elif defined(PAR)
-  next_thread:
-    /* ToDo: wait for next message to arrive rather than busy wait */
-#endif /* GRAN */
-
-  } /* end of while(1) */
-
-  IF_PAR_DEBUG(verbose,
-              belch("== Leaving schedule() after having received Finish"));
 }
 
 /* ---------------------------------------------------------------------------
 }
 
 /* ---------------------------------------------------------------------------
@@ -1409,12 +1965,7 @@ StgBool
 isThreadBound(StgTSO* tso USED_IN_THREADED_RTS)
 {
 #ifdef THREADED_RTS
 isThreadBound(StgTSO* tso USED_IN_THREADED_RTS)
 {
 #ifdef THREADED_RTS
-  StgMainThread *m;
-  for(m = main_threads; m; m = m->link)
-  {
-    if(m->tso == tso)
-      return rtsTrue;
-  }
+  return (tso->main != NULL);
 #endif
   return rtsFalse;
 }
 #endif
   return rtsFalse;
 }
@@ -1423,13 +1974,22 @@ isThreadBound(StgTSO* tso USED_IN_THREADED_RTS)
  * Singleton fork(). Do not copy any running threads.
  * ------------------------------------------------------------------------- */
 
  * Singleton fork(). Do not copy any running threads.
  * ------------------------------------------------------------------------- */
 
+#ifndef mingw32_HOST_OS
+#define FORKPROCESS_PRIMOP_SUPPORTED
+#endif
+
+#ifdef FORKPROCESS_PRIMOP_SUPPORTED
 static void 
 deleteThreadImmediately(StgTSO *tso);
 static void 
 deleteThreadImmediately(StgTSO *tso);
-
+#endif
 StgInt
 StgInt
-forkProcess(HsStablePtr *entry)
+forkProcess(HsStablePtr *entry
+#ifndef FORKPROCESS_PRIMOP_SUPPORTED
+           STG_UNUSED
+#endif
+           )
 {
 {
-#ifndef mingw32_TARGET_OS
+#ifdef FORKPROCESS_PRIMOP_SUPPORTED
   pid_t pid;
   StgTSO* t,*next;
   StgMainThread *m;
   pid_t pid;
   StgTSO* t,*next;
   StgMainThread *m;
@@ -1463,18 +2023,12 @@ forkProcess(HsStablePtr *entry)
       // wipe the main thread list
     while((m = main_threads) != NULL) {
       main_threads = m->link;
       // wipe the main thread list
     while((m = main_threads) != NULL) {
       main_threads = m->link;
-#ifdef THREADED_RTS
+# ifdef THREADED_RTS
       closeCondition(&m->bound_thread_cond);
       closeCondition(&m->bound_thread_cond);
-#endif
+# endif
       stgFree(m);
     }
     
       stgFree(m);
     }
     
-#ifdef RTS_SUPPORTS_THREADS
-    resetTaskManagerAfterFork();      // tell startTask() and friends that
-    startingWorkerThread = rtsFalse;  // we have no worker threads any more
-    resetWorkerWakeupPipeAfterFork();
-#endif
-    
     rc = rts_evalStableIO(entry, NULL);  // run the action
     rts_checkSchedStatus("forkProcess",rc);
     
     rc = rts_evalStableIO(entry, NULL);  // run the action
     rts_checkSchedStatus("forkProcess",rc);
     
@@ -1483,10 +2037,10 @@ forkProcess(HsStablePtr *entry)
     hs_exit();                      // clean up and exit
     stg_exit(0);
   }
     hs_exit();                      // clean up and exit
     stg_exit(0);
   }
-#else /* mingw32 */
-  barf("forkProcess#: primop not implemented for mingw32, sorry!\n");
+#else /* !FORKPROCESS_PRIMOP_SUPPORTED */
+  barf("forkProcess#: primop not supported, sorry!\n");
   return -1;
   return -1;
-#endif /* mingw32 */
+#endif
 }
 
 /* ---------------------------------------------------------------------------
 }
 
 /* ---------------------------------------------------------------------------
@@ -1507,9 +2061,16 @@ deleteAllThreads ( void )
       next = t->global_link;
       deleteThread(t);
   }      
       next = t->global_link;
       deleteThread(t);
   }      
-  run_queue_hd = run_queue_tl = END_TSO_QUEUE;
-  blocked_queue_hd = blocked_queue_tl = END_TSO_QUEUE;
-  sleeping_queue = END_TSO_QUEUE;
+
+  // The run queue now contains a bunch of ThreadKilled threads.  We
+  // must not throw these away: the main thread(s) will be in there
+  // somewhere, and the main scheduler loop has to deal with it.
+  // Also, the run queue is the only thing keeping these threads from
+  // being GC'd, and we don't want the "main thread has been GC'd" panic.
+
+  ASSERT(blocked_queue_hd == END_TSO_QUEUE);
+  ASSERT(blackhole_queue == END_TSO_QUEUE);
+  ASSERT(sleeping_queue == END_TSO_QUEUE);
 }
 
 /* startThread and  insertThread are now in GranSim.c -- HWL */
 }
 
 /* startThread and  insertThread are now in GranSim.c -- HWL */
@@ -1531,12 +2092,7 @@ deleteAllThreads ( void )
  * ------------------------------------------------------------------------- */
    
 StgInt
  * ------------------------------------------------------------------------- */
    
 StgInt
-suspendThread( StgRegTable *reg, 
-              rtsBool concCall
-#if !defined(DEBUG)
-              STG_UNUSED
-#endif
-              )
+suspendThread( StgRegTable *reg )
 {
   nat tok;
   Capability *cap;
 {
   nat tok;
   Capability *cap;
@@ -1550,7 +2106,7 @@ suspendThread( StgRegTable *reg,
   ACQUIRE_LOCK(&sched_mutex);
 
   IF_DEBUG(scheduler,
   ACQUIRE_LOCK(&sched_mutex);
 
   IF_DEBUG(scheduler,
-          sched_belch("thread %d did a _ccall_gc (is_concurrent: %d)", cap->r.rCurrentTSO->id,concCall));
+          sched_belch("thread %d did a _ccall_gc", cap->r.rCurrentTSO->id));
 
   // XXX this might not be necessary --SDM
   cap->r.rCurrentTSO->what_next = ThreadRunGHC;
 
   // XXX this might not be necessary --SDM
   cap->r.rCurrentTSO->what_next = ThreadRunGHC;
@@ -1559,17 +2115,12 @@ suspendThread( StgRegTable *reg,
   cap->r.rCurrentTSO->link = suspended_ccalling_threads;
   suspended_ccalling_threads = cap->r.rCurrentTSO;
 
   cap->r.rCurrentTSO->link = suspended_ccalling_threads;
   suspended_ccalling_threads = cap->r.rCurrentTSO;
 
-#if defined(RTS_SUPPORTS_THREADS)
-  if(cap->r.rCurrentTSO->blocked_exceptions == NULL)
-  {
+  if(cap->r.rCurrentTSO->blocked_exceptions == NULL)  {
       cap->r.rCurrentTSO->why_blocked = BlockedOnCCall;
       cap->r.rCurrentTSO->blocked_exceptions = END_TSO_QUEUE;
       cap->r.rCurrentTSO->why_blocked = BlockedOnCCall;
       cap->r.rCurrentTSO->blocked_exceptions = END_TSO_QUEUE;
-  }
-  else
-  {
+  } else {
       cap->r.rCurrentTSO->why_blocked = BlockedOnCCall_NoUnblockExc;
   }
       cap->r.rCurrentTSO->why_blocked = BlockedOnCCall_NoUnblockExc;
   }
-#endif
 
   /* Use the thread ID as the token; it should be unique */
   tok = cap->r.rCurrentTSO->id;
 
   /* Use the thread ID as the token; it should be unique */
   tok = cap->r.rCurrentTSO->id;
@@ -1584,8 +2135,7 @@ suspendThread( StgRegTable *reg,
   IF_DEBUG(scheduler, sched_belch("worker (token %d): leaving RTS", tok));
 #endif
 
   IF_DEBUG(scheduler, sched_belch("worker (token %d): leaving RTS", tok));
 #endif
 
-  /* Other threads _might_ be available for execution; signal this */
-  THREAD_RUNNABLE();
+  in_haskell = rtsFalse;
   RELEASE_LOCK(&sched_mutex);
   
   errno = saved_errno;
   RELEASE_LOCK(&sched_mutex);
   
   errno = saved_errno;
@@ -1593,8 +2143,7 @@ suspendThread( StgRegTable *reg,
 }
 
 StgRegTable *
 }
 
 StgRegTable *
-resumeThread( StgInt tok,
-             rtsBool concCall STG_UNUSED )
+resumeThread( StgInt tok )
 {
   StgTSO *tso, **prev;
   Capability *cap;
 {
   StgTSO *tso, **prev;
   Capability *cap;
@@ -1625,29 +2174,21 @@ resumeThread( StgInt tok,
   }
   tso->link = END_TSO_QUEUE;
   
   }
   tso->link = END_TSO_QUEUE;
   
-#if defined(RTS_SUPPORTS_THREADS)
-  if(tso->why_blocked == BlockedOnCCall)
-  {
+  if(tso->why_blocked == BlockedOnCCall) {
       awakenBlockedQueueNoLock(tso->blocked_exceptions);
       tso->blocked_exceptions = NULL;
   }
       awakenBlockedQueueNoLock(tso->blocked_exceptions);
       tso->blocked_exceptions = NULL;
   }
-#endif
   
   /* Reset blocking status */
   tso->why_blocked  = NotBlocked;
 
   cap->r.rCurrentTSO = tso;
   
   /* Reset blocking status */
   tso->why_blocked  = NotBlocked;
 
   cap->r.rCurrentTSO = tso;
+  in_haskell = rtsTrue;
   RELEASE_LOCK(&sched_mutex);
   errno = saved_errno;
   return &cap->r;
 }
 
   RELEASE_LOCK(&sched_mutex);
   errno = saved_errno;
   return &cap->r;
 }
 
-
-/* ---------------------------------------------------------------------------
- * Static functions
- * ------------------------------------------------------------------------ */
-static void unblockThread(StgTSO *tso);
-
 /* ---------------------------------------------------------------------------
  * Comparing Thread ids.
  *
 /* ---------------------------------------------------------------------------
  * Comparing Thread ids.
  *
@@ -1720,11 +2261,11 @@ createThread(nat size)
     nat stack_size;
 
     /* First check whether we should create a thread at all */
     nat stack_size;
 
     /* First check whether we should create a thread at all */
-#if defined(PAR)
+#if defined(PARALLEL_HASKELL)
   /* check that no more than RtsFlags.ParFlags.maxThreads threads are created */
   if (advisory_thread_count >= RtsFlags.ParFlags.maxThreads) {
     threadsIgnored++;
   /* check that no more than RtsFlags.ParFlags.maxThreads threads are created */
   if (advisory_thread_count >= RtsFlags.ParFlags.maxThreads) {
     threadsIgnored++;
-    belch("{createThread}Daq ghuH: refusing to create another thread; no more than %d threads allowed (currently %d)",
+    debugBelch("{createThread}Daq ghuH: refusing to create another thread; no more than %d threads allowed (currently %d)\n",
          RtsFlags.ParFlags.maxThreads, advisory_thread_count);
     return END_TSO_QUEUE;
   }
          RtsFlags.ParFlags.maxThreads, advisory_thread_count);
     return END_TSO_QUEUE;
   }
@@ -1760,12 +2301,15 @@ createThread(nat size)
   tso->blocked_exceptions = NULL;
 
   tso->saved_errno = 0;
   tso->blocked_exceptions = NULL;
 
   tso->saved_errno = 0;
+  tso->main = NULL;
   
   tso->stack_size   = stack_size;
   tso->max_stack_size = round_to_mblocks(RtsFlags.GcFlags.maxStkSize) 
                               - TSO_STRUCT_SIZEW;
   tso->sp           = (P_)&(tso->stack) + stack_size;
 
   
   tso->stack_size   = stack_size;
   tso->max_stack_size = round_to_mblocks(RtsFlags.GcFlags.maxStkSize) 
                               - TSO_STRUCT_SIZEW;
   tso->sp           = (P_)&(tso->stack) + stack_size;
 
+  tso->trec = NO_TREC;
+
 #ifdef PROFILING
   tso->prof.CCCS = CCS_MAIN;
 #endif
 #ifdef PROFILING
   tso->prof.CCCS = CCS_MAIN;
 #endif
@@ -1773,9 +2317,10 @@ createThread(nat size)
   /* put a stop frame on the stack */
   tso->sp -= sizeofW(StgStopFrame);
   SET_HDR((StgClosure*)tso->sp,(StgInfoTable *)&stg_stop_thread_info,CCS_SYSTEM);
   /* put a stop frame on the stack */
   tso->sp -= sizeofW(StgStopFrame);
   SET_HDR((StgClosure*)tso->sp,(StgInfoTable *)&stg_stop_thread_info,CCS_SYSTEM);
+  tso->link = END_TSO_QUEUE;
+
   // ToDo: check this
 #if defined(GRAN)
   // ToDo: check this
 #if defined(GRAN)
-  tso->link = END_TSO_QUEUE;
   /* uses more flexible routine in GranSim */
   insertThread(tso, CurrentProc);
 #else
   /* uses more flexible routine in GranSim */
   insertThread(tso, CurrentProc);
 #else
@@ -1787,7 +2332,7 @@ createThread(nat size)
 #if defined(GRAN) 
   if (RtsFlags.GranFlags.GranSimStats.Full) 
     DumpGranEvent(GR_START,tso);
 #if defined(GRAN) 
   if (RtsFlags.GranFlags.GranSimStats.Full) 
     DumpGranEvent(GR_START,tso);
-#elif defined(PAR)
+#elif defined(PARALLEL_HASKELL)
   if (RtsFlags.ParFlags.ParStats.Full) 
     DumpGranEvent(GR_STARTQ,tso);
   /* HACk to avoid SCHEDULE 
   if (RtsFlags.ParFlags.ParStats.Full) 
     DumpGranEvent(GR_STARTQ,tso);
   /* HACk to avoid SCHEDULE 
@@ -1827,7 +2372,7 @@ createThread(nat size)
     tso->gran.clock  = 0;
 
   IF_DEBUG(gran,printTSO(tso));
     tso->gran.clock  = 0;
 
   IF_DEBUG(gran,printTSO(tso));
-#elif defined(PAR)
+#elif defined(PARALLEL_HASKELL)
 # if defined(DEBUG)
   tso->par.magic = TSO_MAGIC; // debugging only
 # endif
 # if defined(DEBUG)
   tso->par.magic = TSO_MAGIC; // debugging only
 # endif
@@ -1851,26 +2396,26 @@ createThread(nat size)
   globalGranStats.threads_created_on_PE[CurrentProc]++;
   globalGranStats.tot_sq_len += spark_queue_len(CurrentProc);
   globalGranStats.tot_sq_probes++;
   globalGranStats.threads_created_on_PE[CurrentProc]++;
   globalGranStats.tot_sq_len += spark_queue_len(CurrentProc);
   globalGranStats.tot_sq_probes++;
-#elif defined(PAR)
+#elif defined(PARALLEL_HASKELL)
   // collect parallel global statistics (currently done together with GC stats)
   if (RtsFlags.ParFlags.ParStats.Global &&
       RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
   // collect parallel global statistics (currently done together with GC stats)
   if (RtsFlags.ParFlags.ParStats.Global &&
       RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
-    //fprintf(stderr, "Creating thread %d @ %11.2f\n", tso->id, usertime()); 
+    //debugBelch("Creating thread %d @ %11.2f\n", tso->id, usertime()); 
     globalParStats.tot_threads_created++;
   }
 #endif 
 
 #if defined(GRAN)
   IF_GRAN_DEBUG(pri,
     globalParStats.tot_threads_created++;
   }
 #endif 
 
 #if defined(GRAN)
   IF_GRAN_DEBUG(pri,
-               belch("==__ schedule: Created TSO %d (%p);",
+               sched_belch("==__ schedule: Created TSO %d (%p);",
                      CurrentProc, tso, tso->id));
                      CurrentProc, tso, tso->id));
-#elif defined(PAR)
-    IF_PAR_DEBUG(verbose,
-                belch("==__ schedule: Created TSO %d (%p); %d threads active",
-                      tso->id, tso, advisory_thread_count));
+#elif defined(PARALLEL_HASKELL)
+  IF_PAR_DEBUG(verbose,
+              sched_belch("==__ schedule: Created TSO %d (%p); %d threads active",
+                          (long)tso->id, tso, advisory_thread_count));
 #else
   IF_DEBUG(scheduler,sched_belch("created thread %ld, stack size = %lx words", 
 #else
   IF_DEBUG(scheduler,sched_belch("created thread %ld, stack size = %lx words", 
-                                tso->id, tso->stack_size));
+                                (long)tso->id, (long)tso->stack_size));
 #endif    
   return tso;
 }
 #endif    
   return tso;
 }
@@ -1880,9 +2425,10 @@ createThread(nat size)
    all parallel thread creation calls should fall through the following routine.
 */
 StgTSO *
    all parallel thread creation calls should fall through the following routine.
 */
 StgTSO *
-createSparkThread(rtsSpark spark) 
+createThreadFromSpark(rtsSpark spark) 
 { StgTSO *tso;
   ASSERT(spark != (rtsSpark)NULL);
 { StgTSO *tso;
   ASSERT(spark != (rtsSpark)NULL);
+// JB: TAKE CARE OF THIS COUNTER! BUGGY
   if (advisory_thread_count >= RtsFlags.ParFlags.maxThreads) 
   { threadsIgnored++;
     barf("{createSparkThread}Daq ghuH: refusing to create another thread; no more than %d threads allowed (currently %d)",
   if (advisory_thread_count >= RtsFlags.ParFlags.maxThreads) 
   { threadsIgnored++;
     barf("{createSparkThread}Daq ghuH: refusing to create another thread; no more than %d threads allowed (currently %d)",
@@ -1898,8 +2444,8 @@ createSparkThread(rtsSpark spark)
     tso->priority = AdvisoryPriority;
 #endif
     pushClosure(tso,spark);
     tso->priority = AdvisoryPriority;
 #endif
     pushClosure(tso,spark);
-    PUSH_ON_RUN_QUEUE(tso);
-    advisory_thread_count++;    
+    addToRunQueue(tso);
+    advisory_thread_count++;  // JB: TAKE CARE OF THIS COUNTER! BUGGY
   }
   return tso;
 }
   }
   return tso;
 }
@@ -1909,7 +2455,7 @@ createSparkThread(rtsSpark spark)
   Turn a spark into a thread.
   ToDo: fix for SMP (needs to acquire SCHED_MUTEX!)
 */
   Turn a spark into a thread.
   ToDo: fix for SMP (needs to acquire SCHED_MUTEX!)
 */
-#if defined(PAR)
+#if 0
 StgTSO *
 activateSpark (rtsSpark spark) 
 {
 StgTSO *
 activateSpark (rtsSpark spark) 
 {
@@ -1918,9 +2464,9 @@ activateSpark (rtsSpark spark)
   tso = createSparkThread(spark);
   if (RtsFlags.ParFlags.ParStats.Full) {   
     //ASSERT(run_queue_hd == END_TSO_QUEUE); // I think ...
   tso = createSparkThread(spark);
   if (RtsFlags.ParFlags.ParStats.Full) {   
     //ASSERT(run_queue_hd == END_TSO_QUEUE); // I think ...
-    IF_PAR_DEBUG(verbose,
-                belch("==^^ activateSpark: turning spark of closure %p (%s) into a thread",
-                      (StgClosure *)spark, info_type((StgClosure *)spark)));
+      IF_PAR_DEBUG(verbose,
+                  debugBelch("==^^ activateSpark: turning spark of closure %p (%s) into a thread\n",
+                             (StgClosure *)spark, info_type((StgClosure *)spark)));
   }
   // ToDo: fwd info on local/global spark to thread -- HWL
   // tso->gran.exported =  spark->exported;
   }
   // ToDo: fwd info on local/global spark to thread -- HWL
   // tso->gran.exported =  spark->exported;
@@ -1931,11 +2477,6 @@ activateSpark (rtsSpark spark)
 }
 #endif
 
 }
 #endif
 
-static SchedulerStatus waitThread_(/*out*/StgMainThread* m,
-                                  Capability *initialCapability
-                                  );
-
-
 /* ---------------------------------------------------------------------------
  * scheduleThread()
  *
 /* ---------------------------------------------------------------------------
  * scheduleThread()
  *
@@ -1946,27 +2487,17 @@ static SchedulerStatus waitThread_(/*out*/StgMainThread* m,
  * on this thread's stack before the scheduler is invoked.
  * ------------------------------------------------------------------------ */
 
  * on this thread's stack before the scheduler is invoked.
  * ------------------------------------------------------------------------ */
 
-static void scheduleThread_ (StgTSO* tso);
-
-void
+static void
 scheduleThread_(StgTSO *tso)
 {
 scheduleThread_(StgTSO *tso)
 {
-  // Precondition: sched_mutex must be held.
-
-  /* Put the new thread on the head of the runnable queue.  The caller
-   * better push an appropriate closure on this thread's stack
-   * beforehand.  In the SMP case, the thread may start running as
-   * soon as we release the scheduler lock below.
-   */
-  PUSH_ON_RUN_QUEUE(tso);
-  THREAD_RUNNABLE();
-
-#if 0
-  IF_DEBUG(scheduler,printTSO(tso));
-#endif
+  // The thread goes at the *end* of the run-queue, to avoid possible
+  // starvation of any threads already on the queue.
+  APPEND_TO_RUN_QUEUE(tso);
+  threadRunnable();
 }
 
 }
 
-void scheduleThread(StgTSO* tso)
+void
+scheduleThread(StgTSO* tso)
 {
   ACQUIRE_LOCK(&sched_mutex);
   scheduleThread_(tso);
 {
   ACQUIRE_LOCK(&sched_mutex);
   scheduleThread_(tso);
@@ -1974,9 +2505,11 @@ void scheduleThread(StgTSO* tso)
 }
 
 #if defined(RTS_SUPPORTS_THREADS)
 }
 
 #if defined(RTS_SUPPORTS_THREADS)
-static Condition *bound_cond_cache = NULL;
+static Condition bound_cond_cache;
+static int bound_cond_cache_full = 0;
 #endif
 
 #endif
 
+
 SchedulerStatus
 scheduleWaitThread(StgTSO* tso, /*[out]*/HaskellObj* ret,
                   Capability *initialCapability)
 SchedulerStatus
 scheduleWaitThread(StgTSO* tso, /*[out]*/HaskellObj* ret,
                   Capability *initialCapability)
@@ -1986,15 +2519,23 @@ scheduleWaitThread(StgTSO* tso, /*[out]*/HaskellObj* ret,
 
     m = stgMallocBytes(sizeof(StgMainThread), "waitThread");
     m->tso = tso;
 
     m = stgMallocBytes(sizeof(StgMainThread), "waitThread");
     m->tso = tso;
+    tso->main = m;
     m->ret = ret;
     m->stat = NoStatus;
     m->ret = ret;
     m->stat = NoStatus;
+    m->link = main_threads;
+    m->prev = NULL;
+    if (main_threads != NULL) {
+       main_threads->prev = m;
+    }
+    main_threads = m;
+
 #if defined(RTS_SUPPORTS_THREADS)
     // Allocating a new condition for each thread is expensive, so we
     // cache one.  This is a pretty feeble hack, but it helps speed up
     // consecutive call-ins quite a bit.
 #if defined(RTS_SUPPORTS_THREADS)
     // Allocating a new condition for each thread is expensive, so we
     // cache one.  This is a pretty feeble hack, but it helps speed up
     // consecutive call-ins quite a bit.
-    if (bound_cond_cache != NULL) {
-       m->bound_thread_cond = *bound_cond_cache;
-       bound_cond_cache = NULL;
+    if (bound_cond_cache_full) {
+       m->bound_thread_cond = bound_cond_cache;
+       bound_cond_cache_full = 0;
     } else {
        initCondition(&m->bound_thread_cond);
     }
     } else {
        initCondition(&m->bound_thread_cond);
     }
@@ -2011,10 +2552,10 @@ scheduleWaitThread(StgTSO* tso, /*[out]*/HaskellObj* ret,
     */
     IF_DEBUG(scheduler, sched_belch("waiting for thread (%d)", tso->id));
     
     */
     IF_DEBUG(scheduler, sched_belch("waiting for thread (%d)", tso->id));
     
-    m->link = main_threads;
-    main_threads = m;
-    
-    scheduleThread_(tso);
+    APPEND_TO_RUN_QUEUE(tso);
+    // NB. Don't call threadRunnable() here, because the thread is
+    // bound and only runnable by *this* OS thread, so waking up other
+    // workers will just slow things down.
 
     return waitThread_(m, initialCapability);
 }
 
     return waitThread_(m, initialCapability);
 }
@@ -2040,6 +2581,7 @@ initScheduler(void)
     blocked_queue_hds[i]  = END_TSO_QUEUE;
     blocked_queue_tls[i]  = END_TSO_QUEUE;
     ccalling_threadss[i]  = END_TSO_QUEUE;
     blocked_queue_hds[i]  = END_TSO_QUEUE;
     blocked_queue_tls[i]  = END_TSO_QUEUE;
     ccalling_threadss[i]  = END_TSO_QUEUE;
+    blackhole_queue[i]    = END_TSO_QUEUE;
     sleeping_queue        = END_TSO_QUEUE;
   }
 #else
     sleeping_queue        = END_TSO_QUEUE;
   }
 #else
@@ -2047,6 +2589,7 @@ initScheduler(void)
   run_queue_tl      = END_TSO_QUEUE;
   blocked_queue_hd  = END_TSO_QUEUE;
   blocked_queue_tl  = END_TSO_QUEUE;
   run_queue_tl      = END_TSO_QUEUE;
   blocked_queue_hd  = END_TSO_QUEUE;
   blocked_queue_tl  = END_TSO_QUEUE;
+  blackhole_queue   = END_TSO_QUEUE;
   sleeping_queue    = END_TSO_QUEUE;
 #endif 
 
   sleeping_queue    = END_TSO_QUEUE;
 #endif 
 
@@ -2068,9 +2611,7 @@ initScheduler(void)
   initMutex(&term_mutex);
 #endif
   
   initMutex(&term_mutex);
 #endif
   
-#if defined(RTS_SUPPORTS_THREADS)
   ACQUIRE_LOCK(&sched_mutex);
   ACQUIRE_LOCK(&sched_mutex);
-#endif
 
   /* A capability holds the state a native thread needs in
    * order to execute STG code. At least one capability is
 
   /* A capability holds the state a native thread needs in
    * order to execute STG code. At least one capability is
@@ -2083,12 +2624,11 @@ initScheduler(void)
     startTaskManager(0,taskStart);
 #endif
 
     startTaskManager(0,taskStart);
 #endif
 
-#if /* defined(SMP) ||*/ defined(PAR)
+#if /* defined(SMP) ||*/ defined(PARALLEL_HASKELL)
   initSparkPools();
 #endif
 
   RELEASE_LOCK(&sched_mutex);
   initSparkPools();
 #endif
 
   RELEASE_LOCK(&sched_mutex);
-
 }
 
 void
 }
 
 void
@@ -2097,6 +2637,7 @@ exitScheduler( void )
 #if defined(RTS_SUPPORTS_THREADS)
   stopTaskManager();
 #endif
 #if defined(RTS_SUPPORTS_THREADS)
   stopTaskManager();
 #endif
+  interrupted = rtsTrue;
   shutting_down_scheduler = rtsTrue;
 }
 
   shutting_down_scheduler = rtsTrue;
 }
 
@@ -2109,8 +2650,7 @@ exitScheduler( void )
    ToDo: no support for two-space collection at the moment???
    ------------------------------------------------------------------------- */
 
    ToDo: no support for two-space collection at the moment???
    ------------------------------------------------------------------------- */
 
-static
-SchedulerStatus
+static SchedulerStatus
 waitThread_(StgMainThread* m, Capability *initialCapability)
 {
   SchedulerStatus stat;
 waitThread_(StgMainThread* m, Capability *initialCapability)
 {
   SchedulerStatus stat;
@@ -2133,8 +2673,9 @@ waitThread_(StgMainThread* m, Capability *initialCapability)
 
 #if defined(RTS_SUPPORTS_THREADS)
   // Free the condition variable, returning it to the cache if possible.
 
 #if defined(RTS_SUPPORTS_THREADS)
   // Free the condition variable, returning it to the cache if possible.
-  if (bound_cond_cache == NULL) {
-      *bound_cond_cache = m->bound_thread_cond;
+  if (!bound_cond_cache_full) {
+      bound_cond_cache = m->bound_thread_cond;
+      bound_cond_cache_full = 1;
   } else {
       closeCondition(&m->bound_thread_cond);
   }
   } else {
       closeCondition(&m->bound_thread_cond);
   }
@@ -2204,11 +2745,15 @@ GetRoots( evac_fn evac )
   }
 #endif 
 
   }
 #endif 
 
+  if (blackhole_queue != END_TSO_QUEUE) {
+      evac((StgClosure **)&blackhole_queue);
+  }
+
   if (suspended_ccalling_threads != END_TSO_QUEUE) {
       evac((StgClosure **)&suspended_ccalling_threads);
   }
 
   if (suspended_ccalling_threads != END_TSO_QUEUE) {
       evac((StgClosure **)&suspended_ccalling_threads);
   }
 
-#if defined(PAR) || defined(GRAN)
+#if defined(PARALLEL_HASKELL) || defined(GRAN)
   markSparkQueue(evac);
 #endif
 
   markSparkQueue(evac);
 #endif
 
@@ -2216,25 +2761,6 @@ GetRoots( evac_fn evac )
   // mark the signal handlers (signals should be already blocked)
   markSignalHandlers(evac);
 #endif
   // mark the signal handlers (signals should be already blocked)
   markSignalHandlers(evac);
 #endif
-
-  // main threads which have completed need to be retained until they
-  // are dealt with in the main scheduler loop.  They won't be
-  // retained any other way: the GC will drop them from the
-  // all_threads list, so we have to be careful to treat them as roots
-  // here.
-  { 
-      StgMainThread *m;
-      for (m = main_threads; m != NULL; m = m->link) {
-         switch (m->tso->what_next) {
-         case ThreadComplete:
-         case ThreadKilled:
-             evac((StgClosure **)&m->tso);
-             break;
-         default:
-             break;
-         }
-      }
-  }
 }
 
 /* -----------------------------------------------------------------------------
 }
 
 /* -----------------------------------------------------------------------------
@@ -2297,7 +2823,8 @@ performGCWithRoots(void (*get_roots)(evac_fn))
 static StgTSO *
 threadStackOverflow(StgTSO *tso)
 {
 static StgTSO *
 threadStackOverflow(StgTSO *tso)
 {
-  nat new_stack_size, new_tso_size, stack_words;
+  nat new_stack_size, stack_words;
+  lnat new_tso_size;
   StgPtr new_sp;
   StgTSO *dest;
 
   StgPtr new_sp;
   StgTSO *dest;
 
@@ -2305,8 +2832,8 @@ threadStackOverflow(StgTSO *tso)
   if (tso->stack_size >= tso->max_stack_size) {
 
     IF_DEBUG(gc,
   if (tso->stack_size >= tso->max_stack_size) {
 
     IF_DEBUG(gc,
-            belch("@@ threadStackOverflow of TSO %d (%p): stack too large (now %ld; max is %ld)",
-                  tso->id, tso, tso->stack_size, tso->max_stack_size);
+            debugBelch("@@ threadStackOverflow of TSO %ld (%p): stack too large (now %ld; max is %ld)\n",
+                  (long)tso->id, tso, (long)tso->stack_size, (long)tso->max_stack_size);
             /* If we're debugging, just print out the top of the stack */
             printStackChunk(tso->sp, stg_min(tso->stack+tso->stack_size, 
                                              tso->sp+64)));
             /* If we're debugging, just print out the top of the stack */
             printStackChunk(tso->sp, stg_min(tso->stack+tso->stack_size, 
                                              tso->sp+64)));
@@ -2321,12 +2848,12 @@ threadStackOverflow(StgTSO *tso)
    * Finally round up so the TSO ends up as a whole number of blocks.
    */
   new_stack_size = stg_min(tso->stack_size * 2, tso->max_stack_size);
    * Finally round up so the TSO ends up as a whole number of blocks.
    */
   new_stack_size = stg_min(tso->stack_size * 2, tso->max_stack_size);
-  new_tso_size   = (nat)BLOCK_ROUND_UP(new_stack_size * sizeof(W_) + 
+  new_tso_size   = (lnat)BLOCK_ROUND_UP(new_stack_size * sizeof(W_) + 
                                       TSO_STRUCT_SIZE)/sizeof(W_);
   new_tso_size = round_to_mblocks(new_tso_size);  /* Be MBLOCK-friendly */
   new_stack_size = new_tso_size - TSO_STRUCT_SIZEW;
 
                                       TSO_STRUCT_SIZE)/sizeof(W_);
   new_tso_size = round_to_mblocks(new_tso_size);  /* Be MBLOCK-friendly */
   new_stack_size = new_tso_size - TSO_STRUCT_SIZEW;
 
-  IF_DEBUG(scheduler, fprintf(stderr,"== sched: increasing stack size from %d words to %d.\n", tso->stack_size, new_stack_size));
+  IF_DEBUG(scheduler, debugBelch("== sched: increasing stack size from %d words to %d.\n", tso->stack_size, new_stack_size));
 
   dest = (StgTSO *)allocate(new_tso_size);
   TICK_ALLOC_TSO(new_stack_size,0);
 
   dest = (StgTSO *)allocate(new_tso_size);
   TICK_ALLOC_TSO(new_stack_size,0);
@@ -2352,10 +2879,9 @@ threadStackOverflow(StgTSO *tso)
   tso->link = dest;
   tso->sp = (P_)&(tso->stack[tso->stack_size]);
   tso->why_blocked = NotBlocked;
   tso->link = dest;
   tso->sp = (P_)&(tso->stack[tso->stack_size]);
   tso->why_blocked = NotBlocked;
-  dest->mut_link = NULL;
 
   IF_PAR_DEBUG(verbose,
 
   IF_PAR_DEBUG(verbose,
-              belch("@@ threadStackOverflow of TSO %d (now at %p): stack size increased to %ld",
+              debugBelch("@@ threadStackOverflow of TSO %d (now at %p): stack size increased to %ld\n",
                     tso->id, tso, tso->stack_size);
               /* If we're debugging, just print out the top of the stack */
               printStackChunk(tso->sp, stg_min(tso->stack+tso->stack_size, 
                     tso->id, tso, tso->stack_size);
               /* If we're debugging, just print out the top of the stack */
               printStackChunk(tso->sp, stg_min(tso->stack+tso->stack_size, 
@@ -2378,7 +2904,7 @@ STATIC_INLINE void
 unblockCount ( StgBlockingQueueElement *bqe, StgClosure *node )
 {
 }
 unblockCount ( StgBlockingQueueElement *bqe, StgClosure *node )
 {
 }
-#elif defined(PAR)
+#elif defined(PARALLEL_HASKELL)
 STATIC_INLINE void
 unblockCount ( StgBlockingQueueElement *bqe, StgClosure *node )
 {
 STATIC_INLINE void
 unblockCount ( StgBlockingQueueElement *bqe, StgClosure *node )
 {
@@ -2444,14 +2970,14 @@ unblockOneLocked(StgBlockingQueueElement *bqe, StgClosure *node)
     }
     /* the thread-queue-overhead is accounted for in either Resume or UnblockThread */
     IF_GRAN_DEBUG(bq,
     }
     /* the thread-queue-overhead is accounted for in either Resume or UnblockThread */
     IF_GRAN_DEBUG(bq,
-                 fprintf(stderr," %s TSO %d (%p) [PE %d] (block_info.closure=%p) (next=%p) ,",
+                 debugBelch(" %s TSO %d (%p) [PE %d] (block_info.closure=%p) (next=%p) ,",
                          (node_loc==tso_loc ? "Local" : "Global"), 
                          tso->id, tso, CurrentProc, tso->block_info.closure, tso->link));
     tso->block_info.closure = NULL;
                          (node_loc==tso_loc ? "Local" : "Global"), 
                          tso->id, tso, CurrentProc, tso->block_info.closure, tso->link));
     tso->block_info.closure = NULL;
-    IF_DEBUG(scheduler,belch("-- Waking up thread %ld (%p)", 
+    IF_DEBUG(scheduler,debugBelch("-- Waking up thread %ld (%p)\n", 
                             tso->id, tso));
 }
                             tso->id, tso));
 }
-#elif defined(PAR)
+#elif defined(PARALLEL_HASKELL)
 static StgBlockingQueueElement *
 unblockOneLocked(StgBlockingQueueElement *bqe, StgClosure *node)
 {
 static StgBlockingQueueElement *
 unblockOneLocked(StgBlockingQueueElement *bqe, StgClosure *node)
 {
@@ -2462,9 +2988,9 @@ unblockOneLocked(StgBlockingQueueElement *bqe, StgClosure *node)
       ASSERT(((StgTSO *)bqe)->why_blocked != NotBlocked);
       /* if it's a TSO just push it onto the run_queue */
       next = bqe->link;
       ASSERT(((StgTSO *)bqe)->why_blocked != NotBlocked);
       /* if it's a TSO just push it onto the run_queue */
       next = bqe->link;
-      // ((StgTSO *)bqe)->link = END_TSO_QUEUE; // debugging?
-      PUSH_ON_RUN_QUEUE((StgTSO *)bqe); 
-      THREAD_RUNNABLE();
+      ((StgTSO *)bqe)->link = END_TSO_QUEUE; // debugging?
+      APPEND_TO_RUN_QUEUE((StgTSO *)bqe); 
+      threadRunnable();
       unblockCount(bqe, node);
       /* reset blocking status after dumping event */
       ((StgTSO *)bqe)->why_blocked = NotBlocked;
       unblockCount(bqe, node);
       /* reset blocking status after dumping event */
       ((StgTSO *)bqe)->why_blocked = NotBlocked;
@@ -2493,11 +3019,11 @@ unblockOneLocked(StgBlockingQueueElement *bqe, StgClosure *node)
           (StgClosure *)bqe);
 # endif
     }
           (StgClosure *)bqe);
 # endif
     }
-  IF_PAR_DEBUG(bq, fprintf(stderr, ", %p (%s)", bqe, info_type((StgClosure*)bqe)));
+  IF_PAR_DEBUG(bq, debugBelch(", %p (%s)\n", bqe, info_type((StgClosure*)bqe)));
   return next;
 }
 
   return next;
 }
 
-#else /* !GRAN && !PAR */
+#else /* !GRAN && !PARALLEL_HASKELL */
 static StgTSO *
 unblockOneLocked(StgTSO *tso)
 {
 static StgTSO *
 unblockOneLocked(StgTSO *tso)
 {
@@ -2507,14 +3033,15 @@ unblockOneLocked(StgTSO *tso)
   ASSERT(tso->why_blocked != NotBlocked);
   tso->why_blocked = NotBlocked;
   next = tso->link;
   ASSERT(tso->why_blocked != NotBlocked);
   tso->why_blocked = NotBlocked;
   next = tso->link;
-  PUSH_ON_RUN_QUEUE(tso);
-  THREAD_RUNNABLE();
-  IF_DEBUG(scheduler,sched_belch("waking up thread %ld", tso->id));
+  tso->link = END_TSO_QUEUE;
+  APPEND_TO_RUN_QUEUE(tso);
+  threadRunnable();
+  IF_DEBUG(scheduler,sched_belch("waking up thread %ld", (long)tso->id));
   return next;
 }
 #endif
 
   return next;
 }
 #endif
 
-#if defined(GRAN) || defined(PAR)
+#if defined(GRAN) || defined(PARALLEL_HASKELL)
 INLINE_ME StgBlockingQueueElement *
 unblockOne(StgBlockingQueueElement *bqe, StgClosure *node)
 {
 INLINE_ME StgBlockingQueueElement *
 unblockOne(StgBlockingQueueElement *bqe, StgClosure *node)
 {
@@ -2543,7 +3070,7 @@ awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node)
   nat len = 0; 
 
   IF_GRAN_DEBUG(bq, 
   nat len = 0; 
 
   IF_GRAN_DEBUG(bq, 
-               belch("##-_ AwBQ for node %p on PE %d @ %ld by TSO %d (%p): ", \
+               debugBelch("##-_ AwBQ for node %p on PE %d @ %ld by TSO %d (%p): \n", \
                      node, CurrentProc, CurrentTime[CurrentProc], 
                      CurrentTSO->id, CurrentTSO));
 
                      node, CurrentProc, CurrentTime[CurrentProc], 
                      CurrentTSO->id, CurrentTSO));
 
@@ -2560,13 +3087,13 @@ awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node)
   */
   if (CurrentProc!=node_loc) {
     IF_GRAN_DEBUG(bq, 
   */
   if (CurrentProc!=node_loc) {
     IF_GRAN_DEBUG(bq, 
-                 belch("## node %p is on PE %d but CurrentProc is %d (TSO %d); assuming fake fetch and adjusting bitmask (old: %#x)",
+                 debugBelch("## node %p is on PE %d but CurrentProc is %d (TSO %d); assuming fake fetch and adjusting bitmask (old: %#x)\n",
                        node, node_loc, CurrentProc, CurrentTSO->id, 
                        // CurrentTSO, where_is(CurrentTSO),
                        node->header.gran.procs));
     node->header.gran.procs = (node->header.gran.procs) | PE_NUMBER(CurrentProc);
     IF_GRAN_DEBUG(bq, 
                        node, node_loc, CurrentProc, CurrentTSO->id, 
                        // CurrentTSO, where_is(CurrentTSO),
                        node->header.gran.procs));
     node->header.gran.procs = (node->header.gran.procs) | PE_NUMBER(CurrentProc);
     IF_GRAN_DEBUG(bq, 
-                 belch("## new bitmask of node %p is %#x",
+                 debugBelch("## new bitmask of node %p is %#x\n",
                        node, node->header.gran.procs));
     if (RtsFlags.GranFlags.GranSimStats.Global) {
       globalGranStats.tot_fake_fetches++;
                        node, node->header.gran.procs));
     if (RtsFlags.GranFlags.GranSimStats.Global) {
       globalGranStats.tot_fake_fetches++;
@@ -2601,7 +3128,7 @@ awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node)
     ((StgRBH *)node)->mut_link       = (StgMutClosure *)((StgRBHSave *)bqe)->payload[1];
 
     IF_GRAN_DEBUG(bq,
     ((StgRBH *)node)->mut_link       = (StgMutClosure *)((StgRBHSave *)bqe)->payload[1];
 
     IF_GRAN_DEBUG(bq,
-                 belch("## Filled in RBH_Save for %p (%s) at end of AwBQ",
+                 debugBelch("## Filled in RBH_Save for %p (%s) at end of AwBQ\n",
                        node, info_type(node)));
   }
 
                        node, info_type(node)));
   }
 
@@ -2613,10 +3140,10 @@ awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node)
     globalGranStats.tot_awbq++;             // total no. of bqs awakened
   }
   IF_GRAN_DEBUG(bq,
     globalGranStats.tot_awbq++;             // total no. of bqs awakened
   }
   IF_GRAN_DEBUG(bq,
-               fprintf(stderr,"## BQ Stats of %p: [%d entries] %s\n",
+               debugBelch("## BQ Stats of %p: [%d entries] %s\n",
                        node, len, (bqe!=END_BQ_QUEUE) ? "RBH" : ""));
 }
                        node, len, (bqe!=END_BQ_QUEUE) ? "RBH" : ""));
 }
-#elif defined(PAR)
+#elif defined(PARALLEL_HASKELL)
 void 
 awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node)
 {
 void 
 awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node)
 {
@@ -2625,12 +3152,12 @@ awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node)
   ACQUIRE_LOCK(&sched_mutex);
 
   IF_PAR_DEBUG(verbose, 
   ACQUIRE_LOCK(&sched_mutex);
 
   IF_PAR_DEBUG(verbose, 
-              belch("##-_ AwBQ for node %p on [%x]: ",
+              debugBelch("##-_ AwBQ for node %p on [%x]: \n",
                     node, mytid));
 #ifdef DIST  
   //RFP
   if(get_itbl(q)->type == CONSTR || q==END_BQ_QUEUE) {
                     node, mytid));
 #ifdef DIST  
   //RFP
   if(get_itbl(q)->type == CONSTR || q==END_BQ_QUEUE) {
-    IF_PAR_DEBUG(verbose, belch("## ... nothing to unblock so lets just return. RFP (BUG?)"));
+    IF_PAR_DEBUG(verbose, debugBelch("## ... nothing to unblock so lets just return. RFP (BUG?)\n"));
     return;
   }
 #endif
     return;
   }
 #endif
@@ -2648,9 +3175,8 @@ awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node)
   RELEASE_LOCK(&sched_mutex);
 }
 
   RELEASE_LOCK(&sched_mutex);
 }
 
-#else   /* !GRAN && !PAR */
+#else   /* !GRAN && !PARALLEL_HASKELL */
 
 
-#ifdef RTS_SUPPORTS_THREADS
 void
 awakenBlockedQueueNoLock(StgTSO *tso)
 {
 void
 awakenBlockedQueueNoLock(StgTSO *tso)
 {
@@ -2658,7 +3184,6 @@ awakenBlockedQueueNoLock(StgTSO *tso)
     tso = unblockOneLocked(tso);
   }
 }
     tso = unblockOneLocked(tso);
   }
 }
-#endif
 
 void
 awakenBlockedQueue(StgTSO *tso)
 
 void
 awakenBlockedQueue(StgTSO *tso)
@@ -2681,9 +3206,6 @@ interruptStgRts(void)
 {
     interrupted    = 1;
     context_switch = 1;
 {
     interrupted    = 1;
     context_switch = 1;
-#ifdef RTS_SUPPORTS_THREADS
-    wakeBlockedWorkerThread();
-#endif
 }
 
 /* -----------------------------------------------------------------------------
 }
 
 /* -----------------------------------------------------------------------------
@@ -2694,7 +3216,7 @@ interruptStgRts(void)
    This has nothing to do with the UnblockThread event in GranSim. -- HWL
    -------------------------------------------------------------------------- */
 
    This has nothing to do with the UnblockThread event in GranSim. -- HWL
    -------------------------------------------------------------------------- */
 
-#if defined(GRAN) || defined(PAR)
+#if defined(GRAN) || defined(PARALLEL_HASKELL)
 /*
   NB: only the type of the blocking queue is different in GranSim and GUM
       the operations on the queue-elements are the same
 /*
   NB: only the type of the blocking queue is different in GranSim and GUM
       the operations on the queue-elements are the same
@@ -2713,6 +3235,14 @@ unblockThread(StgTSO *tso)
   case NotBlocked:
     return;  /* not blocked */
 
   case NotBlocked:
     return;  /* not blocked */
 
+  case BlockedOnSTM:
+    // Be careful: nothing to do here!  We tell the scheduler that the thread
+    // is runnable and we leave it to the stack-walking code to abort the 
+    // transaction while unwinding the stack.  We should perhaps have a debugging
+    // test to make sure that this really happens and that the 'zombie' transaction
+    // does not get committed.
+    goto done;
+
   case BlockedOnMVar:
     ASSERT(get_itbl(tso->block_info.closure)->type == MVAR);
     {
   case BlockedOnMVar:
     ASSERT(get_itbl(tso->block_info.closure)->type == MVAR);
     {
@@ -2779,7 +3309,7 @@ unblockThread(StgTSO *tso)
 
   case BlockedOnRead:
   case BlockedOnWrite:
 
   case BlockedOnRead:
   case BlockedOnWrite:
-#if defined(mingw32_TARGET_OS)
+#if defined(mingw32_HOST_OS)
   case BlockedOnDoProc:
 #endif
     {
   case BlockedOnDoProc:
 #endif
     {
@@ -2846,6 +3376,14 @@ unblockThread(StgTSO *tso)
 
   switch (tso->why_blocked) {
 
 
   switch (tso->why_blocked) {
 
+  case BlockedOnSTM:
+    // Be careful: nothing to do here!  We tell the scheduler that the thread
+    // is runnable and we leave it to the stack-walking code to abort the 
+    // transaction while unwinding the stack.  We should perhaps have a debugging
+    // test to make sure that this really happens and that the 'zombie' transaction
+    // does not get committed.
+    goto done;
+
   case BlockedOnMVar:
     ASSERT(get_itbl(tso->block_info.closure)->type == MVAR);
     {
   case BlockedOnMVar:
     ASSERT(get_itbl(tso->block_info.closure)->type == MVAR);
     {
@@ -2867,12 +3405,9 @@ unblockThread(StgTSO *tso)
     }
 
   case BlockedOnBlackHole:
     }
 
   case BlockedOnBlackHole:
-    ASSERT(get_itbl(tso->block_info.closure)->type == BLACKHOLE_BQ);
     {
     {
-      StgBlockingQueue *bq = (StgBlockingQueue *)(tso->block_info.closure);
-
-      last = &bq->blocking_queue;
-      for (t = bq->blocking_queue; t != END_TSO_QUEUE; 
+      last = &blackhole_queue;
+      for (t = blackhole_queue; t != END_TSO_QUEUE; 
           last = &t->link, t = t->link) {
        if (t == tso) {
          *last = tso->link;
           last = &t->link, t = t->link) {
        if (t == tso) {
          *last = tso->link;
@@ -2909,7 +3444,7 @@ unblockThread(StgTSO *tso)
 
   case BlockedOnRead:
   case BlockedOnWrite:
 
   case BlockedOnRead:
   case BlockedOnWrite:
-#if defined(mingw32_TARGET_OS)
+#if defined(mingw32_HOST_OS)
   case BlockedOnDoProc:
 #endif
     {
   case BlockedOnDoProc:
 #endif
     {
@@ -2959,11 +3494,54 @@ unblockThread(StgTSO *tso)
   tso->link = END_TSO_QUEUE;
   tso->why_blocked = NotBlocked;
   tso->block_info.closure = NULL;
   tso->link = END_TSO_QUEUE;
   tso->why_blocked = NotBlocked;
   tso->block_info.closure = NULL;
-  PUSH_ON_RUN_QUEUE(tso);
+  APPEND_TO_RUN_QUEUE(tso);
 }
 #endif
 
 /* -----------------------------------------------------------------------------
 }
 #endif
 
 /* -----------------------------------------------------------------------------
+ * checkBlackHoles()
+ *
+ * Check the blackhole_queue for threads that can be woken up.  We do
+ * this periodically: before every GC, and whenever the run queue is
+ * empty.
+ *
+ * An elegant solution might be to just wake up all the blocked
+ * threads with awakenBlockedQueue occasionally: they'll go back to
+ * sleep again if the object is still a BLACKHOLE.  Unfortunately this
+ * doesn't give us a way to tell whether we've actually managed to
+ * wake up any threads, so we would be busy-waiting.
+ *
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+checkBlackHoles( void )
+{
+    StgTSO **prev, *t;
+    rtsBool any_woke_up = rtsFalse;
+    StgHalfWord type;
+
+    IF_DEBUG(scheduler, sched_belch("checking threads blocked on black holes"));
+
+    // ASSUMES: sched_mutex
+    prev = &blackhole_queue;
+    t = blackhole_queue;
+    while (t != END_TSO_QUEUE) {
+       ASSERT(t->why_blocked == BlockedOnBlackHole);
+       type = get_itbl(t->block_info.closure)->type;
+       if (type != BLACKHOLE && type != CAF_BLACKHOLE) {
+           t = unblockOneLocked(t);
+           *prev = t;
+           any_woke_up = rtsTrue;
+       } else {
+           prev = &t->link;
+           t = t->link;
+       }
+    }
+
+    return any_woke_up;
+}
+
+/* -----------------------------------------------------------------------------
  * raiseAsync()
  *
  * The following function implements the magic for raising an
  * raiseAsync()
  *
  * The following function implements the magic for raising an
@@ -3000,9 +3578,13 @@ unblockThread(StgTSO *tso)
 void 
 deleteThread(StgTSO *tso)
 {
 void 
 deleteThread(StgTSO *tso)
 {
-  raiseAsync(tso,NULL);
+  if (tso->why_blocked != BlockedOnCCall &&
+      tso->why_blocked != BlockedOnCCall_NoUnblockExc) {
+      raiseAsync(tso,NULL);
+  }
 }
 
 }
 
+#ifdef FORKPROCESS_PRIMOP_SUPPORTED
 static void 
 deleteThreadImmediately(StgTSO *tso)
 { // for forkProcess only:
 static void 
 deleteThreadImmediately(StgTSO *tso)
 { // for forkProcess only:
@@ -3011,13 +3593,15 @@ deleteThreadImmediately(StgTSO *tso)
   if (tso->what_next == ThreadComplete || tso->what_next == ThreadKilled) {
       return;
   }
   if (tso->what_next == ThreadComplete || tso->what_next == ThreadKilled) {
       return;
   }
-#if defined(RTS_SUPPORTS_THREADS)
-  if (tso->why_blocked != BlockedOnCCall
-      && tso->why_blocked != BlockedOnCCall_NoUnblockExc)
-#endif
+
+  if (tso->why_blocked != BlockedOnCCall &&
+      tso->why_blocked != BlockedOnCCall_NoUnblockExc) {
     unblockThread(tso);
     unblockThread(tso);
+  }
+
   tso->what_next = ThreadKilled;
 }
   tso->what_next = ThreadKilled;
 }
+#endif
 
 void
 raiseAsyncWithLock(StgTSO *tso, StgClosure *exception)
 
 void
 raiseAsyncWithLock(StgTSO *tso, StgClosure *exception)
@@ -3032,6 +3616,12 @@ raiseAsyncWithLock(StgTSO *tso, StgClosure *exception)
 void
 raiseAsync(StgTSO *tso, StgClosure *exception)
 {
 void
 raiseAsync(StgTSO *tso, StgClosure *exception)
 {
+    raiseAsync_(tso, exception, rtsFalse);
+}
+
+static void
+raiseAsync_(StgTSO *tso, StgClosure *exception, rtsBool stop_at_atomically)
+{
     StgRetInfoTable *info;
     StgPtr sp;
   
     StgRetInfoTable *info;
     StgPtr sp;
   
@@ -3041,7 +3631,7 @@ raiseAsync(StgTSO *tso, StgClosure *exception)
     }
 
     IF_DEBUG(scheduler, 
     }
 
     IF_DEBUG(scheduler, 
-            sched_belch("raising exception in thread %ld.", tso->id));
+            sched_belch("raising exception in thread %ld.", (long)tso->id));
     
     // Remove it from any blocking queues
     unblockThread(tso);
     
     // Remove it from any blocking queues
     unblockThread(tso);
@@ -3075,6 +3665,10 @@ raiseAsync(StgTSO *tso, StgClosure *exception)
        // top of the stack applied to the exception.
        // 
        // 5. If it's a STOP_FRAME, then kill the thread.
        // top of the stack applied to the exception.
        // 
        // 5. If it's a STOP_FRAME, then kill the thread.
+        // 
+        // NB: if we pass an ATOMICALLY_FRAME then abort the associated 
+        // transaction
+       
        
        StgPtr frame;
        
        
        StgPtr frame;
        
@@ -3083,13 +3677,45 @@ raiseAsync(StgTSO *tso, StgClosure *exception)
        
        while (info->i.type != UPDATE_FRAME
               && (info->i.type != CATCH_FRAME || exception == NULL)
        
        while (info->i.type != UPDATE_FRAME
               && (info->i.type != CATCH_FRAME || exception == NULL)
-              && info->i.type != STOP_FRAME) {
+              && info->i.type != STOP_FRAME
+              && (info->i.type != ATOMICALLY_FRAME || stop_at_atomically == rtsFalse))
+       {
+            if (info->i.type == CATCH_RETRY_FRAME || info->i.type == ATOMICALLY_FRAME) {
+              // IF we find an ATOMICALLY_FRAME then we abort the
+              // current transaction and propagate the exception.  In
+              // this case (unlike ordinary exceptions) we do not care
+              // whether the transaction is valid or not because its
+              // possible validity cannot have caused the exception
+              // and will not be visible after the abort.
+              IF_DEBUG(stm,
+                       debugBelch("Found atomically block delivering async exception\n"));
+              stmAbortTransaction(tso -> trec);
+              tso -> trec = stmGetEnclosingTRec(tso -> trec);
+            }
            frame += stack_frame_sizeW((StgClosure *)frame);
            info = get_ret_itbl((StgClosure *)frame);
        }
        
        switch (info->i.type) {
            
            frame += stack_frame_sizeW((StgClosure *)frame);
            info = get_ret_itbl((StgClosure *)frame);
        }
        
        switch (info->i.type) {
            
+       case ATOMICALLY_FRAME:
+           ASSERT(stop_at_atomically);
+           ASSERT(stmGetEnclosingTRec(tso->trec) == NO_TREC);
+           stmCondemnTransaction(tso -> trec);
+#ifdef REG_R1
+           tso->sp = frame;
+#else
+           // R1 is not a register: the return convention for IO in
+           // this case puts the return value on the stack, so we
+           // need to set up the stack to return to the atomically
+           // frame properly...
+           tso->sp = frame - 2;
+           tso->sp[1] = (StgWord) &stg_NO_FINALIZER_closure; // why not?
+           tso->sp[0] = (StgWord) &stg_ut_1_0_unreg_info;
+#endif
+           tso->what_next = ThreadRunGHC;
+           return;
+
        case CATCH_FRAME:
            // If we find a CATCH_FRAME, and we've got an exception to raise,
            // then build the THUNK raise(exception), and leave it on
        case CATCH_FRAME:
            // If we find a CATCH_FRAME, and we've got an exception to raise,
            // then build the THUNK raise(exception), and leave it on
@@ -3156,9 +3782,9 @@ raiseAsync(StgTSO *tso, StgClosure *exception)
            TICK_ALLOC_UP_THK(words+1,0);
            
            IF_DEBUG(scheduler,
            TICK_ALLOC_UP_THK(words+1,0);
            
            IF_DEBUG(scheduler,
-                    fprintf(stderr,  "sched: Updating ");
+                    debugBelch("sched: Updating ");
                     printPtr((P_)((StgUpdateFrame *)frame)->updatee); 
                     printPtr((P_)((StgUpdateFrame *)frame)->updatee); 
-                    fprintf(stderr,  " with ");
+                    debugBelch(" with ");
                     printObj((StgClosure *)ap);
                );
 
                     printObj((StgClosure *)ap);
                );
 
@@ -3176,7 +3802,8 @@ raiseAsync(StgTSO *tso, StgClosure *exception)
            //
            if (!closure_IND(((StgUpdateFrame *)frame)->updatee)) {
                // revert the black hole
            //
            if (!closure_IND(((StgUpdateFrame *)frame)->updatee)) {
                // revert the black hole
-               UPD_IND_NOLOCK(((StgUpdateFrame *)frame)->updatee,ap);
+               UPD_IND_NOLOCK(((StgUpdateFrame *)frame)->updatee,
+                              (StgClosure *)ap);
            }
            sp += sizeofW(StgUpdateFrame) - 1;
            sp[0] = (W_)ap; // push onto stack
            }
            sp += sizeofW(StgUpdateFrame) - 1;
            sp[0] = (W_)ap; // push onto stack
@@ -3198,6 +3825,137 @@ raiseAsync(StgTSO *tso, StgClosure *exception)
 }
 
 /* -----------------------------------------------------------------------------
 }
 
 /* -----------------------------------------------------------------------------
+   raiseExceptionHelper
+   
+   This function is called by the raise# primitve, just so that we can
+   move some of the tricky bits of raising an exception from C-- into
+   C.  Who knows, it might be a useful re-useable thing here too.
+   -------------------------------------------------------------------------- */
+
+StgWord
+raiseExceptionHelper (StgTSO *tso, StgClosure *exception)
+{
+    StgClosure *raise_closure = NULL;
+    StgPtr p, next;
+    StgRetInfoTable *info;
+    //
+    // This closure represents the expression 'raise# E' where E
+    // is the exception raise.  It is used to overwrite all the
+    // thunks which are currently under evaluataion.
+    //
+
+    //    
+    // LDV profiling: stg_raise_info has THUNK as its closure
+    // type. Since a THUNK takes at least MIN_UPD_SIZE words in its
+    // payload, MIN_UPD_SIZE is more approprate than 1.  It seems that
+    // 1 does not cause any problem unless profiling is performed.
+    // However, when LDV profiling goes on, we need to linearly scan
+    // small object pool, where raise_closure is stored, so we should
+    // use MIN_UPD_SIZE.
+    //
+    // raise_closure = (StgClosure *)RET_STGCALL1(P_,allocate,
+    //                                        sizeofW(StgClosure)+1);
+    //
+
+    //
+    // Walk up the stack, looking for the catch frame.  On the way,
+    // we update any closures pointed to from update frames with the
+    // raise closure that we just built.
+    //
+    p = tso->sp;
+    while(1) {
+       info = get_ret_itbl((StgClosure *)p);
+       next = p + stack_frame_sizeW((StgClosure *)p);
+       switch (info->i.type) {
+           
+       case UPDATE_FRAME:
+           // Only create raise_closure if we need to.
+           if (raise_closure == NULL) {
+               raise_closure = 
+                   (StgClosure *)allocate(sizeofW(StgClosure)+MIN_UPD_SIZE);
+               SET_HDR(raise_closure, &stg_raise_info, CCCS);
+               raise_closure->payload[0] = exception;
+           }
+           UPD_IND(((StgUpdateFrame *)p)->updatee,raise_closure);
+           p = next;
+           continue;
+
+        case ATOMICALLY_FRAME:
+            IF_DEBUG(stm, debugBelch("Found ATOMICALLY_FRAME at %p\n", p));
+            tso->sp = p;
+            return ATOMICALLY_FRAME;
+           
+       case CATCH_FRAME:
+           tso->sp = p;
+           return CATCH_FRAME;
+
+        case CATCH_STM_FRAME:
+            IF_DEBUG(stm, debugBelch("Found CATCH_STM_FRAME at %p\n", p));
+            tso->sp = p;
+            return CATCH_STM_FRAME;
+           
+       case STOP_FRAME:
+           tso->sp = p;
+           return STOP_FRAME;
+
+        case CATCH_RETRY_FRAME:
+       default:
+           p = next; 
+           continue;
+       }
+    }
+}
+
+
+/* -----------------------------------------------------------------------------
+   findRetryFrameHelper
+
+   This function is called by the retry# primitive.  It traverses the stack
+   leaving tso->sp referring to the frame which should handle the retry.  
+
+   This should either be a CATCH_RETRY_FRAME (if the retry# is within an orElse#) 
+   or should be a ATOMICALLY_FRAME (if the retry# reaches the top level).  
+
+   We skip CATCH_STM_FRAMEs because retries are not considered to be exceptions,
+   despite the similar implementation.
+
+   We should not expect to see CATCH_FRAME or STOP_FRAME because those should
+   not be created within memory transactions.
+   -------------------------------------------------------------------------- */
+
+StgWord
+findRetryFrameHelper (StgTSO *tso)
+{
+  StgPtr           p, next;
+  StgRetInfoTable *info;
+
+  p = tso -> sp;
+  while (1) {
+    info = get_ret_itbl((StgClosure *)p);
+    next = p + stack_frame_sizeW((StgClosure *)p);
+    switch (info->i.type) {
+      
+    case ATOMICALLY_FRAME:
+      IF_DEBUG(stm, debugBelch("Found ATOMICALLY_FRAME at %p during retrry\n", p));
+      tso->sp = p;
+      return ATOMICALLY_FRAME;
+      
+    case CATCH_RETRY_FRAME:
+      IF_DEBUG(stm, debugBelch("Found CATCH_RETRY_FRAME at %p during retrry\n", p));
+      tso->sp = p;
+      return CATCH_RETRY_FRAME;
+      
+    case CATCH_STM_FRAME:
+    default:
+      ASSERT(info->i.type != CATCH_FRAME);
+      ASSERT(info->i.type != STOP_FRAME);
+      p = next; 
+      continue;
+    }
+  }
+}
+
+/* -----------------------------------------------------------------------------
    resurrectThreads is called after garbage collection on the list of
    threads found to be garbage.  Each of these threads will be woken
    up and sent a signal: BlockedOnDeadMVar if the thread was blocked
    resurrectThreads is called after garbage collection on the list of
    threads found to be garbage.  Each of these threads will be woken
    up and sent a signal: BlockedOnDeadMVar if the thread was blocked
@@ -3227,6 +3985,9 @@ resurrectThreads( StgTSO *threads )
     case BlockedOnBlackHole:
       raiseAsync(tso,(StgClosure *)NonTermination_closure);
       break;
     case BlockedOnBlackHole:
       raiseAsync(tso,(StgClosure *)NonTermination_closure);
       break;
+    case BlockedOnSTM:
+      raiseAsync(tso,(StgClosure *)BlockedIndefinitely_closure);
+      break;
     case NotBlocked:
       /* This might happen if the thread was blocked on a black hole
        * belonging to a thread that we've just woken up (raiseAsync
     case NotBlocked:
       /* This might happen if the thread was blocked on a black hole
        * belonging to a thread that we've just woken up (raiseAsync
@@ -3239,140 +4000,77 @@ resurrectThreads( StgTSO *threads )
   }
 }
 
   }
 }
 
-/* -----------------------------------------------------------------------------
- * Blackhole detection: if we reach a deadlock, test whether any
- * threads are blocked on themselves.  Any threads which are found to
- * be self-blocked get sent a NonTermination exception.
- *
- * This is only done in a deadlock situation in order to avoid
- * performance overhead in the normal case.
- *
- * Locks: sched_mutex is held upon entry and exit.
- * -------------------------------------------------------------------------- */
-
-static void
-detectBlackHoles( void )
-{
-    StgTSO *tso = all_threads;
-    StgClosure *frame;
-    StgClosure *blocked_on;
-    StgRetInfoTable *info;
-
-    for (tso = all_threads; tso != END_TSO_QUEUE; tso = tso->global_link) {
-
-       while (tso->what_next == ThreadRelocated) {
-           tso = tso->link;
-           ASSERT(get_itbl(tso)->type == TSO);
-       }
-      
-       if (tso->why_blocked != BlockedOnBlackHole) {
-           continue;
-       }
-       blocked_on = tso->block_info.closure;
-
-       frame = (StgClosure *)tso->sp;
-
-       while(1) {
-           info = get_ret_itbl(frame);
-           switch (info->i.type) {
-           case UPDATE_FRAME:
-               if (((StgUpdateFrame *)frame)->updatee == blocked_on) {
-                   /* We are blocking on one of our own computations, so
-                    * send this thread the NonTermination exception.  
-                    */
-                   IF_DEBUG(scheduler, 
-                            sched_belch("thread %d is blocked on itself", tso->id));
-                   raiseAsync(tso, (StgClosure *)NonTermination_closure);
-                   goto done;
-               }
-               
-               frame = (StgClosure *) ((StgUpdateFrame *)frame + 1);
-               continue;
-
-           case STOP_FRAME:
-               goto done;
-
-               // normal stack frames; do nothing except advance the pointer
-           default:
-               (StgPtr)frame += stack_frame_sizeW(frame);
-           }
-       }   
-       done: ;
-    }
-}
-
 /* ----------------------------------------------------------------------------
  * Debugging: why is a thread blocked
  * [Also provides useful information when debugging threaded programs
  *  at the Haskell source code level, so enable outside of DEBUG. --sof 7/02]
    ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------------
  * Debugging: why is a thread blocked
  * [Also provides useful information when debugging threaded programs
  *  at the Haskell source code level, so enable outside of DEBUG. --sof 7/02]
    ------------------------------------------------------------------------- */
 
-static
-void
+static void
 printThreadBlockage(StgTSO *tso)
 {
   switch (tso->why_blocked) {
   case BlockedOnRead:
 printThreadBlockage(StgTSO *tso)
 {
   switch (tso->why_blocked) {
   case BlockedOnRead:
-    fprintf(stderr,"is blocked on read from fd %d", tso->block_info.fd);
+    debugBelch("is blocked on read from fd %ld", tso->block_info.fd);
     break;
   case BlockedOnWrite:
     break;
   case BlockedOnWrite:
-    fprintf(stderr,"is blocked on write to fd %d", tso->block_info.fd);
+    debugBelch("is blocked on write to fd %ld", tso->block_info.fd);
     break;
     break;
-#if defined(mingw32_TARGET_OS)
+#if defined(mingw32_HOST_OS)
     case BlockedOnDoProc:
     case BlockedOnDoProc:
-    fprintf(stderr,"is blocked on proc (request: %d)", tso->block_info.async_result->reqID);
+    debugBelch("is blocked on proc (request: %ld)", tso->block_info.async_result->reqID);
     break;
 #endif
   case BlockedOnDelay:
     break;
 #endif
   case BlockedOnDelay:
-    fprintf(stderr,"is blocked until %d", tso->block_info.target);
+    debugBelch("is blocked until %ld", tso->block_info.target);
     break;
   case BlockedOnMVar:
     break;
   case BlockedOnMVar:
-    fprintf(stderr,"is blocked on an MVar");
+    debugBelch("is blocked on an MVar");
     break;
   case BlockedOnException:
     break;
   case BlockedOnException:
-    fprintf(stderr,"is blocked on delivering an exception to thread %d",
+    debugBelch("is blocked on delivering an exception to thread %d",
            tso->block_info.tso->id);
     break;
   case BlockedOnBlackHole:
            tso->block_info.tso->id);
     break;
   case BlockedOnBlackHole:
-    fprintf(stderr,"is blocked on a black hole");
+    debugBelch("is blocked on a black hole");
     break;
   case NotBlocked:
     break;
   case NotBlocked:
-    fprintf(stderr,"is not blocked");
+    debugBelch("is not blocked");
     break;
     break;
-#if defined(PAR)
+#if defined(PARALLEL_HASKELL)
   case BlockedOnGA:
   case BlockedOnGA:
-    fprintf(stderr,"is blocked on global address; local FM_BQ is %p (%s)",
+    debugBelch("is blocked on global address; local FM_BQ is %p (%s)",
            tso->block_info.closure, info_type(tso->block_info.closure));
     break;
   case BlockedOnGA_NoSend:
            tso->block_info.closure, info_type(tso->block_info.closure));
     break;
   case BlockedOnGA_NoSend:
-    fprintf(stderr,"is blocked on global address (no send); local FM_BQ is %p (%s)",
+    debugBelch("is blocked on global address (no send); local FM_BQ is %p (%s)",
            tso->block_info.closure, info_type(tso->block_info.closure));
     break;
 #endif
            tso->block_info.closure, info_type(tso->block_info.closure));
     break;
 #endif
-#if defined(RTS_SUPPORTS_THREADS)
   case BlockedOnCCall:
   case BlockedOnCCall:
-    fprintf(stderr,"is blocked on an external call");
+    debugBelch("is blocked on an external call");
     break;
   case BlockedOnCCall_NoUnblockExc:
     break;
   case BlockedOnCCall_NoUnblockExc:
-    fprintf(stderr,"is blocked on an external call (exceptions were already blocked)");
+    debugBelch("is blocked on an external call (exceptions were already blocked)");
+    break;
+  case BlockedOnSTM:
+    debugBelch("is blocked on an STM operation");
     break;
     break;
-#endif
   default:
     barf("printThreadBlockage: strange tso->why_blocked: %d for TSO %d (%d)",
         tso->why_blocked, tso->id, tso);
   }
 }
 
   default:
     barf("printThreadBlockage: strange tso->why_blocked: %d for TSO %d (%d)",
         tso->why_blocked, tso->id, tso);
   }
 }
 
-static
-void
+static void
 printThreadStatus(StgTSO *tso)
 {
   switch (tso->what_next) {
   case ThreadKilled:
 printThreadStatus(StgTSO *tso)
 {
   switch (tso->what_next) {
   case ThreadKilled:
-    fprintf(stderr,"has been killed");
+    debugBelch("has been killed");
     break;
   case ThreadComplete:
     break;
   case ThreadComplete:
-    fprintf(stderr,"has completed");
+    debugBelch("has completed");
     break;
   default:
     printThreadBlockage(tso);
     break;
   default:
     printThreadBlockage(tso);
@@ -3383,30 +4081,33 @@ void
 printAllThreads(void)
 {
   StgTSO *t;
 printAllThreads(void)
 {
   StgTSO *t;
-  void *label;
 
 # if defined(GRAN)
   char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
   ullong_format_string(TIME_ON_PROC(CurrentProc), 
                       time_string, rtsFalse/*no commas!*/);
 
 
 # if defined(GRAN)
   char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
   ullong_format_string(TIME_ON_PROC(CurrentProc), 
                       time_string, rtsFalse/*no commas!*/);
 
-  fprintf(stderr, "all threads at [%s]:\n", time_string);
-# elif defined(PAR)
+  debugBelch("all threads at [%s]:\n", time_string);
+# elif defined(PARALLEL_HASKELL)
   char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
   ullong_format_string(CURRENT_TIME,
                       time_string, rtsFalse/*no commas!*/);
 
   char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
   ullong_format_string(CURRENT_TIME,
                       time_string, rtsFalse/*no commas!*/);
 
-  fprintf(stderr,"all threads at [%s]:\n", time_string);
+  debugBelch("all threads at [%s]:\n", time_string);
 # else
 # else
-  fprintf(stderr,"all threads:\n");
+  debugBelch("all threads:\n");
 # endif
 
   for (t = all_threads; t != END_TSO_QUEUE; t = t->global_link) {
 # endif
 
   for (t = all_threads; t != END_TSO_QUEUE; t = t->global_link) {
-    fprintf(stderr, "\tthread %d @ %p ", t->id, (void *)t);
-    label = lookupThreadLabel(t->id);
-    if (label) fprintf(stderr,"[\"%s\"] ",(char *)label);
+    debugBelch("\tthread %d @ %p ", t->id, (void *)t);
+#if defined(DEBUG)
+    {
+      void *label = lookupThreadLabel(t->id);
+      if (label) debugBelch("[\"%s\"] ",(char *)label);
+    }
+#endif
     printThreadStatus(t);
     printThreadStatus(t);
-    fprintf(stderr,"\n");
+    debugBelch("\n");
   }
 }
     
   }
 }
     
@@ -3415,7 +4116,7 @@ printAllThreads(void)
 /* 
    Print a whole blocking queue attached to node (debugging only).
 */
 /* 
    Print a whole blocking queue attached to node (debugging only).
 */
-# if defined(PAR)
+# if defined(PARALLEL_HASKELL)
 void 
 print_bq (StgClosure *node)
 {
 void 
 print_bq (StgClosure *node)
 {
@@ -3423,7 +4124,7 @@ print_bq (StgClosure *node)
   StgTSO *tso;
   rtsBool end;
 
   StgTSO *tso;
   rtsBool end;
 
-  fprintf(stderr,"## BQ of closure %p (%s): ",
+  debugBelch("## BQ of closure %p (%s): ",
          node, info_type(node));
 
   /* should cover all closures that may have a blocking queue */
          node, info_type(node));
 
   /* should cover all closures that may have a blocking queue */
@@ -3463,18 +4164,18 @@ print_bqe (StgBlockingQueueElement *bqe)
 
     switch (get_itbl(bqe)->type) {
     case TSO:
 
     switch (get_itbl(bqe)->type) {
     case TSO:
-      fprintf(stderr," TSO %u (%x),",
+      debugBelch(" TSO %u (%x),",
              ((StgTSO *)bqe)->id, ((StgTSO *)bqe));
       break;
     case BLOCKED_FETCH:
              ((StgTSO *)bqe)->id, ((StgTSO *)bqe));
       break;
     case BLOCKED_FETCH:
-      fprintf(stderr," BF (node=%p, ga=((%x, %d, %x)),",
+      debugBelch(" BF (node=%p, ga=((%x, %d, %x)),",
              ((StgBlockedFetch *)bqe)->node, 
              ((StgBlockedFetch *)bqe)->ga.payload.gc.gtid,
              ((StgBlockedFetch *)bqe)->ga.payload.gc.slot,
              ((StgBlockedFetch *)bqe)->ga.weight);
       break;
     case CONSTR:
              ((StgBlockedFetch *)bqe)->node, 
              ((StgBlockedFetch *)bqe)->ga.payload.gc.gtid,
              ((StgBlockedFetch *)bqe)->ga.payload.gc.slot,
              ((StgBlockedFetch *)bqe)->ga.weight);
       break;
     case CONSTR:
-      fprintf(stderr," %s (IP %p),",
+      debugBelch(" %s (IP %p),",
              (get_itbl(bqe) == &stg_RBH_Save_0_info ? "RBH_Save_0" :
               get_itbl(bqe) == &stg_RBH_Save_1_info ? "RBH_Save_1" :
               get_itbl(bqe) == &stg_RBH_Save_2_info ? "RBH_Save_2" :
              (get_itbl(bqe) == &stg_RBH_Save_0_info ? "RBH_Save_0" :
               get_itbl(bqe) == &stg_RBH_Save_1_info ? "RBH_Save_1" :
               get_itbl(bqe) == &stg_RBH_Save_2_info ? "RBH_Save_2" :
@@ -3486,7 +4187,7 @@ print_bqe (StgBlockingQueueElement *bqe)
       break;
     }
   } /* for */
       break;
     }
   } /* for */
-  fputc('\n', stderr);
+  debugBelch("\n");
 }
 # elif defined(GRAN)
 void 
 }
 # elif defined(GRAN)
 void 
@@ -3504,7 +4205,7 @@ print_bq (StgClosure *node)
   ASSERT(node!=(StgClosure*)NULL);         // sanity check
   node_loc = where_is(node);
 
   ASSERT(node!=(StgClosure*)NULL);         // sanity check
   node_loc = where_is(node);
 
-  fprintf(stderr,"## BQ of closure %p (%s) on [PE %d]: ",
+  debugBelch("## BQ of closure %p (%s) on [PE %d]: ",
          node, info_type(node), node_loc);
 
   /* 
          node, info_type(node), node_loc);
 
   /* 
@@ -3524,11 +4225,11 @@ print_bq (StgClosure *node)
     tso_loc = where_is((StgClosure *)bqe);
     switch (get_itbl(bqe)->type) {
     case TSO:
     tso_loc = where_is((StgClosure *)bqe);
     switch (get_itbl(bqe)->type) {
     case TSO:
-      fprintf(stderr," TSO %d (%p) on [PE %d],",
+      debugBelch(" TSO %d (%p) on [PE %d],",
              ((StgTSO *)bqe)->id, (StgTSO *)bqe, tso_loc);
       break;
     case CONSTR:
              ((StgTSO *)bqe)->id, (StgTSO *)bqe, tso_loc);
       break;
     case CONSTR:
-      fprintf(stderr," %s (IP %p),",
+      debugBelch(" %s (IP %p),",
              (get_itbl(bqe) == &stg_RBH_Save_0_info ? "RBH_Save_0" :
               get_itbl(bqe) == &stg_RBH_Save_1_info ? "RBH_Save_1" :
               get_itbl(bqe) == &stg_RBH_Save_2_info ? "RBH_Save_2" :
              (get_itbl(bqe) == &stg_RBH_Save_0_info ? "RBH_Save_0" :
               get_itbl(bqe) == &stg_RBH_Save_1_info ? "RBH_Save_1" :
               get_itbl(bqe) == &stg_RBH_Save_2_info ? "RBH_Save_2" :
@@ -3540,30 +4241,11 @@ print_bq (StgClosure *node)
       break;
     }
   } /* for */
       break;
     }
   } /* for */
-  fputc('\n', stderr);
-}
-#else
-/* 
-   Nice and easy: only TSOs on the blocking queue
-*/
-void 
-print_bq (StgClosure *node)
-{
-  StgTSO *tso;
-
-  ASSERT(node!=(StgClosure*)NULL);         // sanity check
-  for (tso = ((StgBlockingQueue*)node)->blocking_queue;
-       tso != END_TSO_QUEUE; 
-       tso=tso->link) {
-    ASSERT(tso!=NULL && tso!=END_TSO_QUEUE);   // sanity check
-    ASSERT(get_itbl(tso)->type == TSO);  // guess what, sanity check
-    fprintf(stderr," TSO %d (%p),", tso->id, tso);
-  }
-  fputc('\n', stderr);
+  debugBelch("\n");
 }
 # endif
 
 }
 # endif
 
-#if defined(PAR)
+#if defined(PARALLEL_HASKELL)
 static nat
 run_queue_len(void)
 {
 static nat
 run_queue_len(void)
 {
@@ -3585,15 +4267,14 @@ sched_belch(char *s, ...)
   va_list ap;
   va_start(ap,s);
 #ifdef RTS_SUPPORTS_THREADS
   va_list ap;
   va_start(ap,s);
 #ifdef RTS_SUPPORTS_THREADS
-  fprintf(stderr, "sched (task %p): ", osThreadId());
-#elif defined(PAR)
-  fprintf(stderr, "== ");
+  debugBelch("sched (task %p): ", osThreadId());
+#elif defined(PARALLEL_HASKELL)
+  debugBelch("== ");
 #else
 #else
-  fprintf(stderr, "sched: ");
+  debugBelch("sched: ");
 #endif
 #endif
-  vfprintf(stderr, s, ap);
-  fprintf(stderr, "\n");
-  fflush(stderr);
+  vdebugBelch(s, ap);
+  debugBelch("\n");
   va_end(ap);
 }
 
   va_end(ap);
 }