R1 = StackOverflow;                                    \
     }                                                          \
   sched:                                                       \
-    SAVE_THREAD_STATE();                                       \
     StgTSO_what_next(CurrentTSO) = ThreadRunGHC::I16;          \
-    jump StgReturn;
+    jump stg_returnToSched;
 
 #define RETURN_TO_SCHED(why,what_next)                 \
-  SAVE_THREAD_STATE();                                 \
   StgTSO_what_next(CurrentTSO) = what_next::I16;       \
   R1 = why;                                            \
-  jump StgReturn;
+  jump stg_returnToSched;
+
+#define RETURN_TO_SCHED_BUT_FIRST(why,what_next,cont)  \
+  StgTSO_what_next(CurrentTSO) = what_next::I16;       \
+  R1 = why;                                            \
+  R2 = cont;                                           \
+  jump stg_returnToSchedButFirst;
 
 #define HP_GENERIC           RETURN_TO_SCHED(HeapOverflow,   ThreadRunGHC)
 #define YIELD_GENERIC        RETURN_TO_SCHED(ThreadYielding, ThreadRunGHC)
 #define YIELD_TO_INTERPRETER RETURN_TO_SCHED(ThreadYielding, ThreadInterpret)
 #define BLOCK_GENERIC        RETURN_TO_SCHED(ThreadBlocked,  ThreadRunGHC)
+#define BLOCK_BUT_FIRST(c)   RETURN_TO_SCHED_BUT_FIRST(ThreadBlocked, ThreadRunGHC, c)
 
 /* -----------------------------------------------------------------------------
    Heap checks in thunks/functions.
     jump takeMVarzh_fast;
 }
 
+// code fragment executed just before we return to the scheduler
+stg_block_takemvar_finally
+{
+#ifdef SMP
+    foreign "C" unlockClosure(R3 "ptr", stg_EMPTY_MVAR_info);
+#endif
+    jump StgReturn;
+}
+
 stg_block_takemvar
 {
     Sp_adj(-2);
     Sp(1) = R1;
     Sp(0) = stg_block_takemvar_info;
-    BLOCK_GENERIC;
+    R3 = R1;
+    BLOCK_BUT_FIRST(stg_block_takemvar_finally);
 }
 
 INFO_TABLE_RET( stg_block_putmvar, 2/*framesize*/, 0/*bitmap*/, RET_SMALL )
     jump putMVarzh_fast;
 }
 
+// code fragment executed just before we return to the scheduler
+stg_block_putmvar_finally
+{
+#ifdef SMP
+    foreign "C" unlockClosure(R3 "ptr", stg_FULL_MVAR_info);
+#endif
+    jump StgReturn;
+}
+
 stg_block_putmvar
 {
     Sp_adj(-3);
     Sp(2) = R2;
     Sp(1) = R1;
     Sp(0) = stg_block_putmvar_info;
-    BLOCK_GENERIC;
+    R3 = R1;
+    BLOCK_BUT_FIRST(stg_block_putmvar_finally);
 }
 
 #ifdef mingw32_HOST_OS
 
     StgTSO_sp(tso) = StgTSO_sp(tso) + WDS(3);  \
     lval = W_[StgTSO_sp(tso) - WDS(1)];
 
+/*
+ * Only in threaded mode: we have to be careful when manipulating another thread's TSO,
+ * because the scheduler might also be manipulating it.
+ */
+#if defined(RTS_SUPPORTS_THREADS)
+#define ACQUIRE_SCHED_LOCK   foreign "C" ACQUIRE_LOCK(sched_mutex "ptr");
+#define RELEASE_SCHED_LOCK   foreign "C" RELEASE_LOCK(sched_mutex "ptr");
+#else
+#define ACQUIRE_SCHED_LOCK
+#define RELEASE_SCHED_LOCK
+#endif
 
 takeMVarzh_fast
 {
        StgTSO_block_info(CurrentTSO)  = mvar;
        StgMVar_tail(mvar) = CurrentTSO;
        
-#if defined(SMP)
-        SET_INFO(mvar,stg_EMPTY_MVAR_info);
-#endif
-
        jump stg_block_takemvar;
   }
 
 
   if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure)
   {
+      ACQUIRE_SCHED_LOCK;
+
       /* There are putMVar(s) waiting... 
        * wake up the first thread on the queue
        */
 
 #if defined(GRAN) || defined(PAR)
       /* ToDo: check 2nd arg (mvar) is right */
-      "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar),mvar);
+      "ptr" tso = foreign "C" unblockOneLocked(StgMVar_head(mvar),mvar);
       StgMVar_head(mvar) = tso;
 #else
-      "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr");
+      "ptr" tso = foreign "C" unblockOneLocked(StgMVar_head(mvar) "ptr");
       StgMVar_head(mvar) = tso;
 #endif
+
+      RELEASE_SCHED_LOCK;
+
       if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
          StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
       }
 
 #if defined(SMP)
-      SET_INFO(mvar,stg_FULL_MVAR_info);
+      foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info);
 #endif
-
       RET_P(val);
   } 
   else
       /* No further putMVars, MVar is now empty */
       StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
  
-      /* unlocks the closure in the SMP case */
+#if defined(SMP)
+      foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info);
+#else
       SET_INFO(mvar,stg_EMPTY_MVAR_info);
+#endif
 
       RET_P(val);
   }
 
     if (info == stg_EMPTY_MVAR_info) {
 #if defined(SMP)
-        SET_INFO(mvar,stg_EMPTY_MVAR_info);
+        foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info);
 #endif
        /* HACK: we need a pointer to pass back, 
         * so we abuse NO_FINALIZER_closure
     val = StgMVar_value(mvar);
 
     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+
+        ACQUIRE_SCHED_LOCK;
+
        /* There are putMVar(s) waiting... 
         * wake up the first thread on the queue
         */
 
 #if defined(GRAN) || defined(PAR)
        /* ToDo: check 2nd arg (mvar) is right */
-       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr", mvar "ptr");
+       "ptr" tso = foreign "C" unblockOneLocked(StgMVar_head(mvar) "ptr", mvar "ptr");
        StgMVar_head(mvar) = tso;
 #else
-       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr");
+       "ptr" tso = foreign "C" unblockOneLocked(StgMVar_head(mvar) "ptr");
        StgMVar_head(mvar) = tso;
 #endif
 
+        RELEASE_SCHED_LOCK;
+
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
        }
 #if defined(SMP)
-        SET_INFO(mvar,stg_FULL_MVAR_info);
+        foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info);
 #endif
     }
     else 
     {
        /* No further putMVars, MVar is now empty */
        StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
+#if defined(SMP)
+       foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info);
+#else
        SET_INFO(mvar,stg_EMPTY_MVAR_info);
+#endif
     }
     
     RET_NP(1, val);
        StgTSO_block_info(CurrentTSO)  = mvar;
        StgMVar_tail(mvar) = CurrentTSO;
        
-#if defined(SMP)
-        SET_INFO(mvar,stg_FULL_MVAR_info);
-#endif
        jump stg_block_putmvar;
     }
   
     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+
+        ACQUIRE_SCHED_LOCK;
+
        /* There are takeMVar(s) waiting: wake up the first one
         */
        ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
       
 #if defined(GRAN) || defined(PAR)
        /* ToDo: check 2nd arg (mvar) is right */
-       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr",mvar "ptr");
+       "ptr" tso = foreign "C" unblockOneLocked(StgMVar_head(mvar) "ptr",mvar "ptr");
        StgMVar_head(mvar) = tso;
 #else
-       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr");
+       "ptr" tso = foreign "C" unblockOneLocked(StgMVar_head(mvar) "ptr");
        StgMVar_head(mvar) = tso;
 #endif
 
+        RELEASE_SCHED_LOCK;
+
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
        }
 
 #if defined(SMP)
-        SET_INFO(mvar,stg_EMPTY_MVAR_info);
+       foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info);
 #endif
        jump %ENTRY_CODE(Sp(0));
     }
     {
        /* No further takes, the MVar is now full. */
        StgMVar_value(mvar) = R2;
-       /* unlocks the MVar in the SMP case */
-       SET_INFO(mvar,stg_FULL_MVAR_info);
 
+#if defined(SMP)
+       foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info);
+#else
+       SET_INFO(mvar,stg_FULL_MVAR_info);
+#endif
        jump %ENTRY_CODE(Sp(0));
     }
     
 
     if (info == stg_FULL_MVAR_info) {
 #if defined(SMP)
-        SET_INFO(mvar,stg_FULL_MVAR_info);
+       foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info);
 #endif
        RET_N(0);
     }
   
     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+
+        ACQUIRE_SCHED_LOCK;
+
        /* There are takeMVar(s) waiting: wake up the first one
         */
        ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
       
 #if defined(GRAN) || defined(PAR)
        /* ToDo: check 2nd arg (mvar) is right */
-       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr",mvar "ptr");
+       "ptr" tso = foreign "C" unblockOneLocked(StgMVar_head(mvar) "ptr",mvar "ptr");
        StgMVar_head(mvar) = tso;
 #else
-       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr");
+       "ptr" tso = foreign "C" unblockOneLocked(StgMVar_head(mvar) "ptr");
        StgMVar_head(mvar) = tso;
 #endif
 
+        RELEASE_SCHED_LOCK;
+
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
        }
 
 #if defined(SMP)
-        SET_INFO(mvar,stg_EMPTY_MVAR_info);
+       foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info);
 #endif
        jump %ENTRY_CODE(Sp(0));
     }
     {
        /* No further takes, the MVar is now full. */
        StgMVar_value(mvar) = R2;
-       /* unlocks the MVar in the SMP case */
-       SET_INFO(mvar,stg_FULL_MVAR_info);
 
+#if defined(SMP)
+       foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info);
+#else
+       SET_INFO(mvar,stg_FULL_MVAR_info);
+#endif
        jump %ENTRY_CODE(Sp(0));
     }
     
 
       CurrentTSO = event->tso;
 #endif
 
-      IF_DEBUG(scheduler, printAllThreads());
-
 #if defined(RTS_SUPPORTS_THREADS)
       // Yield the capability to higher-priority tasks if necessary.
       //
 
     case ThreadBlocked:
        scheduleHandleThreadBlocked(t);
-       threadPaused(t);
        break;
 
     case ThreadFinished:
        // they are unreachable and will therefore be sent an
        // exception.  Any threads thus released will be immediately
        // runnable.
+
        GarbageCollect(GetRoots,rtsTrue);
        recent_activity = ACTIVITY_DONE_GC;
        if ( !EMPTY_RUN_QUEUE() ) return;
     IF_DEBUG(scheduler,
             debugBelch("--<< thread %ld (%s) stopped: HeapOverflow\n", 
                        (long)t->id, whatNext_strs[t->what_next]));
-    threadPaused(t);
 #if defined(GRAN)
     ASSERT(!is_on_queue(t,CurrentProc));
 #elif defined(PARALLEL_HASKELL)
     /* just adjust the stack for this thread, then pop it back
      * on the run queue.
      */
-    threadPaused(t);
     { 
        /* enlarge the stack */
        StgTSO *new_t = threadStackOverflow(t);
        return rtsTrue;
     }
     
-    threadPaused(t);
-    
 #if defined(GRAN)
     ASSERT(!is_on_queue(t,CurrentProc));
       
     emitSchedule = rtsTrue;
     
 #else /* !GRAN */
-      /* don't need to do anything.  Either the thread is blocked on
-       * I/O, in which case we'll have called addToBlockedQueue
-       * previously, or it's blocked on an MVar or Blackhole, in which
-       * case it'll be on the relevant queue already.
-       */
+
+      // We don't need to do anything.  The thread is blocked, and it
+      // has tidied up its stack and placed itself on whatever queue
+      // it needs to be on.
+
+#if !defined(SMP)
     ASSERT(t->why_blocked != NotBlocked);
+            // This might not be true under SMP: we don't have
+            // exclusive access to this TSO, so someone might have
+            // woken it up by now.  This actually happens: try
+            // conc023 +RTS -N2.
+#endif
+
     IF_DEBUG(scheduler,
             debugBelch("--<< thread %d (%s) stopped: ", 
                        t->id, whatNext_strs[t->what_next]);
     // so this happens periodically:
     scheduleCheckBlackHoles();
     
+    IF_DEBUG(scheduler, printAllThreads());
+
     /* everybody back, start the GC.
      * Could do it in this thread, or signal a condition var
      * to do it in another thread.  Either way, we need to