X-Git-Url: http://git.megacz.com/?p=ghc-hetmet.git;a=blobdiff_plain;f=rts%2FPrimOps.cmm;h=e17c6fb3f870b7b923c79570cfe45a087bd787ee;hp=ac6de81e1c6e03c542b7e5f9083bc74a67a68a17;hb=e5c3b478b3cd1707cf122833822f44b2ac09b8e9;hpb=0856ac59cfb455d32a3042317fdba0f5e85cab9c

diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index ac6de81..e17c6fb 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -1,6 +1,6 @@
 /* -----------------------------------------------------------------------------
  *
- * (c) The GHC Team, 1998-2004
+ * (c) The GHC Team, 1998-2011
  *
  * Out-of-line primitive operations
  *
@@ -34,7 +34,10 @@ import pthread_mutex_unlock;
 import base_ControlziExceptionziBase_nestedAtomically_closure;
 import EnterCriticalSection;
 import LeaveCriticalSection;
-import ghczmprim_GHCziBool_False_closure;
+import ghczmprim_GHCziTypes_False_closure;
+#if !defined(mingw32_HOST_OS)
+import sm_mutex;
+#endif
 
 /*-----------------------------------------------------------------------------
   Array Primitives
@@ -58,10 +61,10 @@ stg_newByteArrayzh
     n = R1;
     payload_words = ROUNDUP_BYTES_TO_WDS(n);
     words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-    ("ptr" p) = foreign "C" allocateLocal(MyCapability() "ptr",words) [];
+    ("ptr" p) = foreign "C" allocate(MyCapability() "ptr",words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-    StgArrWords_words(p) = payload_words;
+    StgArrWords_bytes(p) = n;
     RET_P(p);
 }
 
@@ -70,10 +73,11 @@ stg_newByteArrayzh
 
 stg_newPinnedByteArrayzh
 {
-    W_ words, bytes, payload_words, p;
+    W_ words, n, bytes, payload_words, p;
 
     MAYBE_GC(NO_PTRS,stg_newPinnedByteArrayzh);
-    bytes = R1;
+    n = R1;
+    bytes = n;
     /* payload_words is what we will tell the profiler we had to allocate */
     payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
     /* When we actually allocate memory, we need to allow space for the
@@ -85,7 +89,7 @@ stg_newPinnedByteArrayzh
     /* Now we convert to a number of words: */
     words = ROUNDUP_BYTES_TO_WDS(bytes);
 
-    ("ptr" p) = foreign "C" allocatePinned(words) [];
+    ("ptr" p) = foreign "C" allocatePinned(MyCapability() "ptr", words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
     /* Now we need to move p forward so that the payload is aligned
@@ -93,18 +97,25 @@ stg_newPinnedByteArrayzh
     p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK);
 
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-    StgArrWords_words(p) = payload_words;
+    StgArrWords_bytes(p) = n;
     RET_P(p);
 }
 
 stg_newAlignedPinnedByteArrayzh
 {
-    W_ words, bytes, payload_words, p, alignment;
+    W_ words, n, bytes, payload_words, p, alignment;
 
     MAYBE_GC(NO_PTRS,stg_newAlignedPinnedByteArrayzh);
-    bytes = R1;
+    n = R1;
     alignment = R2;
 
+    /* we always supply at least word-aligned memory, so there's no
+       need to allow extra space for alignment if the requirement is less
+       than a word.  This also prevents mischief with alignment == 0. */
+    if (alignment <= SIZEOF_W) { alignment = 1; }
+
+    bytes = n;
+
     /* payload_words is what we will tell the profiler we had to allocate */
     payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
 
@@ -117,7 +128,7 @@ stg_newAlignedPinnedByteArrayzh
     /* Now we convert to a number of words: */
     words = ROUNDUP_BYTES_TO_WDS(bytes);
 
-    ("ptr" p) = foreign "C" allocatePinned(words) [];
+    ("ptr" p) = foreign "C" allocatePinned(MyCapability() "ptr", words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
     /* Now we need to move p forward so that the payload is aligned
@@ -126,24 +137,29 @@ stg_newAlignedPinnedByteArrayzh
     p = p + ((-p - SIZEOF_StgArrWords) & (alignment - 1));
 
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-    StgArrWords_words(p) = payload_words;
+    StgArrWords_bytes(p) = n;
     RET_P(p);
 }
 
 stg_newArrayzh
 {
-    W_ words, n, init, arr, p;
+    W_ words, n, init, arr, p, size;
     /* Args: R1 = words, R2 = initialisation value */
 
     n = R1;
     MAYBE_GC(R2_PTR,stg_newArrayzh);
 
-    words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + n;
-    ("ptr" arr) = foreign "C" allocateLocal(MyCapability() "ptr",words) [R2];
+    // the mark area contains one byte for each 2^MUT_ARR_PTRS_CARD_BITS words
+    // in the array, making sure we round up, and then rounding up to a whole
+    // number of words.
+    size = n + mutArrPtrsCardWords(n);
+    words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + size;
+    ("ptr" arr) = foreign "C" allocate(MyCapability() "ptr",words) [R2];
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
 
     SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
     StgMutArrPtrs_ptrs(arr) = n;
+    StgMutArrPtrs_size(arr) = size;
 
     // Initialise all elements of the the array with the value in R2
     init = R2;
@@ -154,6 +170,13 @@ stg_newArrayzh
 	p = p + WDS(1);
 	goto for;
     }
+    // Initialise the mark bits with 0
+  for2:
+    if (p < arr + WDS(size)) {
+	W_[p] = 0;
+	p = p + WDS(1);
+	goto for2;
+    }
 
     RET_P(arr);
 }
@@ -165,7 +188,7 @@ stg_unsafeThawArrayzh
   // A MUT_ARR_PTRS lives on the mutable list, but a MUT_ARR_PTRS_FROZEN 
   // normally doesn't.  However, when we freeze a MUT_ARR_PTRS, we leave
   // it on the mutable list for the GC to remove (removing something from
-  // the mutable list is not easy, because the mut_list is only singly-linked).
+  // the mutable list is not easy).
   // 
   // So that we can tell whether a MUT_ARR_PTRS_FROZEN is on the mutable list,
   // when we freeze it we set the info ptr to be MUT_ARR_PTRS_FROZEN0
@@ -189,6 +212,7 @@ stg_unsafeThawArrayzh
   }
 }
 
+
 /* -----------------------------------------------------------------------------
    MutVar primitives
    -------------------------------------------------------------------------- */
@@ -207,6 +231,25 @@ stg_newMutVarzh
     RET_P(mv);
 }
 
+stg_casMutVarzh
+ /* MutVar# s a -> a -> a -> State# s -> (# State#, Int#, a #) */
+{
+    W_ mv, old, new, h;
+
+    mv  = R1;
+    old = R2;
+    new = R3;
+
+    (h) = foreign "C" cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var,
+                          old, new) [];
+    if (h != old) {
+        RET_NP(1,h);
+    } else {
+        RET_NP(0,h);
+    }
+}
+
+
 stg_atomicModifyMutVarzh
 {
     W_ mv, f, z, x, y, r, h;
@@ -321,8 +364,10 @@ stg_mkWeakzh
   StgWeak_finalizer(w)  = R3;
   StgWeak_cfinalizer(w) = stg_NO_FINALIZER_closure;
 
+  ACQUIRE_LOCK(sm_mutex);
   StgWeak_link(w)	= W_[weak_ptr_list];
   W_[weak_ptr_list] 	= w;
+  RELEASE_LOCK(sm_mutex);
 
   IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
 
@@ -356,12 +401,12 @@ stg_mkWeakForeignEnvzh
 
   payload_words = 4;
   words         = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-  ("ptr" p)     = foreign "C" allocateLocal(MyCapability() "ptr", words) [];
+  ("ptr" p)     = foreign "C" allocate(MyCapability() "ptr", words) [];
 
   TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
 
-  StgArrWords_words(p)     = payload_words;
+  StgArrWords_bytes(p)     = WDS(payload_words);
   StgArrWords_payload(p,0) = fptr;
   StgArrWords_payload(p,1) = ptr;
   StgArrWords_payload(p,2) = eptr;
@@ -375,8 +420,10 @@ stg_mkWeakForeignEnvzh
   StgWeak_finalizer(w)  = stg_NO_FINALIZER_closure;
   StgWeak_cfinalizer(w) = p;
 
+  ACQUIRE_LOCK(sm_mutex);
   StgWeak_link(w)   = W_[weak_ptr_list];
   W_[weak_ptr_list] = w;
+  RELEASE_LOCK(sm_mutex);
 
   IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
 
@@ -525,9 +572,9 @@ stg_forkzh
 				closure "ptr") [];
 
   /* start blocked if the current thread is blocked */
-  StgTSO_flags(threadid) = 
-     StgTSO_flags(threadid) |  (StgTSO_flags(CurrentTSO) & 
-                                (TSO_BLOCKEX::I32 | TSO_INTERRUPTIBLE::I32));
+  StgTSO_flags(threadid) = %lobits16(
+     TO_W_(StgTSO_flags(threadid)) | 
+     TO_W_(StgTSO_flags(CurrentTSO)) & (TSO_BLOCKEX | TSO_INTERRUPTIBLE));
 
   foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];
 
@@ -555,9 +602,9 @@ stg_forkOnzh
 				closure "ptr") [];
 
   /* start blocked if the current thread is blocked */
-  StgTSO_flags(threadid) = 
-     StgTSO_flags(threadid) |  (StgTSO_flags(CurrentTSO) & 
-                                (TSO_BLOCKEX::I32 | TSO_INTERRUPTIBLE::I32));
+  StgTSO_flags(threadid) = %lobits16(
+     TO_W_(StgTSO_flags(threadid)) | 
+     TO_W_(StgTSO_flags(CurrentTSO)) & (TSO_BLOCKEX | TSO_INTERRUPTIBLE));
 
   foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];
 
@@ -604,14 +651,9 @@ stg_threadStatuszh
     W_ tso;
     W_ why_blocked;
     W_ what_next;
-    W_ ret;
+    W_ ret, cap, locked;
 
     tso = R1;
-    loop:
-      if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
-          tso = StgTSO__link(tso);
-          goto loop;
-      }
 
     what_next   = TO_W_(StgTSO_what_next(tso));
     why_blocked = TO_W_(StgTSO_why_blocked(tso));
@@ -629,7 +671,16 @@ stg_threadStatuszh
             ret = why_blocked;
         }
     }
-    RET_N(ret);
+
+    cap = TO_W_(Capability_no(StgTSO_cap(tso)));
+
+    if ((TO_W_(StgTSO_flags(tso)) & TSO_LOCKED) != 0) {
+        locked = 1;
+    } else {
+        locked = 0;
+    }
+
+    RET_NNN(ret,cap,locked);
 }
 
 /* -----------------------------------------------------------------------------
@@ -912,9 +963,9 @@ stg_retryzh
 
   // Find the enclosing ATOMICALLY_FRAME or CATCH_RETRY_FRAME
 retry_pop_stack:
-  StgTSO_sp(CurrentTSO) = Sp;
-  (frame_type) = foreign "C" findRetryFrameHelper(CurrentTSO "ptr") [];
-  Sp = StgTSO_sp(CurrentTSO);
+  SAVE_THREAD_STATE();
+  (frame_type) = foreign "C" findRetryFrameHelper(MyCapability(), CurrentTSO "ptr") [];
+  LOAD_THREAD_STATE();
   frame = Sp;
   trec = StgTSO_trec(CurrentTSO);
   outer  = StgTRecHeader_enclosing_trec(trec);
@@ -1111,17 +1162,21 @@ stg_newMVarzh
 }
 
 
-#define PerformTake(tso, value)				\
-    W_[StgTSO_sp(tso) + WDS(1)] = value;		\
-    W_[StgTSO_sp(tso) + WDS(0)] = stg_gc_unpt_r1_info;
+#define PerformTake(stack, value)               \
+    W_ sp;                                      \
+    sp = StgStack_sp(stack);                    \
+    W_[sp + WDS(1)] = value;                    \
+    W_[sp + WDS(0)] = stg_gc_unpt_r1_info;
 
-#define PerformPut(tso,lval)			\
-    StgTSO_sp(tso) = StgTSO_sp(tso) + WDS(3);	\
-    lval = W_[StgTSO_sp(tso) - WDS(1)];
+#define PerformPut(stack,lval)                  \
+    W_ sp;                                      \
+    sp = StgStack_sp(stack) + WDS(3);           \
+    StgStack_sp(stack) = sp;                    \
+    lval = W_[sp - WDS(1)];
 
 stg_takeMVarzh
 {
-    W_ mvar, val, info, tso;
+    W_ mvar, val, info, tso, q;
 
     /* args: R1 = MVar closure */
     mvar = R1;
@@ -1140,80 +1195,88 @@ stg_takeMVarzh
      * and wait until we're woken up.
      */
     if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
+        
+        // Note [mvar-heap-check] We want to do the heap check in the
+        // branch here, to avoid the conditional in the common case.
+        // However, we've already locked the MVar above, so we better
+        // be careful to unlock it again if the the heap check fails.
+        // Unfortunately we don't have an easy way to inject any code
+        // into the heap check generated by the code generator, so we
+        // have to do it in stg_gc_gen (see HeapStackCheck.cmm).
+        HP_CHK_GEN_TICKY(SIZEOF_StgMVarTSOQueue, R1_PTR, stg_takeMVarzh);
+
+        q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1);
+
+        SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM);
+        StgMVarTSOQueue_link(q) = END_TSO_QUEUE;
+        StgMVarTSOQueue_tso(q)  = CurrentTSO;
+
 	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
-	    StgMVar_head(mvar) = CurrentTSO;
+	    StgMVar_head(mvar) = q;
 	} else {
-            foreign "C" setTSOLink(MyCapability() "ptr", 
-                                   StgMVar_tail(mvar) "ptr",
-                                   CurrentTSO) [];
+            StgMVarTSOQueue_link(StgMVar_tail(mvar)) = q;
+            foreign "C" recordClosureMutated(MyCapability() "ptr",
+                                             StgMVar_tail(mvar)) [];
 	}
-	StgTSO__link(CurrentTSO)       = stg_END_TSO_QUEUE_closure;
+	StgTSO__link(CurrentTSO)       = q;
 	StgTSO_block_info(CurrentTSO)  = mvar;
-        // write barrier for throwTo(), which looks at block_info
-        // if why_blocked==BlockedOnMVar.
-        prim %write_barrier() [];
 	StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
-	StgMVar_tail(mvar) = CurrentTSO;
+	StgMVar_tail(mvar)             = q;
 	
         R1 = mvar;
 	jump stg_block_takemvar;
-  }
-
-  /* we got the value... */
-  val = StgMVar_value(mvar);
-
-  if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure)
-  {
-      /* There are putMVar(s) waiting... 
-       * wake up the first thread on the queue
-       */
-      ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
-
-      /* actually perform the putMVar for the thread that we just woke up */
-      tso = StgMVar_head(mvar);
-      PerformPut(tso,StgMVar_value(mvar));
-
-      if (TO_W_(StgTSO_dirty(tso)) == 0) {
-          foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
-      }
+    }
+    
+    /* we got the value... */
+    val = StgMVar_value(mvar);
+    
+    q = StgMVar_head(mvar);
+loop:
+    if (q == stg_END_TSO_QUEUE_closure) {
+        /* No further putMVars, MVar is now empty */
+        StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
+        unlockClosure(mvar, stg_MVAR_DIRTY_info);
+        RET_P(val);
+    }
+    if (StgHeader_info(q) == stg_IND_info ||
+        StgHeader_info(q) == stg_MSG_NULL_info) {
+        q = StgInd_indirectee(q);
+        goto loop;
+    }
+    
+    // There are putMVar(s) waiting... wake up the first thread on the queue
+    
+    tso = StgMVarTSOQueue_tso(q);
+    StgMVar_head(mvar) = StgMVarTSOQueue_link(q);
+    if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+        StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+    }
 
-      ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
-                                            StgMVar_head(mvar) "ptr", 1) [];
-      StgMVar_head(mvar) = tso;
+    ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
+    ASSERT(StgTSO_block_info(tso) == mvar);
 
-      if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
-	  StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
-      }
+    // actually perform the putMVar for the thread that we just woke up
+    W_ stack;
+    stack = StgTSO_stackobj(tso);
+    PerformPut(stack, StgMVar_value(mvar));
 
-#if defined(THREADED_RTS)
-      unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-      SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
-      RET_P(val);
-  } 
-  else
-  {
-      /* No further putMVars, MVar is now empty */
-      StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
- 
-#if defined(THREADED_RTS)
-      unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-      SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
+    // indicate that the MVar operation has now completed.
+    StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
+    
+    // no need to mark the TSO dirty, we have only written END_TSO_QUEUE.
 
-      RET_P(val);
-  }
+    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
+    
+    unlockClosure(mvar, stg_MVAR_DIRTY_info);
+    RET_P(val);
 }
 
 
 stg_tryTakeMVarzh
 {
-    W_ mvar, val, info, tso;
+    W_ mvar, val, info, tso, q;
 
     /* args: R1 = MVar closure */
-
     mvar = R1;
 
 #if defined(THREADED_RTS)
@@ -1221,7 +1284,10 @@ stg_tryTakeMVarzh
 #else
     info = GET_INFO(mvar);
 #endif
-
+        
+    /* If the MVar is empty, put ourselves on its blocking queue,
+     * and wait until we're woken up.
+     */
     if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
 #if defined(THREADED_RTS)
         unlockClosure(mvar, info);
@@ -1231,59 +1297,59 @@ stg_tryTakeMVarzh
 	 */
 	RET_NP(0, stg_NO_FINALIZER_closure);
     }
-
+    
     if (info == stg_MVAR_CLEAN_info) {
-        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
+        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") [];
     }
 
     /* we got the value... */
     val = StgMVar_value(mvar);
+    
+    q = StgMVar_head(mvar);
+loop:
+    if (q == stg_END_TSO_QUEUE_closure) {
+        /* No further putMVars, MVar is now empty */
+        StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
+        unlockClosure(mvar, stg_MVAR_DIRTY_info);
+        RET_NP(1, val);
+    }
+    if (StgHeader_info(q) == stg_IND_info ||
+        StgHeader_info(q) == stg_MSG_NULL_info) {
+        q = StgInd_indirectee(q);
+        goto loop;
+    }
+    
+    // There are putMVar(s) waiting... wake up the first thread on the queue
+    
+    tso = StgMVarTSOQueue_tso(q);
+    StgMVar_head(mvar) = StgMVarTSOQueue_link(q);
+    if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+        StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+    }
 
-    if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+    ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
+    ASSERT(StgTSO_block_info(tso) == mvar);
 
-	/* There are putMVar(s) waiting... 
-	 * wake up the first thread on the queue
-	 */
-	ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
+    // actually perform the putMVar for the thread that we just woke up
+    W_ stack;
+    stack = StgTSO_stackobj(tso);
+    PerformPut(stack, StgMVar_value(mvar));
 
-	/* actually perform the putMVar for the thread that we just woke up */
-	tso = StgMVar_head(mvar);
-	PerformPut(tso,StgMVar_value(mvar));
-        if (TO_W_(StgTSO_dirty(tso)) == 0) {
-            foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
-        }
-
-        ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
-                                              StgMVar_head(mvar) "ptr", 1) [];
-	StgMVar_head(mvar) = tso;
+    // indicate that the MVar operation has now completed.
+    StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
+    
+    // no need to mark the TSO dirty, we have only written END_TSO_QUEUE.
 
-	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
-	    StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
-	}
-#if defined(THREADED_RTS)
-        unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-        SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
-    }
-    else 
-    {
-	/* No further putMVars, MVar is now empty */
-	StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
-#if defined(THREADED_RTS)
-    	unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-	SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
-    }
+    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
     
-    RET_NP(1, val);
+    unlockClosure(mvar, stg_MVAR_DIRTY_info);
+    RET_NP(1,val);
 }
 
 
 stg_putMVarzh
 {
-    W_ mvar, val, info, tso;
+    W_ mvar, val, info, tso, q;
 
     /* args: R1 = MVar, R2 = value */
     mvar = R1;
@@ -1300,84 +1366,95 @@ stg_putMVarzh
     }
 
     if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
+
+        // see Note [mvar-heap-check] above
+        HP_CHK_GEN_TICKY(SIZEOF_StgMVarTSOQueue, R1_PTR & R2_PTR, stg_putMVarzh);
+
+        q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1);
+
+        SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM);
+        StgMVarTSOQueue_link(q) = END_TSO_QUEUE;
+        StgMVarTSOQueue_tso(q)  = CurrentTSO;
+
 	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
-	    StgMVar_head(mvar) = CurrentTSO;
+	    StgMVar_head(mvar) = q;
 	} else {
-            foreign "C" setTSOLink(MyCapability() "ptr", 
-                                   StgMVar_tail(mvar) "ptr",
-                                   CurrentTSO) [];
+            StgMVarTSOQueue_link(StgMVar_tail(mvar)) = q;
+            foreign "C" recordClosureMutated(MyCapability() "ptr",
+                                             StgMVar_tail(mvar)) [];
 	}
-	StgTSO__link(CurrentTSO)       = stg_END_TSO_QUEUE_closure;
+	StgTSO__link(CurrentTSO)       = q;
 	StgTSO_block_info(CurrentTSO)  = mvar;
-        // write barrier for throwTo(), which looks at block_info
-        // if why_blocked==BlockedOnMVar.
-        prim %write_barrier() [];
 	StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
-	StgMVar_tail(mvar) = CurrentTSO;
-	
+	StgMVar_tail(mvar)             = q;
+
         R1 = mvar;
         R2 = val;
 	jump stg_block_putmvar;
     }
   
-    if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+    q = StgMVar_head(mvar);
+loop:
+    if (q == stg_END_TSO_QUEUE_closure) {
+	/* No further takes, the MVar is now full. */
+	StgMVar_value(mvar) = val;
+    	unlockClosure(mvar, stg_MVAR_DIRTY_info);
+	jump %ENTRY_CODE(Sp(0));
+    }
+    if (StgHeader_info(q) == stg_IND_info ||
+        StgHeader_info(q) == stg_MSG_NULL_info) {
+        q = StgInd_indirectee(q);
+        goto loop;
+    }
 
-	/* There are takeMVar(s) waiting: wake up the first one
-	 */
-	ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
+    // There are takeMVar(s) waiting: wake up the first one
+    
+    tso = StgMVarTSOQueue_tso(q);
+    StgMVar_head(mvar) = StgMVarTSOQueue_link(q);
+    if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+        StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+    }
 
-	/* actually perform the takeMVar */
-	tso = StgMVar_head(mvar);
-	PerformTake(tso, val);
-        if (TO_W_(StgTSO_dirty(tso)) == 0) {
-            foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
-        }
-      
-        ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
-                                              StgMVar_head(mvar) "ptr", 1) [];
-	StgMVar_head(mvar) = tso;
+    ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
+    ASSERT(StgTSO_block_info(tso) == mvar);
 
-	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
-	    StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
-	}
+    // actually perform the takeMVar
+    W_ stack;
+    stack = StgTSO_stackobj(tso);
+    PerformTake(stack, val);
 
-#if defined(THREADED_RTS)
-    	unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-        SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
-	jump %ENTRY_CODE(Sp(0));
-    }
-    else
-    {
-	/* No further takes, the MVar is now full. */
-	StgMVar_value(mvar) = val;
+    // indicate that the MVar operation has now completed.
+    StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
 
-#if defined(THREADED_RTS)
-    	unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-	SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
-	jump %ENTRY_CODE(Sp(0));
+    if (TO_W_(StgStack_dirty(stack)) == 0) {
+        foreign "C" dirty_STACK(MyCapability() "ptr", stack "ptr") [];
     }
     
-    /* ToDo: yield afterward for better communication performance? */
+    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
+
+    unlockClosure(mvar, stg_MVAR_DIRTY_info);
+    jump %ENTRY_CODE(Sp(0));
 }
 
 
 stg_tryPutMVarzh
 {
-    W_ mvar, info, tso;
+    W_ mvar, val, info, tso, q;
 
     /* args: R1 = MVar, R2 = value */
     mvar = R1;
+    val  = R2;
 
 #if defined(THREADED_RTS)
-    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [R2];
+    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
 #else
     info = GET_INFO(mvar);
 #endif
 
+    if (info == stg_MVAR_CLEAN_info) {
+        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
+    }
+
     if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
 #if defined(THREADED_RTS)
     	unlockClosure(mvar, info);
@@ -1385,51 +1462,47 @@ stg_tryPutMVarzh
 	RET_N(0);
     }
   
-    if (info == stg_MVAR_CLEAN_info) {
-        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
+    q = StgMVar_head(mvar);
+loop:
+    if (q == stg_END_TSO_QUEUE_closure) {
+	/* No further takes, the MVar is now full. */
+	StgMVar_value(mvar) = val;
+    	unlockClosure(mvar, stg_MVAR_DIRTY_info);
+        RET_N(1);
+    }
+    if (StgHeader_info(q) == stg_IND_info ||
+        StgHeader_info(q) == stg_MSG_NULL_info) {
+        q = StgInd_indirectee(q);
+        goto loop;
     }
 
-    if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
-
-	/* There are takeMVar(s) waiting: wake up the first one
-	 */
-	ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
-	
-	/* actually perform the takeMVar */
-	tso = StgMVar_head(mvar);
-	PerformTake(tso, R2);
-        if (TO_W_(StgTSO_dirty(tso)) == 0) {
-            foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
-        }
-      
-        ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
-                                              StgMVar_head(mvar) "ptr", 1) [];
-	StgMVar_head(mvar) = tso;
+    // There are takeMVar(s) waiting: wake up the first one
+    
+    tso = StgMVarTSOQueue_tso(q);
+    StgMVar_head(mvar) = StgMVarTSOQueue_link(q);
+    if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+        StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+    }
 
-	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
-	    StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
-	}
+    ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
+    ASSERT(StgTSO_block_info(tso) == mvar);
 
-#if defined(THREADED_RTS)
-    	unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-        SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
-    }
-    else
-    {
-	/* No further takes, the MVar is now full. */
-	StgMVar_value(mvar) = R2;
+    // actually perform the takeMVar
+    W_ stack;
+    stack = StgTSO_stackobj(tso);
+    PerformTake(stack, val);
 
-#if defined(THREADED_RTS)
-    	unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-	SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
+    // indicate that the MVar operation has now completed.
+    StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
+    
+    if (TO_W_(StgStack_dirty(stack)) == 0) {
+        foreign "C" dirty_STACK(MyCapability() "ptr", stack "ptr") [];
     }
     
+    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
+
+    unlockClosure(mvar, stg_MVAR_DIRTY_info);
     RET_N(1);
-    /* ToDo: yield afterward for better communication performance? */
 }
 
 
@@ -1495,7 +1568,7 @@ stg_newBCOzh
     
     bitmap_arr = R5;
 
-    words = BYTES_TO_WDS(SIZEOF_StgBCO) + StgArrWords_words(bitmap_arr);
+    words = BYTES_TO_WDS(SIZEOF_StgBCO) + BYTE_ARR_WDS(bitmap_arr);
     bytes = WDS(words);
 
     ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, stg_newBCOzh );
@@ -1513,7 +1586,7 @@ stg_newBCOzh
     W_ i;
     i = 0;
 for:
-    if (i < StgArrWords_words(bitmap_arr)) {
+    if (i < BYTE_ARR_WDS(bitmap_arr)) {
 	StgBCO_bitmap(bco,i) = StgArrWords_payload(bitmap_arr,i);
 	i = i + 1;
 	goto for;
@@ -1578,9 +1651,10 @@ stg_unpackClosurezh
     }}
 out:
 
-    W_ ptrs_arr_sz, nptrs_arr_sz;
+    W_ ptrs_arr_sz, ptrs_arr_cards, nptrs_arr_sz;
     nptrs_arr_sz = SIZEOF_StgArrWords   + WDS(nptrs);
-    ptrs_arr_sz  = SIZEOF_StgMutArrPtrs + WDS(ptrs);
+    ptrs_arr_cards = mutArrPtrsCardWords(ptrs);
+    ptrs_arr_sz  = SIZEOF_StgMutArrPtrs + WDS(ptrs) + WDS(ptrs_arr_cards);
 
     ALLOC_PRIM (ptrs_arr_sz + nptrs_arr_sz, R1_PTR, stg_unpackClosurezh);
 
@@ -1592,6 +1666,8 @@ out:
 
     SET_HDR(ptrs_arr, stg_MUT_ARR_PTRS_FROZEN_info, W_[CCCS]);
     StgMutArrPtrs_ptrs(ptrs_arr) = ptrs;
+    StgMutArrPtrs_size(ptrs_arr) = ptrs + ptrs_arr_cards;
+
     p = 0;
 for:
     if(p < ptrs) {
@@ -1599,9 +1675,12 @@ for:
 	 p = p + 1;
 	 goto for;
     }
+    /* We can leave the card table uninitialised, since the array is
+       allocated in the nursery.  The GC will fill it in if/when the array
+       is promoted. */
     
     SET_HDR(nptrs_arr, stg_ARR_WORDS_info, W_[CCCS]);
-    StgArrWords_words(nptrs_arr) = nptrs;
+    StgArrWords_bytes(nptrs_arr) = WDS(nptrs);
     p = 0;
 for2:
     if(p < nptrs) {
@@ -1822,12 +1901,71 @@ stg_asyncDoProczh
 }
 #endif
 
-// noDuplicate# tries to ensure that none of the thunks under
-// evaluation by the current thread are also under evaluation by
-// another thread.  It relies on *both* threads doing noDuplicate#;
-// the second one will get blocked if they are duplicating some work.
+/* -----------------------------------------------------------------------------
+ * noDuplicate#
+ *
+ * noDuplicate# tries to ensure that none of the thunks under
+ * evaluation by the current thread are also under evaluation by
+ * another thread.  It relies on *both* threads doing noDuplicate#;
+ * the second one will get blocked if they are duplicating some work.
+ *
+ * The idea is that noDuplicate# is used within unsafePerformIO to
+ * ensure that the IO operation is performed at most once.
+ * noDuplicate# calls threadPaused which acquires an exclusive lock on
+ * all the thunks currently under evaluation by the current thread.
+ *
+ * Consider the following scenario.  There is a thunk A, whose
+ * evaluation requires evaluating thunk B, where thunk B is an
+ * unsafePerformIO.  Two threads, 1 and 2, bother enter A.  Thread 2
+ * is pre-empted before it enters B, and claims A by blackholing it
+ * (in threadPaused).  Thread 1 now enters B, and calls noDuplicate#.
+ *
+ *      thread 1                      thread 2
+ *   +-----------+                 +---------------+
+ *   |    -------+-----> A <-------+-------        |
+ *   |  update   |   BLACKHOLE     | marked_update |
+ *   +-----------+                 +---------------+
+ *   |           |                 |               | 
+ *        ...                             ...
+ *   |           |                 +---------------+
+ *   +-----------+
+ *   |     ------+-----> B
+ *   |  update   |   BLACKHOLE
+ *   +-----------+
+ *
+ * At this point: A is a blackhole, owned by thread 2.  noDuplicate#
+ * calls threadPaused, which walks up the stack and
+ *  - claims B on behalf of thread 1
+ *  - then it reaches the update frame for A, which it sees is already
+ *    a BLACKHOLE and is therefore owned by another thread.  Since
+ *    thread 1 is duplicating work, the computation up to the update
+ *    frame for A is suspended, including thunk B.
+ *  - thunk B, which is an unsafePerformIO, has now been reverted to
+ *    an AP_STACK which could be duplicated - BAD!
+ *  - The solution is as follows: before calling threadPaused, we
+ *    leave a frame on the stack (stg_noDuplicate_info) that will call
+ *    noDuplicate# again if the current computation is suspended and
+ *    restarted.
+ *
+ * See the test program in concurrent/prog003 for a way to demonstrate
+ * this.  It needs to be run with +RTS -N3 or greater, and the bug
+ * only manifests occasionally (once very 10 runs or so).
+ * -------------------------------------------------------------------------- */
+
+INFO_TABLE_RET(stg_noDuplicate, RET_SMALL)
+{
+    Sp_adj(1);
+    jump stg_noDuplicatezh;
+}
+
 stg_noDuplicatezh
 {
+    STK_CHK_GEN( WDS(1), NO_PTRS, stg_noDuplicatezh );
+    // leave noDuplicate frame in case the current
+    // computation is suspended and restarted (see above).
+    Sp_adj(-1);
+    Sp(0) = stg_noDuplicate_info;
+
     SAVE_THREAD_STATE();
     ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
     foreign "C" threadPaused (MyCapability() "ptr", CurrentTSO "ptr") [];
@@ -1837,10 +1975,18 @@ stg_noDuplicatezh
     } else {
         LOAD_THREAD_STATE();
         ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
+        // remove the stg_noDuplicate frame if it is still there.
+        if (Sp(0) == stg_noDuplicate_info) {
+            Sp_adj(1);
+        }
         jump %ENTRY_CODE(Sp(0));
     }
 }
 
+/* -----------------------------------------------------------------------------
+   Misc. primitives
+   -------------------------------------------------------------------------- */
+
 stg_getApStackValzh
 {
    W_ ap_stack, offset, val, ok;
@@ -1859,10 +2005,6 @@ stg_getApStackValzh
    RET_NP(ok,val);
 }
 
-/* -----------------------------------------------------------------------------
-   Misc. primitives
-   -------------------------------------------------------------------------- */
-
 // Write the cost center stack of the first argument on stderr; return
 // the second.  Possibly only makes sense for already evaluated
 // things?
@@ -1884,23 +2026,62 @@ stg_getSparkzh
    W_ spark;
 
 #ifndef THREADED_RTS
-   RET_NP(0,ghczmprim_GHCziBool_False_closure);
+   RET_NP(0,ghczmprim_GHCziTypes_False_closure);
 #else
    (spark) = foreign "C" findSpark(MyCapability());
    if (spark != 0) {
       RET_NP(1,spark);
    } else {
-      RET_NP(0,ghczmprim_GHCziBool_False_closure);
+      RET_NP(0,ghczmprim_GHCziTypes_False_closure);
    }
 #endif
 }
 
+stg_numSparkszh
+{
+  W_ n;
+#ifdef THREADED_RTS
+  (n) = foreign "C" dequeElements(Capability_sparks(MyCapability()));
+#else
+  n = 0;
+#endif
+  RET_N(n);
+}
+
 stg_traceEventzh
 {
    W_ msg;
    msg = R1;
+
 #if defined(TRACING) || defined(DEBUG)
+
    foreign "C" traceUserMsg(MyCapability() "ptr", msg "ptr") [];
+
+#elif defined(DTRACE)
+
+   W_ enabled;
+
+   // We should go through the macro HASKELLEVENT_USER_MSG_ENABLED from
+   // RtsProbes.h, but that header file includes unistd.h, which doesn't
+   // work in Cmm
+#if !defined(solaris2_TARGET_OS)
+   (enabled) = foreign "C" __dtrace_isenabled$HaskellEvent$user__msg$v1() [];
+#else
+   // Solaris' DTrace can't handle the
+   //     __dtrace_isenabled$HaskellEvent$user__msg$v1
+   // call above. This call is just for testing whether the user__msg
+   // probe is enabled, and is here for just performance optimization.
+   // Since preparation for the probe is not that complex I disable usage of
+   // this test above for Solaris and enable the probe usage manually
+   // here. Please note that this does not mean that the probe will be
+   // used during the runtime! You still need to enable it by consumption
+   // in your dtrace script as you do with any other probe.
+   enabled = 1;
+#endif
+   if (enabled != 0) {
+     foreign "C" dtraceUserMsgWrapper(MyCapability() "ptr", msg "ptr") [];
+   }
+
 #endif
    jump %ENTRY_CODE(Sp(0));
 }