Add a proper write barrier for MVars
[ghc-hetmet.git] / rts / PrimOps.cmm
index 990d6f3..04a753c 100644 (file)
 
 #include "Cmm.h"
 
+#ifdef __PIC__
+import __gmpz_init;
+import __gmpz_add;
+import __gmpz_sub;
+import __gmpz_mul;
+import __gmpz_gcd;
+import __gmpn_gcd_1;
+import __gmpn_cmp;
+import __gmpz_tdiv_q;
+import __gmpz_tdiv_r;
+import __gmpz_tdiv_qr;
+import __gmpz_fdiv_qr;
+import __gmpz_divexact;
+import __gmpz_and;
+import __gmpz_xor;
+import __gmpz_ior;
+import __gmpz_com;
+import base_GHCziIOBase_NestedAtomically_closure;
+import pthread_mutex_lock;
+import pthread_mutex_unlock;
+#endif
+import EnterCriticalSection;
+import LeaveCriticalSection;
+
 /*-----------------------------------------------------------------------------
   Array Primitives
 
@@ -49,7 +73,7 @@ newByteArrayzh_fast
     n = R1;
     payload_words = ROUNDUP_BYTES_TO_WDS(n);
     words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-    "ptr" p = foreign "C" allocateLocal(MyCapability() "ptr",words) [];
+    ("ptr" p) = foreign "C" allocateLocal(MyCapability() "ptr",words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
     StgArrWords_words(p) = payload_words;
@@ -73,7 +97,7 @@ newPinnedByteArrayzh_fast
        words = words + 1;
     }
 
-    "ptr" p = foreign "C" allocatePinned(words) [];
+    ("ptr" p) = foreign "C" allocatePinned(words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
     // Again, if the ArrWords header isn't a multiple of 8 bytes, we
@@ -97,7 +121,7 @@ newArrayzh_fast
     MAYBE_GC(R2_PTR,newArrayzh_fast);
 
     words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + n;
-    "ptr" arr = foreign "C" allocateLocal(MyCapability() "ptr",words) [R2];
+    ("ptr" arr) = foreign "C" allocateLocal(MyCapability() "ptr",words) [R2];
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
 
     SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
@@ -207,7 +231,7 @@ atomicModifyMutVarzh_fast
    HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, atomicModifyMutVarzh_fast);
 
 #if defined(THREADED_RTS)
-    foreign "C" ACQUIRE_LOCK(atomic_modify_mutvar_mutex "ptr") [R1,R2];
+    ACQUIRE_LOCK(atomic_modify_mutvar_mutex "ptr") [R1,R2];
 #endif
 
    x = StgMutVar_var(R1);
@@ -238,7 +262,7 @@ atomicModifyMutVarzh_fast
    StgThunk_payload(r,0) = z;
 
 #if defined(THREADED_RTS)
-    foreign "C" RELEASE_LOCK(atomic_modify_mutvar_mutex "ptr") [];
+    RELEASE_LOCK(atomic_modify_mutvar_mutex "ptr") [];
 #endif
 
    RET_P(r);
@@ -418,12 +442,15 @@ int64ToIntegerzh_fast
    /* arguments: L1 = Int64# */
 
    L_ val;
-   W_ hi, s, neg, words_needed, p;
+   W_ hi, lo, s, neg, words_needed, p;
 
    val = L1;
    neg = 0;
 
-   if ( %ge(val,0x100000000::L_) || %le(val,-0x100000000::L_) )  { 
+   hi = TO_W_(val >> 32);
+   lo = TO_W_(val);
+
+   if ( hi != 0 && hi != 0xFFFFFFFF )  { 
        words_needed = 2;
    } else { 
        // minimum is one word
@@ -437,21 +464,24 @@ int64ToIntegerzh_fast
    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
    StgArrWords_words(p) = words_needed;
 
-   if ( %lt(val,0::L_) ) {
+   if ( %lt(hi,0) ) {
      neg = 1;
-     val = -val;
+     lo = -lo;
+     if(lo == 0) {
+       hi = -hi;
+     } else {
+       hi = -hi - 1;
+     }
    }
 
-   hi = TO_W_(val >> 32);
-
    if ( words_needed == 2 )  { 
       s = 2;
-      Hp(-1) = TO_W_(val);
+      Hp(-1) = lo;
       Hp(0) = hi;
    } else { 
-       if ( val != 0::L_ ) {
+       if ( lo != 0 ) {
           s = 1;
-          Hp(0) = TO_W_(val);
+          Hp(0) = lo;
        } else /* val==0 */  {
           s = 0;
        }
@@ -465,16 +495,18 @@ int64ToIntegerzh_fast
    */
    RET_NP(s,p);
 }
-
 word64ToIntegerzh_fast
 {
    /* arguments: L1 = Word64# */
 
    L_ val;
-   W_ hi, s, words_needed, p;
+   W_ hi, lo, s, words_needed, p;
 
    val = L1;
-   if ( val >= 0x100000000::L_ ) {
+   hi = TO_W_(val >> 32);
+   lo = TO_W_(val);
+
+   if ( hi != 0 ) {
       words_needed = 2;
    } else {
       words_needed = 1;
@@ -487,15 +519,14 @@ word64ToIntegerzh_fast
    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
    StgArrWords_words(p) = words_needed;
 
-   hi = TO_W_(val >> 32);
-   if ( val >= 0x100000000::L_ ) { 
+   if ( hi != 0 ) { 
      s = 2;
-     Hp(-1) = TO_W_(val);
+     Hp(-1) = lo;
      Hp(0)  = hi;
    } else {
-      if ( val != 0::L_ ) {
+      if ( lo != 0 ) {
         s = 1;
-        Hp(0) = TO_W_(val);
+        Hp(0) = lo;
      } else /* val==0 */  {
       s = 0;
      }
@@ -508,6 +539,7 @@ word64ToIntegerzh_fast
 }
 
 
+
 #endif /* SUPPORT_LONG_LONGS */
 
 /* ToDo: this is shockingly inefficient */
@@ -668,7 +700,7 @@ gcdIntzh_fast
     FETCH_MP_TEMP(mp_tmp_w);
 
     W_[mp_tmp_w] = R1;
-    r = foreign "C" __gmpn_gcd_1(mp_tmp_w "ptr", 1, R2) [];
+    (r) = foreign "C" __gmpn_gcd_1(mp_tmp_w "ptr", 1, R2) [];
 
     R1 = r;
     /* Result parked in R1, return via info-pointer at TOS */
@@ -679,7 +711,9 @@ gcdIntzh_fast
 gcdIntegerIntzh_fast
 {
     /* R1 = s1; R2 = d1; R3 = the int */
-    R1 = foreign "C" __gmpn_gcd_1( BYTE_ARR_CTS(R2) "ptr", R1, R3) [];
+    W_ s1;
+    (s1) = foreign "C" __gmpn_gcd_1( BYTE_ARR_CTS(R2) "ptr", R1, R3) [];
+    R1 = s1;
     
     /* Result parked in R1, return via info-pointer at TOS */
     jump %ENTRY_CODE(Sp(0));
@@ -760,7 +794,7 @@ cmpIntegerzh_fast
     up = BYTE_ARR_CTS(R2);
     vp = BYTE_ARR_CTS(R4);
 
-    cmp = foreign "C" __gmpn_cmp(up "ptr", vp "ptr", size) [];
+    (cmp) = foreign "C" __gmpn_cmp(up "ptr", vp "ptr", size) [];
 
     if (cmp == 0 :: CInt) {
        R1 = 0; 
@@ -883,7 +917,7 @@ forkzh_fast
   W_ threadid;
   closure = R1;
 
-  "ptr" threadid = foreign "C" createIOThread( MyCapability() "ptr", 
+  ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
                                RtsFlags_GcFlags_initialStkSize(RtsFlags), 
                                closure "ptr") [];
   foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];
@@ -906,7 +940,7 @@ forkOnzh_fast
   cpu = R1;
   closure = R2;
 
-  "ptr" threadid = foreign "C" createIOThread( MyCapability() "ptr", 
+  ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
                                RtsFlags_GcFlags_initialStkSize(RtsFlags), 
                                closure "ptr") [];
   foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];
@@ -943,7 +977,7 @@ isCurrentThreadBoundzh_fast
 {
   /* no args */
   W_ r;
-  r = foreign "C" isThreadBound(CurrentTSO) [];
+  (r) = foreign "C" isThreadBound(CurrentTSO) [];
   RET_N(r);
 }
 
@@ -962,49 +996,19 @@ isCurrentThreadBoundzh_fast
 
 // Catch retry frame ------------------------------------------------------------
 
-#define CATCH_RETRY_FRAME_ERROR(label) \
-  label { foreign "C" barf("catch_retry_frame incorrectly entered!"); }
-
-CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_0_ret)
-CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_1_ret)
-CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_2_ret)
-CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_3_ret)
-CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_4_ret)
-CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_5_ret)
-CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_6_ret)
-CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_7_ret)
-
-#if MAX_VECTORED_RTN > 8
-#error MAX_VECTORED_RTN has changed: please modify stg_catch_retry_frame too.
-#endif
-
+INFO_TABLE_RET(stg_catch_retry_frame, CATCH_RETRY_FRAME,
 #if defined(PROFILING)
-#define CATCH_RETRY_FRAME_BITMAP 7
-#define CATCH_RETRY_FRAME_WORDS  6
-#else
-#define CATCH_RETRY_FRAME_BITMAP 1
-#define CATCH_RETRY_FRAME_WORDS  4
+  W_ unused1, W_ unused2,
 #endif
-
-INFO_TABLE_RET(stg_catch_retry_frame,
-              CATCH_RETRY_FRAME_WORDS, CATCH_RETRY_FRAME_BITMAP,
-              CATCH_RETRY_FRAME,
-              stg_catch_retry_frame_0_ret,
-              stg_catch_retry_frame_1_ret,
-              stg_catch_retry_frame_2_ret,
-              stg_catch_retry_frame_3_ret,
-              stg_catch_retry_frame_4_ret,
-              stg_catch_retry_frame_5_ret,
-              stg_catch_retry_frame_6_ret,
-              stg_catch_retry_frame_7_ret)
+  W_ unused3, "ptr" W_ unused4, "ptr" W_ unused5)
 {
    W_ r, frame, trec, outer;
    IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
 
    frame = Sp;
    trec = StgTSO_trec(CurrentTSO);
-   "ptr" outer = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
-   r = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
+   ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+   (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
    if (r != 0) {
      /* Succeeded (either first branch or second branch) */
      StgTSO_trec(CurrentTSO) = outer;
@@ -1012,95 +1016,91 @@ INFO_TABLE_RET(stg_catch_retry_frame,
      IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
      jump %ENTRY_CODE(Sp(SP_OFF));
    } else {
-     /* Did not commit: retry */
+     /* Did not commit: re-execute */
      W_ new_trec;
-     "ptr" new_trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+     ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
      StgTSO_trec(CurrentTSO) = new_trec;
      if (StgCatchRetryFrame_running_alt_code(frame) != 0::I32) {
        R1 = StgCatchRetryFrame_alt_code(frame);
      } else {
        R1 = StgCatchRetryFrame_first_code(frame);
-       StgCatchRetryFrame_first_code_trec(frame) = new_trec;
      }
      jump stg_ap_v_fast;
    }
 }
 
 
-// Atomically frame -------------------------------------------------------------
-
-
-#define ATOMICALLY_FRAME_ERROR(label) \
-  label { foreign "C" barf("atomically_frame incorrectly entered!"); }
-
-ATOMICALLY_FRAME_ERROR(stg_atomically_frame_0_ret)
-ATOMICALLY_FRAME_ERROR(stg_atomically_frame_1_ret)
-ATOMICALLY_FRAME_ERROR(stg_atomically_frame_2_ret)
-ATOMICALLY_FRAME_ERROR(stg_atomically_frame_3_ret)
-ATOMICALLY_FRAME_ERROR(stg_atomically_frame_4_ret)
-ATOMICALLY_FRAME_ERROR(stg_atomically_frame_5_ret)
-ATOMICALLY_FRAME_ERROR(stg_atomically_frame_6_ret)
-ATOMICALLY_FRAME_ERROR(stg_atomically_frame_7_ret)
-
-#if MAX_VECTORED_RTN > 8
-#error MAX_VECTORED_RTN has changed: please modify stg_atomically_frame too.
-#endif
+// Atomically frame ------------------------------------------------------------
 
+INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
 #if defined(PROFILING)
-#define ATOMICALLY_FRAME_BITMAP 3
-#define ATOMICALLY_FRAME_WORDS  3
-#else
-#define ATOMICALLY_FRAME_BITMAP 0
-#define ATOMICALLY_FRAME_WORDS  1
+  W_ unused1, W_ unused2,
 #endif
-
-
-INFO_TABLE_RET(stg_atomically_frame,
-              ATOMICALLY_FRAME_WORDS, ATOMICALLY_FRAME_BITMAP,
-              ATOMICALLY_FRAME,
-              stg_atomically_frame_0_ret,
-              stg_atomically_frame_1_ret,
-              stg_atomically_frame_2_ret,
-              stg_atomically_frame_3_ret,
-              stg_atomically_frame_4_ret,
-              stg_atomically_frame_5_ret,
-              stg_atomically_frame_6_ret,
-              stg_atomically_frame_7_ret)
+  "ptr" W_ unused3, "ptr" W_ unused4)
 {
-  W_ frame, trec, valid;
+  W_ frame, trec, valid, next_invariant, q, outer;
   IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
 
   frame = Sp;
   trec = StgTSO_trec(CurrentTSO);
+  ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+
+  if (outer == NO_TREC) {
+    /* First time back at the atomically frame -- pick up invariants */
+    ("ptr" q) = foreign "C" stmGetInvariantsToCheck(MyCapability() "ptr", trec "ptr") [];
+    StgAtomicallyFrame_next_invariant_to_check(frame) = q;
 
-  /* The TSO is not currently waiting: try to commit the transaction */
-  valid = foreign "C" stmCommitTransaction(MyCapability() "ptr", trec "ptr") [];
-  if (valid != 0) {
-    /* Transaction was valid: commit succeeded */
-    StgTSO_trec(CurrentTSO) = NO_TREC;
-    Sp = Sp + SIZEOF_StgAtomicallyFrame;
-    IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
-    jump %ENTRY_CODE(Sp(SP_OFF));
   } else {
-    /* Transaction was not valid: try again */
-    "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
+    /* Second/subsequent time back at the atomically frame -- abort the
+     * tx that's checking the invariant and move on to the next one */
+    StgTSO_trec(CurrentTSO) = outer;
+    q = StgAtomicallyFrame_next_invariant_to_check(frame);
+    StgInvariantCheckQueue_my_execution(q) = trec;
+    foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
+    /* Don't free trec -- it's linked from q and will be stashed in the
+     * invariant if we eventually commit. */
+    q = StgInvariantCheckQueue_next_queue_entry(q);
+    StgAtomicallyFrame_next_invariant_to_check(frame) = q;
+    trec = outer;
+  }
+
+  q = StgAtomicallyFrame_next_invariant_to_check(frame);
+
+  if (q != END_INVARIANT_CHECK_QUEUE) {
+    /* We can't commit yet: another invariant to check */
+    ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [];
     StgTSO_trec(CurrentTSO) = trec;
-    R1 = StgAtomicallyFrame_code(frame);
+
+    next_invariant = StgInvariantCheckQueue_invariant(q);
+    R1 = StgAtomicInvariant_code(next_invariant);
     jump stg_ap_v_fast;
+
+  } else {
+
+    /* We've got no more invariants to check, try to commit */
+    (valid) = foreign "C" stmCommitTransaction(MyCapability() "ptr", trec "ptr") [];
+    if (valid != 0) {
+      /* Transaction was valid: commit succeeded */
+      StgTSO_trec(CurrentTSO) = NO_TREC;
+      Sp = Sp + SIZEOF_StgAtomicallyFrame;
+      IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
+      jump %ENTRY_CODE(Sp(SP_OFF));
+    } else {
+      /* Transaction was not valid: try again */
+      ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
+      StgTSO_trec(CurrentTSO) = trec;
+      StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
+      R1 = StgAtomicallyFrame_code(frame);
+      jump stg_ap_v_fast;
+    }
   }
 }
 
-INFO_TABLE_RET(stg_atomically_waiting_frame,
-              ATOMICALLY_FRAME_WORDS, ATOMICALLY_FRAME_BITMAP,
-              ATOMICALLY_FRAME,
-              stg_atomically_frame_0_ret,
-              stg_atomically_frame_1_ret,
-              stg_atomically_frame_2_ret,
-              stg_atomically_frame_3_ret,
-              stg_atomically_frame_4_ret,
-              stg_atomically_frame_5_ret,
-              stg_atomically_frame_6_ret,
-              stg_atomically_frame_7_ret)
+INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
+#if defined(PROFILING)
+  W_ unused1, W_ unused2,
+#endif
+  "ptr" W_ unused3, "ptr" W_ unused4)
 {
   W_ frame, trec, valid;
   IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
@@ -1108,7 +1108,7 @@ INFO_TABLE_RET(stg_atomically_waiting_frame,
   frame = Sp;
 
   /* The TSO is currently waiting: should we stop waiting? */
-  valid = foreign "C" stmReWait(MyCapability() "ptr", CurrentTSO "ptr") [];
+  (valid) = foreign "C" stmReWait(MyCapability() "ptr", CurrentTSO "ptr") [];
   if (valid != 0) {
     /* Previous attempt is still valid: no point trying again yet */
          IF_NOT_REG_R1(Sp_adj(-2);
@@ -1117,7 +1117,7 @@ INFO_TABLE_RET(stg_atomically_waiting_frame,
     jump stg_block_noregs;
   } else {
     /* Previous attempt is no longer valid: try again */
-    "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
+    ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
     StgTSO_trec(CurrentTSO) = trec;
     StgHeader_info(frame) = stg_atomically_frame_info;
     R1 = StgAtomicallyFrame_code(frame);
@@ -1127,59 +1127,44 @@ INFO_TABLE_RET(stg_atomically_waiting_frame,
 
 // STM catch frame --------------------------------------------------------------
 
-#define CATCH_STM_FRAME_ENTRY_TEMPLATE(label,ret)          \
-   label                                                   \
-   {                                                       \
-      IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )  \
-      Sp = Sp + SIZEOF_StgCatchSTMFrame;                   \
-      IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)             \
-      jump ret;                                            \
-   }
-
 #ifdef REG_R1
 #define SP_OFF 0
 #else
 #define SP_OFF 1
 #endif
 
-CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_0_ret,%RET_VEC(Sp(SP_OFF),0))
-CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_1_ret,%RET_VEC(Sp(SP_OFF),1))
-CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_2_ret,%RET_VEC(Sp(SP_OFF),2))
-CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_3_ret,%RET_VEC(Sp(SP_OFF),3))
-CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_4_ret,%RET_VEC(Sp(SP_OFF),4))
-CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_5_ret,%RET_VEC(Sp(SP_OFF),5))
-CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_6_ret,%RET_VEC(Sp(SP_OFF),6))
-CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_7_ret,%RET_VEC(Sp(SP_OFF),7))
-
-#if MAX_VECTORED_RTN > 8
-#error MAX_VECTORED_RTN has changed: please modify stg_catch_stm_frame too.
-#endif
-
-#if defined(PROFILING)
-#define CATCH_STM_FRAME_BITMAP 3
-#define CATCH_STM_FRAME_WORDS  3
-#else
-#define CATCH_STM_FRAME_BITMAP 0
-#define CATCH_STM_FRAME_WORDS  1
-#endif
-
 /* Catch frames are very similar to update frames, but when entering
  * one we just pop the frame off the stack and perform the correct
  * kind of return to the activation record underneath us on the stack.
  */
 
-INFO_TABLE_RET(stg_catch_stm_frame,
-              CATCH_STM_FRAME_WORDS, CATCH_STM_FRAME_BITMAP,
-              CATCH_STM_FRAME,
-              stg_catch_stm_frame_0_ret,
-              stg_catch_stm_frame_1_ret,
-              stg_catch_stm_frame_2_ret,
-              stg_catch_stm_frame_3_ret,
-              stg_catch_stm_frame_4_ret,
-              stg_catch_stm_frame_5_ret,
-              stg_catch_stm_frame_6_ret,
-              stg_catch_stm_frame_7_ret)
-CATCH_STM_FRAME_ENTRY_TEMPLATE(,%ENTRY_CODE(Sp(SP_OFF)))
+INFO_TABLE_RET(stg_catch_stm_frame, CATCH_STM_FRAME,
+#if defined(PROFILING)
+  W_ unused1, W_ unused2,
+#endif
+  "ptr" W_ unused3, "ptr" W_ unused4)
+   {
+      IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
+      W_ r, frame, trec, outer;
+      frame = Sp;
+      trec = StgTSO_trec(CurrentTSO);
+      ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+      (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
+      if (r != 0) {
+        /* Commit succeeded */
+        StgTSO_trec(CurrentTSO) = outer;
+        Sp = Sp + SIZEOF_StgCatchSTMFrame;
+        IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
+        jump Sp(SP_OFF);
+      } else {
+        /* Commit failed */
+        W_ new_trec;
+        ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+        StgTSO_trec(CurrentTSO) = new_trec;
+        R1 = StgCatchSTMFrame_code(frame);
+        jump stg_ap_v_fast;
+      }
+   }
 
 
 // Primop definition ------------------------------------------------------------
@@ -1210,9 +1195,10 @@ atomicallyzh_fast
 
   SET_HDR(frame,stg_atomically_frame_info, W_[CCCS]);
   StgAtomicallyFrame_code(frame) = R1;
+  StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
 
   /* Start the memory transcation */
-  "ptr" new_trec = foreign "C" stmStartTransaction(MyCapability() "ptr", old_trec "ptr") [R1];
+  ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", old_trec "ptr") [R1];
   StgTSO_trec(CurrentTSO) = new_trec;
 
   /* Apply R1 to the realworld token */
@@ -1234,6 +1220,14 @@ catchSTMzh_fast
 
   SET_HDR(frame, stg_catch_stm_frame_info, W_[CCCS]);
   StgCatchSTMFrame_handler(frame) = R2;
+  StgCatchSTMFrame_code(frame) = R1;
+
+  /* Start a nested transaction to run the body of the try block in */
+  W_ cur_trec;  
+  W_ new_trec;
+  cur_trec = StgTSO_trec(CurrentTSO);
+  ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", cur_trec "ptr");
+  StgTSO_trec(CurrentTSO) = new_trec;
 
   /* Apply R1 to the realworld token */
   jump stg_ap_v_fast;
@@ -1255,7 +1249,7 @@ catchRetryzh_fast
 
   /* Start a nested transaction within which to run the first code */
   trec = StgTSO_trec(CurrentTSO);
-  "ptr" new_trec = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [R1,R2];
+  ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [R1,R2];
   StgTSO_trec(CurrentTSO) = new_trec;
 
   /* Set up the catch-retry frame */
@@ -1266,7 +1260,6 @@ catchRetryzh_fast
   StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
   StgCatchRetryFrame_first_code(frame) = R1;
   StgCatchRetryFrame_alt_code(frame) = R2;
-  StgCatchRetryFrame_first_code_trec(frame) = new_trec;
 
   /* Apply R1 to the realworld token */
   jump stg_ap_v_fast;
@@ -1285,55 +1278,49 @@ retryzh_fast
 
   // Find the enclosing ATOMICALLY_FRAME or CATCH_RETRY_FRAME
 retry_pop_stack:
-  trec = StgTSO_trec(CurrentTSO);
-  "ptr" outer = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
   StgTSO_sp(CurrentTSO) = Sp;
-  frame_type = foreign "C" findRetryFrameHelper(CurrentTSO "ptr") [];
+  (frame_type) = foreign "C" findRetryFrameHelper(CurrentTSO "ptr") [];
   Sp = StgTSO_sp(CurrentTSO);
   frame = Sp;
+  trec = StgTSO_trec(CurrentTSO);
+  ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
 
   if (frame_type == CATCH_RETRY_FRAME) {
     // The retry reaches a CATCH_RETRY_FRAME before the atomic frame
     ASSERT(outer != NO_TREC);
+    // Abort the transaction attempting the current branch
+    foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
+    foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
     if (!StgCatchRetryFrame_running_alt_code(frame) != 0::I32) {
-      // Retry in the first code: try the alternative
-      "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+      // Retry in the first branch: try the alternative
+      ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
       StgTSO_trec(CurrentTSO) = trec;
       StgCatchRetryFrame_running_alt_code(frame) = 1 :: CInt; // true;
       R1 = StgCatchRetryFrame_alt_code(frame);
       jump stg_ap_v_fast;
     } else {
-      // Retry in the alternative code: propagate
-      W_ other_trec;
-      other_trec = StgCatchRetryFrame_first_code_trec(frame);
-      r = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", other_trec "ptr") [];
-      if (r != 0) {
-        r = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
-      } else {
-        foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
-      }
-      if (r != 0) {
-        // Merge between siblings succeeded: commit it back to enclosing transaction
-        // and then propagate the retry
-        StgTSO_trec(CurrentTSO) = outer;
-        Sp = Sp + SIZEOF_StgCatchRetryFrame;
-        goto retry_pop_stack;
-      } else {
-        // Merge failed: we musn't propagate the retry.  Try both paths again.
-        "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
-        StgCatchRetryFrame_first_code_trec(frame) = trec;
-        StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
-        StgTSO_trec(CurrentTSO) = trec;
-        R1 = StgCatchRetryFrame_first_code(frame);
-        jump stg_ap_v_fast;
-      }
+      // Retry in the alternative code: propagate the retry
+      StgTSO_trec(CurrentTSO) = outer;
+      Sp = Sp + SIZEOF_StgCatchRetryFrame;
+      goto retry_pop_stack;
     }
   }
 
   // We've reached the ATOMICALLY_FRAME: attempt to wait 
   ASSERT(frame_type == ATOMICALLY_FRAME);
+  if (outer != NO_TREC) {
+    // We called retry while checking invariants, so abort the current
+    // invariant check (merging its TVar accesses into the parents read
+    // set so we'll wait on them)
+    foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
+    foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
+    trec = outer;
+    StgTSO_trec(CurrentTSO) = trec;
+    ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+  }
   ASSERT(outer == NO_TREC);
-  r = foreign "C" stmWait(MyCapability() "ptr", CurrentTSO "ptr", trec "ptr") [];
+
+  (r) = foreign "C" stmWait(MyCapability() "ptr", CurrentTSO "ptr", trec "ptr") [];
   if (r != 0) {
     // Transaction was valid: stmWait put us on the TVars' queues, we now block
     StgHeader_info(frame) = stg_atomically_waiting_frame_info;
@@ -1346,7 +1333,7 @@ retry_pop_stack:
     jump stg_block_stmwait;
   } else {
     // Transaction was not valid: retry immediately
-    "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+    ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
     StgTSO_trec(CurrentTSO) = trec;
     R1 = StgAtomicallyFrame_code(frame);
     Sp = frame;
@@ -1355,6 +1342,23 @@ retry_pop_stack:
 }
 
 
+checkzh_fast
+{
+  W_ trec, closure;
+
+  /* Args: R1 = invariant closure */
+  MAYBE_GC (R1_PTR, checkzh_fast); 
+
+  trec = StgTSO_trec(CurrentTSO);
+  closure = R1;
+  foreign "C" stmAddInvariantToCheck(MyCapability() "ptr", 
+                                     trec "ptr",
+                                     closure "ptr") [];
+
+  jump %ENTRY_CODE(Sp(0));
+}
+
+
 newTVarzh_fast
 {
   W_ tv;
@@ -1364,7 +1368,7 @@ newTVarzh_fast
 
   MAYBE_GC (R1_PTR, newTVarzh_fast); 
   new_value = R1;
-  "ptr" tv = foreign "C" stmNewTVar(MyCapability() "ptr", new_value "ptr") [];
+  ("ptr" tv) = foreign "C" stmNewTVar(MyCapability() "ptr", new_value "ptr") [];
   RET_P(tv);
 }
 
@@ -1380,7 +1384,7 @@ readTVarzh_fast
   MAYBE_GC (R1_PTR, readTVarzh_fast); // Call to stmReadTVar may allocate
   trec = StgTSO_trec(CurrentTSO);
   tvar = R1;
-  "ptr" result = foreign "C" stmReadTVar(MyCapability() "ptr", trec "ptr", tvar "ptr") [];
+  ("ptr" result) = foreign "C" stmReadTVar(MyCapability() "ptr", trec "ptr", tvar "ptr") [];
 
   RET_P(result);
 }
@@ -1441,7 +1445,7 @@ isEmptyMVarzh_fast
 {
     /* args: R1 = MVar closure */
 
-    if (GET_INFO(R1) == stg_EMPTY_MVAR_info) {
+    if (StgMVar_value(R1) == stg_END_TSO_QUEUE_closure) {
        RET_N(1);
     } else {
        RET_N(0);
@@ -1456,7 +1460,8 @@ newMVarzh_fast
     ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, newMVarzh_fast );
   
     mvar = Hp - SIZEOF_StgMVar + WDS(1);
-    SET_HDR(mvar,stg_EMPTY_MVAR_info,W_[CCCS]);
+    SET_HDR(mvar,stg_MVAR_DIRTY_info,W_[CCCS]);
+        // MVARs start dirty: generation 0 has no mutable list
     StgMVar_head(mvar)  = stg_END_TSO_QUEUE_closure;
     StgMVar_tail(mvar)  = stg_END_TSO_QUEUE_closure;
     StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
@@ -1487,15 +1492,19 @@ takeMVarzh_fast
     mvar = R1;
 
 #if defined(THREADED_RTS)
-    "ptr" info = foreign "C" lockClosure(mvar "ptr") [];
+    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
 #else
     info = GET_INFO(mvar);
 #endif
+        
+    if (info == stg_MVAR_CLEAN_info) {
+        foreign "C" dirty_MVAR(BaseReg "ptr", mvar);
+    }
 
     /* If the MVar is empty, put ourselves on its blocking queue,
      * and wait until we're woken up.
      */
-    if (info == stg_EMPTY_MVAR_info) {
+    if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
            StgMVar_head(mvar) = CurrentTSO;
        } else {
@@ -1526,10 +1535,10 @@ takeMVarzh_fast
 
 #if defined(GRAN) || defined(PAR)
       /* ToDo: check 2nd arg (mvar) is right */
-      "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar),mvar) [];
+      ("ptr" tso) = foreign "C" unblockOne(StgMVar_head(mvar),mvar) [];
       StgMVar_head(mvar) = tso;
 #else
-      "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", 
+      ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", 
                                         StgMVar_head(mvar) "ptr") [];
       StgMVar_head(mvar) = tso;
 #endif
@@ -1539,7 +1548,9 @@ takeMVarzh_fast
       }
 
 #if defined(THREADED_RTS)
-      foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+      unlockClosure(mvar, stg_MVAR_DIRTY_info);
+#else
+      SET_INFO(mvar,stg_MVAR_DIRTY_info);
 #endif
       RET_P(val);
   } 
@@ -1549,9 +1560,9 @@ takeMVarzh_fast
       StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
  
 #if defined(THREADED_RTS)
-      foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+      unlockClosure(mvar, stg_MVAR_DIRTY_info);
 #else
-      SET_INFO(mvar,stg_EMPTY_MVAR_info);
+      SET_INFO(mvar,stg_MVAR_DIRTY_info);
 #endif
 
       RET_P(val);
@@ -1568,14 +1579,14 @@ tryTakeMVarzh_fast
     mvar = R1;
 
 #if defined(THREADED_RTS)
-    "ptr" info = foreign "C" lockClosure(mvar "ptr") [];
+    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
 #else
     info = GET_INFO(mvar);
 #endif
 
-    if (info == stg_EMPTY_MVAR_info) {
+    if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
 #if defined(THREADED_RTS)
-        foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+        unlockClosure(mvar, info);
 #endif
        /* HACK: we need a pointer to pass back, 
         * so we abuse NO_FINALIZER_closure
@@ -1583,6 +1594,10 @@ tryTakeMVarzh_fast
        RET_NP(0, stg_NO_FINALIZER_closure);
     }
 
+    if (info == stg_MVAR_CLEAN_info) {
+        foreign "C" dirty_MVAR(BaseReg "ptr", mvar);
+    }
+
     /* we got the value... */
     val = StgMVar_value(mvar);
 
@@ -1600,10 +1615,10 @@ tryTakeMVarzh_fast
 
 #if defined(GRAN) || defined(PAR)
        /* ToDo: check 2nd arg (mvar) is right */
-       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr", mvar "ptr") [];
+       ("ptr" tso) = foreign "C" unblockOne(StgMVar_head(mvar) "ptr", mvar "ptr") [];
        StgMVar_head(mvar) = tso;
 #else
-       "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr",
+       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr",
                                           StgMVar_head(mvar) "ptr") [];
        StgMVar_head(mvar) = tso;
 #endif
@@ -1612,7 +1627,9 @@ tryTakeMVarzh_fast
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
        }
 #if defined(THREADED_RTS)
-        foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+        unlockClosure(mvar, stg_MVAR_DIRTY_info);
+#else
+        SET_INFO(mvar,stg_MVAR_DIRTY_info);
 #endif
     }
     else 
@@ -1620,9 +1637,9 @@ tryTakeMVarzh_fast
        /* No further putMVars, MVar is now empty */
        StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
 #if defined(THREADED_RTS)
-       foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+       unlockClosure(mvar, stg_MVAR_DIRTY_info);
 #else
-       SET_INFO(mvar,stg_EMPTY_MVAR_info);
+       SET_INFO(mvar,stg_MVAR_DIRTY_info);
 #endif
     }
     
@@ -1638,12 +1655,16 @@ putMVarzh_fast
     mvar = R1;
 
 #if defined(THREADED_RTS)
-    "ptr" info = foreign "C" lockClosure(mvar "ptr") [R2];
+    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [R2];
 #else
     info = GET_INFO(mvar);
 #endif
 
-    if (info == stg_FULL_MVAR_info) {
+    if (info == stg_MVAR_CLEAN_info) {
+        foreign "C" dirty_MVAR(BaseReg "ptr", mvar);
+    }
+
+    if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
            StgMVar_head(mvar) = CurrentTSO;
        } else {
@@ -1670,10 +1691,10 @@ putMVarzh_fast
       
 #if defined(GRAN) || defined(PAR)
        /* ToDo: check 2nd arg (mvar) is right */
-       "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
+       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
        StgMVar_head(mvar) = tso;
 #else
-       "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
+       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
        StgMVar_head(mvar) = tso;
 #endif
 
@@ -1682,7 +1703,9 @@ putMVarzh_fast
        }
 
 #if defined(THREADED_RTS)
-       foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+       unlockClosure(mvar, stg_MVAR_DIRTY_info);
+#else
+        SET_INFO(mvar,stg_MVAR_DIRTY_info);
 #endif
        jump %ENTRY_CODE(Sp(0));
     }
@@ -1692,9 +1715,9 @@ putMVarzh_fast
        StgMVar_value(mvar) = R2;
 
 #if defined(THREADED_RTS)
-       foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+       unlockClosure(mvar, stg_MVAR_DIRTY_info);
 #else
-       SET_INFO(mvar,stg_FULL_MVAR_info);
+       SET_INFO(mvar,stg_MVAR_DIRTY_info);
 #endif
        jump %ENTRY_CODE(Sp(0));
     }
@@ -1711,18 +1734,22 @@ tryPutMVarzh_fast
     mvar = R1;
 
 #if defined(THREADED_RTS)
-    "ptr" info = foreign "C" lockClosure(mvar "ptr") [R2];
+    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [R2];
 #else
     info = GET_INFO(mvar);
 #endif
 
-    if (info == stg_FULL_MVAR_info) {
+    if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
 #if defined(THREADED_RTS)
-       foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+       unlockClosure(mvar, info);
 #endif
        RET_N(0);
     }
   
+    if (info == stg_MVAR_CLEAN_info) {
+        foreign "C" dirty_MVAR(BaseReg "ptr", mvar);
+    }
+
     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
 
        /* There are takeMVar(s) waiting: wake up the first one
@@ -1736,10 +1763,10 @@ tryPutMVarzh_fast
       
 #if defined(GRAN) || defined(PAR)
        /* ToDo: check 2nd arg (mvar) is right */
-       "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
+       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
        StgMVar_head(mvar) = tso;
 #else
-       "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
+       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
        StgMVar_head(mvar) = tso;
 #endif
 
@@ -1748,7 +1775,9 @@ tryPutMVarzh_fast
        }
 
 #if defined(THREADED_RTS)
-       foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+       unlockClosure(mvar, stg_MVAR_DIRTY_info);
+#else
+        SET_INFO(mvar,stg_MVAR_DIRTY_info);
 #endif
     }
     else
@@ -1757,9 +1786,9 @@ tryPutMVarzh_fast
        StgMVar_value(mvar) = R2;
 
 #if defined(THREADED_RTS)
-       foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+       unlockClosure(mvar, stg_MVAR_DIRTY_info);
 #else
-       SET_INFO(mvar,stg_FULL_MVAR_info);
+       SET_INFO(mvar,stg_MVAR_DIRTY_info);
 #endif
     }
     
@@ -1778,7 +1807,7 @@ makeStableNamezh_fast
 
     ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, makeStableNamezh_fast );
   
-    index = foreign "C" lookupStableName(R1 "ptr") [];
+    (index) = foreign "C" lookupStableName(R1 "ptr") [];
 
     /* Is there already a StableName for this heap object?
      *  stable_ptr_table is a pointer to an array of snEntry structs.
@@ -1801,7 +1830,7 @@ makeStablePtrzh_fast
     /* Args: R1 = a */
     W_ sp;
     MAYBE_GC(R1_PTR, makeStablePtrzh_fast);
-    "ptr" sp = foreign "C" getStablePtr(R1 "ptr") [];
+    ("ptr" sp) = foreign "C" getStablePtr(R1 "ptr") [];
     RET_N(sp);
 }
 
@@ -1823,17 +1852,17 @@ newBCOzh_fast
     /* R1 = instrs
        R2 = literals
        R3 = ptrs
-       R4 = itbls
-       R5 = arity
-       R6 = bitmap array
+       R4 = arity
+       R5 = bitmap array
     */
     W_ bco, bitmap_arr, bytes, words;
     
-    bitmap_arr = R6;
+    bitmap_arr = R5;
+
     words = BYTES_TO_WDS(SIZEOF_StgBCO) + StgArrWords_words(bitmap_arr);
     bytes = WDS(words);
 
-    ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R4_PTR&R6_PTR, newBCOzh_fast );
+    ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, newBCOzh_fast );
 
     bco = Hp - bytes + WDS(1);
     SET_HDR(bco, stg_BCO_info, W_[CCCS]);
@@ -1841,8 +1870,7 @@ newBCOzh_fast
     StgBCO_instrs(bco)     = R1;
     StgBCO_literals(bco)   = R2;
     StgBCO_ptrs(bco)       = R3;
-    StgBCO_itbls(bco)      = R4;
-    StgBCO_arity(bco)      = HALF_W_(R5);
+    StgBCO_arity(bco)      = HALF_W_(R4);
     StgBCO_size(bco)       = HALF_W_(words);
     
     // Copy the arity/bitmap info into the BCO
@@ -1884,6 +1912,70 @@ mkApUpd0zh_fast
     RET_P(ap);
 }
 
+unpackClosurezh_fast
+{
+/* args: R1 = closure to analyze */
+// TODO: Consider the absence of ptrs or nonptrs as a special case ?
+
+    W_ info, ptrs, nptrs, p, ptrs_arr, nptrs_arr;
+    info  = %GET_STD_INFO(UNTAG(R1));
+
+    // Some closures have non-standard layout, so we omit those here.
+    W_ type;
+    type = TO_W_(%INFO_TYPE(info));
+    switch [0 .. N_CLOSURE_TYPES] type {
+    case THUNK_SELECTOR : {
+        ptrs = 1;
+        nptrs = 0;
+        goto out;
+    }
+    case THUNK, THUNK_1_0, THUNK_0_1, THUNK_2_0, THUNK_1_1, 
+         THUNK_0_2, THUNK_STATIC, AP, PAP, AP_STACK, BCO : {
+        ptrs = 0;
+        nptrs = 0;
+        goto out;
+    }
+    default: {
+        ptrs  = TO_W_(%INFO_PTRS(info)); 
+        nptrs = TO_W_(%INFO_NPTRS(info));
+        goto out;
+    }}
+out:
+
+    W_ ptrs_arr_sz, nptrs_arr_sz;
+    nptrs_arr_sz = SIZEOF_StgArrWords   + WDS(nptrs);
+    ptrs_arr_sz  = SIZEOF_StgMutArrPtrs + WDS(ptrs);
+
+    ALLOC_PRIM (ptrs_arr_sz + nptrs_arr_sz, R1_PTR, unpackClosurezh_fast);
+
+    W_ clos;
+    clos = UNTAG(R1);
+
+    ptrs_arr  = Hp - nptrs_arr_sz - ptrs_arr_sz + WDS(1);
+    nptrs_arr = Hp - nptrs_arr_sz + WDS(1);
+
+    SET_HDR(ptrs_arr, stg_MUT_ARR_PTRS_FROZEN_info, W_[CCCS]);
+    StgMutArrPtrs_ptrs(ptrs_arr) = ptrs;
+    p = 0;
+for:
+    if(p < ptrs) {
+        W_[ptrs_arr + SIZEOF_StgMutArrPtrs + WDS(p)] = StgClosure_payload(clos,p);
+        p = p + 1;
+        goto for;
+    }
+    
+    SET_HDR(nptrs_arr, stg_ARR_WORDS_info, W_[CCCS]);
+    StgArrWords_words(nptrs_arr) = nptrs;
+    p = 0;
+for2:
+    if(p < nptrs) {
+        W_[BYTE_ARR_CTS(nptrs_arr) + WDS(p)] = StgClosure_payload(clos, p+ptrs);
+        p = p + 1;
+        goto for2;
+    }
+    RET_NPP(info, ptrs_arr, nptrs_arr);
+}
+
 /* -----------------------------------------------------------------------------
    Thread I/O blocking primitives
    -------------------------------------------------------------------------- */
@@ -1904,7 +1996,7 @@ waitReadzh_fast
 {
     /* args: R1 */
 #ifdef THREADED_RTS
-    foreign "C" barf("waitRead# on threaded RTS");
+    foreign "C" barf("waitRead# on threaded RTS") never returns;
 #else
 
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
@@ -1921,7 +2013,7 @@ waitWritezh_fast
 {
     /* args: R1 */
 #ifdef THREADED_RTS
-    foreign "C" barf("waitWrite# on threaded RTS");
+    foreign "C" barf("waitWrite# on threaded RTS") never returns;
 #else
 
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
@@ -1946,7 +2038,7 @@ delayzh_fast
 #endif
 
 #ifdef THREADED_RTS
-    foreign "C" barf("delay# on threaded RTS");
+    foreign "C" barf("delay# on threaded RTS") never returns;
 #else
 
     /* args: R1 (microsecond delay amount) */
@@ -1956,9 +2048,9 @@ delayzh_fast
 #ifdef mingw32_HOST_OS
 
     /* could probably allocate this on the heap instead */
-    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
                                            stg_delayzh_malloc_str);
-    reqID = foreign "C" addDelayRequest(R1);
+    (reqID) = foreign "C" addDelayRequest(R1);
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
     StgAsyncIOResult_errCode(ares) = 0;
@@ -1975,8 +2067,11 @@ delayzh_fast
 #else
 
     W_ time;
-    time = foreign "C" getourtimeofday() [R1];
-    target = (R1 / (TO_W_(RtsFlags_MiscFlags_tickInterval(RtsFlags))*1000)) + time;
+    W_ divisor;
+    (time) = foreign "C" getourtimeofday() [R1];
+    divisor = TO_W_(RtsFlags_MiscFlags_tickInterval(RtsFlags))*1000;
+    target = ((R1 + divisor - 1) / divisor) /* divide rounding up */
+           + time + 1; /* Add 1 as getourtimeofday rounds down */
     StgTSO_block_info(CurrentTSO) = target;
 
     /* Insert the new thread in the sleeping queue. */
@@ -2009,7 +2104,7 @@ asyncReadzh_fast
     CInt reqID;
 
 #ifdef THREADED_RTS
-    foreign "C" barf("asyncRead# on threaded RTS");
+    foreign "C" barf("asyncRead# on threaded RTS") never returns;
 #else
 
     /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
@@ -2017,10 +2112,10 @@ asyncReadzh_fast
     StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
 
     /* could probably allocate this on the heap instead */
-    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
                                            stg_asyncReadzh_malloc_str)
                        [R1,R2,R3,R4];
-    reqID = foreign "C" addIORequest(R1, 0/*FALSE*/,R2,R3,R4 "ptr") [];
+    (reqID) = foreign "C" addIORequest(R1, 0/*FALSE*/,R2,R3,R4 "ptr") [];
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
     StgAsyncIOResult_errCode(ares) = 0;
@@ -2037,17 +2132,17 @@ asyncWritezh_fast
     CInt reqID;
 
 #ifdef THREADED_RTS
-    foreign "C" barf("asyncWrite# on threaded RTS");
+    foreign "C" barf("asyncWrite# on threaded RTS") never returns;
 #else
 
     /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
 
-    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
                                            stg_asyncWritezh_malloc_str)
                        [R1,R2,R3,R4];
-    reqID = foreign "C" addIORequest(R1, 1/*TRUE*/,R2,R3,R4 "ptr") [];
+    (reqID) = foreign "C" addIORequest(R1, 1/*TRUE*/,R2,R3,R4 "ptr") [];
 
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
@@ -2065,7 +2160,7 @@ asyncDoProczh_fast
     CInt reqID;
 
 #ifdef THREADED_RTS
-    foreign "C" barf("asyncDoProc# on threaded RTS");
+    foreign "C" barf("asyncDoProc# on threaded RTS") never returns;
 #else
 
     /* args: R1 = proc, R2 = param */
@@ -2073,10 +2168,10 @@ asyncDoProczh_fast
     StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
 
     /* could probably allocate this on the heap instead */
-    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
                                            stg_asyncDoProczh_malloc_str) 
                                [R1,R2];
-    reqID = foreign "C" addDoProcRequest(R1 "ptr",R2 "ptr") [];
+    (reqID) = foreign "C" addDoProcRequest(R1 "ptr",R2 "ptr") [];
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
     StgAsyncIOResult_errCode(ares) = 0;
@@ -2087,20 +2182,39 @@ asyncDoProczh_fast
 }
 #endif
 
-/* -----------------------------------------------------------------------------
-  ** temporary **
+// noDuplicate# tries to ensure that none of the thunks under
+// evaluation by the current thread are also under evaluation by
+// another thread.  It relies on *both* threads doing noDuplicate#;
+// the second one will get blocked if they are duplicating some work.
+noDuplicatezh_fast
+{
+    SAVE_THREAD_STATE();
+    ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
+    foreign "C" threadPaused (MyCapability() "ptr", CurrentTSO "ptr") [];
+    
+    if (StgTSO_what_next(CurrentTSO) == ThreadKilled::I16) {
+        jump stg_threadFinished;
+    } else {
+        LOAD_THREAD_STATE();
+        ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
+        jump %ENTRY_CODE(Sp(0));
+    }
+}
 
-   classes CCallable and CReturnable don't really exist, but the
-   compiler insists on generating dictionaries containing references
-   to GHC_ZcCCallable_static_info etc., so we provide dummy symbols
-   for these.  Some C compilers can't cope with zero-length static arrays,
-   so we have to make these one element long.
-  --------------------------------------------------------------------------- */
+getApStackValzh_fast
+{
+   W_ ap_stack, offset, val, ok;
 
-section "rodata" {
-  GHC_ZCCCallable_static_info:   W_ 0;
-}
+   /* args: R1 = AP_STACK, R2 = offset */
+   ap_stack = R1;
+   offset   = R2;
 
-section "rodata" {
-  GHC_ZCCReturnable_static_info: W_ 0;
+   if (%INFO_PTR(ap_stack) == stg_AP_STACK_info) {
+        ok = 1;
+        val = StgAP_STACK_payload(ap_stack,offset); 
+   } else {
+        ok = 0;
+        val = R1;
+   }
+   RET_NP(ok,val);
 }