New implementation of BLACKHOLEs
[ghc-hetmet.git] / rts / PrimOps.cmm
index 272c705..5c575f6 100644 (file)
 #include "Cmm.h"
 
 #ifdef __PIC__
-#ifndef mingw32_HOST_OS
-import __gmpz_init;
-import __gmpz_add;
-import __gmpz_sub;
-import __gmpz_mul;
-import __gmpz_gcd;
-import __gmpn_gcd_1;
-import __gmpn_cmp;
-import __gmpz_tdiv_q;
-import __gmpz_tdiv_r;
-import __gmpz_tdiv_qr;
-import __gmpz_fdiv_qr;
-import __gmpz_divexact;
-import __gmpz_and;
-import __gmpz_xor;
-import __gmpz_ior;
-import __gmpz_com;
-#endif
 import pthread_mutex_lock;
 import pthread_mutex_unlock;
 #endif
@@ -53,6 +35,9 @@ import base_ControlziExceptionziBase_nestedAtomically_closure;
 import EnterCriticalSection;
 import LeaveCriticalSection;
 import ghczmprim_GHCziBool_False_closure;
+#if !defined(mingw32_HOST_OS)
+import sm_mutex;
+#endif
 
 /*-----------------------------------------------------------------------------
   Array Primitives
@@ -69,14 +54,14 @@ import ghczmprim_GHCziBool_False_closure;
  * round up to the nearest word for the size of the array.
  */
 
-newByteArrayzh_fast
+stg_newByteArrayzh
 {
     W_ words, payload_words, n, p;
-    MAYBE_GC(NO_PTRS,newByteArrayzh_fast);
+    MAYBE_GC(NO_PTRS,stg_newByteArrayzh);
     n = R1;
     payload_words = ROUNDUP_BYTES_TO_WDS(n);
     words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-    ("ptr" p) = foreign "C" allocateLocal(MyCapability() "ptr",words) [];
+    ("ptr" p) = foreign "C" allocate(MyCapability() "ptr",words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
     StgArrWords_words(p) = payload_words;
@@ -86,20 +71,28 @@ newByteArrayzh_fast
 #define BA_ALIGN 16
 #define BA_MASK  (BA_ALIGN-1)
 
-newPinnedByteArrayzh_fast
+stg_newPinnedByteArrayzh
 {
-    W_ words, payload_words, n, p;
-
-    MAYBE_GC(NO_PTRS,newPinnedByteArrayzh_fast);
-    n = R1;
-    payload_words = ROUNDUP_BYTES_TO_WDS(n);
-
-    words = payload_words + ((SIZEOF_StgArrWords + BA_MASK) & ~BA_MASK);
-
-    ("ptr" p) = foreign "C" allocatePinned(words) [];
+    W_ words, bytes, payload_words, p;
+
+    MAYBE_GC(NO_PTRS,stg_newPinnedByteArrayzh);
+    bytes = R1;
+    /* payload_words is what we will tell the profiler we had to allocate */
+    payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
+    /* When we actually allocate memory, we need to allow space for the
+       header: */
+    bytes = bytes + SIZEOF_StgArrWords;
+    /* And we want to align to BA_ALIGN bytes, so we need to allow space
+       to shift up to BA_ALIGN - 1 bytes: */
+    bytes = bytes + BA_ALIGN - 1;
+    /* Now we convert to a number of words: */
+    words = ROUNDUP_BYTES_TO_WDS(bytes);
+
+    ("ptr" p) = foreign "C" allocatePinned(MyCapability() "ptr", words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
-    // This bumps p forwards so that the payload falls on an R2-byte boundary.
+    /* Now we need to move p forward so that the payload is aligned
+       to BA_ALIGN bytes: */
     p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK);
 
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
@@ -107,52 +100,58 @@ newPinnedByteArrayzh_fast
     RET_P(p);
 }
 
-newAlignedPinnedByteArrayzh_fast
+stg_newAlignedPinnedByteArrayzh
 {
-    W_ words, payload_words, n, p, mask;
+    W_ words, bytes, payload_words, p, alignment;
 
-    MAYBE_GC(NO_PTRS,newAlignedPinnedByteArrayzh_fast);
-    n = R1;
+    MAYBE_GC(NO_PTRS,stg_newAlignedPinnedByteArrayzh);
+    bytes = R1;
+    alignment = R2;
 
-    if (R2 > SIZEOF_W) {
-        mask = R2 - 1;
-    } else {
-        mask = 0;
-    }
-
-    payload_words = ROUNDUP_BYTES_TO_WDS(n);
+    /* payload_words is what we will tell the profiler we had to allocate */
+    payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
 
-    // We want an <align>-byte aligned array.  allocatePinned() gives us
-    // 8-byte aligned memory by default, but we want to align the
-    // *goods* inside the ArrWords object, so we have to check the
-    // size of the ArrWords header and adjust our size accordingly.
-    words = payload_words + ((SIZEOF_StgArrWords + mask) & ~mask);
+    /* When we actually allocate memory, we need to allow space for the
+       header: */
+    bytes = bytes + SIZEOF_StgArrWords;
+    /* And we want to align to <alignment> bytes, so we need to allow space
+       to shift up to <alignment - 1> bytes: */
+    bytes = bytes + alignment - 1;
+    /* Now we convert to a number of words: */
+    words = ROUNDUP_BYTES_TO_WDS(bytes);
 
-    ("ptr" p) = foreign "C" allocatePinned(words) [];
+    ("ptr" p) = foreign "C" allocatePinned(MyCapability() "ptr", words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
-    // This bumps p forwards so that the payload falls on an R2-byte boundary.
-    p = p + ((-p - SIZEOF_StgArrWords) & mask);
+    /* Now we need to move p forward so that the payload is aligned
+       to <alignment> bytes. Note that we are assuming that
+       <alignment> is a power of 2, which is technically not guaranteed */
+    p = p + ((-p - SIZEOF_StgArrWords) & (alignment - 1));
 
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
     StgArrWords_words(p) = payload_words;
     RET_P(p);
 }
 
-newArrayzh_fast
+stg_newArrayzh
 {
-    W_ words, n, init, arr, p;
+    W_ words, n, init, arr, p, size;
     /* Args: R1 = words, R2 = initialisation value */
 
     n = R1;
-    MAYBE_GC(R2_PTR,newArrayzh_fast);
-
-    words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + n;
-    ("ptr" arr) = foreign "C" allocateLocal(MyCapability() "ptr",words) [R2];
+    MAYBE_GC(R2_PTR,stg_newArrayzh);
+
+    // the mark area contains one byte for each 2^MUT_ARR_PTRS_CARD_BITS words
+    // in the array, making sure we round up, and then rounding up to a whole
+    // number of words.
+    size = n + mutArrPtrsCardWords(n);
+    words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + size;
+    ("ptr" arr) = foreign "C" allocate(MyCapability() "ptr",words) [R2];
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
 
     SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
     StgMutArrPtrs_ptrs(arr) = n;
+    StgMutArrPtrs_size(arr) = size;
 
     // Initialise all elements of the the array with the value in R2
     init = R2;
@@ -163,18 +162,25 @@ newArrayzh_fast
        p = p + WDS(1);
        goto for;
     }
+    // Initialise the mark bits with 0
+  for2:
+    if (p < arr + WDS(size)) {
+       W_[p] = 0;
+       p = p + WDS(1);
+       goto for2;
+    }
 
     RET_P(arr);
 }
 
-unsafeThawArrayzh_fast
+stg_unsafeThawArrayzh
 {
   // SUBTLETY TO DO WITH THE OLD GEN MUTABLE LIST
   //
   // A MUT_ARR_PTRS lives on the mutable list, but a MUT_ARR_PTRS_FROZEN 
   // normally doesn't.  However, when we freeze a MUT_ARR_PTRS, we leave
   // it on the mutable list for the GC to remove (removing something from
-  // the mutable list is not easy, because the mut_list is only singly-linked).
+  // the mutable list is not easy).
   // 
   // So that we can tell whether a MUT_ARR_PTRS_FROZEN is on the mutable list,
   // when we freeze it we set the info ptr to be MUT_ARR_PTRS_FROZEN0
@@ -202,12 +208,12 @@ unsafeThawArrayzh_fast
    MutVar primitives
    -------------------------------------------------------------------------- */
 
-newMutVarzh_fast
+stg_newMutVarzh
 {
     W_ mv;
     /* Args: R1 = initialisation value */
 
-    ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, newMutVarzh_fast);
+    ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, stg_newMutVarzh);
 
     mv = Hp - SIZEOF_StgMutVar + WDS(1);
     SET_HDR(mv,stg_MUT_VAR_DIRTY_info,W_[CCCS]);
@@ -216,7 +222,7 @@ newMutVarzh_fast
     RET_P(mv);
 }
 
-atomicModifyMutVarzh_fast
+stg_atomicModifyMutVarzh
 {
     W_ mv, f, z, x, y, r, h;
     /* Args: R1 :: MutVar#,  R2 :: a -> (a,b) */
@@ -255,7 +261,7 @@ atomicModifyMutVarzh_fast
 
 #define SIZE (THUNK_2_SIZE + THUNK_1_SIZE + THUNK_1_SIZE)
 
-   HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, atomicModifyMutVarzh_fast);
+   HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, stg_atomicModifyMutVarzh);
 
    mv = R1;
    f = R2;
@@ -304,7 +310,7 @@ atomicModifyMutVarzh_fast
 
 STRING(stg_weak_msg,"New weak pointer at %p\n")
 
-mkWeakzh_fast
+stg_mkWeakzh
 {
   /* R1 = key
      R2 = value
@@ -316,7 +322,7 @@ mkWeakzh_fast
     R3 = stg_NO_FINALIZER_closure;
   }
 
-  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, mkWeakzh_fast );
+  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, stg_mkWeakzh );
 
   w = Hp - SIZEOF_StgWeak + WDS(1);
   SET_HDR(w, stg_WEAK_info, W_[CCCS]);
@@ -330,15 +336,17 @@ mkWeakzh_fast
   StgWeak_finalizer(w)  = R3;
   StgWeak_cfinalizer(w) = stg_NO_FINALIZER_closure;
 
+  ACQUIRE_LOCK(sm_mutex);
   StgWeak_link(w)      = W_[weak_ptr_list];
   W_[weak_ptr_list]    = w;
+  RELEASE_LOCK(sm_mutex);
 
   IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
 
   RET_P(w);
 }
 
-mkWeakForeignEnvzh_fast
+stg_mkWeakForeignEnvzh
 {
   /* R1 = key
      R2 = value
@@ -358,14 +366,14 @@ mkWeakForeignEnvzh_fast
   flag = R5;
   eptr = R6;
 
-  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, mkWeakForeignEnvzh_fast );
+  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR, stg_mkWeakForeignEnvzh );
 
   w = Hp - SIZEOF_StgWeak + WDS(1);
   SET_HDR(w, stg_WEAK_info, W_[CCCS]);
 
   payload_words = 4;
   words         = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-  ("ptr" p)     = foreign "C" allocateLocal(MyCapability() "ptr", words) [];
+  ("ptr" p)     = foreign "C" allocate(MyCapability() "ptr", words) [];
 
   TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
@@ -384,15 +392,17 @@ mkWeakForeignEnvzh_fast
   StgWeak_finalizer(w)  = stg_NO_FINALIZER_closure;
   StgWeak_cfinalizer(w) = p;
 
+  ACQUIRE_LOCK(sm_mutex);
   StgWeak_link(w)   = W_[weak_ptr_list];
   W_[weak_ptr_list] = w;
+  RELEASE_LOCK(sm_mutex);
 
   IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
 
   RET_P(w);
 }
 
-finalizzeWeakzh_fast
+stg_finalizzeWeakzh
 {
   /* R1 = weak ptr
    */
@@ -444,7 +454,7 @@ finalizzeWeakzh_fast
   }
 }
 
-deRefWeakzh_fast
+stg_deRefWeakzh
 {
   /* R1 = weak ptr */
   W_ w, code, val;
@@ -461,522 +471,20 @@ deRefWeakzh_fast
 }
 
 /* -----------------------------------------------------------------------------
-   Arbitrary-precision Integer operations.
-
-   There are some assumptions in this code that mp_limb_t == W_.  This is
-   the case for all the platforms that GHC supports, currently.
+   Floating point operations.
    -------------------------------------------------------------------------- */
 
-int2Integerzh_fast
-{
-   /* arguments: R1 = Int# */
-
-   W_ val, s, p;       /* to avoid aliasing */
-
-   val = R1;
-   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, int2Integerzh_fast );
-
-   p = Hp - SIZEOF_StgArrWords;
-   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-   StgArrWords_words(p) = 1;
-
-   /* mpz_set_si is inlined here, makes things simpler */
-   if (%lt(val,0)) { 
-       s  = -1;
-       Hp(0) = -val;
-   } else { 
-     if (%gt(val,0)) {
-       s = 1;
-       Hp(0) = val;
-     } else {
-       s = 0;
-     }
-  }
-
-   /* returns (# size  :: Int#, 
-                data  :: ByteArray# 
-              #)
-   */
-   RET_NP(s,p);
-}
-
-word2Integerzh_fast
-{
-   /* arguments: R1 = Word# */
-
-   W_ val, s, p;       /* to avoid aliasing */
-
-   val = R1;
-
-   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, word2Integerzh_fast);
-
-   p = Hp - SIZEOF_StgArrWords;
-   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-   StgArrWords_words(p) = 1;
-
-   if (val != 0) {
-       s = 1;
-       W_[Hp] = val;
-   } else {
-       s = 0;
-   }
-
-   /* returns (# size  :: Int#, 
-                data  :: ByteArray# #)
-   */
-   RET_NP(s,p);
-}
-
-
-/*
- * 'long long' primops for converting to/from Integers.
- */
-
-#ifdef SUPPORT_LONG_LONGS
-
-int64ToIntegerzh_fast
-{
-   /* arguments: L1 = Int64# */
-
-   L_ val;
-   W_ hi, lo, s, neg, words_needed, p;
-
-   val = L1;
-   neg = 0;
-
-   hi = TO_W_(val >> 32);
-   lo = TO_W_(val);
-
-   if ( hi == 0 || (hi == 0xFFFFFFFF && lo != 0) )  {
-       // minimum is one word
-       words_needed = 1;
-   } else { 
-       words_needed = 2;
-   }
-
-   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
-              NO_PTRS, int64ToIntegerzh_fast );
-
-   p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
-   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-   StgArrWords_words(p) = words_needed;
-
-   if ( %lt(hi,0) ) {
-     neg = 1;
-     lo = -lo;
-     if(lo == 0) {
-       hi = -hi;
-     } else {
-       hi = -hi - 1;
-     }
-   }
-
-   if ( words_needed == 2 )  { 
-      s = 2;
-      Hp(-1) = lo;
-      Hp(0) = hi;
-   } else { 
-       if ( lo != 0 ) {
-          s = 1;
-          Hp(0) = lo;
-       } else /* val==0 */  {
-          s = 0;
-       }
-   }
-   if ( neg != 0 ) {
-       s = -s;
-   }
-
-   /* returns (# size  :: Int#, 
-                data  :: ByteArray# #)
-   */
-   RET_NP(s,p);
-}
-word64ToIntegerzh_fast
-{
-   /* arguments: L1 = Word64# */
-
-   L_ val;
-   W_ hi, lo, s, words_needed, p;
-
-   val = L1;
-   hi = TO_W_(val >> 32);
-   lo = TO_W_(val);
-
-   if ( hi != 0 ) {
-      words_needed = 2;
-   } else {
-      words_needed = 1;
-   }
-
-   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
-              NO_PTRS, word64ToIntegerzh_fast );
-
-   p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
-   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-   StgArrWords_words(p) = words_needed;
-
-   if ( hi != 0 ) { 
-     s = 2;
-     Hp(-1) = lo;
-     Hp(0)  = hi;
-   } else {
-      if ( lo != 0 ) {
-        s = 1;
-        Hp(0) = lo;
-     } else /* val==0 */  {
-      s = 0;
-     }
-  }
-
-   /* returns (# size  :: Int#, 
-                data  :: ByteArray# #)
-   */
-   RET_NP(s,p);
-}
-
-
-
-#endif /* SUPPORT_LONG_LONGS */
-
-/* ToDo: this is shockingly inefficient */
-
-#ifndef THREADED_RTS
-section "bss" {
-  mp_tmp1:
-    bits8 [SIZEOF_MP_INT];
-}
-
-section "bss" {
-  mp_tmp2:
-    bits8 [SIZEOF_MP_INT];
-}
-
-section "bss" {
-  mp_result1:
-    bits8 [SIZEOF_MP_INT];
-}
-
-section "bss" {
-  mp_result2:
-    bits8 [SIZEOF_MP_INT];
-}
-#endif
-
-#ifdef THREADED_RTS
-#define FETCH_MP_TEMP(X) \
-W_ X; \
-X = BaseReg + (OFFSET_StgRegTable_r ## X);
-#else
-#define FETCH_MP_TEMP(X) /* Nothing */
-#endif
-
-#define GMP_TAKE2_RET1(name,mp_fun)                                     \
-name                                                                    \
-{                                                                       \
-  CInt s1, s2;                                                          \
-  W_ d1, d2;                                                            \
-  FETCH_MP_TEMP(mp_tmp1);                                               \
-  FETCH_MP_TEMP(mp_tmp2);                                               \
-  FETCH_MP_TEMP(mp_result1)                                             \
-  FETCH_MP_TEMP(mp_result2);                                            \
-                                                                        \
-  /* call doYouWantToGC() */                                            \
-  MAYBE_GC(R2_PTR & R4_PTR, name);                                      \
-                                                                        \
-  s1 = W_TO_INT(R1);                                                    \
-  d1 = R2;                                                              \
-  s2 = W_TO_INT(R3);                                                    \
-  d2 = R4;                                                              \
-                                                                        \
-  MP_INT__mp_alloc(mp_tmp1) = W_TO_INT(StgArrWords_words(d1));          \
-  MP_INT__mp_size(mp_tmp1)  = (s1);                                     \
-  MP_INT__mp_d(mp_tmp1)            = BYTE_ARR_CTS(d1);                         \
-  MP_INT__mp_alloc(mp_tmp2) = W_TO_INT(StgArrWords_words(d2));          \
-  MP_INT__mp_size(mp_tmp2)  = (s2);                                     \
-  MP_INT__mp_d(mp_tmp2)            = BYTE_ARR_CTS(d2);                         \
-                                                                        \
-  foreign "C" __gmpz_init(mp_result1 "ptr") [];                            \
-                                                                        \
-  /* Perform the operation */                                           \
-  foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1  "ptr",mp_tmp2  "ptr") []; \
-                                                                        \
-  RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
-         MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
-}
-
-#define GMP_TAKE1_RET1(name,mp_fun)                                     \
-name                                                                    \
-{                                                                       \
-  CInt s1;                                                              \
-  W_ d1;                                                                \
-  FETCH_MP_TEMP(mp_tmp1);                                               \
-  FETCH_MP_TEMP(mp_result1)                                             \
-                                                                        \
-  /* call doYouWantToGC() */                                            \
-  MAYBE_GC(R2_PTR, name);                                               \
-                                                                        \
-  d1 = R2;                                                              \
-  s1 = W_TO_INT(R1);                                                    \
-                                                                        \
-  MP_INT__mp_alloc(mp_tmp1)    = W_TO_INT(StgArrWords_words(d1));      \
-  MP_INT__mp_size(mp_tmp1)     = (s1);                                 \
-  MP_INT__mp_d(mp_tmp1)                = BYTE_ARR_CTS(d1);                     \
-                                                                        \
-  foreign "C" __gmpz_init(mp_result1 "ptr") [];                            \
-                                                                        \
-  /* Perform the operation */                                           \
-  foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1 "ptr") [];                \
-                                                                        \
-  RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
-         MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
-}
-
-#define GMP_TAKE2_RET2(name,mp_fun)                                                     \
-name                                                                                    \
-{                                                                                       \
-  CInt s1, s2;                                                                          \
-  W_ d1, d2;                                                                            \
-  FETCH_MP_TEMP(mp_tmp1);                                                               \
-  FETCH_MP_TEMP(mp_tmp2);                                                               \
-  FETCH_MP_TEMP(mp_result1)                                                             \
-  FETCH_MP_TEMP(mp_result2)                                                             \
-                                                                                        \
-  /* call doYouWantToGC() */                                                            \
-  MAYBE_GC(R2_PTR & R4_PTR, name);                                                      \
-                                                                                        \
-  s1 = W_TO_INT(R1);                                                                    \
-  d1 = R2;                                                                              \
-  s2 = W_TO_INT(R3);                                                                    \
-  d2 = R4;                                                                              \
-                                                                                        \
-  MP_INT__mp_alloc(mp_tmp1)    = W_TO_INT(StgArrWords_words(d1));                      \
-  MP_INT__mp_size(mp_tmp1)     = (s1);                                                 \
-  MP_INT__mp_d(mp_tmp1)                = BYTE_ARR_CTS(d1);                                     \
-  MP_INT__mp_alloc(mp_tmp2)    = W_TO_INT(StgArrWords_words(d2));                      \
-  MP_INT__mp_size(mp_tmp2)     = (s2);                                                 \
-  MP_INT__mp_d(mp_tmp2)                = BYTE_ARR_CTS(d2);                                     \
-                                                                                        \
-  foreign "C" __gmpz_init(mp_result1 "ptr") [];                                               \
-  foreign "C" __gmpz_init(mp_result2 "ptr") [];                                               \
-                                                                                        \
-  /* Perform the operation */                                                           \
-  foreign "C" mp_fun(mp_result1 "ptr",mp_result2 "ptr",mp_tmp1 "ptr",mp_tmp2 "ptr") [];    \
-                                                                                        \
-  RET_NPNP(TO_W_(MP_INT__mp_size(mp_result1)),                                          \
-           MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords,                               \
-          TO_W_(MP_INT__mp_size(mp_result2)),                                          \
-           MP_INT__mp_d(mp_result2) - SIZEOF_StgArrWords);                              \
-}
-
-GMP_TAKE2_RET1(plusIntegerzh_fast,     __gmpz_add)
-GMP_TAKE2_RET1(minusIntegerzh_fast,    __gmpz_sub)
-GMP_TAKE2_RET1(timesIntegerzh_fast,    __gmpz_mul)
-GMP_TAKE2_RET1(gcdIntegerzh_fast,      __gmpz_gcd)
-GMP_TAKE2_RET1(quotIntegerzh_fast,     __gmpz_tdiv_q)
-GMP_TAKE2_RET1(remIntegerzh_fast,      __gmpz_tdiv_r)
-GMP_TAKE2_RET1(divExactIntegerzh_fast, __gmpz_divexact)
-GMP_TAKE2_RET1(andIntegerzh_fast,      __gmpz_and)
-GMP_TAKE2_RET1(orIntegerzh_fast,       __gmpz_ior)
-GMP_TAKE2_RET1(xorIntegerzh_fast,      __gmpz_xor)
-GMP_TAKE1_RET1(complementIntegerzh_fast, __gmpz_com)
-
-GMP_TAKE2_RET2(quotRemIntegerzh_fast, __gmpz_tdiv_qr)
-GMP_TAKE2_RET2(divModIntegerzh_fast,  __gmpz_fdiv_qr)
-
-#ifndef THREADED_RTS
-section "bss" {
-  mp_tmp_w:  W_; // NB. mp_tmp_w is really an here mp_limb_t
-}
-#endif
-
-gcdIntzh_fast
-{
-    /* R1 = the first Int#; R2 = the second Int# */
-    W_ r; 
-    FETCH_MP_TEMP(mp_tmp_w);
-
-    W_[mp_tmp_w] = R1;
-    (r) = foreign "C" __gmpn_gcd_1(mp_tmp_w "ptr", 1, R2) [];
-
-    R1 = r;
-    /* Result parked in R1, return via info-pointer at TOS */
-    jump %ENTRY_CODE(Sp(0));
-}
-
-
-gcdIntegerIntzh_fast
-{
-    /* R1 = s1; R2 = d1; R3 = the int */
-    W_ s1;
-    (s1) = foreign "C" __gmpn_gcd_1( BYTE_ARR_CTS(R2) "ptr", R1, R3) [];
-    R1 = s1;
-    
-    /* Result parked in R1, return via info-pointer at TOS */
-    jump %ENTRY_CODE(Sp(0));
-}
-
-
-cmpIntegerIntzh_fast
-{
-    /* R1 = s1; R2 = d1; R3 = the int */
-    W_ usize, vsize, v_digit, u_digit;
-
-    usize = R1;
-    vsize = 0;
-    v_digit = R3;
-
-    // paraphrased from __gmpz_cmp_si() in the GMP sources
-    if (%gt(v_digit,0)) {
-       vsize = 1;
-    } else { 
-       if (%lt(v_digit,0)) {
-           vsize = -1;
-           v_digit = -v_digit;
-       }
-    }
-
-    if (usize != vsize) {
-       R1 = usize - vsize; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (usize == 0) {
-       R1 = 0; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    u_digit = W_[BYTE_ARR_CTS(R2)];
-
-    if (u_digit == v_digit) {
-       R1 = 0; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (%gtu(u_digit,v_digit)) { // NB. unsigned: these are mp_limb_t's
-       R1 = usize; 
-    } else {
-       R1 = -usize; 
-    }
-
-    jump %ENTRY_CODE(Sp(0));
-}
-
-cmpIntegerzh_fast
-{
-    /* R1 = s1; R2 = d1; R3 = s2; R4 = d2 */
-    W_ usize, vsize, size, up, vp;
-    CInt cmp;
-
-    // paraphrased from __gmpz_cmp() in the GMP sources
-    usize = R1;
-    vsize = R3;
-
-    if (usize != vsize) {
-       R1 = usize - vsize; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (usize == 0) {
-       R1 = 0; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (%lt(usize,0)) { // NB. not <, which is unsigned
-       size = -usize;
-    } else {
-       size = usize;
-    }
-
-    up = BYTE_ARR_CTS(R2);
-    vp = BYTE_ARR_CTS(R4);
-
-    (cmp) = foreign "C" __gmpn_cmp(up "ptr", vp "ptr", size) [];
-
-    if (cmp == 0 :: CInt) {
-       R1 = 0; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (%lt(cmp,0 :: CInt) == %lt(usize,0)) {
-       R1 = 1;
-    } else {
-       R1 = (-1); 
-    }
-    /* Result parked in R1, return via info-pointer at TOS */
-    jump %ENTRY_CODE(Sp(0));
-}
-
-integer2Intzh_fast
-{
-    /* R1 = s; R2 = d */
-    W_ r, s;
-
-    s = R1;
-    if (s == 0) {
-       r = 0;
-    } else {
-       r = W_[R2 + SIZEOF_StgArrWords];
-       if (%lt(s,0)) {
-           r = -r;
-       }
-    }
-    /* Result parked in R1, return via info-pointer at TOS */
-    R1 = r;
-    jump %ENTRY_CODE(Sp(0));
-}
-
-integer2Wordzh_fast
-{
-  /* R1 = s; R2 = d */
-  W_ r, s;
-
-  s = R1;
-  if (s == 0) {
-    r = 0;
-  } else {
-    r = W_[R2 + SIZEOF_StgArrWords];
-    if (%lt(s,0)) {
-       r = -r;
-    }
-  }
-  /* Result parked in R1, return via info-pointer at TOS */
-  R1 = r;
-  jump %ENTRY_CODE(Sp(0));
-}
-
-decodeFloatzh_fast
+stg_decodeFloatzuIntzh
 { 
     W_ p;
     F_ arg;
-    FETCH_MP_TEMP(mp_tmp1);
-    FETCH_MP_TEMP(mp_tmp_w);
-    
-    /* arguments: F1 = Float# */
-    arg = F1;
-    
-    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, decodeFloatzh_fast );
-    
-    /* Be prepared to tell Lennart-coded __decodeFloat
-       where mantissa._mp_d can be put (it does not care about the rest) */
-    p = Hp - SIZEOF_StgArrWords;
-    SET_HDR(p,stg_ARR_WORDS_info,W_[CCCS]);
-    StgArrWords_words(p) = 1;
-    MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
-    
-    /* Perform the operation */
-    foreign "C" __decodeFloat(mp_tmp1 "ptr",mp_tmp_w "ptr" ,arg) [];
-    
-    /* returns: (Int# (expn), Int#, ByteArray#) */
-    RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
-}
+    W_ mp_tmp1;
+    W_ mp_tmp_w;
 
-decodeFloatzuIntzh_fast
-{ 
-    W_ p;
-    F_ arg;
-    FETCH_MP_TEMP(mp_tmp1);
-    FETCH_MP_TEMP(mp_tmp_w);
+    STK_CHK_GEN( WDS(2), NO_PTRS, stg_decodeFloatzuIntzh );
+
+    mp_tmp1  = Sp - WDS(1);
+    mp_tmp_w = Sp - WDS(2);
     
     /* arguments: F1 = Float# */
     arg = F1;
@@ -988,43 +496,21 @@ decodeFloatzuIntzh_fast
     RET_NN(W_[mp_tmp1], W_[mp_tmp_w]);
 }
 
-#define DOUBLE_MANTISSA_SIZE SIZEOF_DOUBLE
-#define ARR_SIZE (SIZEOF_StgArrWords + DOUBLE_MANTISSA_SIZE)
-
-decodeDoublezh_fast
+stg_decodeDoublezu2Intzh
 { 
     D_ arg;
     W_ p;
-    FETCH_MP_TEMP(mp_tmp1);
-    FETCH_MP_TEMP(mp_tmp_w);
-
-    /* arguments: D1 = Double# */
-    arg = D1;
+    W_ mp_tmp1;
+    W_ mp_tmp2;
+    W_ mp_result1;
+    W_ mp_result2;
 
-    ALLOC_PRIM( ARR_SIZE, NO_PTRS, decodeDoublezh_fast );
-    
-    /* Be prepared to tell Lennart-coded __decodeDouble
-       where mantissa.d can be put (it does not care about the rest) */
-    p = Hp - ARR_SIZE + WDS(1);
-    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-    StgArrWords_words(p) = BYTES_TO_WDS(DOUBLE_MANTISSA_SIZE);
-    MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
+    STK_CHK_GEN( WDS(4), NO_PTRS, stg_decodeDoublezu2Intzh );
 
-    /* Perform the operation */
-    foreign "C" __decodeDouble(mp_tmp1 "ptr", mp_tmp_w "ptr",arg) [];
-    
-    /* returns: (Int# (expn), Int#, ByteArray#) */
-    RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
-}
-
-decodeDoublezu2Intzh_fast
-{ 
-    D_ arg;
-    W_ p;
-    FETCH_MP_TEMP(mp_tmp1);
-    FETCH_MP_TEMP(mp_tmp2);
-    FETCH_MP_TEMP(mp_result1);
-    FETCH_MP_TEMP(mp_result2);
+    mp_tmp1    = Sp - WDS(1);
+    mp_tmp2    = Sp - WDS(2);
+    mp_result1 = Sp - WDS(3);
+    mp_result2 = Sp - WDS(4);
 
     /* arguments: D1 = Double# */
     arg = D1;
@@ -1043,11 +529,11 @@ decodeDoublezu2Intzh_fast
  * Concurrency primitives
  * -------------------------------------------------------------------------- */
 
-forkzh_fast
+stg_forkzh
 {
   /* args: R1 = closure to spark */
 
-  MAYBE_GC(R1_PTR, forkzh_fast);
+  MAYBE_GC(R1_PTR, stg_forkzh);
 
   W_ closure;
   W_ threadid;
@@ -1058,23 +544,24 @@ forkzh_fast
                                closure "ptr") [];
 
   /* start blocked if the current thread is blocked */
-  StgTSO_flags(threadid) = 
-     StgTSO_flags(threadid) |  (StgTSO_flags(CurrentTSO) & 
-                                (TSO_BLOCKEX::I32 | TSO_INTERRUPTIBLE::I32));
+  StgTSO_flags(threadid) = %lobits16(
+     TO_W_(StgTSO_flags(threadid)) | 
+     TO_W_(StgTSO_flags(CurrentTSO)) & (TSO_BLOCKEX | TSO_INTERRUPTIBLE));
 
   foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];
 
-  // switch at the earliest opportunity
+  // context switch soon, but not immediately: we don't want every
+  // forkIO to force a context-switch.
   Capability_context_switch(MyCapability()) = 1 :: CInt;
   
   RET_P(threadid);
 }
 
-forkOnzh_fast
+stg_forkOnzh
 {
   /* args: R1 = cpu, R2 = closure to spark */
 
-  MAYBE_GC(R2_PTR, forkOnzh_fast);
+  MAYBE_GC(R2_PTR, stg_forkOnzh);
 
   W_ cpu;
   W_ closure;
@@ -1087,30 +574,31 @@ forkOnzh_fast
                                closure "ptr") [];
 
   /* start blocked if the current thread is blocked */
-  StgTSO_flags(threadid) = 
-     StgTSO_flags(threadid) |  (StgTSO_flags(CurrentTSO) & 
-                                (TSO_BLOCKEX::I32 | TSO_INTERRUPTIBLE::I32));
+  StgTSO_flags(threadid) = %lobits16(
+     TO_W_(StgTSO_flags(threadid)) | 
+     TO_W_(StgTSO_flags(CurrentTSO)) & (TSO_BLOCKEX | TSO_INTERRUPTIBLE));
 
   foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];
 
-  // switch at the earliest opportunity
+  // context switch soon, but not immediately: we don't want every
+  // forkIO to force a context-switch.
   Capability_context_switch(MyCapability()) = 1 :: CInt;
   
   RET_P(threadid);
 }
 
-yieldzh_fast
+stg_yieldzh
 {
   jump stg_yield_noregs;
 }
 
-myThreadIdzh_fast
+stg_myThreadIdzh
 {
   /* no args. */
   RET_P(CurrentTSO);
 }
 
-labelThreadzh_fast
+stg_labelThreadzh
 {
   /* args: 
        R1 = ThreadId#
@@ -1121,7 +609,7 @@ labelThreadzh_fast
   jump %ENTRY_CODE(Sp(0));
 }
 
-isCurrentThreadBoundzh_fast
+stg_isCurrentThreadBoundzh
 {
   /* no args */
   W_ r;
@@ -1129,7 +617,7 @@ isCurrentThreadBoundzh_fast
   RET_N(r);
 }
 
-threadStatuszh_fast
+stg_threadStatuszh
 {
     /* args: R1 :: ThreadId# */
     W_ tso;
@@ -1181,7 +669,7 @@ INFO_TABLE_RET(stg_catch_retry_frame, CATCH_RETRY_FRAME,
 
    frame = Sp;
    trec = StgTSO_trec(CurrentTSO);
-   ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+   outer  = StgTRecHeader_enclosing_trec(trec);
    (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
    if (r != 0) {
      /* Succeeded (either first branch or second branch) */
@@ -1209,18 +697,20 @@ INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
 #if defined(PROFILING)
   W_ unused1, W_ unused2,
 #endif
-  P_ unused3, P_ unused4)
+  P_ code, P_ next_invariant_to_check, P_ result)
 {
   W_ frame, trec, valid, next_invariant, q, outer;
 
-  frame = Sp;
-  trec = StgTSO_trec(CurrentTSO);
-  ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+  frame  = Sp;
+  trec   = StgTSO_trec(CurrentTSO);
+  result = R1;
+  outer  = StgTRecHeader_enclosing_trec(trec);
 
   if (outer == NO_TREC) {
     /* First time back at the atomically frame -- pick up invariants */
     ("ptr" q) = foreign "C" stmGetInvariantsToCheck(MyCapability() "ptr", trec "ptr") [];
     StgAtomicallyFrame_next_invariant_to_check(frame) = q;
+    StgAtomicallyFrame_result(frame) = result;
 
   } else {
     /* Second/subsequent time back at the atomically frame -- abort the
@@ -1254,6 +744,7 @@ INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
     if (valid != 0) {
       /* Transaction was valid: commit succeeded */
       StgTSO_trec(CurrentTSO) = NO_TREC;
+      R1 = StgAtomicallyFrame_result(frame);
       Sp = Sp + SIZEOF_StgAtomicallyFrame;
       jump %ENTRY_CODE(Sp(SP_OFF));
     } else {
@@ -1271,7 +762,7 @@ INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
 #if defined(PROFILING)
   W_ unused1, W_ unused2,
 #endif
-  P_ unused3, P_ unused4)
+  P_ code, P_ next_invariant_to_check, P_ result)
 {
   W_ frame, trec, valid;
 
@@ -1310,7 +801,7 @@ INFO_TABLE_RET(stg_catch_stm_frame, CATCH_STM_FRAME,
       W_ r, frame, trec, outer;
       frame = Sp;
       trec = StgTSO_trec(CurrentTSO);
-      ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+      outer  = StgTRecHeader_enclosing_trec(trec);
       (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
       if (r != 0) {
         /* Commit succeeded */
@@ -1330,24 +821,24 @@ INFO_TABLE_RET(stg_catch_stm_frame, CATCH_STM_FRAME,
 
 // Primop definition ------------------------------------------------------------
 
-atomicallyzh_fast
+stg_atomicallyzh
 {
   W_ frame;
   W_ old_trec;
   W_ new_trec;
   
   // stmStartTransaction may allocate
-  MAYBE_GC (R1_PTR, atomicallyzh_fast); 
+  MAYBE_GC (R1_PTR, stg_atomicallyzh); 
 
   /* Args: R1 = m :: STM a */
-  STK_CHK_GEN(SIZEOF_StgAtomicallyFrame + WDS(1), R1_PTR, atomicallyzh_fast);
+  STK_CHK_GEN(SIZEOF_StgAtomicallyFrame + WDS(1), R1_PTR, stg_atomicallyzh);
 
   old_trec = StgTSO_trec(CurrentTSO);
 
   /* Nested transactions are not allowed; raise an exception */
   if (old_trec != NO_TREC) {
      R1 = base_ControlziExceptionziBase_nestedAtomically_closure;
-     jump raisezh_fast;
+     jump stg_raisezh;
   }
 
   /* Set up the atomically frame */
@@ -1356,6 +847,7 @@ atomicallyzh_fast
 
   SET_HDR(frame,stg_atomically_frame_info, W_[CCCS]);
   StgAtomicallyFrame_code(frame) = R1;
+  StgAtomicallyFrame_result(frame) = NO_TREC;
   StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
 
   /* Start the memory transcation */
@@ -1367,13 +859,13 @@ atomicallyzh_fast
 }
 
 
-catchSTMzh_fast
+stg_catchSTMzh
 {
   W_ frame;
   
   /* Args: R1 :: STM a */
   /* Args: R2 :: Exception -> STM a */
-  STK_CHK_GEN(SIZEOF_StgCatchSTMFrame + WDS(1), R1_PTR & R2_PTR, catchSTMzh_fast);
+  STK_CHK_GEN(SIZEOF_StgCatchSTMFrame + WDS(1), R1_PTR & R2_PTR, stg_catchSTMzh);
 
   /* Set up the catch frame */
   Sp = Sp - SIZEOF_StgCatchSTMFrame;
@@ -1395,18 +887,18 @@ catchSTMzh_fast
 }
 
 
-catchRetryzh_fast
+stg_catchRetryzh
 {
   W_ frame;
   W_ new_trec;
   W_ trec;
 
   // stmStartTransaction may allocate
-  MAYBE_GC (R1_PTR & R2_PTR, catchRetryzh_fast); 
+  MAYBE_GC (R1_PTR & R2_PTR, stg_catchRetryzh); 
 
   /* Args: R1 :: STM a */
   /* Args: R2 :: STM a */
-  STK_CHK_GEN(SIZEOF_StgCatchRetryFrame + WDS(1), R1_PTR & R2_PTR, catchRetryzh_fast);
+  STK_CHK_GEN(SIZEOF_StgCatchRetryFrame + WDS(1), R1_PTR & R2_PTR, stg_catchRetryzh);
 
   /* Start a nested transaction within which to run the first code */
   trec = StgTSO_trec(CurrentTSO);
@@ -1427,7 +919,7 @@ catchRetryzh_fast
 }
 
 
-retryzh_fast
+stg_retryzh
 {
   W_ frame_type;
   W_ frame;
@@ -1435,7 +927,7 @@ retryzh_fast
   W_ outer;
   W_ r;
 
-  MAYBE_GC (NO_PTRS, retryzh_fast); // STM operations may allocate
+  MAYBE_GC (NO_PTRS, stg_retryzh); // STM operations may allocate
 
   // Find the enclosing ATOMICALLY_FRAME or CATCH_RETRY_FRAME
 retry_pop_stack:
@@ -1444,7 +936,7 @@ retry_pop_stack:
   Sp = StgTSO_sp(CurrentTSO);
   frame = Sp;
   trec = StgTSO_trec(CurrentTSO);
-  ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+  outer  = StgTRecHeader_enclosing_trec(trec);
 
   if (frame_type == CATCH_RETRY_FRAME) {
     // The retry reaches a CATCH_RETRY_FRAME before the atomic frame
@@ -1477,7 +969,7 @@ retry_pop_stack:
     foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
     trec = outer;
     StgTSO_trec(CurrentTSO) = trec;
-    ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+    outer  = StgTRecHeader_enclosing_trec(trec);
   }
   ASSERT(outer == NO_TREC);
 
@@ -1500,12 +992,12 @@ retry_pop_stack:
 }
 
 
-checkzh_fast
+stg_checkzh
 {
   W_ trec, closure;
 
   /* Args: R1 = invariant closure */
-  MAYBE_GC (R1_PTR, checkzh_fast); 
+  MAYBE_GC (R1_PTR, stg_checkzh); 
 
   trec = StgTSO_trec(CurrentTSO);
   closure = R1;
@@ -1517,21 +1009,21 @@ checkzh_fast
 }
 
 
-newTVarzh_fast
+stg_newTVarzh
 {
   W_ tv;
   W_ new_value;
 
   /* Args: R1 = initialisation value */
 
-  MAYBE_GC (R1_PTR, newTVarzh_fast); 
+  MAYBE_GC (R1_PTR, stg_newTVarzh); 
   new_value = R1;
   ("ptr" tv) = foreign "C" stmNewTVar(MyCapability() "ptr", new_value "ptr") [];
   RET_P(tv);
 }
 
 
-readTVarzh_fast
+stg_readTVarzh
 {
   W_ trec;
   W_ tvar;
@@ -1539,7 +1031,7 @@ readTVarzh_fast
 
   /* Args: R1 = TVar closure */
 
-  MAYBE_GC (R1_PTR, readTVarzh_fast); // Call to stmReadTVar may allocate
+  MAYBE_GC (R1_PTR, stg_readTVarzh); // Call to stmReadTVar may allocate
   trec = StgTSO_trec(CurrentTSO);
   tvar = R1;
   ("ptr" result) = foreign "C" stmReadTVar(MyCapability() "ptr", trec "ptr", tvar "ptr") [];
@@ -1547,7 +1039,7 @@ readTVarzh_fast
   RET_P(result);
 }
 
-readTVarIOzh_fast
+stg_readTVarIOzh
 {
     W_ result;
 
@@ -1559,7 +1051,7 @@ again:
     RET_P(result);
 }
 
-writeTVarzh_fast
+stg_writeTVarzh
 {
   W_ trec;
   W_ tvar;
@@ -1568,7 +1060,7 @@ writeTVarzh_fast
   /* Args: R1 = TVar closure */
   /*       R2 = New value    */
 
-  MAYBE_GC (R1_PTR & R2_PTR, writeTVarzh_fast); // Call to stmWriteTVar may allocate
+  MAYBE_GC (R1_PTR & R2_PTR, stg_writeTVarzh); // Call to stmWriteTVar may allocate
   trec = StgTSO_trec(CurrentTSO);
   tvar = R1;
   new_value = R2;
@@ -1610,7 +1102,7 @@ writeTVarzh_fast
  *
  * -------------------------------------------------------------------------- */
 
-isEmptyMVarzh_fast
+stg_isEmptyMVarzh
 {
     /* args: R1 = MVar closure */
 
@@ -1621,12 +1113,12 @@ isEmptyMVarzh_fast
     }
 }
 
-newMVarzh_fast
+stg_newMVarzh
 {
     /* args: none */
     W_ mvar;
 
-    ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, newMVarzh_fast );
+    ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, stg_newMVarzh );
   
     mvar = Hp - SIZEOF_StgMVar + WDS(1);
     SET_HDR(mvar,stg_MVAR_DIRTY_info,W_[CCCS]);
@@ -1646,7 +1138,7 @@ newMVarzh_fast
     StgTSO_sp(tso) = StgTSO_sp(tso) + WDS(3);  \
     lval = W_[StgTSO_sp(tso) - WDS(1)];
 
-takeMVarzh_fast
+stg_takeMVarzh
 {
     W_ mvar, val, info, tso;
 
@@ -1700,7 +1192,7 @@ takeMVarzh_fast
       tso = StgMVar_head(mvar);
       PerformPut(tso,StgMVar_value(mvar));
 
-      if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
+      if (TO_W_(StgTSO_dirty(tso)) == 0) {
           foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
       }
 
@@ -1712,11 +1204,7 @@ takeMVarzh_fast
          StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
       }
 
-#if defined(THREADED_RTS)
       unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-      SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
       RET_P(val);
   } 
   else
@@ -1724,18 +1212,14 @@ takeMVarzh_fast
       /* No further putMVars, MVar is now empty */
       StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
  
-#if defined(THREADED_RTS)
       unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-      SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
 
       RET_P(val);
   }
 }
 
 
-tryTakeMVarzh_fast
+stg_tryTakeMVarzh
 {
     W_ mvar, val, info, tso;
 
@@ -1776,7 +1260,7 @@ tryTakeMVarzh_fast
        /* actually perform the putMVar for the thread that we just woke up */
        tso = StgMVar_head(mvar);
        PerformPut(tso,StgMVar_value(mvar));
-        if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
+        if (TO_W_(StgTSO_dirty(tso)) == 0) {
             foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
         }
 
@@ -1787,28 +1271,20 @@ tryTakeMVarzh_fast
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
        }
-#if defined(THREADED_RTS)
         unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-        SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
     }
     else 
     {
        /* No further putMVars, MVar is now empty */
        StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
-#if defined(THREADED_RTS)
        unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-       SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
     }
     
     RET_NP(1, val);
 }
 
 
-putMVarzh_fast
+stg_putMVarzh
 {
     W_ mvar, val, info, tso;
 
@@ -1856,7 +1332,7 @@ putMVarzh_fast
        /* actually perform the takeMVar */
        tso = StgMVar_head(mvar);
        PerformTake(tso, val);
-        if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
+        if (TO_W_(StgTSO_dirty(tso)) == 0) {
             foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
         }
       
@@ -1868,11 +1344,7 @@ putMVarzh_fast
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
        }
 
-#if defined(THREADED_RTS)
        unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-        SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
        jump %ENTRY_CODE(Sp(0));
     }
     else
@@ -1880,11 +1352,7 @@ putMVarzh_fast
        /* No further takes, the MVar is now full. */
        StgMVar_value(mvar) = val;
 
-#if defined(THREADED_RTS)
        unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-       SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
        jump %ENTRY_CODE(Sp(0));
     }
     
@@ -1892,7 +1360,7 @@ putMVarzh_fast
 }
 
 
-tryPutMVarzh_fast
+stg_tryPutMVarzh
 {
     W_ mvar, info, tso;
 
@@ -1925,7 +1393,7 @@ tryPutMVarzh_fast
        /* actually perform the takeMVar */
        tso = StgMVar_head(mvar);
        PerformTake(tso, R2);
-        if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
+        if (TO_W_(StgTSO_dirty(tso)) == 0) {
             foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
         }
       
@@ -1937,22 +1405,14 @@ tryPutMVarzh_fast
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
        }
 
-#if defined(THREADED_RTS)
        unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-        SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
     }
     else
     {
        /* No further takes, the MVar is now full. */
        StgMVar_value(mvar) = R2;
 
-#if defined(THREADED_RTS)
        unlockClosure(mvar, stg_MVAR_DIRTY_info);
-#else
-       SET_INFO(mvar,stg_MVAR_DIRTY_info);
-#endif
     }
     
     RET_N(1);
@@ -1964,11 +1424,11 @@ tryPutMVarzh_fast
    Stable pointer primitives
    -------------------------------------------------------------------------  */
 
-makeStableNamezh_fast
+stg_makeStableNamezh
 {
     W_ index, sn_obj;
 
-    ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, makeStableNamezh_fast );
+    ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, stg_makeStableNamezh );
   
     (index) = foreign "C" lookupStableName(R1 "ptr") [];
 
@@ -1988,16 +1448,16 @@ makeStableNamezh_fast
 }
 
 
-makeStablePtrzh_fast
+stg_makeStablePtrzh
 {
     /* Args: R1 = a */
     W_ sp;
-    MAYBE_GC(R1_PTR, makeStablePtrzh_fast);
+    MAYBE_GC(R1_PTR, stg_makeStablePtrzh);
     ("ptr" sp) = foreign "C" getStablePtr(R1 "ptr") [];
     RET_N(sp);
 }
 
-deRefStablePtrzh_fast
+stg_deRefStablePtrzh
 {
     /* Args: R1 = the stable ptr */
     W_ r, sp;
@@ -2010,7 +1470,7 @@ deRefStablePtrzh_fast
    Bytecode object primitives
    -------------------------------------------------------------------------  */
 
-newBCOzh_fast
+stg_newBCOzh
 {
     /* R1 = instrs
        R2 = literals
@@ -2025,7 +1485,7 @@ newBCOzh_fast
     words = BYTES_TO_WDS(SIZEOF_StgBCO) + StgArrWords_words(bitmap_arr);
     bytes = WDS(words);
 
-    ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, newBCOzh_fast );
+    ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, stg_newBCOzh );
 
     bco = Hp - bytes + WDS(1);
     SET_HDR(bco, stg_BCO_info, W_[CCCS]);
@@ -2050,7 +1510,7 @@ for:
 }
 
 
-mkApUpd0zh_fast
+stg_mkApUpd0zh
 {
     // R1 = the BCO# for the AP
     // 
@@ -2062,7 +1522,7 @@ mkApUpd0zh_fast
     ASSERT(%INFO_TYPE(%GET_STD_INFO(R1)) == HALF_W_(BCO) &&
           StgBCO_arity(R1) == HALF_W_(0));
 
-    HP_CHK_GEN_TICKY(SIZEOF_StgAP, R1_PTR, mkApUpd0zh_fast);
+    HP_CHK_GEN_TICKY(SIZEOF_StgAP, R1_PTR, stg_mkApUpd0zh);
     TICK_ALLOC_UP_THK(0, 0);
     CCCS_ALLOC(SIZEOF_StgAP);
 
@@ -2075,7 +1535,7 @@ mkApUpd0zh_fast
     RET_P(ap);
 }
 
-unpackClosurezh_fast
+stg_unpackClosurezh
 {
 /* args: R1 = closure to analyze */
 // TODO: Consider the absence of ptrs or nonptrs as a special case ?
@@ -2105,11 +1565,12 @@ unpackClosurezh_fast
     }}
 out:
 
-    W_ ptrs_arr_sz, nptrs_arr_sz;
+    W_ ptrs_arr_sz, ptrs_arr_cards, nptrs_arr_sz;
     nptrs_arr_sz = SIZEOF_StgArrWords   + WDS(nptrs);
-    ptrs_arr_sz  = SIZEOF_StgMutArrPtrs + WDS(ptrs);
+    ptrs_arr_cards = mutArrPtrsCardWords(ptrs);
+    ptrs_arr_sz  = SIZEOF_StgMutArrPtrs + WDS(ptrs) + WDS(ptrs_arr_cards);
 
-    ALLOC_PRIM (ptrs_arr_sz + nptrs_arr_sz, R1_PTR, unpackClosurezh_fast);
+    ALLOC_PRIM (ptrs_arr_sz + nptrs_arr_sz, R1_PTR, stg_unpackClosurezh);
 
     W_ clos;
     clos = UNTAG(R1);
@@ -2119,6 +1580,8 @@ out:
 
     SET_HDR(ptrs_arr, stg_MUT_ARR_PTRS_FROZEN_info, W_[CCCS]);
     StgMutArrPtrs_ptrs(ptrs_arr) = ptrs;
+    StgMutArrPtrs_size(ptrs_arr) = ptrs + ptrs_arr_cards;
+
     p = 0;
 for:
     if(p < ptrs) {
@@ -2126,6 +1589,9 @@ for:
         p = p + 1;
         goto for;
     }
+    /* We can leave the card table uninitialised, since the array is
+       allocated in the nursery.  The GC will fill it in if/when the array
+       is promoted. */
     
     SET_HDR(nptrs_arr, stg_ARR_WORDS_info, W_[CCCS]);
     StgArrWords_words(nptrs_arr) = nptrs;
@@ -2155,7 +1621,7 @@ for2:
     }                                                  \
     W_[blocked_queue_tl] = tso;
 
-waitReadzh_fast
+stg_waitReadzh
 {
     /* args: R1 */
 #ifdef THREADED_RTS
@@ -2172,7 +1638,7 @@ waitReadzh_fast
 #endif
 }
 
-waitWritezh_fast
+stg_waitWritezh
 {
     /* args: R1 */
 #ifdef THREADED_RTS
@@ -2190,8 +1656,8 @@ waitWritezh_fast
 }
 
 
-STRING(stg_delayzh_malloc_str, "delayzh_fast")
-delayzh_fast
+STRING(stg_delayzh_malloc_str, "stg_delayzh")
+stg_delayzh
 {
 #ifdef mingw32_HOST_OS
     W_ ares;
@@ -2264,8 +1730,8 @@ while:
 
 
 #ifdef mingw32_HOST_OS
-STRING(stg_asyncReadzh_malloc_str, "asyncReadzh_fast")
-asyncReadzh_fast
+STRING(stg_asyncReadzh_malloc_str, "stg_asyncReadzh")
+stg_asyncReadzh
 {
     W_ ares;
     CInt reqID;
@@ -2292,8 +1758,8 @@ asyncReadzh_fast
 #endif
 }
 
-STRING(stg_asyncWritezh_malloc_str, "asyncWritezh_fast")
-asyncWritezh_fast
+STRING(stg_asyncWritezh_malloc_str, "stg_asyncWritezh")
+stg_asyncWritezh
 {
     W_ ares;
     CInt reqID;
@@ -2320,8 +1786,8 @@ asyncWritezh_fast
 #endif
 }
 
-STRING(stg_asyncDoProczh_malloc_str, "asyncDoProczh_fast")
-asyncDoProczh_fast
+STRING(stg_asyncDoProczh_malloc_str, "stg_asyncDoProczh")
+stg_asyncDoProczh
 {
     W_ ares;
     CInt reqID;
@@ -2349,12 +1815,71 @@ asyncDoProczh_fast
 }
 #endif
 
-// noDuplicate# tries to ensure that none of the thunks under
-// evaluation by the current thread are also under evaluation by
-// another thread.  It relies on *both* threads doing noDuplicate#;
-// the second one will get blocked if they are duplicating some work.
-noDuplicatezh_fast
+/* -----------------------------------------------------------------------------
+ * noDuplicate#
+ *
+ * noDuplicate# tries to ensure that none of the thunks under
+ * evaluation by the current thread are also under evaluation by
+ * another thread.  It relies on *both* threads doing noDuplicate#;
+ * the second one will get blocked if they are duplicating some work.
+ *
+ * The idea is that noDuplicate# is used within unsafePerformIO to
+ * ensure that the IO operation is performed at most once.
+ * noDuplicate# calls threadPaused which acquires an exclusive lock on
+ * all the thunks currently under evaluation by the current thread.
+ *
+ * Consider the following scenario.  There is a thunk A, whose
+ * evaluation requires evaluating thunk B, where thunk B is an
+ * unsafePerformIO.  Two threads, 1 and 2, bother enter A.  Thread 2
+ * is pre-empted before it enters B, and claims A by blackholing it
+ * (in threadPaused).  Thread 1 now enters B, and calls noDuplicate#.
+ *
+ *      thread 1                      thread 2
+ *   +-----------+                 +---------------+
+ *   |    -------+-----> A <-------+-------        |
+ *   |  update   |   BLACKHOLE     | marked_update |
+ *   +-----------+                 +---------------+
+ *   |           |                 |               | 
+ *        ...                             ...
+ *   |           |                 +---------------+
+ *   +-----------+
+ *   |     ------+-----> B
+ *   |  update   |   BLACKHOLE
+ *   +-----------+
+ *
+ * At this point: A is a blackhole, owned by thread 2.  noDuplicate#
+ * calls threadPaused, which walks up the stack and
+ *  - claims B on behalf of thread 1
+ *  - then it reaches the update frame for A, which it sees is already
+ *    a BLACKHOLE and is therefore owned by another thread.  Since
+ *    thread 1 is duplicating work, the computation up to the update
+ *    frame for A is suspended, including thunk B.
+ *  - thunk B, which is an unsafePerformIO, has now been reverted to
+ *    an AP_STACK which could be duplicated - BAD!
+ *  - The solution is as follows: before calling threadPaused, we
+ *    leave a frame on the stack (stg_noDuplicate_info) that will call
+ *    noDuplicate# again if the current computation is suspended and
+ *    restarted.
+ *
+ * See the test program in concurrent/prog003 for a way to demonstrate
+ * this.  It needs to be run with +RTS -N3 or greater, and the bug
+ * only manifests occasionally (once very 10 runs or so).
+ * -------------------------------------------------------------------------- */
+
+INFO_TABLE_RET(stg_noDuplicate, RET_SMALL)
 {
+    Sp_adj(1);
+    jump stg_noDuplicatezh;
+}
+
+stg_noDuplicatezh
+{
+    STK_CHK_GEN( WDS(1), NO_PTRS, stg_noDuplicatezh );
+    // leave noDuplicate frame in case the current
+    // computation is suspended and restarted (see above).
+    Sp_adj(-1);
+    Sp(0) = stg_noDuplicate_info;
+
     SAVE_THREAD_STATE();
     ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
     foreign "C" threadPaused (MyCapability() "ptr", CurrentTSO "ptr") [];
@@ -2364,11 +1889,19 @@ noDuplicatezh_fast
     } else {
         LOAD_THREAD_STATE();
         ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
+        // remove the stg_noDuplicate frame if it is still there.
+        if (Sp(0) == stg_noDuplicate_info) {
+            Sp_adj(1);
+        }
         jump %ENTRY_CODE(Sp(0));
     }
 }
 
-getApStackValzh_fast
+/* -----------------------------------------------------------------------------
+   Misc. primitives
+   -------------------------------------------------------------------------- */
+
+stg_getApStackValzh
 {
    W_ ap_stack, offset, val, ok;
 
@@ -2386,14 +1919,10 @@ getApStackValzh_fast
    RET_NP(ok,val);
 }
 
-/* -----------------------------------------------------------------------------
-   Misc. primitives
-   -------------------------------------------------------------------------- */
-
 // Write the cost center stack of the first argument on stderr; return
 // the second.  Possibly only makes sense for already evaluated
 // things?
-traceCcszh_fast
+stg_traceCcszh
 {
     W_ ccs;
 
@@ -2406,7 +1935,7 @@ traceCcszh_fast
     ENTER();
 }
 
-getSparkzh_fast
+stg_getSparkzh
 {
    W_ spark;
 
@@ -2421,3 +1950,28 @@ getSparkzh_fast
    }
 #endif
 }
+
+stg_traceEventzh
+{
+   W_ msg;
+   msg = R1;
+
+#if defined(TRACING) || defined(DEBUG)
+
+   foreign "C" traceUserMsg(MyCapability() "ptr", msg "ptr") [];
+
+#elif defined(DTRACE)
+
+   W_ enabled;
+
+   // We should go through the macro HASKELLEVENT_USER_MSG_ENABLED from
+   // RtsProbes.h, but that header file includes unistd.h, which doesn't
+   // work in Cmm
+   (enabled) = foreign "C" __dtrace_isenabled$HaskellEvent$user__msg$v1() [];
+   if (enabled != 0) {
+     foreign "C" dtraceUserMsgWrapper(MyCapability() "ptr", msg "ptr") [];
+   }
+
+#endif
+   jump %ENTRY_CODE(Sp(0));
+}