New syntax for GADT-style record declarations, and associated refactoring
[ghc-hetmet.git] / rts / PrimOps.cmm
index a6e221b..9efc9f1 100644 (file)
 #include "Cmm.h"
 
 #ifdef __PIC__
-#ifndef mingw32_HOST_OS
-import __gmpz_init;
-import __gmpz_add;
-import __gmpz_sub;
-import __gmpz_mul;
-import __gmpz_gcd;
-import __gmpn_gcd_1;
-import __gmpn_cmp;
-import __gmpz_tdiv_q;
-import __gmpz_tdiv_r;
-import __gmpz_tdiv_qr;
-import __gmpz_fdiv_qr;
-import __gmpz_divexact;
-import __gmpz_and;
-import __gmpz_xor;
-import __gmpz_ior;
-import __gmpz_com;
-#endif
 import pthread_mutex_lock;
 import pthread_mutex_unlock;
 #endif
@@ -83,32 +65,65 @@ newByteArrayzh_fast
     RET_P(p);
 }
 
+#define BA_ALIGN 16
+#define BA_MASK  (BA_ALIGN-1)
+
 newPinnedByteArrayzh_fast
 {
-    W_ words, payload_words, n, p;
+    W_ words, bytes, payload_words, p;
 
     MAYBE_GC(NO_PTRS,newPinnedByteArrayzh_fast);
-    n = R1;
-    payload_words = ROUNDUP_BYTES_TO_WDS(n);
+    bytes = R1;
+    /* payload_words is what we will tell the profiler we had to allocate */
+    payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
+    /* When we actually allocate memory, we need to allow space for the
+       header: */
+    bytes = bytes + SIZEOF_StgArrWords;
+    /* And we want to align to BA_ALIGN bytes, so we need to allow space
+       to shift up to BA_ALIGN - 1 bytes: */
+    bytes = bytes + BA_ALIGN - 1;
+    /* Now we convert to a number of words: */
+    words = ROUNDUP_BYTES_TO_WDS(bytes);
 
-    // We want an 8-byte aligned array.  allocatePinned() gives us
-    // 8-byte aligned memory by default, but we want to align the
-    // *goods* inside the ArrWords object, so we have to check the
-    // size of the ArrWords header and adjust our size accordingly.
-    words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-    if ((SIZEOF_StgArrWords & 7) != 0) {
-       words = words + 1;
-    }
+    ("ptr" p) = foreign "C" allocatePinned(words) [];
+    TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
+
+    /* Now we need to move p forward so that the payload is aligned
+       to BA_ALIGN bytes: */
+    p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK);
+
+    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+    StgArrWords_words(p) = payload_words;
+    RET_P(p);
+}
+
+newAlignedPinnedByteArrayzh_fast
+{
+    W_ words, bytes, payload_words, p, alignment;
+
+    MAYBE_GC(NO_PTRS,newAlignedPinnedByteArrayzh_fast);
+    bytes = R1;
+    alignment = R2;
+
+    /* payload_words is what we will tell the profiler we had to allocate */
+    payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
+
+    /* When we actually allocate memory, we need to allow space for the
+       header: */
+    bytes = bytes + SIZEOF_StgArrWords;
+    /* And we want to align to <alignment> bytes, so we need to allow space
+       to shift up to <alignment - 1> bytes: */
+    bytes = bytes + alignment - 1;
+    /* Now we convert to a number of words: */
+    words = ROUNDUP_BYTES_TO_WDS(bytes);
 
     ("ptr" p) = foreign "C" allocatePinned(words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
-    // Again, if the ArrWords header isn't a multiple of 8 bytes, we
-    // have to push the object forward one word so that the goods
-    // fall on an 8-byte boundary.
-    if ((SIZEOF_StgArrWords & 7) != 0) {
-       p = p + WDS(1);
-    }
+    /* Now we need to move p forward so that the payload is aligned
+       to <alignment> bytes. Note that we are assuming that
+       <alignment> is a power of 2, which is technically not guaranteed */
+    p = p + ((-p - SIZEOF_StgArrWords) & (alignment - 1));
 
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
     StgArrWords_words(p) = payload_words;
@@ -334,7 +349,7 @@ mkWeakForeignEnvzh_fast
   flag = R5;
   eptr = R6;
 
-  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, mkWeakForeignEnvzh_fast );
+  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR, mkWeakForeignEnvzh_fast );
 
   w = Hp - SIZEOF_StgWeak + WDS(1);
   SET_HDR(w, stg_WEAK_info, W_[CCCS]);
@@ -437,522 +452,20 @@ deRefWeakzh_fast
 }
 
 /* -----------------------------------------------------------------------------
-   Arbitrary-precision Integer operations.
-
-   There are some assumptions in this code that mp_limb_t == W_.  This is
-   the case for all the platforms that GHC supports, currently.
+   Floating point operations.
    -------------------------------------------------------------------------- */
 
-int2Integerzh_fast
-{
-   /* arguments: R1 = Int# */
-
-   W_ val, s, p;       /* to avoid aliasing */
-
-   val = R1;
-   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, int2Integerzh_fast );
-
-   p = Hp - SIZEOF_StgArrWords;
-   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-   StgArrWords_words(p) = 1;
-
-   /* mpz_set_si is inlined here, makes things simpler */
-   if (%lt(val,0)) { 
-       s  = -1;
-       Hp(0) = -val;
-   } else { 
-     if (%gt(val,0)) {
-       s = 1;
-       Hp(0) = val;
-     } else {
-       s = 0;
-     }
-  }
-
-   /* returns (# size  :: Int#, 
-                data  :: ByteArray# 
-              #)
-   */
-   RET_NP(s,p);
-}
-
-word2Integerzh_fast
-{
-   /* arguments: R1 = Word# */
-
-   W_ val, s, p;       /* to avoid aliasing */
-
-   val = R1;
-
-   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, word2Integerzh_fast);
-
-   p = Hp - SIZEOF_StgArrWords;
-   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-   StgArrWords_words(p) = 1;
-
-   if (val != 0) {
-       s = 1;
-       W_[Hp] = val;
-   } else {
-       s = 0;
-   }
-
-   /* returns (# size  :: Int#, 
-                data  :: ByteArray# #)
-   */
-   RET_NP(s,p);
-}
-
-
-/*
- * 'long long' primops for converting to/from Integers.
- */
-
-#ifdef SUPPORT_LONG_LONGS
-
-int64ToIntegerzh_fast
-{
-   /* arguments: L1 = Int64# */
-
-   L_ val;
-   W_ hi, lo, s, neg, words_needed, p;
-
-   val = L1;
-   neg = 0;
-
-   hi = TO_W_(val >> 32);
-   lo = TO_W_(val);
-
-   if ( hi == 0 || (hi == 0xFFFFFFFF && lo != 0) )  {
-       // minimum is one word
-       words_needed = 1;
-   } else { 
-       words_needed = 2;
-   }
-
-   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
-              NO_PTRS, int64ToIntegerzh_fast );
-
-   p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
-   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-   StgArrWords_words(p) = words_needed;
-
-   if ( %lt(hi,0) ) {
-     neg = 1;
-     lo = -lo;
-     if(lo == 0) {
-       hi = -hi;
-     } else {
-       hi = -hi - 1;
-     }
-   }
-
-   if ( words_needed == 2 )  { 
-      s = 2;
-      Hp(-1) = lo;
-      Hp(0) = hi;
-   } else { 
-       if ( lo != 0 ) {
-          s = 1;
-          Hp(0) = lo;
-       } else /* val==0 */  {
-          s = 0;
-       }
-   }
-   if ( neg != 0 ) {
-       s = -s;
-   }
-
-   /* returns (# size  :: Int#, 
-                data  :: ByteArray# #)
-   */
-   RET_NP(s,p);
-}
-word64ToIntegerzh_fast
-{
-   /* arguments: L1 = Word64# */
-
-   L_ val;
-   W_ hi, lo, s, words_needed, p;
-
-   val = L1;
-   hi = TO_W_(val >> 32);
-   lo = TO_W_(val);
-
-   if ( hi != 0 ) {
-      words_needed = 2;
-   } else {
-      words_needed = 1;
-   }
-
-   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
-              NO_PTRS, word64ToIntegerzh_fast );
-
-   p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
-   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-   StgArrWords_words(p) = words_needed;
-
-   if ( hi != 0 ) { 
-     s = 2;
-     Hp(-1) = lo;
-     Hp(0)  = hi;
-   } else {
-      if ( lo != 0 ) {
-        s = 1;
-        Hp(0) = lo;
-     } else /* val==0 */  {
-      s = 0;
-     }
-  }
-
-   /* returns (# size  :: Int#, 
-                data  :: ByteArray# #)
-   */
-   RET_NP(s,p);
-}
-
-
-
-#endif /* SUPPORT_LONG_LONGS */
-
-/* ToDo: this is shockingly inefficient */
-
-#ifndef THREADED_RTS
-section "bss" {
-  mp_tmp1:
-    bits8 [SIZEOF_MP_INT];
-}
-
-section "bss" {
-  mp_tmp2:
-    bits8 [SIZEOF_MP_INT];
-}
-
-section "bss" {
-  mp_result1:
-    bits8 [SIZEOF_MP_INT];
-}
-
-section "bss" {
-  mp_result2:
-    bits8 [SIZEOF_MP_INT];
-}
-#endif
-
-#ifdef THREADED_RTS
-#define FETCH_MP_TEMP(X) \
-W_ X; \
-X = BaseReg + (OFFSET_StgRegTable_r ## X);
-#else
-#define FETCH_MP_TEMP(X) /* Nothing */
-#endif
-
-#define GMP_TAKE2_RET1(name,mp_fun)                                     \
-name                                                                    \
-{                                                                       \
-  CInt s1, s2;                                                          \
-  W_ d1, d2;                                                            \
-  FETCH_MP_TEMP(mp_tmp1);                                               \
-  FETCH_MP_TEMP(mp_tmp2);                                               \
-  FETCH_MP_TEMP(mp_result1)                                             \
-  FETCH_MP_TEMP(mp_result2);                                            \
-                                                                        \
-  /* call doYouWantToGC() */                                            \
-  MAYBE_GC(R2_PTR & R4_PTR, name);                                      \
-                                                                        \
-  s1 = W_TO_INT(R1);                                                    \
-  d1 = R2;                                                              \
-  s2 = W_TO_INT(R3);                                                    \
-  d2 = R4;                                                              \
-                                                                        \
-  MP_INT__mp_alloc(mp_tmp1) = W_TO_INT(StgArrWords_words(d1));          \
-  MP_INT__mp_size(mp_tmp1)  = (s1);                                     \
-  MP_INT__mp_d(mp_tmp1)            = BYTE_ARR_CTS(d1);                         \
-  MP_INT__mp_alloc(mp_tmp2) = W_TO_INT(StgArrWords_words(d2));          \
-  MP_INT__mp_size(mp_tmp2)  = (s2);                                     \
-  MP_INT__mp_d(mp_tmp2)            = BYTE_ARR_CTS(d2);                         \
-                                                                        \
-  foreign "C" __gmpz_init(mp_result1 "ptr") [];                            \
-                                                                        \
-  /* Perform the operation */                                           \
-  foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1  "ptr",mp_tmp2  "ptr") []; \
-                                                                        \
-  RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
-         MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
-}
-
-#define GMP_TAKE1_RET1(name,mp_fun)                                     \
-name                                                                    \
-{                                                                       \
-  CInt s1;                                                              \
-  W_ d1;                                                                \
-  FETCH_MP_TEMP(mp_tmp1);                                               \
-  FETCH_MP_TEMP(mp_result1)                                             \
-                                                                        \
-  /* call doYouWantToGC() */                                            \
-  MAYBE_GC(R2_PTR, name);                                               \
-                                                                        \
-  d1 = R2;                                                              \
-  s1 = W_TO_INT(R1);                                                    \
-                                                                        \
-  MP_INT__mp_alloc(mp_tmp1)    = W_TO_INT(StgArrWords_words(d1));      \
-  MP_INT__mp_size(mp_tmp1)     = (s1);                                 \
-  MP_INT__mp_d(mp_tmp1)                = BYTE_ARR_CTS(d1);                     \
-                                                                        \
-  foreign "C" __gmpz_init(mp_result1 "ptr") [];                            \
-                                                                        \
-  /* Perform the operation */                                           \
-  foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1 "ptr") [];                \
-                                                                        \
-  RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
-         MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
-}
-
-#define GMP_TAKE2_RET2(name,mp_fun)                                                     \
-name                                                                                    \
-{                                                                                       \
-  CInt s1, s2;                                                                          \
-  W_ d1, d2;                                                                            \
-  FETCH_MP_TEMP(mp_tmp1);                                                               \
-  FETCH_MP_TEMP(mp_tmp2);                                                               \
-  FETCH_MP_TEMP(mp_result1)                                                             \
-  FETCH_MP_TEMP(mp_result2)                                                             \
-                                                                                        \
-  /* call doYouWantToGC() */                                                            \
-  MAYBE_GC(R2_PTR & R4_PTR, name);                                                      \
-                                                                                        \
-  s1 = W_TO_INT(R1);                                                                    \
-  d1 = R2;                                                                              \
-  s2 = W_TO_INT(R3);                                                                    \
-  d2 = R4;                                                                              \
-                                                                                        \
-  MP_INT__mp_alloc(mp_tmp1)    = W_TO_INT(StgArrWords_words(d1));                      \
-  MP_INT__mp_size(mp_tmp1)     = (s1);                                                 \
-  MP_INT__mp_d(mp_tmp1)                = BYTE_ARR_CTS(d1);                                     \
-  MP_INT__mp_alloc(mp_tmp2)    = W_TO_INT(StgArrWords_words(d2));                      \
-  MP_INT__mp_size(mp_tmp2)     = (s2);                                                 \
-  MP_INT__mp_d(mp_tmp2)                = BYTE_ARR_CTS(d2);                                     \
-                                                                                        \
-  foreign "C" __gmpz_init(mp_result1 "ptr") [];                                               \
-  foreign "C" __gmpz_init(mp_result2 "ptr") [];                                               \
-                                                                                        \
-  /* Perform the operation */                                                           \
-  foreign "C" mp_fun(mp_result1 "ptr",mp_result2 "ptr",mp_tmp1 "ptr",mp_tmp2 "ptr") [];    \
-                                                                                        \
-  RET_NPNP(TO_W_(MP_INT__mp_size(mp_result1)),                                          \
-           MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords,                               \
-          TO_W_(MP_INT__mp_size(mp_result2)),                                          \
-           MP_INT__mp_d(mp_result2) - SIZEOF_StgArrWords);                              \
-}
-
-GMP_TAKE2_RET1(plusIntegerzh_fast,     __gmpz_add)
-GMP_TAKE2_RET1(minusIntegerzh_fast,    __gmpz_sub)
-GMP_TAKE2_RET1(timesIntegerzh_fast,    __gmpz_mul)
-GMP_TAKE2_RET1(gcdIntegerzh_fast,      __gmpz_gcd)
-GMP_TAKE2_RET1(quotIntegerzh_fast,     __gmpz_tdiv_q)
-GMP_TAKE2_RET1(remIntegerzh_fast,      __gmpz_tdiv_r)
-GMP_TAKE2_RET1(divExactIntegerzh_fast, __gmpz_divexact)
-GMP_TAKE2_RET1(andIntegerzh_fast,      __gmpz_and)
-GMP_TAKE2_RET1(orIntegerzh_fast,       __gmpz_ior)
-GMP_TAKE2_RET1(xorIntegerzh_fast,      __gmpz_xor)
-GMP_TAKE1_RET1(complementIntegerzh_fast, __gmpz_com)
-
-GMP_TAKE2_RET2(quotRemIntegerzh_fast, __gmpz_tdiv_qr)
-GMP_TAKE2_RET2(divModIntegerzh_fast,  __gmpz_fdiv_qr)
-
-#ifndef THREADED_RTS
-section "bss" {
-  mp_tmp_w:  W_; // NB. mp_tmp_w is really an here mp_limb_t
-}
-#endif
-
-gcdIntzh_fast
-{
-    /* R1 = the first Int#; R2 = the second Int# */
-    W_ r; 
-    FETCH_MP_TEMP(mp_tmp_w);
-
-    W_[mp_tmp_w] = R1;
-    (r) = foreign "C" __gmpn_gcd_1(mp_tmp_w "ptr", 1, R2) [];
-
-    R1 = r;
-    /* Result parked in R1, return via info-pointer at TOS */
-    jump %ENTRY_CODE(Sp(0));
-}
-
-
-gcdIntegerIntzh_fast
-{
-    /* R1 = s1; R2 = d1; R3 = the int */
-    W_ s1;
-    (s1) = foreign "C" __gmpn_gcd_1( BYTE_ARR_CTS(R2) "ptr", R1, R3) [];
-    R1 = s1;
-    
-    /* Result parked in R1, return via info-pointer at TOS */
-    jump %ENTRY_CODE(Sp(0));
-}
-
-
-cmpIntegerIntzh_fast
-{
-    /* R1 = s1; R2 = d1; R3 = the int */
-    W_ usize, vsize, v_digit, u_digit;
-
-    usize = R1;
-    vsize = 0;
-    v_digit = R3;
-
-    // paraphrased from __gmpz_cmp_si() in the GMP sources
-    if (%gt(v_digit,0)) {
-       vsize = 1;
-    } else { 
-       if (%lt(v_digit,0)) {
-           vsize = -1;
-           v_digit = -v_digit;
-       }
-    }
-
-    if (usize != vsize) {
-       R1 = usize - vsize; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (usize == 0) {
-       R1 = 0; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    u_digit = W_[BYTE_ARR_CTS(R2)];
-
-    if (u_digit == v_digit) {
-       R1 = 0; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (%gtu(u_digit,v_digit)) { // NB. unsigned: these are mp_limb_t's
-       R1 = usize; 
-    } else {
-       R1 = -usize; 
-    }
-
-    jump %ENTRY_CODE(Sp(0));
-}
-
-cmpIntegerzh_fast
-{
-    /* R1 = s1; R2 = d1; R3 = s2; R4 = d2 */
-    W_ usize, vsize, size, up, vp;
-    CInt cmp;
-
-    // paraphrased from __gmpz_cmp() in the GMP sources
-    usize = R1;
-    vsize = R3;
-
-    if (usize != vsize) {
-       R1 = usize - vsize; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (usize == 0) {
-       R1 = 0; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (%lt(usize,0)) { // NB. not <, which is unsigned
-       size = -usize;
-    } else {
-       size = usize;
-    }
-
-    up = BYTE_ARR_CTS(R2);
-    vp = BYTE_ARR_CTS(R4);
-
-    (cmp) = foreign "C" __gmpn_cmp(up "ptr", vp "ptr", size) [];
-
-    if (cmp == 0 :: CInt) {
-       R1 = 0; 
-       jump %ENTRY_CODE(Sp(0));
-    }
-
-    if (%lt(cmp,0 :: CInt) == %lt(usize,0)) {
-       R1 = 1;
-    } else {
-       R1 = (-1); 
-    }
-    /* Result parked in R1, return via info-pointer at TOS */
-    jump %ENTRY_CODE(Sp(0));
-}
-
-integer2Intzh_fast
-{
-    /* R1 = s; R2 = d */
-    W_ r, s;
-
-    s = R1;
-    if (s == 0) {
-       r = 0;
-    } else {
-       r = W_[R2 + SIZEOF_StgArrWords];
-       if (%lt(s,0)) {
-           r = -r;
-       }
-    }
-    /* Result parked in R1, return via info-pointer at TOS */
-    R1 = r;
-    jump %ENTRY_CODE(Sp(0));
-}
-
-integer2Wordzh_fast
-{
-  /* R1 = s; R2 = d */
-  W_ r, s;
-
-  s = R1;
-  if (s == 0) {
-    r = 0;
-  } else {
-    r = W_[R2 + SIZEOF_StgArrWords];
-    if (%lt(s,0)) {
-       r = -r;
-    }
-  }
-  /* Result parked in R1, return via info-pointer at TOS */
-  R1 = r;
-  jump %ENTRY_CODE(Sp(0));
-}
-
-decodeFloatzh_fast
-{ 
-    W_ p;
-    F_ arg;
-    FETCH_MP_TEMP(mp_tmp1);
-    FETCH_MP_TEMP(mp_tmp_w);
-    
-    /* arguments: F1 = Float# */
-    arg = F1;
-    
-    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, decodeFloatzh_fast );
-    
-    /* Be prepared to tell Lennart-coded __decodeFloat
-       where mantissa._mp_d can be put (it does not care about the rest) */
-    p = Hp - SIZEOF_StgArrWords;
-    SET_HDR(p,stg_ARR_WORDS_info,W_[CCCS]);
-    StgArrWords_words(p) = 1;
-    MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
-    
-    /* Perform the operation */
-    foreign "C" __decodeFloat(mp_tmp1 "ptr",mp_tmp_w "ptr" ,arg) [];
-    
-    /* returns: (Int# (expn), Int#, ByteArray#) */
-    RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
-}
-
 decodeFloatzuIntzh_fast
 { 
     W_ p;
     F_ arg;
-    FETCH_MP_TEMP(mp_tmp1);
-    FETCH_MP_TEMP(mp_tmp_w);
+    W_ mp_tmp1;
+    W_ mp_tmp_w;
+
+    STK_CHK_GEN( WDS(2), NO_PTRS, decodeFloatzuIntzh_fast );
+
+    mp_tmp1  = Sp - WDS(1);
+    mp_tmp_w = Sp - WDS(2);
     
     /* arguments: F1 = Float# */
     arg = F1;
@@ -964,43 +477,21 @@ decodeFloatzuIntzh_fast
     RET_NN(W_[mp_tmp1], W_[mp_tmp_w]);
 }
 
-#define DOUBLE_MANTISSA_SIZE SIZEOF_DOUBLE
-#define ARR_SIZE (SIZEOF_StgArrWords + DOUBLE_MANTISSA_SIZE)
-
-decodeDoublezh_fast
+decodeDoublezu2Intzh_fast
 { 
     D_ arg;
     W_ p;
-    FETCH_MP_TEMP(mp_tmp1);
-    FETCH_MP_TEMP(mp_tmp_w);
-
-    /* arguments: D1 = Double# */
-    arg = D1;
-
-    ALLOC_PRIM( ARR_SIZE, NO_PTRS, decodeDoublezh_fast );
-    
-    /* Be prepared to tell Lennart-coded __decodeDouble
-       where mantissa.d can be put (it does not care about the rest) */
-    p = Hp - ARR_SIZE + WDS(1);
-    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
-    StgArrWords_words(p) = BYTES_TO_WDS(DOUBLE_MANTISSA_SIZE);
-    MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
+    W_ mp_tmp1;
+    W_ mp_tmp2;
+    W_ mp_result1;
+    W_ mp_result2;
 
-    /* Perform the operation */
-    foreign "C" __decodeDouble(mp_tmp1 "ptr", mp_tmp_w "ptr",arg) [];
-    
-    /* returns: (Int# (expn), Int#, ByteArray#) */
-    RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
-}
+    STK_CHK_GEN( WDS(4), NO_PTRS, decodeDoublezu2Intzh_fast );
 
-decodeDoublezu2Intzh_fast
-{ 
-    D_ arg;
-    W_ p;
-    FETCH_MP_TEMP(mp_tmp1);
-    FETCH_MP_TEMP(mp_tmp2);
-    FETCH_MP_TEMP(mp_result1);
-    FETCH_MP_TEMP(mp_result2);
+    mp_tmp1    = Sp - WDS(1);
+    mp_tmp2    = Sp - WDS(2);
+    mp_result1 = Sp - WDS(3);
+    mp_result2 = Sp - WDS(4);
 
     /* arguments: D1 = Double# */
     arg = D1;
@@ -1040,7 +531,8 @@ forkzh_fast
 
   foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];
 
-  // switch at the earliest opportunity
+  // context switch soon, but not immediately: we don't want every
+  // forkIO to force a context-switch.
   Capability_context_switch(MyCapability()) = 1 :: CInt;
   
   RET_P(threadid);
@@ -1069,7 +561,8 @@ forkOnzh_fast
 
   foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];
 
-  // switch at the earliest opportunity
+  // context switch soon, but not immediately: we don't want every
+  // forkIO to force a context-switch.
   Capability_context_switch(MyCapability()) = 1 :: CInt;
   
   RET_P(threadid);
@@ -1185,18 +678,20 @@ INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
 #if defined(PROFILING)
   W_ unused1, W_ unused2,
 #endif
-  P_ unused3, P_ unused4)
+  P_ code, P_ next_invariant_to_check, P_ result)
 {
   W_ frame, trec, valid, next_invariant, q, outer;
 
-  frame = Sp;
-  trec = StgTSO_trec(CurrentTSO);
+  frame  = Sp;
+  trec   = StgTSO_trec(CurrentTSO);
+  result = R1;
   ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
 
   if (outer == NO_TREC) {
     /* First time back at the atomically frame -- pick up invariants */
     ("ptr" q) = foreign "C" stmGetInvariantsToCheck(MyCapability() "ptr", trec "ptr") [];
     StgAtomicallyFrame_next_invariant_to_check(frame) = q;
+    StgAtomicallyFrame_result(frame) = result;
 
   } else {
     /* Second/subsequent time back at the atomically frame -- abort the
@@ -1230,6 +725,7 @@ INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
     if (valid != 0) {
       /* Transaction was valid: commit succeeded */
       StgTSO_trec(CurrentTSO) = NO_TREC;
+      R1 = StgAtomicallyFrame_result(frame);
       Sp = Sp + SIZEOF_StgAtomicallyFrame;
       jump %ENTRY_CODE(Sp(SP_OFF));
     } else {
@@ -1247,7 +743,7 @@ INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
 #if defined(PROFILING)
   W_ unused1, W_ unused2,
 #endif
-  P_ unused3, P_ unused4)
+  P_ code, P_ next_invariant_to_check, P_ result)
 {
   W_ frame, trec, valid;
 
@@ -1332,6 +828,7 @@ atomicallyzh_fast
 
   SET_HDR(frame,stg_atomically_frame_info, W_[CCCS]);
   StgAtomicallyFrame_code(frame) = R1;
+  StgAtomicallyFrame_result(frame) = NO_TREC;
   StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
 
   /* Start the memory transcation */