Convert the gmp cmm primops to use local stack allocation
[ghc-hetmet.git] / rts / PrimOps.cmm
index 06628b9..9ebe8fb 100644 (file)
@@ -46,12 +46,13 @@ import __gmpz_xor;
 import __gmpz_ior;
 import __gmpz_com;
 #endif
-import base_GHCziIOBase_NestedAtomically_closure;
 import pthread_mutex_lock;
 import pthread_mutex_unlock;
 #endif
+import base_ControlziExceptionziBase_nestedAtomically_closure;
 import EnterCriticalSection;
 import LeaveCriticalSection;
+import ghczmprim_GHCziBool_False_closure;
 
 /*-----------------------------------------------------------------------------
   Array Primitives
@@ -82,32 +83,65 @@ newByteArrayzh_fast
     RET_P(p);
 }
 
+#define BA_ALIGN 16
+#define BA_MASK  (BA_ALIGN-1)
+
 newPinnedByteArrayzh_fast
 {
-    W_ words, payload_words, n, p;
+    W_ words, bytes, payload_words, p;
 
     MAYBE_GC(NO_PTRS,newPinnedByteArrayzh_fast);
-    n = R1;
-    payload_words = ROUNDUP_BYTES_TO_WDS(n);
+    bytes = R1;
+    /* payload_words is what we will tell the profiler we had to allocate */
+    payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
+    /* When we actually allocate memory, we need to allow space for the
+       header: */
+    bytes = bytes + SIZEOF_StgArrWords;
+    /* And we want to align to BA_ALIGN bytes, so we need to allow space
+       to shift up to BA_ALIGN - 1 bytes: */
+    bytes = bytes + BA_ALIGN - 1;
+    /* Now we convert to a number of words: */
+    words = ROUNDUP_BYTES_TO_WDS(bytes);
 
-    // We want an 8-byte aligned array.  allocatePinned() gives us
-    // 8-byte aligned memory by default, but we want to align the
-    // *goods* inside the ArrWords object, so we have to check the
-    // size of the ArrWords header and adjust our size accordingly.
-    words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-    if ((SIZEOF_StgArrWords & 7) != 0) {
-       words = words + 1;
-    }
+    ("ptr" p) = foreign "C" allocatePinned(words) [];
+    TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
+
+    /* Now we need to move p forward so that the payload is aligned
+       to BA_ALIGN bytes: */
+    p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK);
+
+    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+    StgArrWords_words(p) = payload_words;
+    RET_P(p);
+}
+
+newAlignedPinnedByteArrayzh_fast
+{
+    W_ words, bytes, payload_words, p, alignment;
+
+    MAYBE_GC(NO_PTRS,newAlignedPinnedByteArrayzh_fast);
+    bytes = R1;
+    alignment = R2;
+
+    /* payload_words is what we will tell the profiler we had to allocate */
+    payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
+
+    /* When we actually allocate memory, we need to allow space for the
+       header: */
+    bytes = bytes + SIZEOF_StgArrWords;
+    /* And we want to align to <alignment> bytes, so we need to allow space
+       to shift up to <alignment - 1> bytes: */
+    bytes = bytes + alignment - 1;
+    /* Now we convert to a number of words: */
+    words = ROUNDUP_BYTES_TO_WDS(bytes);
 
     ("ptr" p) = foreign "C" allocatePinned(words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
-    // Again, if the ArrWords header isn't a multiple of 8 bytes, we
-    // have to push the object forward one word so that the goods
-    // fall on an 8-byte boundary.
-    if ((SIZEOF_StgArrWords & 7) != 0) {
-       p = p + WDS(1);
-    }
+    /* Now we need to move p forward so that the payload is aligned
+       to <alignment> bytes. Note that we are assuming that
+       <alignment> is a power of 2, which is technically not guaranteed */
+    p = p + ((-p - SIZEOF_StgArrWords) & (alignment - 1));
 
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
     StgArrWords_words(p) = payload_words;
@@ -193,7 +227,7 @@ newMutVarzh_fast
 
 atomicModifyMutVarzh_fast
 {
-    W_ mv, z, x, y, r;
+    W_ mv, f, z, x, y, r, h;
     /* Args: R1 :: MutVar#,  R2 :: a -> (a,b) */
 
     /* If x is the current contents of the MutVar#, then 
@@ -232,19 +266,15 @@ atomicModifyMutVarzh_fast
 
    HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, atomicModifyMutVarzh_fast);
 
-#if defined(THREADED_RTS)
-    ACQUIRE_LOCK(atomic_modify_mutvar_mutex "ptr") [R1,R2];
-#endif
-
-   x = StgMutVar_var(R1);
+   mv = R1;
+   f = R2;
 
    TICK_ALLOC_THUNK_2();
    CCCS_ALLOC(THUNK_2_SIZE);
    z = Hp - THUNK_2_SIZE + WDS(1);
    SET_HDR(z, stg_ap_2_upd_info, W_[CCCS]);
    LDV_RECORD_CREATE(z);
-   StgThunk_payload(z,0) = R2;
-   StgThunk_payload(z,1) = x;
+   StgThunk_payload(z,0) = f;
 
    TICK_ALLOC_THUNK_1();
    CCCS_ALLOC(THUNK_1_SIZE);
@@ -253,9 +283,6 @@ atomicModifyMutVarzh_fast
    LDV_RECORD_CREATE(y);
    StgThunk_payload(y,0) = z;
 
-   StgMutVar_var(R1) = y;
-   foreign "C" dirty_MUT_VAR(BaseReg "ptr", R1 "ptr") [R1];
-
    TICK_ALLOC_THUNK_1();
    CCCS_ALLOC(THUNK_1_SIZE);
    r = y - THUNK_1_SIZE;
@@ -263,10 +290,20 @@ atomicModifyMutVarzh_fast
    LDV_RECORD_CREATE(r);
    StgThunk_payload(r,0) = z;
 
-#if defined(THREADED_RTS)
-    RELEASE_LOCK(atomic_modify_mutvar_mutex "ptr") [];
+ retry:
+   x = StgMutVar_var(mv);
+   StgThunk_payload(z,1) = x;
+#ifdef THREADED_RTS
+   (h) = foreign "C" cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y) [];
+   if (h != x) { goto retry; }
+#else
+   StgMutVar_var(mv) = y;
 #endif
 
+   if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
+     foreign "C" dirty_MUT_VAR(BaseReg "ptr", mv "ptr") [];
+   }
+
    RET_P(r);
 }
 
@@ -293,9 +330,14 @@ mkWeakzh_fast
   w = Hp - SIZEOF_StgWeak + WDS(1);
   SET_HDR(w, stg_WEAK_info, W_[CCCS]);
 
-  StgWeak_key(w)       = R1;
-  StgWeak_value(w)     = R2;
-  StgWeak_finalizer(w) = R3;
+  // We don't care about cfinalizer here.
+  // Should StgWeak_cfinalizer(w) be stg_NO_FINALIZER_closure or
+  // something else?
+
+  StgWeak_key(w)        = R1;
+  StgWeak_value(w)      = R2;
+  StgWeak_finalizer(w)  = R3;
+  StgWeak_cfinalizer(w) = stg_NO_FINALIZER_closure;
 
   StgWeak_link(w)      = W_[weak_ptr_list];
   W_[weak_ptr_list]    = w;
@@ -305,12 +347,65 @@ mkWeakzh_fast
   RET_P(w);
 }
 
+mkWeakForeignEnvzh_fast
+{
+  /* R1 = key
+     R2 = value
+     R3 = finalizer
+     R4 = pointer
+     R5 = has environment (0 or 1)
+     R6 = environment
+  */
+  W_ w, payload_words, words, p;
+
+  W_ key, val, fptr, ptr, flag, eptr;
+
+  key  = R1;
+  val  = R2;
+  fptr = R3;
+  ptr  = R4;
+  flag = R5;
+  eptr = R6;
+
+  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR, mkWeakForeignEnvzh_fast );
+
+  w = Hp - SIZEOF_StgWeak + WDS(1);
+  SET_HDR(w, stg_WEAK_info, W_[CCCS]);
+
+  payload_words = 4;
+  words         = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
+  ("ptr" p)     = foreign "C" allocateLocal(MyCapability() "ptr", words) [];
+
+  TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
+  SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+
+  StgArrWords_words(p)     = payload_words;
+  StgArrWords_payload(p,0) = fptr;
+  StgArrWords_payload(p,1) = ptr;
+  StgArrWords_payload(p,2) = eptr;
+  StgArrWords_payload(p,3) = flag;
+
+  // We don't care about the value here.
+  // Should StgWeak_value(w) be stg_NO_FINALIZER_closure or something else?
+
+  StgWeak_key(w)        = key;
+  StgWeak_value(w)      = val;
+  StgWeak_finalizer(w)  = stg_NO_FINALIZER_closure;
+  StgWeak_cfinalizer(w) = p;
+
+  StgWeak_link(w)   = W_[weak_ptr_list];
+  W_[weak_ptr_list] = w;
+
+  IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
+
+  RET_P(w);
+}
 
 finalizzeWeakzh_fast
 {
   /* R1 = weak ptr
    */
-  W_ w, f;
+  W_ w, f, arr;
 
   w = R1;
 
@@ -338,9 +433,18 @@ finalizzeWeakzh_fast
   SET_INFO(w,stg_DEAD_WEAK_info);
   LDV_RECORD_CREATE(w);
 
-  f = StgWeak_finalizer(w);
+  f   = StgWeak_finalizer(w);
+  arr = StgWeak_cfinalizer(w);
+
   StgDeadWeak_link(w) = StgWeak_link(w);
 
+  if (arr != stg_NO_FINALIZER_closure) {
+    foreign "C" runCFinalizer(StgArrWords_payload(arr,0),
+                              StgArrWords_payload(arr,1),
+                              StgArrWords_payload(arr,2),
+                              StgArrWords_payload(arr,3)) [];
+  }
+
   /* return the finalizer */
   if (f == stg_NO_FINALIZER_closure) {
       RET_NP(0,stg_NO_FINALIZER_closure);
@@ -452,11 +556,11 @@ int64ToIntegerzh_fast
    hi = TO_W_(val >> 32);
    lo = TO_W_(val);
 
-   if ( hi != 0 && hi != 0xFFFFFFFF )  { 
-       words_needed = 2;
-   } else { 
+   if ( hi == 0 || (hi == 0xFFFFFFFF && lo != 0) )  {
        // minimum is one word
        words_needed = 1;
+   } else { 
+       words_needed = 2;
    }
 
    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
@@ -540,60 +644,32 @@ word64ToIntegerzh_fast
    RET_NP(s,p);
 }
 
-
-
 #endif /* SUPPORT_LONG_LONGS */
 
-/* ToDo: this is shockingly inefficient */
-
-#ifndef THREADED_RTS
-section "bss" {
-  mp_tmp1:
-    bits8 [SIZEOF_MP_INT];
-}
-
-section "bss" {
-  mp_tmp2:
-    bits8 [SIZEOF_MP_INT];
-}
-
-section "bss" {
-  mp_result1:
-    bits8 [SIZEOF_MP_INT];
-}
-
-section "bss" {
-  mp_result2:
-    bits8 [SIZEOF_MP_INT];
-}
-#endif
-
-#ifdef THREADED_RTS
-#define FETCH_MP_TEMP(X) \
-W_ X; \
-X = BaseReg + (OFFSET_StgRegTable_r ## X);
-#else
-#define FETCH_MP_TEMP(X) /* Nothing */
-#endif
-
 #define GMP_TAKE2_RET1(name,mp_fun)                                     \
 name                                                                    \
 {                                                                       \
   CInt s1, s2;                                                          \
   W_ d1, d2;                                                            \
-  FETCH_MP_TEMP(mp_tmp1);                                               \
-  FETCH_MP_TEMP(mp_tmp2);                                               \
-  FETCH_MP_TEMP(mp_result1)                                             \
-  FETCH_MP_TEMP(mp_result2);                                            \
+  W_ mp_tmp1;                                                           \
+  W_ mp_tmp2;                                                           \
+  W_ mp_result1;                                                        \
+  W_ mp_result2;                                                        \
                                                                         \
   /* call doYouWantToGC() */                                            \
   MAYBE_GC(R2_PTR & R4_PTR, name);                                      \
                                                                         \
+  STK_CHK_GEN( 4 * SIZEOF_MP_INT, R2_PTR & R4_PTR, name );              \
+                                                                        \
   s1 = W_TO_INT(R1);                                                    \
   d1 = R2;                                                              \
   s2 = W_TO_INT(R3);                                                    \
   d2 = R4;                                                              \
                                                                         \
+  mp_tmp1    = Sp - 1 * SIZEOF_MP_INT;                                  \
+  mp_tmp2    = Sp - 2 * SIZEOF_MP_INT;                                  \
+  mp_result1 = Sp - 3 * SIZEOF_MP_INT;                                  \
+  mp_result2 = Sp - 4 * SIZEOF_MP_INT;                                  \
   MP_INT__mp_alloc(mp_tmp1) = W_TO_INT(StgArrWords_words(d1));          \
   MP_INT__mp_size(mp_tmp1)  = (s1);                                     \
   MP_INT__mp_d(mp_tmp1)            = BYTE_ARR_CTS(d1);                         \
@@ -615,15 +691,19 @@ name                                                                    \
 {                                                                       \
   CInt s1;                                                              \
   W_ d1;                                                                \
-  FETCH_MP_TEMP(mp_tmp1);                                               \
-  FETCH_MP_TEMP(mp_result1)                                             \
+  W_ mp_tmp1;                                                           \
+  W_ mp_result1;                                                        \
                                                                         \
   /* call doYouWantToGC() */                                            \
   MAYBE_GC(R2_PTR, name);                                               \
                                                                         \
+  STK_CHK_GEN( 2 * SIZEOF_MP_INT, R2_PTR, name );                       \
+                                                                        \
   d1 = R2;                                                              \
   s1 = W_TO_INT(R1);                                                    \
                                                                         \
+  mp_tmp1    = Sp - 1 * SIZEOF_MP_INT;                                  \
+  mp_result1 = Sp - 2 * SIZEOF_MP_INT;                                  \
   MP_INT__mp_alloc(mp_tmp1)    = W_TO_INT(StgArrWords_words(d1));      \
   MP_INT__mp_size(mp_tmp1)     = (s1);                                 \
   MP_INT__mp_d(mp_tmp1)                = BYTE_ARR_CTS(d1);                     \
@@ -642,19 +722,25 @@ name
 {                                                                                       \
   CInt s1, s2;                                                                          \
   W_ d1, d2;                                                                            \
-  FETCH_MP_TEMP(mp_tmp1);                                                               \
-  FETCH_MP_TEMP(mp_tmp2);                                                               \
-  FETCH_MP_TEMP(mp_result1)                                                             \
-  FETCH_MP_TEMP(mp_result2)                                                             \
+  W_ mp_tmp1;                                                                           \
+  W_ mp_tmp2;                                                                           \
+  W_ mp_result1;                                                                        \
+  W_ mp_result2;                                                                        \
                                                                                         \
   /* call doYouWantToGC() */                                                            \
   MAYBE_GC(R2_PTR & R4_PTR, name);                                                      \
                                                                                         \
+  STK_CHK_GEN( 4 * SIZEOF_MP_INT, R2_PTR & R4_PTR, name );                              \
+                                                                                        \
   s1 = W_TO_INT(R1);                                                                    \
   d1 = R2;                                                                              \
   s2 = W_TO_INT(R3);                                                                    \
   d2 = R4;                                                                              \
                                                                                         \
+  mp_tmp1    = Sp - 1 * SIZEOF_MP_INT;                                                  \
+  mp_tmp2    = Sp - 2 * SIZEOF_MP_INT;                                                  \
+  mp_result1 = Sp - 3 * SIZEOF_MP_INT;                                                  \
+  mp_result2 = Sp - 4 * SIZEOF_MP_INT;                                                  \
   MP_INT__mp_alloc(mp_tmp1)    = W_TO_INT(StgArrWords_words(d1));                      \
   MP_INT__mp_size(mp_tmp1)     = (s1);                                                 \
   MP_INT__mp_d(mp_tmp1)                = BYTE_ARR_CTS(d1);                                     \
@@ -689,17 +775,15 @@ GMP_TAKE1_RET1(complementIntegerzh_fast, __gmpz_com)
 GMP_TAKE2_RET2(quotRemIntegerzh_fast, __gmpz_tdiv_qr)
 GMP_TAKE2_RET2(divModIntegerzh_fast,  __gmpz_fdiv_qr)
 
-#ifndef THREADED_RTS
-section "bss" {
-  mp_tmp_w:  W_; // NB. mp_tmp_w is really an here mp_limb_t
-}
-#endif
-
 gcdIntzh_fast
 {
     /* R1 = the first Int#; R2 = the second Int# */
     W_ r; 
-    FETCH_MP_TEMP(mp_tmp_w);
+    W_ mp_tmp_w;
+
+    STK_CHK_GEN( 1 * SIZEOF_MP_INT, NO_PTRS, gcdIntzh_fast );
+
+    mp_tmp_w = Sp - 1 * SIZEOF_MP_INT;
 
     W_[mp_tmp_w] = R1;
     (r) = foreign "C" __gmpn_gcd_1(mp_tmp_w "ptr", 1, R2) [];
@@ -850,30 +934,26 @@ integer2Wordzh_fast
   jump %ENTRY_CODE(Sp(0));
 }
 
-decodeFloatzh_fast
+decodeFloatzuIntzh_fast
 { 
     W_ p;
     F_ arg;
-    FETCH_MP_TEMP(mp_tmp1);
-    FETCH_MP_TEMP(mp_tmp_w);
+    W_ mp_tmp1;
+    W_ mp_tmp_w;
+
+    STK_CHK_GEN( WDS(2), NO_PTRS, decodeFloatzuIntzh_fast );
+
+    mp_tmp1  = Sp - WDS(1);
+    mp_tmp_w = Sp - WDS(2);
     
     /* arguments: F1 = Float# */
     arg = F1;
     
-    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, decodeFloatzh_fast );
-    
-    /* Be prepared to tell Lennart-coded __decodeFloat
-       where mantissa._mp_d can be put (it does not care about the rest) */
-    p = Hp - SIZEOF_StgArrWords;
-    SET_HDR(p,stg_ARR_WORDS_info,W_[CCCS]);
-    StgArrWords_words(p) = 1;
-    MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
-    
     /* Perform the operation */
-    foreign "C" __decodeFloat(mp_tmp1 "ptr",mp_tmp_w "ptr" ,arg) [];
+    foreign "C" __decodeFloat_Int(mp_tmp1 "ptr", mp_tmp_w "ptr", arg) [];
     
-    /* returns: (Int# (expn), Int#, ByteArray#) */
-    RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
+    /* returns: (Int# (mantissa), Int# (exponent)) */
+    RET_NN(W_[mp_tmp1], W_[mp_tmp_w]);
 }
 
 #define DOUBLE_MANTISSA_SIZE SIZEOF_DOUBLE
@@ -883,8 +963,13 @@ decodeDoublezh_fast
 { 
     D_ arg;
     W_ p;
-    FETCH_MP_TEMP(mp_tmp1);
-    FETCH_MP_TEMP(mp_tmp_w);
+    W_ mp_tmp1;
+    W_ mp_tmp_w;
+
+    STK_CHK_GEN( 2 * SIZEOF_MP_INT, NO_PTRS, decodeDoublezh_fast );
+
+    mp_tmp1  = Sp - 1 * SIZEOF_MP_INT;
+    mp_tmp_w = Sp - 2 * SIZEOF_MP_INT;
 
     /* arguments: D1 = Double# */
     arg = D1;
@@ -905,6 +990,35 @@ decodeDoublezh_fast
     RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
 }
 
+decodeDoublezu2Intzh_fast
+{ 
+    D_ arg;
+    W_ p;
+    W_ mp_tmp1;
+    W_ mp_tmp2;
+    W_ mp_result1;
+    W_ mp_result2;
+
+    STK_CHK_GEN( WDS(4), NO_PTRS, decodeDoublezu2Intzh_fast );
+
+    mp_tmp1    = Sp - WDS(1);
+    mp_tmp2    = Sp - WDS(2);
+    mp_result1 = Sp - WDS(3);
+    mp_result2 = Sp - WDS(4);
+
+    /* arguments: D1 = Double# */
+    arg = D1;
+
+    /* Perform the operation */
+    foreign "C" __decodeDouble_2Int(mp_tmp1 "ptr", mp_tmp2 "ptr",
+                                    mp_result1 "ptr", mp_result2 "ptr",
+                                    arg) [];
+
+    /* returns:
+       (Int# (mant sign), Word# (mant high), Word# (mant low), Int# (expn)) */
+    RET_NNNN(W_[mp_tmp1], W_[mp_tmp2], W_[mp_result1], W_[mp_result2]);
+}
+
 /* -----------------------------------------------------------------------------
  * Concurrency primitives
  * -------------------------------------------------------------------------- */
@@ -930,8 +1044,9 @@ forkzh_fast
 
   foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];
 
-  // switch at the earliest opportunity
-  CInt[context_switch] = 1 :: CInt;
+  // context switch soon, but not immediately: we don't want every
+  // forkIO to force a context-switch.
+  Capability_context_switch(MyCapability()) = 1 :: CInt;
   
   RET_P(threadid);
 }
@@ -959,8 +1074,9 @@ forkOnzh_fast
 
   foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];
 
-  // switch at the earliest opportunity
-  CInt[context_switch] = 1 :: CInt;
+  // context switch soon, but not immediately: we don't want every
+  // forkIO to force a context-switch.
+  Capability_context_switch(MyCapability()) = 1 :: CInt;
   
   RET_P(threadid);
 }
@@ -995,18 +1111,45 @@ isCurrentThreadBoundzh_fast
   RET_N(r);
 }
 
+threadStatuszh_fast
+{
+    /* args: R1 :: ThreadId# */
+    W_ tso;
+    W_ why_blocked;
+    W_ what_next;
+    W_ ret;
+
+    tso = R1;
+    loop:
+      if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
+          tso = StgTSO__link(tso);
+          goto loop;
+      }
+
+    what_next   = TO_W_(StgTSO_what_next(tso));
+    why_blocked = TO_W_(StgTSO_why_blocked(tso));
+    // Note: these two reads are not atomic, so they might end up
+    // being inconsistent.  It doesn't matter, since we
+    // only return one or the other.  If we wanted to return the
+    // contents of block_info too, then we'd have to do some synchronisation.
+
+    if (what_next == ThreadComplete) {
+        ret = 16;  // NB. magic, matches up with GHC.Conc.threadStatus
+    } else {
+        if (what_next == ThreadKilled) {
+            ret = 17;
+        } else {
+            ret = why_blocked;
+        }
+    }
+    RET_N(ret);
+}
 
 /* -----------------------------------------------------------------------------
  * TVar primitives
  * -------------------------------------------------------------------------- */
 
-#ifdef REG_R1
 #define SP_OFF 0
-#define IF_NOT_REG_R1(x) 
-#else
-#define SP_OFF 1
-#define IF_NOT_REG_R1(x) x
-#endif
 
 // Catch retry frame ------------------------------------------------------------
 
@@ -1014,10 +1157,9 @@ INFO_TABLE_RET(stg_catch_retry_frame, CATCH_RETRY_FRAME,
 #if defined(PROFILING)
   W_ unused1, W_ unused2,
 #endif
-  W_ unused3, "ptr" W_ unused4, "ptr" W_ unused5)
+  W_ unused3, P_ unused4, P_ unused5)
 {
    W_ r, frame, trec, outer;
-   IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
 
    frame = Sp;
    trec = StgTSO_trec(CurrentTSO);
@@ -1027,7 +1169,6 @@ INFO_TABLE_RET(stg_catch_retry_frame, CATCH_RETRY_FRAME,
      /* Succeeded (either first branch or second branch) */
      StgTSO_trec(CurrentTSO) = outer;
      Sp = Sp + SIZEOF_StgCatchRetryFrame;
-     IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
      jump %ENTRY_CODE(Sp(SP_OFF));
    } else {
      /* Did not commit: re-execute */
@@ -1050,10 +1191,9 @@ INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
 #if defined(PROFILING)
   W_ unused1, W_ unused2,
 #endif
-  "ptr" W_ unused3, "ptr" W_ unused4)
+  P_ unused3, P_ unused4)
 {
   W_ frame, trec, valid, next_invariant, q, outer;
-  IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
 
   frame = Sp;
   trec = StgTSO_trec(CurrentTSO);
@@ -1097,7 +1237,6 @@ INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
       /* Transaction was valid: commit succeeded */
       StgTSO_trec(CurrentTSO) = NO_TREC;
       Sp = Sp + SIZEOF_StgAtomicallyFrame;
-      IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
       jump %ENTRY_CODE(Sp(SP_OFF));
     } else {
       /* Transaction was not valid: try again */
@@ -1114,10 +1253,9 @@ INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
 #if defined(PROFILING)
   W_ unused1, W_ unused2,
 #endif
-  "ptr" W_ unused3, "ptr" W_ unused4)
+  P_ unused3, P_ unused4)
 {
   W_ frame, trec, valid;
-  IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
 
   frame = Sp;
 
@@ -1125,9 +1263,6 @@ INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
   (valid) = foreign "C" stmReWait(MyCapability() "ptr", CurrentTSO "ptr") [];
   if (valid != 0) {
     /* Previous attempt is still valid: no point trying again yet */
-         IF_NOT_REG_R1(Sp_adj(-2);
-                       Sp(1) = stg_NO_FINALIZER_closure;
-                       Sp(0) = stg_ut_1_0_unreg_info;)
     jump stg_block_noregs;
   } else {
     /* Previous attempt is no longer valid: try again */
@@ -1141,11 +1276,7 @@ INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
 
 // STM catch frame --------------------------------------------------------------
 
-#ifdef REG_R1
 #define SP_OFF 0
-#else
-#define SP_OFF 1
-#endif
 
 /* Catch frames are very similar to update frames, but when entering
  * one we just pop the frame off the stack and perform the correct
@@ -1156,9 +1287,8 @@ INFO_TABLE_RET(stg_catch_stm_frame, CATCH_STM_FRAME,
 #if defined(PROFILING)
   W_ unused1, W_ unused2,
 #endif
-  "ptr" W_ unused3, "ptr" W_ unused4)
+  P_ unused3, P_ unused4)
    {
-      IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
       W_ r, frame, trec, outer;
       frame = Sp;
       trec = StgTSO_trec(CurrentTSO);
@@ -1168,7 +1298,6 @@ INFO_TABLE_RET(stg_catch_stm_frame, CATCH_STM_FRAME,
         /* Commit succeeded */
         StgTSO_trec(CurrentTSO) = outer;
         Sp = Sp + SIZEOF_StgCatchSTMFrame;
-        IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
         jump Sp(SP_OFF);
       } else {
         /* Commit failed */
@@ -1199,7 +1328,7 @@ atomicallyzh_fast
 
   /* Nested transactions are not allowed; raise an exception */
   if (old_trec != NO_TREC) {
-     R1 = base_GHCziIOBase_NestedAtomically_closure;
+     R1 = base_ControlziExceptionziBase_nestedAtomically_closure;
      jump raisezh_fast;
   }
 
@@ -1340,9 +1469,6 @@ retry_pop_stack:
     StgHeader_info(frame) = stg_atomically_waiting_frame_info;
     Sp = frame;
     // Fix up the stack in the unregisterised case: the return convention is different.
-    IF_NOT_REG_R1(Sp_adj(-2); 
-                 Sp(1) = stg_NO_FINALIZER_closure;
-                 Sp(0) = stg_ut_1_0_unreg_info;)
     R3 = trec; // passing to stmWaitUnblock()
     jump stg_block_stmwait;
   } else {
@@ -1403,6 +1529,17 @@ readTVarzh_fast
   RET_P(result);
 }
 
+readTVarIOzh_fast
+{
+    W_ result;
+
+again:
+    result = StgTVar_current_value(R1);
+    if (%INFO_PTR(result) == stg_TREC_HEADER_info) {
+        goto again;
+    }
+    RET_P(result);
+}
 
 writeTVarzh_fast
 {
@@ -1483,16 +1620,9 @@ newMVarzh_fast
 }
 
 
-/* If R1 isn't available, pass it on the stack */
-#ifdef REG_R1
 #define PerformTake(tso, value)                                \
     W_[StgTSO_sp(tso) + WDS(1)] = value;               \
     W_[StgTSO_sp(tso) + WDS(0)] = stg_gc_unpt_r1_info;
-#else
-#define PerformTake(tso, value)                                        \
-    W_[StgTSO_sp(tso) + WDS(1)] = value;                       \
-    W_[StgTSO_sp(tso) + WDS(0)] = stg_ut_1_0_unreg_info;
-#endif
 
 #define PerformPut(tso,lval)                   \
     StgTSO_sp(tso) = StgTSO_sp(tso) + WDS(3);  \
@@ -1512,7 +1642,7 @@ takeMVarzh_fast
 #endif
         
     if (info == stg_MVAR_CLEAN_info) {
-        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
+        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") [];
     }
 
     /* If the MVar is empty, put ourselves on its blocking queue,
@@ -1522,13 +1652,19 @@ takeMVarzh_fast
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
            StgMVar_head(mvar) = CurrentTSO;
        } else {
-           StgTSO_link(StgMVar_tail(mvar)) = CurrentTSO;
+            foreign "C" setTSOLink(MyCapability() "ptr", 
+                                   StgMVar_tail(mvar) "ptr",
+                                   CurrentTSO) [];
        }
-       StgTSO_link(CurrentTSO)        = stg_END_TSO_QUEUE_closure;
-       StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
+       StgTSO__link(CurrentTSO)       = stg_END_TSO_QUEUE_closure;
        StgTSO_block_info(CurrentTSO)  = mvar;
+        // write barrier for throwTo(), which looks at block_info
+        // if why_blocked==BlockedOnMVar.
+        prim %write_barrier() [];
+       StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
        StgMVar_tail(mvar) = CurrentTSO;
        
+        R1 = mvar;
        jump stg_block_takemvar;
   }
 
@@ -1545,17 +1681,14 @@ takeMVarzh_fast
       /* actually perform the putMVar for the thread that we just woke up */
       tso = StgMVar_head(mvar);
       PerformPut(tso,StgMVar_value(mvar));
-      dirtyTSO(tso);
 
-#if defined(GRAN) || defined(PAR)
-      /* ToDo: check 2nd arg (mvar) is right */
-      ("ptr" tso) = foreign "C" unblockOne(StgMVar_head(mvar),mvar) [];
-      StgMVar_head(mvar) = tso;
-#else
-      ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", 
-                                        StgMVar_head(mvar) "ptr") [];
+      if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
+          foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
+      }
+
+      ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
+                                            StgMVar_head(mvar) "ptr", 1) [];
       StgMVar_head(mvar) = tso;
-#endif
 
       if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
          StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
@@ -1625,17 +1758,13 @@ tryTakeMVarzh_fast
        /* actually perform the putMVar for the thread that we just woke up */
        tso = StgMVar_head(mvar);
        PerformPut(tso,StgMVar_value(mvar));
-        dirtyTSO(tso);
+        if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
+            foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
+        }
 
-#if defined(GRAN) || defined(PAR)
-       /* ToDo: check 2nd arg (mvar) is right */
-       ("ptr" tso) = foreign "C" unblockOne(StgMVar_head(mvar) "ptr", mvar "ptr") [];
+        ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
+                                              StgMVar_head(mvar) "ptr", 1) [];
        StgMVar_head(mvar) = tso;
-#else
-       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr",
-                                          StgMVar_head(mvar) "ptr") [];
-       StgMVar_head(mvar) = tso;
-#endif
 
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
@@ -1663,13 +1792,14 @@ tryTakeMVarzh_fast
 
 putMVarzh_fast
 {
-    W_ mvar, info, tso;
+    W_ mvar, val, info, tso;
 
     /* args: R1 = MVar, R2 = value */
     mvar = R1;
+    val  = R2;
 
 #if defined(THREADED_RTS)
-    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [R2];
+    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
 #else
     info = GET_INFO(mvar);
 #endif
@@ -1682,13 +1812,20 @@ putMVarzh_fast
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
            StgMVar_head(mvar) = CurrentTSO;
        } else {
-           StgTSO_link(StgMVar_tail(mvar)) = CurrentTSO;
+            foreign "C" setTSOLink(MyCapability() "ptr", 
+                                   StgMVar_tail(mvar) "ptr",
+                                   CurrentTSO) [];
        }
-       StgTSO_link(CurrentTSO)        = stg_END_TSO_QUEUE_closure;
-       StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
+       StgTSO__link(CurrentTSO)       = stg_END_TSO_QUEUE_closure;
        StgTSO_block_info(CurrentTSO)  = mvar;
+        // write barrier for throwTo(), which looks at block_info
+        // if why_blocked==BlockedOnMVar.
+        prim %write_barrier() [];
+       StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
        StgMVar_tail(mvar) = CurrentTSO;
        
+        R1 = mvar;
+        R2 = val;
        jump stg_block_putmvar;
     }
   
@@ -1700,17 +1837,14 @@ putMVarzh_fast
 
        /* actually perform the takeMVar */
        tso = StgMVar_head(mvar);
-       PerformTake(tso, R2);
-        dirtyTSO(tso);
+       PerformTake(tso, val);
+        if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
+            foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
+        }
       
-#if defined(GRAN) || defined(PAR)
-       /* ToDo: check 2nd arg (mvar) is right */
-       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
-       StgMVar_head(mvar) = tso;
-#else
-       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
+        ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
+                                              StgMVar_head(mvar) "ptr", 1) [];
        StgMVar_head(mvar) = tso;
-#endif
 
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
@@ -1726,7 +1860,7 @@ putMVarzh_fast
     else
     {
        /* No further takes, the MVar is now full. */
-       StgMVar_value(mvar) = R2;
+       StgMVar_value(mvar) = val;
 
 #if defined(THREADED_RTS)
        unlockClosure(mvar, stg_MVAR_DIRTY_info);
@@ -1773,16 +1907,13 @@ tryPutMVarzh_fast
        /* actually perform the takeMVar */
        tso = StgMVar_head(mvar);
        PerformTake(tso, R2);
-        dirtyTSO(tso);
+        if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
+            foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
+        }
       
-#if defined(GRAN) || defined(PAR)
-       /* ToDo: check 2nd arg (mvar) is right */
-       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
-       StgMVar_head(mvar) = tso;
-#else
-       ("ptr" tso) = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
+        ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
+                                              StgMVar_head(mvar) "ptr", 1) [];
        StgMVar_head(mvar) = tso;
-#endif
 
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
@@ -1998,11 +2129,11 @@ for2:
  * macro in Schedule.h).
  */
 #define APPEND_TO_BLOCKED_QUEUE(tso)                   \
-    ASSERT(StgTSO_link(tso) == END_TSO_QUEUE);         \
+    ASSERT(StgTSO__link(tso) == END_TSO_QUEUE);                \
     if (W_[blocked_queue_hd] == END_TSO_QUEUE) {       \
       W_[blocked_queue_hd] = tso;                      \
     } else {                                           \
-      StgTSO_link(W_[blocked_queue_tl]) = tso;         \
+      foreign "C" setTSOLink(MyCapability() "ptr", W_[blocked_queue_tl] "ptr", tso) []; \
     }                                                  \
     W_[blocked_queue_tl] = tso;
 
@@ -2083,7 +2214,11 @@ delayzh_fast
     W_ time;
     W_ divisor;
     (time) = foreign "C" getourtimeofday() [R1];
-    divisor = TO_W_(RtsFlags_MiscFlags_tickInterval(RtsFlags))*1000;
+    divisor = TO_W_(RtsFlags_MiscFlags_tickInterval(RtsFlags));
+    if (divisor == 0) {
+        divisor = 50;
+    }
+    divisor = divisor * 1000;
     target = ((R1 + divisor - 1) / divisor) /* divide rounding up */
            + time + 1; /* Add 1 as getourtimeofday rounds down */
     StgTSO_block_info(CurrentTSO) = target;
@@ -2094,15 +2229,15 @@ delayzh_fast
 while:
     if (t != END_TSO_QUEUE && StgTSO_block_info(t) < target) {
        prev = t;
-       t = StgTSO_link(t);
+       t = StgTSO__link(t);
        goto while;
     }
 
-    StgTSO_link(CurrentTSO) = t;
+    StgTSO__link(CurrentTSO) = t;
     if (prev == NULL) {
        W_[sleeping_queue] = CurrentTSO;
     } else {
-       StgTSO_link(prev) = CurrentTSO;
+        foreign "C" setTSOLink(MyCapability() "ptr", prev "ptr", CurrentTSO) [];
     }
     jump stg_block_noregs;
 #endif
@@ -2232,3 +2367,39 @@ getApStackValzh_fast
    }
    RET_NP(ok,val);
 }
+
+/* -----------------------------------------------------------------------------
+   Misc. primitives
+   -------------------------------------------------------------------------- */
+
+// Write the cost center stack of the first argument on stderr; return
+// the second.  Possibly only makes sense for already evaluated
+// things?
+traceCcszh_fast
+{
+    W_ ccs;
+
+#ifdef PROFILING
+    ccs = StgHeader_ccs(UNTAG(R1));
+    foreign "C" fprintCCS_stderr(ccs "ptr") [R2];
+#endif
+
+    R1 = R2;
+    ENTER();
+}
+
+getSparkzh_fast
+{
+   W_ spark;
+
+#ifndef THREADED_RTS
+   RET_NP(0,ghczmprim_GHCziBool_False_closure);
+#else
+   (spark) = foreign "C" findSpark(MyCapability());
+   if (spark != 0) {
+      RET_NP(1,spark);
+   } else {
+      RET_NP(0,ghczmprim_GHCziBool_False_closure);
+   }
+#endif
+}