takeMVar/putMVar were missing some write barriers when modifying a TSO
[ghc-hetmet.git] / ghc / rts / PrimOps.cmm
index d4a54c6..23bc22e 100644 (file)
@@ -49,7 +49,7 @@ newByteArrayzh_fast
     n = R1;
     payload_words = ROUNDUP_BYTES_TO_WDS(n);
     words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-    "ptr" p = foreign "C" allocate(words);
+    "ptr" p = foreign "C" allocateLocal(MyCapability() "ptr",words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
     StgArrWords_words(p) = payload_words;
@@ -73,7 +73,7 @@ newPinnedByteArrayzh_fast
        words = words + 1;
     }
 
-    "ptr" p = foreign "C" allocatePinned(words);
+    "ptr" p = foreign "C" allocatePinned(words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
     // Again, if the ArrWords header isn't a multiple of 8 bytes, we
@@ -97,10 +97,10 @@ newArrayzh_fast
     MAYBE_GC(R2_PTR,newArrayzh_fast);
 
     words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + n;
-    "ptr" arr = foreign "C" allocate(words);
+    "ptr" arr = foreign "C" allocateLocal(MyCapability() "ptr",words) [R2];
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
 
-    SET_HDR(arr, stg_MUT_ARR_PTRS_info, W_[CCCS]);
+    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
     StgMutArrPtrs_ptrs(arr) = n;
 
     // Initialise all elements of the the array with the value in R2
@@ -118,8 +118,6 @@ newArrayzh_fast
 
 unsafeThawArrayzh_fast
 {
-  SET_INFO(R1,stg_MUT_ARR_PTRS_info);
-
   // SUBTLETY TO DO WITH THE OLD GEN MUTABLE LIST
   //
   // A MUT_ARR_PTRS lives on the mutable list, but a MUT_ARR_PTRS_FROZEN 
@@ -127,16 +125,26 @@ unsafeThawArrayzh_fast
   // it on the mutable list for the GC to remove (removing something from
   // the mutable list is not easy, because the mut_list is only singly-linked).
   // 
+  // So that we can tell whether a MUT_ARR_PTRS_FROZEN is on the mutable list,
+  // when we freeze it we set the info ptr to be MUT_ARR_PTRS_FROZEN0
+  // to indicate that it is still on the mutable list.
+  //
   // So, when we thaw a MUT_ARR_PTRS_FROZEN, we must cope with two cases:
   // either it is on a mut_list, or it isn't.  We adopt the convention that
-  // the mut_link field is NULL if it isn't on a mut_list, and the GC
-  // maintains this invariant.
+  // the closure type is MUT_ARR_PTRS_FROZEN0 if it is on the mutable list,
+  // and MUT_ARR_PTRS_FROZEN otherwise.  In fact it wouldn't matter if
+  // we put it on the mutable list more than once, but it would get scavenged
+  // multiple times during GC, which would be unnecessarily slow.
   //
-  if (StgMutClosure_mut_link(R1) == NULL) {
-       foreign "C" recordMutable(R1 "ptr");
+  if (StgHeader_info(R1) != stg_MUT_ARR_PTRS_FROZEN0_info) {
+       SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
+       foreign "C" recordMutableLock(R1 "ptr") [R1];
+       // must be done after SET_INFO, because it ASSERTs closure_MUTABLE()
+       RET_P(R1);
+  } else {
+       SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
+       RET_P(R1);
   }
-
-  RET_P(R1);
 }
 
 /* -----------------------------------------------------------------------------
@@ -151,7 +159,7 @@ newMutVarzh_fast
     ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, newMutVarzh_fast);
 
     mv = Hp - SIZEOF_StgMutVar + WDS(1);
-    SET_HDR(mv,stg_MUT_VAR_info,W_[CCCS]);
+    SET_HDR(mv,stg_MUT_VAR_DIRTY_info,W_[CCCS]);
     StgMutVar_var(mv) = R1;
     
     RET_P(mv);
@@ -179,18 +187,18 @@ atomicModifyMutVarzh_fast
     */
 
 #if MIN_UPD_SIZE > 1
-#define THUNK_1_SIZE (SIZEOF_StgHeader + WDS(MIN_UPD_SIZE))
+#define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
 #define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),WDS(MIN_UPD_SIZE-1))
 #else
-#define THUNK_1_SIZE (SIZEOF_StgHeader + WDS(1))
+#define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(1))
 #define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),0)
 #endif
 
 #if MIN_UPD_SIZE > 2
-#define THUNK_2_SIZE (SIZEOF_StgHeader + WDS(MIN_UPD_SIZE))
+#define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
 #define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),WDS(MIN_UPD_SIZE-2))
 #else
-#define THUNK_2_SIZE (SIZEOF_StgHeader + WDS(2))
+#define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(2))
 #define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),0)
 #endif
 
@@ -198,6 +206,10 @@ atomicModifyMutVarzh_fast
 
    HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, atomicModifyMutVarzh_fast);
 
+#if defined(THREADED_RTS)
+    foreign "C" ACQUIRE_LOCK(atomic_modify_mutvar_mutex "ptr") [R1,R2];
+#endif
+
    x = StgMutVar_var(R1);
 
    TICK_ALLOC_THUNK_2();
@@ -205,46 +217,31 @@ atomicModifyMutVarzh_fast
    z = Hp - THUNK_2_SIZE + WDS(1);
    SET_HDR(z, stg_ap_2_upd_info, W_[CCCS]);
    LDV_RECORD_CREATE(z);
-   StgClosure_payload(z,0) = R2;
-   StgClosure_payload(z,1) = x;
+   StgThunk_payload(z,0) = R2;
+   StgThunk_payload(z,1) = x;
 
    TICK_ALLOC_THUNK_1();
    CCCS_ALLOC(THUNK_1_SIZE);
    y = z - THUNK_1_SIZE;
    SET_HDR(y, stg_sel_0_upd_info, W_[CCCS]);
    LDV_RECORD_CREATE(y);
-   StgClosure_payload(y,0) = z;
+   StgThunk_payload(y,0) = z;
 
    StgMutVar_var(R1) = y;
+   foreign "C" dirty_MUT_VAR(BaseReg "ptr", R1 "ptr") [R1];
 
    TICK_ALLOC_THUNK_1();
    CCCS_ALLOC(THUNK_1_SIZE);
    r = y - THUNK_1_SIZE;
    SET_HDR(r, stg_sel_1_upd_info, W_[CCCS]);
    LDV_RECORD_CREATE(r);
-   StgClosure_payload(r,0) = z;
-
-   RET_P(r);
-}
-
-/* -----------------------------------------------------------------------------
-   Foreign Object Primitives
-   -------------------------------------------------------------------------- */
-
-mkForeignObjzh_fast
-{
-  /* R1 = ptr to foreign object,
-  */
-  W_ result;
-
-  ALLOC_PRIM( SIZEOF_StgForeignObj, NO_PTRS, mkForeignObjzh_fast);
+   StgThunk_payload(r,0) = z;
 
-  result = Hp - SIZEOF_StgForeignObj + WDS(1);
-  SET_HDR(result,stg_FOREIGN_info,W_[CCCS]);
-  StgForeignObj_data(result) = R1;
+#if defined(THREADED_RTS)
+    foreign "C" RELEASE_LOCK(atomic_modify_mutvar_mutex "ptr") [];
+#endif
 
-  /* returns (# s#, ForeignObj# #) */
-  RET_P(result);
+   RET_P(r);
 }
 
 /* -----------------------------------------------------------------------------
@@ -277,7 +274,7 @@ mkWeakzh_fast
   StgWeak_link(w)      = W_[weak_ptr_list];
   W_[weak_ptr_list]    = w;
 
-  IF_DEBUG(weak, foreign "C" fprintf(stderr,stg_weak_msg,w));
+  IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
 
   RET_P(w);
 }
@@ -316,6 +313,7 @@ finalizzeWeakzh_fast
   LDV_RECORD_CREATE(w);
 
   f = StgWeak_finalizer(w);
+  StgDeadWeak_link(w) = StgWeak_link(w);
 
   /* return the finalizer */
   if (f == stg_NO_FINALIZER_closure) {
@@ -514,6 +512,7 @@ word64ToIntegerzh_fast
 
 /* ToDo: this is shockingly inefficient */
 
+#ifndef THREADED_RTS
 section "bss" {
   mp_tmp1:
     bits8 [SIZEOF_MP_INT];
@@ -525,98 +524,120 @@ section "bss" {
 }
 
 section "bss" {
-  result1:
+  mp_result1:
     bits8 [SIZEOF_MP_INT];
 }
 
 section "bss" {
-  result2:
+  mp_result2:
     bits8 [SIZEOF_MP_INT];
 }
+#endif
+
+#ifdef THREADED_RTS
+#define FETCH_MP_TEMP(X) \
+W_ X; \
+X = BaseReg + (OFFSET_StgRegTable_r ## X);
+#else
+#define FETCH_MP_TEMP(X) /* Nothing */
+#endif
 
-#define GMP_TAKE2_RET1(name,mp_fun)                    \
-name                                                   \
-{                                                      \
-  W_ s1, s2, d1, d2;                                   \
-                                                       \
-  /* call doYouWantToGC() */                           \
-  MAYBE_GC(R2_PTR & R4_PTR, name);                     \
-                                                       \
-  s1 = R1;                                             \
-  d1 = R2;                                             \
-  s2 = R3;                                             \
-  d2 = R4;                                             \
-                                                       \
-  MP_INT__mp_alloc(mp_tmp1) = StgArrWords_words(d1);   \
-  MP_INT__mp_size(mp_tmp1)  = (s1);                    \
-  MP_INT__mp_d(mp_tmp1)            = BYTE_ARR_CTS(d1);         \
-  MP_INT__mp_alloc(mp_tmp2) = StgArrWords_words(d2);   \
-  MP_INT__mp_size(mp_tmp2)  = (s2);                    \
-  MP_INT__mp_d(mp_tmp2)            = BYTE_ARR_CTS(d2);         \
-                                                       \
-  foreign "C" mpz_init(result1);                       \
-                                                       \
-  /* Perform the operation */                          \
-  foreign "C" mp_fun(result1,mp_tmp1,mp_tmp2);         \
-                                                       \
-  RET_NP(MP_INT__mp_size(result1),                     \
-         MP_INT__mp_d(result1) - SIZEOF_StgArrWords);  \
+#define GMP_TAKE2_RET1(name,mp_fun)                                     \
+name                                                                    \
+{                                                                       \
+  CInt s1, s2;                                                          \
+  W_ d1, d2;                                                            \
+  FETCH_MP_TEMP(mp_tmp1);                                               \
+  FETCH_MP_TEMP(mp_tmp2);                                               \
+  FETCH_MP_TEMP(mp_result1)                                             \
+  FETCH_MP_TEMP(mp_result2);                                            \
+                                                                        \
+  /* call doYouWantToGC() */                                            \
+  MAYBE_GC(R2_PTR & R4_PTR, name);                                      \
+                                                                        \
+  s1 = W_TO_INT(R1);                                                    \
+  d1 = R2;                                                              \
+  s2 = W_TO_INT(R3);                                                    \
+  d2 = R4;                                                              \
+                                                                        \
+  MP_INT__mp_alloc(mp_tmp1) = W_TO_INT(StgArrWords_words(d1));          \
+  MP_INT__mp_size(mp_tmp1)  = (s1);                                     \
+  MP_INT__mp_d(mp_tmp1)            = BYTE_ARR_CTS(d1);                         \
+  MP_INT__mp_alloc(mp_tmp2) = W_TO_INT(StgArrWords_words(d2));          \
+  MP_INT__mp_size(mp_tmp2)  = (s2);                                     \
+  MP_INT__mp_d(mp_tmp2)            = BYTE_ARR_CTS(d2);                         \
+                                                                        \
+  foreign "C" mpz_init(mp_result1 "ptr") [];                            \
+                                                                        \
+  /* Perform the operation */                                           \
+  foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1  "ptr",mp_tmp2  "ptr") []; \
+                                                                        \
+  RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
+         MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
 }
 
-#define GMP_TAKE1_RET1(name,mp_fun)                            \
-name                                                           \
-{                                                              \
-  W_ s1, d1;                                                   \
-                                                               \
-  /* call doYouWantToGC() */                                   \
-  MAYBE_GC(R2_PTR, name);                                      \
-                                                               \
-  d1 = R2;                                                     \
-  s1 = R1;                                                     \
-                                                               \
-  MP_INT__mp_alloc(mp_tmp1)    = StgArrWords_words(d1);        \
-  MP_INT__mp_size(mp_tmp1)     = (s1);                         \
-  MP_INT__mp_d(mp_tmp1)                = BYTE_ARR_CTS(d1);             \
-                                                               \
-  foreign "C" mpz_init(result1);                               \
-                                                               \
-  /* Perform the operation */                                  \
-  foreign "C" mp_fun(result1,mp_tmp1);                         \
-                                                               \
-  RET_NP(MP_INT__mp_size(result1),                             \
-         MP_INT__mp_d(result1) - SIZEOF_StgArrWords);          \
+#define GMP_TAKE1_RET1(name,mp_fun)                                     \
+name                                                                    \
+{                                                                       \
+  CInt s1;                                                              \
+  W_ d1;                                                                \
+  FETCH_MP_TEMP(mp_tmp1);                                               \
+  FETCH_MP_TEMP(mp_result1)                                             \
+                                                                        \
+  /* call doYouWantToGC() */                                            \
+  MAYBE_GC(R2_PTR, name);                                               \
+                                                                        \
+  d1 = R2;                                                              \
+  s1 = W_TO_INT(R1);                                                    \
+                                                                        \
+  MP_INT__mp_alloc(mp_tmp1)    = W_TO_INT(StgArrWords_words(d1));      \
+  MP_INT__mp_size(mp_tmp1)     = (s1);                                 \
+  MP_INT__mp_d(mp_tmp1)                = BYTE_ARR_CTS(d1);                     \
+                                                                        \
+  foreign "C" mpz_init(mp_result1 "ptr") [];                            \
+                                                                        \
+  /* Perform the operation */                                           \
+  foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1 "ptr") [];                \
+                                                                        \
+  RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
+         MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
 }
 
-#define GMP_TAKE2_RET2(name,mp_fun)                            \
-name                                                           \
-{                                                              \
-  W_ s1, s2, d1, d2;                                           \
-                                                               \
-  /* call doYouWantToGC() */                                   \
-  MAYBE_GC(R2_PTR & R4_PTR, name);                             \
-                                                               \
-  s1 = R1;                                                     \
-  d1 = R2;                                                     \
-  s2 = R3;                                                     \
-  d2 = R4;                                                     \
-                                                               \
-  MP_INT__mp_alloc(mp_tmp1)    = StgArrWords_words(d1);        \
-  MP_INT__mp_size(mp_tmp1)     = (s1);                         \
-  MP_INT__mp_d(mp_tmp1)                = BYTE_ARR_CTS(d1);             \
-  MP_INT__mp_alloc(mp_tmp2)    = StgArrWords_words(d2);        \
-  MP_INT__mp_size(mp_tmp2)     = (s2);                         \
-  MP_INT__mp_d(mp_tmp2)                = BYTE_ARR_CTS(d2);             \
-                                                               \
-  foreign "C" mpz_init(result1);                               \
-  foreign "C" mpz_init(result2);                               \
-                                                               \
-  /* Perform the operation */                                  \
-  foreign "C" mp_fun(result1,result2,mp_tmp1,mp_tmp2);         \
-                                                               \
-  RET_NPNP(MP_INT__mp_size(result1),                           \
-           MP_INT__mp_d(result1) - SIZEOF_StgArrWords,         \
-          MP_INT__mp_size(result2),                            \
-           MP_INT__mp_d(result2) - SIZEOF_StgArrWords);                \
+#define GMP_TAKE2_RET2(name,mp_fun)                                                     \
+name                                                                                    \
+{                                                                                       \
+  CInt s1, s2;                                                                          \
+  W_ d1, d2;                                                                            \
+  FETCH_MP_TEMP(mp_tmp1);                                                               \
+  FETCH_MP_TEMP(mp_tmp2);                                                               \
+  FETCH_MP_TEMP(mp_result1)                                                             \
+  FETCH_MP_TEMP(mp_result2)                                                             \
+                                                                                        \
+  /* call doYouWantToGC() */                                                            \
+  MAYBE_GC(R2_PTR & R4_PTR, name);                                                      \
+                                                                                        \
+  s1 = W_TO_INT(R1);                                                                    \
+  d1 = R2;                                                                              \
+  s2 = W_TO_INT(R3);                                                                    \
+  d2 = R4;                                                                              \
+                                                                                        \
+  MP_INT__mp_alloc(mp_tmp1)    = W_TO_INT(StgArrWords_words(d1));                      \
+  MP_INT__mp_size(mp_tmp1)     = (s1);                                                 \
+  MP_INT__mp_d(mp_tmp1)                = BYTE_ARR_CTS(d1);                                     \
+  MP_INT__mp_alloc(mp_tmp2)    = W_TO_INT(StgArrWords_words(d2));                      \
+  MP_INT__mp_size(mp_tmp2)     = (s2);                                                 \
+  MP_INT__mp_d(mp_tmp2)                = BYTE_ARR_CTS(d2);                                     \
+                                                                                        \
+  foreign "C" mpz_init(mp_result1 "ptr") [];                                               \
+  foreign "C" mpz_init(mp_result2 "ptr") [];                                               \
+                                                                                        \
+  /* Perform the operation */                                                           \
+  foreign "C" mp_fun(mp_result1 "ptr",mp_result2 "ptr",mp_tmp1 "ptr",mp_tmp2 "ptr") [];    \
+                                                                                        \
+  RET_NPNP(TO_W_(MP_INT__mp_size(mp_result1)),                                          \
+           MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords,                               \
+          TO_W_(MP_INT__mp_size(mp_result2)),                                          \
+           MP_INT__mp_d(mp_result2) - SIZEOF_StgArrWords);                              \
 }
 
 GMP_TAKE2_RET1(plusIntegerzh_fast,     mpz_add)
@@ -634,17 +655,20 @@ GMP_TAKE1_RET1(complementIntegerzh_fast, mpz_com)
 GMP_TAKE2_RET2(quotRemIntegerzh_fast, mpz_tdiv_qr)
 GMP_TAKE2_RET2(divModIntegerzh_fast,  mpz_fdiv_qr)
 
+#ifndef THREADED_RTS
 section "bss" {
-  aa:  W_; // NB. aa is really an mp_limb_t
+  mp_tmp_w:  W_; // NB. mp_tmp_w is really an here mp_limb_t
 }
+#endif
 
 gcdIntzh_fast
 {
     /* R1 = the first Int#; R2 = the second Int# */
     W_ r; 
+    FETCH_MP_TEMP(mp_tmp_w);
 
-    W_[aa] = R1;
-    r = foreign "C" mpn_gcd_1(aa, 1, R2);
+    W_[mp_tmp_w] = R1;
+    r = foreign "C" mpn_gcd_1(mp_tmp_w "ptr", 1, R2) [];
 
     R1 = r;
     /* Result parked in R1, return via info-pointer at TOS */
@@ -655,7 +679,7 @@ gcdIntzh_fast
 gcdIntegerIntzh_fast
 {
     /* R1 = s1; R2 = d1; R3 = the int */
-    R1 = foreign "C" mpn_gcd_1( BYTE_ARR_CTS(R2) "ptr", R1, R3);
+    R1 = foreign "C" mpn_gcd_1( BYTE_ARR_CTS(R2) "ptr", R1, R3) [];
     
     /* Result parked in R1, return via info-pointer at TOS */
     jump %ENTRY_CODE(Sp(0));
@@ -736,14 +760,14 @@ cmpIntegerzh_fast
     up = BYTE_ARR_CTS(R2);
     vp = BYTE_ARR_CTS(R4);
 
-    cmp = foreign "C" mpn_cmp(up "ptr", vp "ptr", size);
+    cmp = foreign "C" mpn_cmp(up "ptr", vp "ptr", size) [];
 
-    if (cmp == 0) {
+    if (cmp == 0 :: CInt) {
        R1 = 0; 
        jump %ENTRY_CODE(Sp(0));
     }
 
-    if (%lt(cmp,0) == %lt(usize,0)) {
+    if (%lt(cmp,0 :: CInt) == %lt(usize,0)) {
        R1 = 1;
     } else {
        R1 = (-1); 
@@ -790,14 +814,12 @@ integer2Wordzh_fast
   jump %ENTRY_CODE(Sp(0));
 }
 
-section "bss" {
-  exponent:  W_;
-}
-
 decodeFloatzh_fast
 { 
     W_ p;
     F_ arg;
+    FETCH_MP_TEMP(mp_tmp1);
+    FETCH_MP_TEMP(mp_tmp_w);
     
     /* arguments: F1 = Float# */
     arg = F1;
@@ -812,10 +834,10 @@ decodeFloatzh_fast
     MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
     
     /* Perform the operation */
-    foreign "C" __decodeFloat(mp_tmp1,exponent,arg);
+    foreign "C" __decodeFloat(mp_tmp1 "ptr",mp_tmp_w "ptr" ,arg) [];
     
     /* returns: (Int# (expn), Int#, ByteArray#) */
-    RET_NNP(W_[exponent], MP_INT__mp_size(mp_tmp1), p);
+    RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
 }
 
 #define DOUBLE_MANTISSA_SIZE SIZEOF_DOUBLE
@@ -825,6 +847,8 @@ decodeDoublezh_fast
 { 
     D_ arg;
     W_ p;
+    FETCH_MP_TEMP(mp_tmp1);
+    FETCH_MP_TEMP(mp_tmp_w);
 
     /* arguments: D1 = Double# */
     arg = D1;
@@ -839,10 +863,10 @@ decodeDoublezh_fast
     MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
 
     /* Perform the operation */
-    foreign "C" __decodeDouble(mp_tmp1,exponent,arg);
+    foreign "C" __decodeDouble(mp_tmp1 "ptr", mp_tmp_w "ptr",arg) [];
     
     /* returns: (Int# (expn), Int#, ByteArray#) */
-    RET_NNP(W_[exponent], MP_INT__mp_size(mp_tmp1), p);
+    RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
 }
 
 /* -----------------------------------------------------------------------------
@@ -856,12 +880,13 @@ forkzh_fast
   MAYBE_GC(R1_PTR, forkzh_fast);
 
   // create it right now, return ThreadID in R1
-  "ptr" R1 = foreign "C" createIOThread( RtsFlags_GcFlags_initialStkSize(RtsFlags), 
-                                  R1 "ptr");
-  foreign "C" scheduleThread(R1 "ptr");
+  "ptr" R1 = foreign "C" createIOThread( MyCapability() "ptr", 
+                               RtsFlags_GcFlags_initialStkSize(RtsFlags), 
+                               R1 "ptr") [R1];
+  foreign "C" scheduleThread(MyCapability() "ptr", R1 "ptr") [R1];
 
   // switch at the earliest opportunity
-  CInt[context_switch] = 1;
+  CInt[context_switch] = 1 :: CInt;
   
   RET_P(R1);
 }
@@ -883,7 +908,7 @@ labelThreadzh_fast
        R1 = ThreadId#
        R2 = Addr# */
 #ifdef DEBUG
-  foreign "C" labelThread(R1 "ptr", R2 "ptr");
+  foreign "C" labelThread(R1 "ptr", R2 "ptr") [];
 #endif
   jump %ENTRY_CODE(Sp(0));
 }
@@ -892,10 +917,468 @@ isCurrentThreadBoundzh_fast
 {
   /* no args */
   W_ r;
-  r = foreign "C" isThreadBound(CurrentTSO);
+  r = foreign "C" isThreadBound(CurrentTSO) [];
   RET_N(r);
 }
 
+
+/* -----------------------------------------------------------------------------
+ * TVar primitives
+ * -------------------------------------------------------------------------- */
+
+#ifdef REG_R1
+#define SP_OFF 0
+#define IF_NOT_REG_R1(x) 
+#else
+#define SP_OFF 1
+#define IF_NOT_REG_R1(x) x
+#endif
+
+// Catch retry frame ------------------------------------------------------------
+
+#define CATCH_RETRY_FRAME_ERROR(label) \
+  label { foreign "C" barf("catch_retry_frame incorrectly entered!"); }
+
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_0_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_1_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_2_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_3_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_4_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_5_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_6_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_7_ret)
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_catch_retry_frame too.
+#endif
+
+#if defined(PROFILING)
+#define CATCH_RETRY_FRAME_BITMAP 7
+#define CATCH_RETRY_FRAME_WORDS  6
+#else
+#define CATCH_RETRY_FRAME_BITMAP 1
+#define CATCH_RETRY_FRAME_WORDS  4
+#endif
+
+INFO_TABLE_RET(stg_catch_retry_frame,
+              CATCH_RETRY_FRAME_WORDS, CATCH_RETRY_FRAME_BITMAP,
+              CATCH_RETRY_FRAME,
+              stg_catch_retry_frame_0_ret,
+              stg_catch_retry_frame_1_ret,
+              stg_catch_retry_frame_2_ret,
+              stg_catch_retry_frame_3_ret,
+              stg_catch_retry_frame_4_ret,
+              stg_catch_retry_frame_5_ret,
+              stg_catch_retry_frame_6_ret,
+              stg_catch_retry_frame_7_ret)
+{
+   W_ r, frame, trec, outer;
+   IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
+
+   frame = Sp;
+   trec = StgTSO_trec(CurrentTSO);
+   "ptr" outer = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+   r = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
+   if (r) {
+     /* Succeeded (either first branch or second branch) */
+     StgTSO_trec(CurrentTSO) = outer;
+     Sp = Sp + SIZEOF_StgCatchRetryFrame;
+     IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
+     jump %ENTRY_CODE(Sp(SP_OFF));
+   } else {
+     /* Did not commit: retry */
+     W_ new_trec;
+     "ptr" new_trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+     StgTSO_trec(CurrentTSO) = new_trec;
+     if (StgCatchRetryFrame_running_alt_code(frame)) {
+       R1 = StgCatchRetryFrame_alt_code(frame);
+     } else {
+       R1 = StgCatchRetryFrame_first_code(frame);
+       StgCatchRetryFrame_first_code_trec(frame) = new_trec;
+     }
+     jump stg_ap_v_fast;
+   }
+}
+
+
+// Atomically frame -------------------------------------------------------------
+
+
+#define ATOMICALLY_FRAME_ERROR(label) \
+  label { foreign "C" barf("atomically_frame incorrectly entered!"); }
+
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_0_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_1_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_2_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_3_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_4_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_5_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_6_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_7_ret)
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_atomically_frame too.
+#endif
+
+#if defined(PROFILING)
+#define ATOMICALLY_FRAME_BITMAP 3
+#define ATOMICALLY_FRAME_WORDS  3
+#else
+#define ATOMICALLY_FRAME_BITMAP 0
+#define ATOMICALLY_FRAME_WORDS  1
+#endif
+
+
+INFO_TABLE_RET(stg_atomically_frame,
+              ATOMICALLY_FRAME_WORDS, ATOMICALLY_FRAME_BITMAP,
+              ATOMICALLY_FRAME,
+              stg_atomically_frame_0_ret,
+              stg_atomically_frame_1_ret,
+              stg_atomically_frame_2_ret,
+              stg_atomically_frame_3_ret,
+              stg_atomically_frame_4_ret,
+              stg_atomically_frame_5_ret,
+              stg_atomically_frame_6_ret,
+              stg_atomically_frame_7_ret)
+{
+  W_ frame, trec, valid;
+  IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
+
+  frame = Sp;
+  trec = StgTSO_trec(CurrentTSO);
+
+  /* The TSO is not currently waiting: try to commit the transaction */
+  valid = foreign "C" stmCommitTransaction(MyCapability() "ptr", trec "ptr") [];
+  if (valid) {
+    /* Transaction was valid: commit succeeded */
+    StgTSO_trec(CurrentTSO) = NO_TREC;
+    Sp = Sp + SIZEOF_StgAtomicallyFrame;
+    IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
+    jump %ENTRY_CODE(Sp(SP_OFF));
+  } else {
+    /* Transaction was not valid: try again */
+    "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
+    StgTSO_trec(CurrentTSO) = trec;
+    R1 = StgAtomicallyFrame_code(frame);
+    jump stg_ap_v_fast;
+  }
+}
+
+INFO_TABLE_RET(stg_atomically_waiting_frame,
+              ATOMICALLY_FRAME_WORDS, ATOMICALLY_FRAME_BITMAP,
+              ATOMICALLY_FRAME,
+              stg_atomically_frame_0_ret,
+              stg_atomically_frame_1_ret,
+              stg_atomically_frame_2_ret,
+              stg_atomically_frame_3_ret,
+              stg_atomically_frame_4_ret,
+              stg_atomically_frame_5_ret,
+              stg_atomically_frame_6_ret,
+              stg_atomically_frame_7_ret)
+{
+  W_ frame, trec, valid;
+  IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
+
+  frame = Sp;
+
+  /* The TSO is currently waiting: should we stop waiting? */
+  valid = foreign "C" stmReWait(MyCapability() "ptr", CurrentTSO "ptr") [];
+  if (valid) {
+    /* Previous attempt is still valid: no point trying again yet */
+         IF_NOT_REG_R1(Sp_adj(-2);
+                       Sp(1) = stg_NO_FINALIZER_closure;
+                       Sp(0) = stg_ut_1_0_unreg_info;)
+    jump stg_block_noregs;
+  } else {
+    /* Previous attempt is no longer valid: try again */
+    "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
+    StgTSO_trec(CurrentTSO) = trec;
+    StgHeader_info(frame) = stg_atomically_frame_info;
+    R1 = StgAtomicallyFrame_code(frame);
+    jump stg_ap_v_fast;
+  }
+}
+
+// STM catch frame --------------------------------------------------------------
+
+#define CATCH_STM_FRAME_ENTRY_TEMPLATE(label,ret)          \
+   label                                                   \
+   {                                                       \
+      IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )  \
+      Sp = Sp + SIZEOF_StgCatchSTMFrame;                   \
+      IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)             \
+      jump ret;                                            \
+   }
+
+#ifdef REG_R1
+#define SP_OFF 0
+#else
+#define SP_OFF 1
+#endif
+
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_0_ret,%RET_VEC(Sp(SP_OFF),0))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_1_ret,%RET_VEC(Sp(SP_OFF),1))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_2_ret,%RET_VEC(Sp(SP_OFF),2))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_3_ret,%RET_VEC(Sp(SP_OFF),3))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_4_ret,%RET_VEC(Sp(SP_OFF),4))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_5_ret,%RET_VEC(Sp(SP_OFF),5))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_6_ret,%RET_VEC(Sp(SP_OFF),6))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_7_ret,%RET_VEC(Sp(SP_OFF),7))
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_catch_stm_frame too.
+#endif
+
+#if defined(PROFILING)
+#define CATCH_STM_FRAME_BITMAP 3
+#define CATCH_STM_FRAME_WORDS  3
+#else
+#define CATCH_STM_FRAME_BITMAP 0
+#define CATCH_STM_FRAME_WORDS  1
+#endif
+
+/* Catch frames are very similar to update frames, but when entering
+ * one we just pop the frame off the stack and perform the correct
+ * kind of return to the activation record underneath us on the stack.
+ */
+
+INFO_TABLE_RET(stg_catch_stm_frame,
+              CATCH_STM_FRAME_WORDS, CATCH_STM_FRAME_BITMAP,
+              CATCH_STM_FRAME,
+              stg_catch_stm_frame_0_ret,
+              stg_catch_stm_frame_1_ret,
+              stg_catch_stm_frame_2_ret,
+              stg_catch_stm_frame_3_ret,
+              stg_catch_stm_frame_4_ret,
+              stg_catch_stm_frame_5_ret,
+              stg_catch_stm_frame_6_ret,
+              stg_catch_stm_frame_7_ret)
+CATCH_STM_FRAME_ENTRY_TEMPLATE(,%ENTRY_CODE(Sp(SP_OFF)))
+
+
+// Primop definition ------------------------------------------------------------
+
+atomicallyzh_fast
+{
+  W_ frame;
+  W_ old_trec;
+  W_ new_trec;
+  
+  // stmStartTransaction may allocate
+  MAYBE_GC (R1_PTR, atomicallyzh_fast); 
+
+  /* Args: R1 = m :: STM a */
+  STK_CHK_GEN(SIZEOF_StgAtomicallyFrame + WDS(1), R1_PTR, atomicallyzh_fast);
+
+  old_trec = StgTSO_trec(CurrentTSO);
+
+  /* Nested transactions are not allowed; raise an exception */
+  if (old_trec != NO_TREC) {
+     R1 = GHCziIOBase_NestedAtomically_closure;
+     jump raisezh_fast;
+  }
+
+  /* Set up the atomically frame */
+  Sp = Sp - SIZEOF_StgAtomicallyFrame;
+  frame = Sp;
+
+  SET_HDR(frame,stg_atomically_frame_info, W_[CCCS]);
+  StgAtomicallyFrame_code(frame) = R1;
+
+  /* Start the memory transcation */
+  "ptr" new_trec = foreign "C" stmStartTransaction(MyCapability() "ptr", old_trec "ptr") [R1];
+  StgTSO_trec(CurrentTSO) = new_trec;
+
+  /* Apply R1 to the realworld token */
+  jump stg_ap_v_fast;
+}
+
+
+catchSTMzh_fast
+{
+  W_ frame;
+  
+  /* Args: R1 :: STM a */
+  /* Args: R2 :: Exception -> STM a */
+  STK_CHK_GEN(SIZEOF_StgCatchSTMFrame + WDS(1), R1_PTR & R2_PTR, catchSTMzh_fast);
+
+  /* Set up the catch frame */
+  Sp = Sp - SIZEOF_StgCatchSTMFrame;
+  frame = Sp;
+
+  SET_HDR(frame, stg_catch_stm_frame_info, W_[CCCS]);
+  StgCatchSTMFrame_handler(frame) = R2;
+
+  /* Apply R1 to the realworld token */
+  jump stg_ap_v_fast;
+}
+
+
+catchRetryzh_fast
+{
+  W_ frame;
+  W_ new_trec;
+  W_ trec;
+
+  // stmStartTransaction may allocate
+  MAYBE_GC (R1_PTR & R2_PTR, catchRetryzh_fast); 
+
+  /* Args: R1 :: STM a */
+  /* Args: R2 :: STM a */
+  STK_CHK_GEN(SIZEOF_StgCatchRetryFrame + WDS(1), R1_PTR & R2_PTR, catchRetryzh_fast);
+
+  /* Start a nested transaction within which to run the first code */
+  trec = StgTSO_trec(CurrentTSO);
+  "ptr" new_trec = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [R1,R2];
+  StgTSO_trec(CurrentTSO) = new_trec;
+
+  /* Set up the catch-retry frame */
+  Sp = Sp - SIZEOF_StgCatchRetryFrame;
+  frame = Sp;
+  
+  SET_HDR(frame, stg_catch_retry_frame_info, W_[CCCS]);
+  StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
+  StgCatchRetryFrame_first_code(frame) = R1;
+  StgCatchRetryFrame_alt_code(frame) = R2;
+  StgCatchRetryFrame_first_code_trec(frame) = new_trec;
+
+  /* Apply R1 to the realworld token */
+  jump stg_ap_v_fast;
+}
+
+
+retryzh_fast
+{
+  W_ frame_type;
+  W_ frame;
+  W_ trec;
+  W_ outer;
+  W_ r;
+
+  MAYBE_GC (NO_PTRS, retryzh_fast); // STM operations may allocate
+
+  // Find the enclosing ATOMICALLY_FRAME or CATCH_RETRY_FRAME
+retry_pop_stack:
+  trec = StgTSO_trec(CurrentTSO);
+  "ptr" outer = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+  StgTSO_sp(CurrentTSO) = Sp;
+  frame_type = foreign "C" findRetryFrameHelper(CurrentTSO "ptr") [];
+  Sp = StgTSO_sp(CurrentTSO);
+  frame = Sp;
+
+  if (frame_type == CATCH_RETRY_FRAME) {
+    // The retry reaches a CATCH_RETRY_FRAME before the atomic frame
+    ASSERT(outer != NO_TREC);
+    if (!StgCatchRetryFrame_running_alt_code(frame)) {
+      // Retry in the first code: try the alternative
+      "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+      StgTSO_trec(CurrentTSO) = trec;
+      StgCatchRetryFrame_running_alt_code(frame) = 1 :: CInt; // true;
+      R1 = StgCatchRetryFrame_alt_code(frame);
+      jump stg_ap_v_fast;
+    } else {
+      // Retry in the alternative code: propagate
+      W_ other_trec;
+      other_trec = StgCatchRetryFrame_first_code_trec(frame);
+      r = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", other_trec "ptr") [];
+      if (r) {
+        r = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
+      } else {
+        foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
+      }
+      if (r) {
+        // Merge between siblings succeeded: commit it back to enclosing transaction
+        // and then propagate the retry
+        StgTSO_trec(CurrentTSO) = outer;
+        Sp = Sp + SIZEOF_StgCatchRetryFrame;
+        goto retry_pop_stack;
+      } else {
+        // Merge failed: we musn't propagate the retry.  Try both paths again.
+        "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+        StgCatchRetryFrame_first_code_trec(frame) = trec;
+        StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
+        StgTSO_trec(CurrentTSO) = trec;
+        R1 = StgCatchRetryFrame_first_code(frame);
+        jump stg_ap_v_fast;
+      }
+    }
+  }
+
+  // We've reached the ATOMICALLY_FRAME: attempt to wait 
+  ASSERT(frame_type == ATOMICALLY_FRAME);
+  ASSERT(outer == NO_TREC);
+  r = foreign "C" stmWait(MyCapability() "ptr", CurrentTSO "ptr", trec "ptr") [];
+  if (r) {
+    // Transaction was valid: stmWait put us on the TVars' queues, we now block
+    StgHeader_info(frame) = stg_atomically_waiting_frame_info;
+    Sp = frame;
+    // Fix up the stack in the unregisterised case: the return convention is different.
+    IF_NOT_REG_R1(Sp_adj(-2); 
+                 Sp(1) = stg_NO_FINALIZER_closure;
+                 Sp(0) = stg_ut_1_0_unreg_info;)
+    R3 = trec; // passing to stmWaitUnblock()
+    jump stg_block_stmwait;
+  } else {
+    // Transaction was not valid: retry immediately
+    "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+    StgTSO_trec(CurrentTSO) = trec;
+    R1 = StgAtomicallyFrame_code(frame);
+    Sp = frame;
+    jump stg_ap_v_fast;
+  }
+}
+
+
+newTVarzh_fast
+{
+  W_ tv;
+  W_ new_value;
+
+  /* Args: R1 = initialisation value */
+
+  MAYBE_GC (R1_PTR, newTVarzh_fast); 
+  new_value = R1;
+  "ptr" tv = foreign "C" stmNewTVar(MyCapability() "ptr", new_value "ptr") [];
+  RET_P(tv);
+}
+
+
+readTVarzh_fast
+{
+  W_ trec;
+  W_ tvar;
+  W_ result;
+
+  /* Args: R1 = TVar closure */
+
+  MAYBE_GC (R1_PTR, readTVarzh_fast); // Call to stmReadTVar may allocate
+  trec = StgTSO_trec(CurrentTSO);
+  tvar = R1;
+  "ptr" result = foreign "C" stmReadTVar(MyCapability() "ptr", trec "ptr", tvar "ptr") [];
+
+  RET_P(result);
+}
+
+
+writeTVarzh_fast
+{
+  W_ trec;
+  W_ tvar;
+  W_ new_value;
+  
+  /* Args: R1 = TVar closure */
+  /*       R2 = New value    */
+
+  MAYBE_GC (R1_PTR & R2_PTR, writeTVarzh_fast); // Call to stmWriteTVar may allocate
+  trec = StgTSO_trec(CurrentTSO);
+  tvar = R1;
+  new_value = R2;
+  foreign "C" stmWriteTVar(MyCapability() "ptr", trec "ptr", tvar "ptr", new_value "ptr") [];
+
+  jump %ENTRY_CODE(Sp(0));
+}
+
+
 /* -----------------------------------------------------------------------------
  * MVar primitives
  *
@@ -970,7 +1453,6 @@ newMVarzh_fast
     StgTSO_sp(tso) = StgTSO_sp(tso) + WDS(3);  \
     lval = W_[StgTSO_sp(tso) - WDS(1)];
 
-
 takeMVarzh_fast
 {
     W_ mvar, val, info, tso;
@@ -978,7 +1460,11 @@ takeMVarzh_fast
     /* args: R1 = MVar closure */
     mvar = R1;
 
+#if defined(THREADED_RTS)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr") [];
+#else
     info = GET_INFO(mvar);
+#endif
 
     /* If the MVar is empty, put ourselves on its blocking queue,
      * and wait until we're woken up.
@@ -1010,29 +1496,38 @@ takeMVarzh_fast
       /* actually perform the putMVar for the thread that we just woke up */
       tso = StgMVar_head(mvar);
       PerformPut(tso,StgMVar_value(mvar));
+      foreign "C" dirtyTSO(tso "ptr") [];
 
 #if defined(GRAN) || defined(PAR)
       /* ToDo: check 2nd arg (mvar) is right */
-      "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar),mvar);
+      "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar),mvar) [];
       StgMVar_head(mvar) = tso;
 #else
-      "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr");
+      "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", 
+                                        StgMVar_head(mvar) "ptr") [];
       StgMVar_head(mvar) = tso;
 #endif
+
       if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
          StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
       }
+
+#if defined(THREADED_RTS)
+      foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#endif
       RET_P(val);
   } 
   else
   {
       /* No further putMVars, MVar is now empty */
-      
-      /* do this last... we might have locked the MVar in the SMP case,
-       * and writing the info pointer will unlock it.
-       */
-      SET_INFO(mvar,stg_EMPTY_MVAR_info);
       StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
+#if defined(THREADED_RTS)
+      foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#else
+      SET_INFO(mvar,stg_EMPTY_MVAR_info);
+#endif
+
       RET_P(val);
   }
 }
@@ -1046,9 +1541,16 @@ tryTakeMVarzh_fast
 
     mvar = R1;
 
+#if defined(THREADED_RTS)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr") [];
+#else
     info = GET_INFO(mvar);
+#endif
 
     if (info == stg_EMPTY_MVAR_info) {
+#if defined(THREADED_RTS)
+        foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#endif
        /* HACK: we need a pointer to pass back, 
         * so we abuse NO_FINALIZER_closure
         */
@@ -1059,6 +1561,7 @@ tryTakeMVarzh_fast
     val = StgMVar_value(mvar);
 
     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+
        /* There are putMVar(s) waiting... 
         * wake up the first thread on the queue
         */
@@ -1067,29 +1570,34 @@ tryTakeMVarzh_fast
        /* actually perform the putMVar for the thread that we just woke up */
        tso = StgMVar_head(mvar);
        PerformPut(tso,StgMVar_value(mvar));
+        foreign "C" dirtyTSO(tso "ptr") [];
 
 #if defined(GRAN) || defined(PAR)
        /* ToDo: check 2nd arg (mvar) is right */
-       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr", mvar "ptr");
+       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr", mvar "ptr") [];
        StgMVar_head(mvar) = tso;
 #else
-       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr");
+       "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr",
+                                          StgMVar_head(mvar) "ptr") [];
        StgMVar_head(mvar) = tso;
 #endif
 
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
        }
+#if defined(THREADED_RTS)
+        foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#endif
     }
     else 
     {
        /* No further putMVars, MVar is now empty */
        StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
-       
-       /* do this last... we might have locked the MVar in the SMP case,
-        * and writing the info pointer will unlock it.
-        */
+#if defined(THREADED_RTS)
+       foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#else
        SET_INFO(mvar,stg_EMPTY_MVAR_info);
+#endif
     }
     
     RET_NP(1, val);
@@ -1103,7 +1611,11 @@ putMVarzh_fast
     /* args: R1 = MVar, R2 = value */
     mvar = R1;
 
+#if defined(THREADED_RTS)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr") [R2];
+#else
     info = GET_INFO(mvar);
+#endif
 
     if (info == stg_FULL_MVAR_info) {
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
@@ -1120,6 +1632,7 @@ putMVarzh_fast
     }
   
     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+
        /* There are takeMVar(s) waiting: wake up the first one
         */
        ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
@@ -1127,13 +1640,14 @@ putMVarzh_fast
        /* actually perform the takeMVar */
        tso = StgMVar_head(mvar);
        PerformTake(tso, R2);
+        foreign "C" dirtyTSO(tso "ptr") [];
       
 #if defined(GRAN) || defined(PAR)
        /* ToDo: check 2nd arg (mvar) is right */
-       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr",mvar "ptr");
+       "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
        StgMVar_head(mvar) = tso;
 #else
-       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr");
+       "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
        StgMVar_head(mvar) = tso;
 #endif
 
@@ -1141,14 +1655,21 @@ putMVarzh_fast
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
        }
 
+#if defined(THREADED_RTS)
+       foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#endif
        jump %ENTRY_CODE(Sp(0));
     }
     else
     {
        /* No further takes, the MVar is now full. */
        StgMVar_value(mvar) = R2;
-       /* unlocks the MVar in the SMP case */
+
+#if defined(THREADED_RTS)
+       foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#else
        SET_INFO(mvar,stg_FULL_MVAR_info);
+#endif
        jump %ENTRY_CODE(Sp(0));
     }
     
@@ -1163,13 +1684,21 @@ tryPutMVarzh_fast
     /* args: R1 = MVar, R2 = value */
     mvar = R1;
 
+#if defined(THREADED_RTS)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr") [R2];
+#else
     info = GET_INFO(mvar);
+#endif
 
     if (info == stg_FULL_MVAR_info) {
+#if defined(THREADED_RTS)
+       foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#endif
        RET_N(0);
     }
   
     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+
        /* There are takeMVar(s) waiting: wake up the first one
         */
        ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
@@ -1177,13 +1706,14 @@ tryPutMVarzh_fast
        /* actually perform the takeMVar */
        tso = StgMVar_head(mvar);
        PerformTake(tso, R2);
+        foreign "C" dirtyTSO(tso "ptr") [];
       
 #if defined(GRAN) || defined(PAR)
        /* ToDo: check 2nd arg (mvar) is right */
-       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr",mvar "ptr");
+       "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
        StgMVar_head(mvar) = tso;
 #else
-       "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr");
+       "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
        StgMVar_head(mvar) = tso;
 #endif
 
@@ -1191,17 +1721,23 @@ tryPutMVarzh_fast
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
        }
 
-       jump %ENTRY_CODE(Sp(0));
+#if defined(THREADED_RTS)
+       foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#endif
     }
     else
     {
        /* No further takes, the MVar is now full. */
        StgMVar_value(mvar) = R2;
-       /* unlocks the MVar in the SMP case */
+
+#if defined(THREADED_RTS)
+       foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#else
        SET_INFO(mvar,stg_FULL_MVAR_info);
-       jump %ENTRY_CODE(Sp(0));
+#endif
     }
     
+    RET_N(1);
     /* ToDo: yield afterward for better communication performance? */
 }
 
@@ -1216,18 +1752,18 @@ makeStableNamezh_fast
 
     ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, makeStableNamezh_fast );
   
-    index = foreign "C" lookupStableName(R1 "ptr");
+    index = foreign "C" lookupStableName(R1 "ptr") [];
 
     /* Is there already a StableName for this heap object?
-     *  stable_ptr_table is an array of snEntry structs.
+     *  stable_ptr_table is a pointer to an array of snEntry structs.
      */
-    if ( snEntry_sn_obj(stable_ptr_table + index*SIZEOF_snEntry) == NULL ) {
+    if ( snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) == NULL ) {
        sn_obj = Hp - SIZEOF_StgStableName + WDS(1);
        SET_HDR(sn_obj, stg_STABLE_NAME_info, W_[CCCS]);
        StgStableName_sn(sn_obj) = index;
-       snEntry_sn_obj(stable_ptr_table + index*SIZEOF_snEntry) = sn_obj;
+       snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) = sn_obj;
     } else {
-       sn_obj = snEntry_sn_obj(stable_ptr_table + index*SIZEOF_snEntry);
+       sn_obj = snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry);
     }
     
     RET_P(sn_obj);
@@ -1239,7 +1775,7 @@ makeStablePtrzh_fast
     /* Args: R1 = a */
     W_ sp;
     MAYBE_GC(R1_PTR, makeStablePtrzh_fast);
-    "ptr" sp = foreign "C" getStablePtr(R1 "ptr");
+    "ptr" sp = foreign "C" getStablePtr(R1 "ptr") [];
     RET_N(sp);
 }
 
@@ -1248,7 +1784,7 @@ deRefStablePtrzh_fast
     /* Args: R1 = the stable ptr */
     W_ r, sp;
     sp = R1;
-    r = snEntry_addr(stable_ptr_table + sp*SIZEOF_snEntry);
+    r = snEntry_addr(W_[stable_ptr_table] + sp*SIZEOF_snEntry);
     RET_P(r);
 }
 
@@ -1306,8 +1842,8 @@ mkApUpd0zh_fast
     // This function is *only* used to wrap zero-arity BCOs in an
     // updatable wrapper (see ByteCodeLink.lhs).  An AP thunk is always
     // saturated and always points directly to a FUN or BCO.
-    ASSERT(%INFO_TYPE(%GET_STD_INFO(R1)) == BCO::I16 &&
-          StgBCO_arity(R1) == 0::I16);
+    ASSERT(%INFO_TYPE(%GET_STD_INFO(R1)) == HALF_W_(BCO) &&
+          StgBCO_arity(R1) == HALF_W_(0));
 
     HP_CHK_GEN_TICKY(SIZEOF_StgAP, R1_PTR, mkApUpd0zh_fast);
     TICK_ALLOC_UP_THK(0, 0);
@@ -1316,7 +1852,7 @@ mkApUpd0zh_fast
     ap = Hp - SIZEOF_StgAP + WDS(1);
     SET_HDR(ap, stg_AP_info, W_[CCCS]);
     
-    StgAP_n_args(ap) = 0::I16;
+    StgAP_n_args(ap) = HALF_W_(0);
     StgAP_fun(ap) = R1;
     
     RET_P(ap);
@@ -1341,6 +1877,10 @@ mkApUpd0zh_fast
 waitReadzh_fast
 {
     /* args: R1 */
+#ifdef THREADED_RTS
+    foreign "C" barf("waitRead# on threaded RTS");
+#else
+
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
     StgTSO_block_info(CurrentTSO) = R1;
@@ -1348,11 +1888,16 @@ waitReadzh_fast
     // threaded RTS anyway.
     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
     jump stg_block_noregs;
+#endif
 }
 
 waitWritezh_fast
 {
     /* args: R1 */
+#ifdef THREADED_RTS
+    foreign "C" barf("waitWrite# on threaded RTS");
+#else
+
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
     StgTSO_block_info(CurrentTSO) = R1;
@@ -1360,24 +1905,29 @@ waitWritezh_fast
     // threaded RTS anyway.
     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
     jump stg_block_noregs;
+#endif
 }
 
 
 STRING(stg_delayzh_malloc_str, "delayzh_fast")
 delayzh_fast
 {
-#ifdef mingw32_TARGET_OS
+#ifdef mingw32_HOST_OS
     W_ ares;
     CInt reqID;
 #else
     W_ t, prev, target;
 #endif
 
+#ifdef THREADED_RTS
+    foreign "C" barf("delay# on threaded RTS");
+#else
+
     /* args: R1 (microsecond delay amount) */
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnDelay::I16;
 
-#ifdef mingw32_TARGET_OS
+#ifdef mingw32_HOST_OS
 
     /* could probably allocate this on the heap instead */
     "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
@@ -1394,12 +1944,13 @@ delayzh_fast
      */
     StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    jump stg_block_async_void;
 
 #else
 
-    CInt time;
+    W_ time;
     time = foreign "C" getourtimeofday();
-    target = (R1 / (TICK_MILLISECS*1000)) + TO_W_(time);
+    target = (R1 / (TICK_MILLISECS*1000)) + time;
     StgTSO_block_info(CurrentTSO) = target;
 
     /* Insert the new thread in the sleeping queue. */
@@ -1418,33 +1969,39 @@ while:
     } else {
        StgTSO_link(prev) = CurrentTSO;
     }
-#endif
-
     jump stg_block_noregs;
+#endif
+#endif /* !THREADED_RTS */
 }
 
 
-#ifdef mingw32_TARGET_OS
+#ifdef mingw32_HOST_OS
 STRING(stg_asyncReadzh_malloc_str, "asyncReadzh_fast")
 asyncReadzh_fast
 {
     W_ ares;
     CInt reqID;
 
+#ifdef THREADED_RTS
+    foreign "C" barf("asyncRead# on threaded RTS");
+#else
+
     /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
 
     /* could probably allocate this on the heap instead */
     "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
-                                           stg_asyncReadzh_malloc_str);
-    reqID = foreign "C" addIORequest(R1, 0/*FALSE*/,R2,R3,R4 "ptr");
+                                           stg_asyncReadzh_malloc_str)
+                       [R1,R2,R3,R4];
+    reqID = foreign "C" addIORequest(R1, 0/*FALSE*/,R2,R3,R4 "ptr") [];
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
     StgAsyncIOResult_errCode(ares) = 0;
     StgTSO_block_info(CurrentTSO)  = ares;
     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
     jump stg_block_async;
+#endif
 }
 
 STRING(stg_asyncWritezh_malloc_str, "asyncWritezh_fast")
@@ -1453,13 +2010,18 @@ asyncWritezh_fast
     W_ ares;
     CInt reqID;
 
+#ifdef THREADED_RTS
+    foreign "C" barf("asyncWrite# on threaded RTS");
+#else
+
     /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
 
     "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
-                                           stg_asyncWritezh_malloc_str);
-    reqID = foreign "C" addIORequest(R1, 1/*TRUE*/,R2,R3,R4 "ptr");
+                                           stg_asyncWritezh_malloc_str)
+                       [R1,R2,R3,R4];
+    reqID = foreign "C" addIORequest(R1, 1/*TRUE*/,R2,R3,R4 "ptr") [];
 
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
@@ -1467,6 +2029,7 @@ asyncWritezh_fast
     StgTSO_block_info(CurrentTSO)  = ares;
     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
     jump stg_block_async;
+#endif
 }
 
 STRING(stg_asyncDoProczh_malloc_str, "asyncDoProczh_fast")
@@ -1475,20 +2038,26 @@ asyncDoProczh_fast
     W_ ares;
     CInt reqID;
 
+#ifdef THREADED_RTS
+    foreign "C" barf("asyncDoProc# on threaded RTS");
+#else
+
     /* args: R1 = proc, R2 = param */
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
 
     /* could probably allocate this on the heap instead */
     "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
-                                           stg_asyncDoProczh_malloc_str);
-    reqID = foreign "C" addDoProcRequest(R1 "ptr",R2 "ptr");
+                                           stg_asyncDoProczh_malloc_str) 
+                               [R1,R2];
+    reqID = foreign "C" addDoProcRequest(R1 "ptr",R2 "ptr") [];
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
     StgAsyncIOResult_errCode(ares) = 0;
     StgTSO_block_info(CurrentTSO) = ares;
     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
     jump stg_block_async;
+#endif
 }
 #endif