[project @ 2003-07-23 13:39:11 by simonmar]
[ghc-hetmet.git] / ghc / rts / PrimOps.hc
index 8a2db25..45717e5 100644 (file)
@@ -1,12 +1,13 @@
 /* -----------------------------------------------------------------------------
- * $Id: PrimOps.hc,v 1.39 2000/01/13 14:34:03 hwloidl Exp $
+ * $Id: PrimOps.hc,v 1.110 2003/07/23 13:39:11 simonmar Exp $
  *
- * (c) The GHC Team, 1998-1999
+ * (c) The GHC Team, 1998-2002
  *
  * Primitive functions / data
  *
  * ---------------------------------------------------------------------------*/
 
+#include "Stg.h"
 #include "Rts.h"
 
 #include "RtsFlags.h"
 #include "Storage.h"
 #include "BlockAlloc.h" /* tmp */
 #include "StablePriv.h"
-#include "HeapStackCheck.h"
 #include "StgRun.h"
+#include "Timer.h"      /* TICK_MILLISECS */
+#include "Prelude.h"
+#ifndef mingw32_TARGET_OS
+#include "Itimer.h"    /* getourtimeofday() */
+#endif
+
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+
+#include <stdlib.h>
+
+#ifdef mingw32_TARGET_OS
+#include <windows.h>
+#include "win32/AsyncIO.h"
+#endif
 
 /* ** temporary **
 
    classes CCallable and CReturnable don't really exist, but the
    compiler insists on generating dictionaries containing references
    to GHC_ZcCCallable_static_info etc., so we provide dummy symbols
-   for these.
+   for these.  Some C compilers can't cope with zero-length static arrays,
+   so we have to make these one element long.
 */
 
-W_ GHC_ZCCCallable_static_info[0];
-W_ GHC_ZCCReturnable_static_info[0];
-
-
+StgWord GHC_ZCCCallable_static_info[1];
+StgWord GHC_ZCCReturnable_static_info[1];
+  
 /* -----------------------------------------------------------------------------
    Macros for Hand-written primitives.
    -------------------------------------------------------------------------- */
@@ -50,10 +66,17 @@ W_ GHC_ZCCReturnable_static_info[0];
  * We only define the cases actually used, to avoid having too much
  * garbage in this section.  Warning: any bugs in here will be hard to
  * track down.
+ *
+ * The return convention for an unboxed tuple is as follows:
+ *   - fit as many fields as possible in registers (as per the
+ *     function fast-entry point calling convention).
+ *   - sort the rest of the fields into pointers and non-pointers.
+ *     push the pointers on the stack, followed by the non-pointers.
+ *     (so the pointers have higher addresses).
  */
 
 /*------ All Regs available */
-#if defined(REG_R8)
+#if MAX_REAL_VANILLA_REG == 8
 # define RET_P(a)     R1.w = (W_)(a); JMP_(ENTRY_CODE(Sp[0]));
 # define RET_N(a)     RET_P(a)
 
@@ -73,17 +96,11 @@ W_ GHC_ZCCReturnable_static_info[0];
         R1.w = (W_)(a); R2.w = (W_)(b); R3.w = (W_)(c); R4.w = (W_)(d); \
        JMP_(ENTRY_CODE(Sp[0]));
 
-# define RET_NNPNNP(a,b,c,d,e,f) \
-        R1.w = (W_)(a); R2.w = (W_)(b); R3.w = (W_)(c); \
-        R4.w = (W_)(d); R5.w = (W_)(e); R6.w = (W_)(f); \
-       JMP_(ENTRY_CODE(Sp[0]));
-
-#elif defined(REG_R7) || defined(REG_R6) || defined(REG_R5) || \
-      defined(REG_R4) || defined(REG_R3)
+#elif MAX_REAL_VANILLA_REG > 2 && MAX_REAL_VANILLA_REG < 8
 # error RET_n macros not defined for this setup.
 
 /*------ 2 Registers available */
-#elif defined(REG_R2)
+#elif MAX_REAL_VANILLA_REG == 2
 
 # define RET_P(a)     R1.w = (W_)(a); JMP_(ENTRY_CODE(Sp[0]));
 # define RET_N(a)     RET_P(a)
@@ -93,45 +110,38 @@ W_ GHC_ZCCReturnable_static_info[0];
 # define RET_NN(a,b)   RET_PP(a,b)
 # define RET_NP(a,b)   RET_PP(a,b)
 
-# define RET_PPP(a,b,c) \
-       R1.w = (W_)(a); R2.w = (W_)(b); Sp[-1] = (W_)(c); Sp -= 1; \
+# define RET_PPP(a,b,c)                                \
+       R1.w = (W_)(a);                         \
+       R2.w = (W_)(b);                         \
+       Sp[-1] = (W_)(c);                       \
+       Sp -= 1;                                \
        JMP_(ENTRY_CODE(Sp[1]));
-# define RET_NNP(a,b,c) \
-       R1.w = (W_)(a); R2.w = (W_)(b); Sp[-1] = (W_)(c); Sp -= 1; \
+
+# define RET_NNP(a,b,c)                                \
+       R1.w = (W_)(a);                         \
+       R2.w = (W_)(b);                         \
+       Sp[-1] = (W_)(c);                       \
+       Sp -= 1;                                \
        JMP_(ENTRY_CODE(Sp[1]));
 
 # define RET_NNNP(a,b,c,d)                     \
        R1.w = (W_)(a);                         \
         R2.w = (W_)(b);                        \
-    /*  Sp[-3] = ARGTAG(1); */                 \
         Sp[-2] = (W_)(c);                      \
         Sp[-1] = (W_)(d);                      \
-        Sp -= 3;                               \
-        JMP_(ENTRY_CODE(Sp[3]));
+        Sp -= 2;                               \
+        JMP_(ENTRY_CODE(Sp[2]));
 
 # define RET_NPNP(a,b,c,d)                     \
        R1.w = (W_)(a);                         \
         R2.w = (W_)(b);                        \
-    /*  Sp[-3] = ARGTAG(1); */                 \
         Sp[-2] = (W_)(c);                      \
         Sp[-1] = (W_)(d);                      \
-        Sp -= 3;                               \
-        JMP_(ENTRY_CODE(Sp[3]));
-
-# define RET_NNPNNP(a,b,c,d,e,f)               \
-        R1.w = (W_)(a);                                \
-       R2.w = (W_)(b);                         \
-       Sp[-6] = (W_)(c);                       \
-       /* Sp[-5] = ARGTAG(1); */               \
-       Sp[-4] = (W_)(d);                       \
-       /* Sp[-3] = ARGTAG(1); */               \
-       Sp[-2] = (W_)(e);                       \
-       Sp[-1] = (W_)(f);                       \
-       Sp -= 6;                                \
-       JMP_(ENTRY_CODE(Sp[6]));
+        Sp -= 2;                               \
+        JMP_(ENTRY_CODE(Sp[2]));
 
 /*------ 1 Register available */
-#elif defined(REG_R1)
+#elif MAX_REAL_VANILLA_REG == 1
 # define RET_P(a)     R1.w = (W_)(a); JMP_(ENTRY_CODE(Sp[0]));
 # define RET_N(a)     RET_P(a)
 
@@ -141,82 +151,54 @@ W_ GHC_ZCCReturnable_static_info[0];
                       JMP_(ENTRY_CODE(Sp[2]));
 # define RET_NP(a,b)   RET_PP(a,b)
 
-# define RET_PPP(a,b,c) \
-       R1.w = (W_)(a); Sp[-2] = (W_)(b); Sp[-1] = (W_)(c); Sp -= 2; \
+# define RET_PPP(a,b,c)                                \
+       R1.w = (W_)(a);                         \
+       Sp[-2] = (W_)(b);                       \
+       Sp[-1] = (W_)(c);                       \
+       Sp -= 2;                                \
+       JMP_(ENTRY_CODE(Sp[2]));
+
+# define RET_NNP(a,b,c)                                \
+       R1.w = (W_)(a);                         \
+       Sp[-2] = (W_)(b);                       \
+       Sp[-1] = (W_)(c);                       \
+       Sp -= 2;                                \
        JMP_(ENTRY_CODE(Sp[2]));
-# define RET_NNP(a,b,c) \
-       R1.w = (W_)(a); Sp[-2] = (W_)(b); Sp[-1] = (W_)(c); Sp -= 3; \
-       JMP_(ENTRY_CODE(Sp[3]));
 
 # define RET_NNNP(a,b,c,d)                     \
        R1.w = (W_)(a);                         \
-    /*  Sp[-5] = ARGTAG(1); */                 \
-        Sp[-4] = (W_)(b);                      \
-    /*  Sp[-3] = ARGTAG(1); */                 \
+        Sp[-3] = (W_)(b);                      \
         Sp[-2] = (W_)(c);                      \
         Sp[-1] = (W_)(d);                      \
-        Sp -= 5;                               \
-        JMP_(ENTRY_CODE(Sp[5]));
+        Sp -= 3;                               \
+        JMP_(ENTRY_CODE(Sp[3]));
 
 # define RET_NPNP(a,b,c,d)                     \
        R1.w = (W_)(a);                         \
-        Sp[-4] = (W_)(b);                      \
-    /*  Sp[-3] = ARGTAG(1); */                 \
-        Sp[-2] = (W_)(c);                      \
+        Sp[-3] = (W_)(c);                      \
+        Sp[-2] = (W_)(b);                      \
         Sp[-1] = (W_)(d);                      \
-        Sp -= 4;                               \
-        JMP_(ENTRY_CODE(Sp[4]));
-
-# define RET_NNPNNP(a,b,c,d,e,f)               \
-        R1.w = (W_)(a);                                \
-       Sp[-1] = (W_)(f);                       \
-       Sp[-2] = (W_)(e);                       \
-       /* Sp[-3] = ARGTAG(1); */               \
-       Sp[-4] = (W_)(d);                       \
-       /* Sp[-5] = ARGTAG(1); */               \
-       Sp[-6] = (W_)(c);                       \
-       Sp[-7] = (W_)(b);                       \
-       /* Sp[-8] = ARGTAG(1); */               \
-       Sp -= 8;                                \
-       JMP_(ENTRY_CODE(Sp[8]));
+        Sp -= 3;                               \
+        JMP_(ENTRY_CODE(Sp[3]));
 
 #else /* 0 Regs available */
 
-#define PUSH_P(o,x) Sp[-o] = (W_)(x)
-#define PUSH_N(o,x) Sp[1-o] = (W_)(x); /* Sp[-o] = ARGTAG(1) */
-#define PUSHED(m)   Sp -= (m); JMP_(ENTRY_CODE(Sp[m]));
+#define PUSH(o,x) Sp[-o] = (W_)(x)
 
-/* Here's how to construct these macros:
- *
- *   N = number of N's in the name;
- *   P = number of P's in the name;
- *   s = N * 2 + P;
- *   while (nonNull(name)) {
- *     if (nextChar == 'P') {
- *       PUSH_P(s,_);
- *       s -= 1;
- *     } else {
- *       PUSH_N(s,_);
- *       s -= 2
- *     }
- *   }
- *   PUSHED(N * 2 + P);
- */
-
-# define RET_P(a)     PUSH_P(1,a); PUSHED(1)
-# define RET_N(a)     PUSH_N(2,a); PUSHED(2)
+#define PUSHED(m)   Sp -= (m); JMP_(ENTRY_CODE(Sp[m]));
 
-# define RET_PP(a,b)   PUSH_P(2,a); PUSH_P(1,b); PUSHED(2)
-# define RET_NN(a,b)   PUSH_N(4,a); PUSH_N(2,b); PUSHED(4)
-# define RET_NP(a,b)   PUSH_N(3,a); PUSH_P(1,b); PUSHED(3)
+# define RET_P(a)     PUSH(1,a); PUSHED(1)
+# define RET_N(a)     PUSH(1,a); PUSHED(1)
 
-# define RET_PPP(a,b,c) PUSH_P(3,a); PUSH_P(2,b); PUSH_P(1,c); PUSHED(3)
-# define RET_NNP(a,b,c) PUSH_N(5,a); PUSH_N(3,b); PUSH_P(1,c); PUSHED(5)
+# define RET_PP(a,b)   PUSH(2,a); PUSH(1,b); PUSHED(2)
+# define RET_NN(a,b)   PUSH(2,a); PUSH(1,b); PUSHED(2)
+# define RET_NP(a,b)   PUSH(2,a); PUSH(1,b); PUSHED(2)
 
-# define RET_NNNP(a,b,c,d) PUSH_N(7,a); PUSH_N(5,b); PUSH_N(3,c); PUSH_P(1,d); PUSHED(7)       
-# define RET_NPNP(a,b,c,d) PUSH_N(6,a); PUSH_P(4,b); PUSH_N(3,c); PUSH_P(1,d); PUSHED(6)       
-# define RET_NNPNNP(a,b,c,d,e,f) PUSH_N(10,a); PUSH_N(8,b); PUSH_P(6,c); PUSH_N(5,d); PUSH_N(3,e); PUSH_P(1,f); PUSHED(10)
+# define RET_PPP(a,b,c) PUSH(3,a); PUSH(2,b); PUSH(1,c); PUSHED(3)
+# define RET_NNP(a,b,c) PUSH(3,a); PUSH(2,b); PUSH(1,c); PUSHED(3)
 
+# define RET_NNNP(a,b,c,d) PUSH(4,a); PUSH(3,b); PUSH(2,c); PUSH(1,d); PUSHED(4)       
+# define RET_NPNP(a,b,c,d) PUSH(4,a); PUSH(3,c); PUSH(2,b); PUSH(1,d); PUSHED(4)       
 #endif
 
 /*-----------------------------------------------------------------------------
@@ -236,32 +218,58 @@ W_ GHC_ZCCReturnable_static_info[0];
 
 #define BYTES_TO_STGWORDS(n) ((n) + sizeof(W_) - 1)/sizeof(W_)
 
-#define newByteArray(ty,scale)                         \
- FN_(new##ty##Arrayzh_fast)                            \
- {                                                     \
-   W_ stuff_size, size, n;                             \
-   StgArrWords* p;                                     \
-   FB_                                                 \
-     MAYBE_GC(NO_PTRS,new##ty##Arrayzh_fast);          \
-     n = R1.w;                                         \
-     stuff_size = BYTES_TO_STGWORDS(n*scale);          \
-     size = sizeofW(StgArrWords)+ stuff_size;          \
-     p = (StgArrWords *)RET_STGCALL1(P_,allocate,size);        \
-     TICK_ALLOC_PRIM(sizeofW(StgArrWords),stuff_size,0); \
-     SET_HDR(p, &ARR_WORDS_info, CCCS);                \
-     p->words = stuff_size;                            \
-     TICK_RET_UNBOXED_TUP(1)                           \
-     RET_P(p);                                         \
-   FE_                                                 \
+FN_(newByteArrayzh_fast)
+ {
+   W_ size, stuff_size, n;
+   StgArrWords* p;
+   FB_
+     MAYBE_GC(NO_PTRS,newByteArrayzh_fast);
+     n = R1.w;
+     stuff_size = BYTES_TO_STGWORDS(n);
+     size = sizeofW(StgArrWords)+ stuff_size;
+     p = (StgArrWords *)RET_STGCALL1(P_,allocate,size);
+     TICK_ALLOC_PRIM(sizeofW(StgArrWords),stuff_size,0);
+     SET_HDR(p, &stg_ARR_WORDS_info, CCCS);
+     p->words = stuff_size;
+     TICK_RET_UNBOXED_TUP(1)
+     RET_P(p);
+   FE_
  }
 
-newByteArray(Char,   sizeof(C_))
-newByteArray(Int,    sizeof(I_));
-newByteArray(Word,   sizeof(W_));
-newByteArray(Addr,   sizeof(P_));
-newByteArray(Float,  sizeof(StgFloat));
-newByteArray(Double, sizeof(StgDouble));
-newByteArray(StablePtr, sizeof(StgStablePtr));
+FN_(newPinnedByteArrayzh_fast)
+ {
+   W_ size, stuff_size, n;
+   StgArrWords* p;
+   FB_
+     MAYBE_GC(NO_PTRS,newPinnedByteArrayzh_fast);
+     n = R1.w;
+     stuff_size = BYTES_TO_STGWORDS(n);
+
+     // We want an 8-byte aligned array.  allocatePinned() gives us
+     // 8-byte aligned memory by default, but we want to align the
+     // *goods* inside the ArrWords object, so we have to check the
+     // size of the ArrWords header and adjust our size accordingly.
+     size = sizeofW(StgArrWords)+ stuff_size;
+     if ((sizeof(StgArrWords) & 7) != 0) {
+        size++;
+     }
+
+     p = (StgArrWords *)RET_STGCALL1(P_,allocatePinned,size);
+     TICK_ALLOC_PRIM(sizeofW(StgArrWords),stuff_size,0);
+
+     // Again, if the ArrWords header isn't a multiple of 8 bytes, we
+     // have to push the object forward one word so that the goods
+     // fall on an 8-byte boundary.
+     if ((sizeof(StgArrWords) & 7) != 0) {
+        ((StgPtr)p)++;
+     }
+
+     SET_HDR(p, &stg_ARR_WORDS_info, CCCS);
+     p->words = stuff_size;
+     TICK_RET_UNBOXED_TUP(1)
+     RET_P(p);
+   FE_
+ }
 
 FN_(newArrayzh_fast)
 {
@@ -277,7 +285,7 @@ FN_(newArrayzh_fast)
     arr = (StgMutArrPtrs *)RET_STGCALL1(P_, allocate, size);
     TICK_ALLOC_PRIM(sizeofW(StgMutArrPtrs), n, 0);
 
-    SET_HDR(arr,&MUT_ARR_PTRS_info,CCCS);
+    SET_HDR(arr,&stg_MUT_ARR_PTRS_info,CCCS);
     arr->ptrs = n;
 
     init = R2.w;
@@ -297,12 +305,12 @@ FN_(newMutVarzh_fast)
   /* Args: R1.p = initialisation value */
   FB_
 
-  HP_CHK_GEN_TICKY(sizeofW(StgMutVar), R1_PTR, newMutVarzh_fast,);
+  HP_CHK_GEN_TICKY(sizeofW(StgMutVar), R1_PTR, newMutVarzh_fast);
   TICK_ALLOC_PRIM(sizeofW(StgHeader)+1,1, 0); /* hack, dependent on rep. */
   CCS_ALLOC(CCCS,sizeofW(StgMutVar));
 
   mv = (StgMutVar *)(Hp-sizeofW(StgMutVar)+1);
-  SET_HDR(mv,&MUT_VAR_info,CCCS);
+  SET_HDR(mv,&stg_MUT_VAR_info,CCCS);
   mv->var = R1.cl;
 
   TICK_RET_UNBOXED_TUP(1);
@@ -310,26 +318,77 @@ FN_(newMutVarzh_fast)
   FE_
 }
 
+FN_(atomicModifyMutVarzh_fast)
+{
+   StgMutVar* mv;
+   StgClosure *z, *x, *y, *r;
+   FB_
+   /* Args: R1.p :: MutVar#,  R2.p :: a -> (a,b) */
+
+   /* If x is the current contents of the MutVar#, then 
+      We want to make the new contents point to
+
+         (sel_0 (f x))
+      and the return value is
+
+        (sel_1 (f x))
+
+      obviously we can share (f x).
+
+         z = [stg_ap_2 f x]  (max (HS + 2) MIN_UPD_SIZE)
+        y = [stg_sel_0 z]   (max (HS + 1) MIN_UPD_SIZE)
+         r = [stg_sel_1 z]   (max (HS + 1) MIN_UPD_SIZE)
+   */
+
+#define THUNK_SIZE(n) (sizeofW(StgHeader) + stg_max((n), MIN_UPD_SIZE))
+#define SIZE (THUNK_SIZE(2) + THUNK_SIZE(1) + THUNK_SIZE(1))
+
+   HP_CHK_GEN_TICKY(SIZE, R1_PTR|R2_PTR, atomicModifyMutVarzh_fast);
+   CCS_ALLOC(CCCS,SIZE);
+
+   x = ((StgMutVar *)R1.cl)->var;
+
+   TICK_ALLOC_UP_THK(2,0); // XXX
+   z = (StgClosure *) Hp - THUNK_SIZE(2) + 1;
+   SET_HDR(z, (StgInfoTable *)&stg_ap_2_upd_info, CCCS);
+   z->payload[0] = R2.cl;
+   z->payload[1] = x;
+
+   TICK_ALLOC_UP_THK(1,1); // XXX
+   y = (StgClosure *) (StgPtr)z - THUNK_SIZE(1);
+   SET_HDR(y, &stg_sel_0_upd_info, CCCS);
+   y->payload[0] = z;
+
+   ((StgMutVar *)R1.cl)->var = y;
+
+   TICK_ALLOC_UP_THK(1,1); // XXX
+   r = (StgClosure *) (StgPtr)y - THUNK_SIZE(1);
+   SET_HDR(r, &stg_sel_1_upd_info, CCCS);
+   r->payload[0] = z;
+
+   RET_P(r);
+   FE_
+}
+
 /* -----------------------------------------------------------------------------
    Foreign Object Primitives
-
    -------------------------------------------------------------------------- */
 
-#ifndef PAR
-FN_(makeForeignObjzh_fast)
+FN_(mkForeignObjzh_fast)
 {
   /* R1.p = ptr to foreign object,
   */
   StgForeignObj *result;
   FB_
 
-  HP_CHK_GEN_TICKY(sizeofW(StgForeignObj), NO_PTRS, makeForeignObjzh_fast,);
+  HP_CHK_GEN_TICKY(sizeofW(StgForeignObj), NO_PTRS, mkForeignObjzh_fast);
   TICK_ALLOC_PRIM(sizeofW(StgHeader),
                  sizeofW(StgForeignObj)-sizeofW(StgHeader), 0);
   CCS_ALLOC(CCCS,sizeofW(StgForeignObj)); /* ccs prof */
 
   result = (StgForeignObj *) (Hp + 1 - sizeofW(StgForeignObj));
-  SET_HDR(result,&FOREIGN_info,CCCS);
+  SET_HDR(result,&stg_FOREIGN_info,CCCS);
   result->data = R1.p;
 
   /* returns (# s#, ForeignObj# #) */
@@ -337,13 +396,12 @@ FN_(makeForeignObjzh_fast)
   RET_P(result);
   FE_
 }
-#endif
 
 /* These two are out-of-line for the benefit of the NCG */
 FN_(unsafeThawArrayzh_fast)
 {
   FB_
-  SET_INFO((StgClosure *)R1.cl,&MUT_ARR_PTRS_info);
+  SET_INFO((StgClosure *)R1.cl,&stg_MUT_ARR_PTRS_info);
   recordMutable((StgMutClosure*)R1.cl);
 
   TICK_RET_UNBOXED_TUP(1);
@@ -355,32 +413,30 @@ FN_(unsafeThawArrayzh_fast)
    Weak Pointer Primitives
    -------------------------------------------------------------------------- */
 
-#ifndef PAR
-
 FN_(mkWeakzh_fast)
 {
   /* R1.p = key
      R2.p = value
-     R3.p = finalizer
+     R3.p = finalizer (or NULL)
   */
   StgWeak *w;
   FB_
 
-  HP_CHK_GEN_TICKY(sizeofW(StgWeak), R1_PTR|R2_PTR|R3_PTR, mkWeakzh_fast,);
+  if (R3.cl == NULL) {
+    R3.cl = &stg_NO_FINALIZER_closure;
+  }
+
+  HP_CHK_GEN_TICKY(sizeofW(StgWeak),R1_PTR|R2_PTR|R3_PTR, mkWeakzh_fast);
   TICK_ALLOC_PRIM(sizeofW(StgHeader)+1,  // +1 is for the link field
                  sizeofW(StgWeak)-sizeofW(StgHeader)-1, 0);
   CCS_ALLOC(CCCS,sizeofW(StgWeak)); /* ccs prof */
 
   w = (StgWeak *) (Hp + 1 - sizeofW(StgWeak));
-  SET_HDR(w, &WEAK_info, CCCS);
+  SET_HDR(w, &stg_WEAK_info, CCCS);
 
   w->key        = R1.cl;
   w->value      = R2.cl;
-  if (R3.cl) {
-     w->finalizer  = R3.cl;
-  } else {
-     w->finalizer  = &NO_FINALIZER_closure;
-  }
+  w->finalizer  = R3.cl;
 
   w->link       = weak_ptr_list;
   weak_ptr_list = w;
@@ -402,25 +458,60 @@ FN_(finalizzeWeakzh_fast)
   w = (StgDeadWeak *)R1.p;
 
   /* already dead? */
-  if (w->header.info == &DEAD_WEAK_info) {
-      RET_NP(0,&NO_FINALIZER_closure);
+  if (w->header.info == &stg_DEAD_WEAK_info) {
+      RET_NP(0,&stg_NO_FINALIZER_closure);
   }
 
   /* kill it */
-  w->header.info = &DEAD_WEAK_info;
+#ifdef PROFILING
+  // @LDV profiling
+  // A weak pointer is inherently used, so we do not need to call
+  // LDV_recordDead_FILL_SLOP_DYNAMIC():
+  //    LDV_recordDead_FILL_SLOP_DYNAMIC((StgClosure *)w);
+  // or, LDV_recordDead():
+  //    LDV_recordDead((StgClosure *)w, sizeofW(StgWeak) - sizeofW(StgProfHeader));
+  // Furthermore, when PROFILING is turned on, dead weak pointers are exactly as 
+  // large as weak pointers, so there is no need to fill the slop, either.
+  // See stg_DEAD_WEAK_info in StgMiscClosures.hc.
+#endif
+  //
+  // Todo: maybe use SET_HDR() and remove LDV_recordCreate()?
+  //
+  w->header.info = &stg_DEAD_WEAK_info;
+#ifdef PROFILING
+  // @LDV profiling
+  LDV_recordCreate((StgClosure *)w);
+#endif
   f = ((StgWeak *)w)->finalizer;
   w->link = ((StgWeak *)w)->link;
 
   /* return the finalizer */
-  if (f == &NO_FINALIZER_closure) {
-      RET_NP(0,&NO_FINALIZER_closure);
+  if (f == &stg_NO_FINALIZER_closure) {
+      RET_NP(0,&stg_NO_FINALIZER_closure);
   } else {
       RET_NP(1,f);
   }
   FE_
 }
 
-#endif /* !PAR */
+FN_(deRefWeakzh_fast)
+{
+  /* R1.p = weak ptr */
+  StgWeak* w;
+  I_       code;
+  P_       val;
+  FB_
+  w = (StgWeak*)R1.p;
+  if (w->header.info == &stg_WEAK_info) {
+    code = 1;
+    val = (P_)((StgWeak *)w)->value;
+  } else {
+    code = 0;
+    val = (P_)w;
+  }
+  RET_NP(code,val);
+  FE_
+}
 
 /* -----------------------------------------------------------------------------
    Arbitrary-precision Integer operations.
@@ -435,12 +526,12 @@ FN_(int2Integerzh_fast)
    FB_
 
    val = R1.i;
-   HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+1, NO_PTRS, int2Integerzh_fast,);
+   HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+1, NO_PTRS, int2Integerzh_fast);
    TICK_ALLOC_PRIM(sizeofW(StgArrWords),1,0);
    CCS_ALLOC(CCCS,sizeofW(StgArrWords)+1); /* ccs prof */
 
    p = (StgArrWords *)Hp - 1;
-   SET_ARR_HDR(p, &ARR_WORDS_info, CCCS, 1);
+   SET_ARR_HDR(p, &stg_ARR_WORDS_info, CCCS, 1);
 
    /* mpz_set_si is inlined here, makes things simpler */
    if (val < 0) { 
@@ -472,12 +563,12 @@ FN_(word2Integerzh_fast)
    FB_
 
    val = R1.w;
-   HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+1, NO_PTRS, word2Integerzh_fast,)
+   HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+1, NO_PTRS, word2Integerzh_fast)
    TICK_ALLOC_PRIM(sizeofW(StgArrWords),1,0);
    CCS_ALLOC(CCCS,sizeofW(StgArrWords)+1); /* ccs prof */
 
    p = (StgArrWords *)Hp - 1;
-   SET_ARR_HDR(p, &ARR_WORDS_info, CCCS, 1);
+   SET_ARR_HDR(p, &stg_ARR_WORDS_info, CCCS, 1);
 
    if (val != 0) {
        s = 1;
@@ -495,30 +586,6 @@ FN_(word2Integerzh_fast)
    FE_
 }
 
-FN_(addr2Integerzh_fast)
-{
-  MP_INT result;
-  char *str;
-  FB_
-
-  MAYBE_GC(NO_PTRS,addr2Integerzh_fast);
-
-  /* args:   R1 :: Addr# */
-  str = R1.a;
-
-  /* Perform the operation */
-  if (RET_STGCALL3(int, mpz_init_set_str,&result,(str),/*base*/10))
-      abort();
-
-   /* returns (# size  :: Int#, 
-                data  :: ByteArray# 
-              #)
-   */
-  TICK_RET_UNBOXED_TUP(2);
-  RET_NP(result._mp_size, 
-         result._mp_d - sizeofW(StgArrWords));
-  FE_
-}
 
 /*
  * 'long long' primops for converting to/from Integers.
@@ -545,22 +612,22 @@ FN_(int64ToIntegerzh_fast)
        /* minimum is one word */
        words_needed = 1;
    }
-   HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+words_needed, NO_PTRS, int64ToIntegerzh_fast,)
+   HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+words_needed, NO_PTRS, int64ToIntegerzh_fast)
    TICK_ALLOC_PRIM(sizeofW(StgArrWords),words_needed,0);
    CCS_ALLOC(CCCS,sizeofW(StgArrWords)+words_needed); /* ccs prof */
 
    p = (StgArrWords *)(Hp-words_needed+1) - 1;
-   SET_ARR_HDR(p, &ARR_WORDS_info, CCCS, words_needed);
+   SET_ARR_HDR(p, &stg_ARR_WORDS_info, CCCS, words_needed);
 
    if ( val < 0LL ) {
      neg = 1;
      val = -val;
-   } 
+   }
 
    hi = (W_)((LW_)val / 0x100000000ULL);
 
    if ( words_needed == 2 )  { 
-      s = 2; 
+      s = 2;
       Hp[-1] = (W_)val;
       Hp[0] = hi;
    } else if ( val != 0 ) {
@@ -596,12 +663,12 @@ FN_(word64ToIntegerzh_fast)
    } else {
       words_needed = 1;
    }
-   HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+words_needed, NO_PTRS, word64ToIntegerzh_fast,)
+   HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+words_needed, NO_PTRS, word64ToIntegerzh_fast)
    TICK_ALLOC_PRIM(sizeofW(StgArrWords),words_needed,0);
    CCS_ALLOC(CCCS,sizeofW(StgArrWords)+words_needed); /* ccs prof */
 
    p = (StgArrWords *)(Hp-words_needed+1) - 1;
-   SET_ARR_HDR(p, &ARR_WORDS_info, CCCS, words_needed);
+   SET_ARR_HDR(p, &stg_ARR_WORDS_info, CCCS, words_needed);
 
    hi = (W_)((LW_)val / 0x100000000ULL);
    if ( val >= 0x100000000ULL ) { 
@@ -625,7 +692,7 @@ FN_(word64ToIntegerzh_fast)
 }
 
 
-#endif /* HAVE_LONG_LONG */
+#endif /* SUPPORT_LONG_LONGS */
 
 /* ToDo: this is shockingly inefficient */
 
@@ -633,7 +700,7 @@ FN_(word64ToIntegerzh_fast)
 FN_(name)                                                              \
 {                                                                      \
   MP_INT arg1, arg2, result;                                           \
-  I_ s1, s2;                                                   \
+  I_ s1, s2;                                                           \
   StgArrWords* d1;                                                     \
   StgArrWords* d2;                                                     \
   FB_                                                                  \
@@ -664,11 +731,40 @@ FN_(name)                                                         \
   FE_                                                                  \
 }
 
+#define GMP_TAKE1_RET1(name,mp_fun)                                    \
+FN_(name)                                                              \
+{                                                                      \
+  MP_INT arg1, result;                                                 \
+  I_ s1;                                                               \
+  StgArrWords* d1;                                                     \
+  FB_                                                                  \
+                                                                       \
+  /* call doYouWantToGC() */                                           \
+  MAYBE_GC(R2_PTR, name);                                              \
+                                                                       \
+  d1 = (StgArrWords *)R2.p;                                            \
+  s1 = R1.i;                                                           \
+                                                                       \
+  arg1._mp_alloc       = d1->words;                                    \
+  arg1._mp_size                = (s1);                                         \
+  arg1._mp_d           = (unsigned long int *) (BYTE_ARR_CTS(d1));     \
+                                                                       \
+  STGCALL1(mpz_init,&result);                                          \
+                                                                       \
+  /* Perform the operation */                                          \
+  STGCALL2(mp_fun,&result,&arg1);                                      \
+                                                                       \
+  TICK_RET_UNBOXED_TUP(2);                                             \
+  RET_NP(result._mp_size,                                              \
+         result._mp_d-sizeofW(StgArrWords));                           \
+  FE_                                                                  \
+}
+
 #define GMP_TAKE2_RET2(name,mp_fun)                                    \
 FN_(name)                                                              \
 {                                                                      \
   MP_INT arg1, arg2, result1, result2;                                 \
-  I_ s1, s2;                                                   \
+  I_ s1, s2;                                                           \
   StgArrWords* d1;                                                     \
   StgArrWords* d2;                                                     \
   FB_                                                                  \
@@ -709,11 +805,169 @@ GMP_TAKE2_RET1(gcdIntegerzh_fast,      mpz_gcd);
 GMP_TAKE2_RET1(quotIntegerzh_fast,     mpz_tdiv_q);
 GMP_TAKE2_RET1(remIntegerzh_fast,      mpz_tdiv_r);
 GMP_TAKE2_RET1(divExactIntegerzh_fast, mpz_divexact);
+GMP_TAKE2_RET1(andIntegerzh_fast,      mpz_and);
+GMP_TAKE2_RET1(orIntegerzh_fast,       mpz_ior);
+GMP_TAKE2_RET1(xorIntegerzh_fast,      mpz_xor);
+GMP_TAKE1_RET1(complementIntegerzh_fast, mpz_com);
 
 GMP_TAKE2_RET2(quotRemIntegerzh_fast, mpz_tdiv_qr);
 GMP_TAKE2_RET2(divModIntegerzh_fast,  mpz_fdiv_qr);
 
-#ifndef FLOATS_AS_DOUBLES
+
+FN_(gcdIntzh_fast)
+{
+  /* R1 = the first Int#; R2 = the second Int# */
+  mp_limb_t aa;
+  I_ r;
+  FB_
+  aa = (mp_limb_t)(R1.i);
+  r = RET_STGCALL3(StgInt, mpn_gcd_1, (mp_limb_t *)(&aa), 1, (mp_limb_t)(R2.i));
+
+  R1.i = r;
+  /* Result parked in R1, return via info-pointer at TOS */
+  JMP_(ENTRY_CODE(Sp[0]));
+  FE_
+}
+
+FN_(gcdIntegerIntzh_fast)
+{
+  /* R1 = s1; R2 = d1; R3 = the int */
+  I_ r;
+  FB_
+  r = RET_STGCALL3(StgInt,mpn_gcd_1,(mp_limb_t *)(BYTE_ARR_CTS(R2.p)), R1.i, R3.i);
+
+  R1.i = r;
+  /* Result parked in R1, return via info-pointer at TOS */
+  JMP_(ENTRY_CODE(Sp[0]));
+  FE_
+}
+
+FN_(cmpIntegerIntzh_fast)
+{
+  /* R1 = s1; R2 = d1; R3 = the int */
+  I_ usize;
+  I_ vsize;
+  I_ v_digit;
+  mp_limb_t u_digit;
+  FB_
+
+  usize = R1.i;
+  vsize = 0;
+  v_digit = R3.i;
+
+  // paraphrased from mpz_cmp_si() in the GMP sources
+  if (v_digit > 0) {
+      vsize = 1;
+  } else if (v_digit < 0) {
+      vsize = -1;
+      v_digit = -v_digit;
+  }
+
+  if (usize != vsize) {
+    R1.i = usize - vsize; JMP_(ENTRY_CODE(Sp[0]));
+  }
+
+  if (usize == 0) {
+    R1.i = 0; JMP_(ENTRY_CODE(Sp[0]));
+  }
+
+  u_digit = *(mp_limb_t *)(BYTE_ARR_CTS(R2.p));
+
+  if (u_digit == (mp_limb_t) (unsigned long) v_digit) {
+    R1.i = 0; JMP_(ENTRY_CODE(Sp[0]));
+  }
+
+  if (u_digit > (mp_limb_t) (unsigned long) v_digit) {
+    R1.i = usize; 
+  } else {
+    R1.i = -usize; 
+  }
+
+  JMP_(ENTRY_CODE(Sp[0]));
+  FE_
+}
+
+FN_(cmpIntegerzh_fast)
+{
+  /* R1 = s1; R2 = d1; R3 = s2; R4 = d2 */
+  I_ usize;
+  I_ vsize;
+  I_ size;
+  StgPtr up, vp;
+  int cmp;
+  FB_
+
+  // paraphrased from mpz_cmp() in the GMP sources
+  usize = R1.i;
+  vsize = R3.i;
+
+  if (usize != vsize) {
+    R1.i = usize - vsize; JMP_(ENTRY_CODE(Sp[0]));
+  }
+
+  if (usize == 0) {
+    R1.i = 0; JMP_(ENTRY_CODE(Sp[0]));
+  }
+
+  size = abs(usize);
+
+  up = BYTE_ARR_CTS(R2.p);
+  vp = BYTE_ARR_CTS(R4.p);
+
+  cmp = RET_STGCALL3(I_, mpn_cmp, (mp_limb_t *)up, (mp_limb_t *)vp, size);
+
+  if (cmp == 0) {
+    R1.i = 0; JMP_(ENTRY_CODE(Sp[0]));
+  }
+
+  if ((cmp < 0) == (usize < 0)) {
+    R1.i = 1;
+  } else {
+    R1.i = (-1); 
+  }
+  /* Result parked in R1, return via info-pointer at TOS */
+  JMP_(ENTRY_CODE(Sp[0]));
+  FE_
+}
+
+FN_(integer2Intzh_fast)
+{
+  /* R1 = s; R2 = d */
+  I_ r, s;
+  FB_
+  s = R1.i;
+  if (s == 0)
+    r = 0;
+  else {
+    r = ((mp_limb_t *) (BYTE_ARR_CTS(R2.p)))[0];
+    if (s < 0) r = -r;
+  }
+  /* Result parked in R1, return via info-pointer at TOS */
+  R1.i = r;
+  JMP_(ENTRY_CODE(Sp[0]));
+  FE_
+}
+
+FN_(integer2Wordzh_fast)
+{
+  /* R1 = s; R2 = d */
+  I_ s;
+  W_ r;
+  FB_
+  s = R1.i;
+  if (s == 0)
+    r = 0;
+  else {
+    r = ((mp_limb_t *) (BYTE_ARR_CTS(R2.p)))[0];
+    if (s < 0) r = -r;
+  }
+  /* Result parked in R1, return via info-pointer at TOS */
+  R1.w = r;
+  JMP_(ENTRY_CODE(Sp[0]));
+  FE_
+}
+
+
 FN_(decodeFloatzh_fast)
 { 
   MP_INT mantissa;
@@ -725,14 +979,14 @@ FN_(decodeFloatzh_fast)
   /* arguments: F1 = Float# */
   arg = F1;
 
-  HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+1, NO_PTRS, decodeFloatzh_fast,);
+  HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+1, NO_PTRS, decodeFloatzh_fast);
   TICK_ALLOC_PRIM(sizeofW(StgArrWords),1,0);
   CCS_ALLOC(CCCS,sizeofW(StgArrWords)+1); /* ccs prof */
 
   /* Be prepared to tell Lennart-coded __decodeFloat   */
   /* where mantissa._mp_d can be put (it does not care about the rest) */
   p = (StgArrWords *)Hp - 1;
-  SET_ARR_HDR(p,&ARR_WORDS_info,CCCS,1)
+  SET_ARR_HDR(p,&stg_ARR_WORDS_info,CCCS,1)
   mantissa._mp_d = (void *)BYTE_ARR_CTS(p);
 
   /* Perform the operation */
@@ -743,7 +997,6 @@ FN_(decodeFloatzh_fast)
   RET_NNP(exponent,mantissa._mp_size,p);
   FE_
 }
-#endif /* !FLOATS_AS_DOUBLES */
 
 #define DOUBLE_MANTISSA_SIZE (sizeofW(StgDouble))
 #define ARR_SIZE (sizeofW(StgArrWords) + DOUBLE_MANTISSA_SIZE)
@@ -758,14 +1011,14 @@ FN_(decodeDoublezh_fast)
   /* arguments: D1 = Double# */
   arg = D1;
 
-  HP_CHK_GEN_TICKY(ARR_SIZE, NO_PTRS, decodeDoublezh_fast,);
+  HP_CHK_GEN_TICKY(ARR_SIZE, NO_PTRS, decodeDoublezh_fast);
   TICK_ALLOC_PRIM(sizeofW(StgArrWords),DOUBLE_MANTISSA_SIZE,0);
   CCS_ALLOC(CCCS,ARR_SIZE); /* ccs prof */
 
   /* Be prepared to tell Lennart-coded __decodeDouble  */
   /* where mantissa.d can be put (it does not care about the rest) */
   p = (StgArrWords *)(Hp-ARR_SIZE+1);
-  SET_ARR_HDR(p, &ARR_WORDS_info, CCCS, DOUBLE_MANTISSA_SIZE);
+  SET_ARR_HDR(p, &stg_ARR_WORDS_info, CCCS, DOUBLE_MANTISSA_SIZE);
   mantissa._mp_d = (void *)BYTE_ARR_CTS(p);
 
   /* Perform the operation */
@@ -790,13 +1043,27 @@ FN_(forkzh_fast)
 
   /* create it right now, return ThreadID in R1 */
   R1.t = RET_STGCALL2(StgTSO *, createIOThread, 
-                     RtsFlags.GcFlags.initialStkSize, R1.cl);
+                    RtsFlags.GcFlags.initialStkSize, R1.cl);
   STGCALL1(scheduleThread, R1.t);
       
   /* switch at the earliest opportunity */ 
   context_switch = 1;
   
-  JMP_(ENTRY_CODE(Sp[0]));
+  RET_P(R1.t);
+  FE_
+}
+
+FN_(forkProcesszh_fast)
+{
+  pid_t pid;
+
+  FB_
+  /* args: none */
+  /* result: Pid */
+
+  R1.i = RET_STGCALL1(StgInt, forkProcess, CurrentTSO);
+
+  RET_N(R1.i);
   FE_
 }
 
@@ -807,6 +1074,71 @@ FN_(yieldzh_fast)
   FE_
 }
 
+FN_(myThreadIdzh_fast)
+{
+  /* no args. */
+  FB_
+  RET_P((P_)CurrentTSO);
+  FE_
+}
+
+FN_(labelThreadzh_fast)
+{
+  FB_
+  /* args: 
+       R1.p = ThreadId#
+       R2.p = Addr# */
+#ifdef DEBUG
+  STGCALL2(labelThread,R1.p,(char *)R2.p);
+#endif
+  JMP_(ENTRY_CODE(Sp[0]));
+  FE_
+}
+
+
+/* -----------------------------------------------------------------------------
+ * MVar primitives
+ *
+ * take & putMVar work as follows.  Firstly, an important invariant:
+ *
+ *    If the MVar is full, then the blocking queue contains only
+ *    threads blocked on putMVar, and if the MVar is empty then the
+ *    blocking queue contains only threads blocked on takeMVar.
+ *
+ * takeMvar:
+ *    MVar empty : then add ourselves to the blocking queue
+ *    MVar full  : remove the value from the MVar, and
+ *                 blocking queue empty     : return
+ *                 blocking queue non-empty : perform the first blocked putMVar
+ *                                            from the queue, and wake up the
+ *                                            thread (MVar is now full again)
+ *
+ * putMVar is just the dual of the above algorithm.
+ *
+ * How do we "perform a putMVar"?  Well, we have to fiddle around with
+ * the stack of the thread waiting to do the putMVar.  See
+ * stg_block_putmvar and stg_block_takemvar in HeapStackCheck.c for
+ * the stack layout, and the PerformPut and PerformTake macros below.
+ *
+ * It is important that a blocked take or put is woken up with the
+ * take/put already performed, because otherwise there would be a
+ * small window of vulnerability where the thread could receive an
+ * exception and never perform its take or put, and we'd end up with a
+ * deadlock.
+ *
+ * -------------------------------------------------------------------------- */
+
+FN_(isEmptyMVarzh_fast)
+{
+  /* args: R1 = MVar closure */
+  I_ r;
+  FB_
+  r = (I_)((GET_INFO((StgMVar*)(R1.p))) == &stg_EMPTY_MVAR_info);
+  RET_N(r);
+  FE_
+}
+
+
 FN_(newMVarzh_fast)
 {
   StgMVar *mvar;
@@ -814,21 +1146,41 @@ FN_(newMVarzh_fast)
   FB_
   /* args: none */
 
-  HP_CHK_GEN_TICKY(sizeofW(StgMVar), NO_PTRS, newMVarzh_fast,);
+  HP_CHK_GEN_TICKY(sizeofW(StgMVar), NO_PTRS, newMVarzh_fast);
   TICK_ALLOC_PRIM(sizeofW(StgMutVar)-1, // consider head,tail,link as admin wds
                  1, 0);
   CCS_ALLOC(CCCS,sizeofW(StgMVar)); /* ccs prof */
   
   mvar = (StgMVar *) (Hp - sizeofW(StgMVar) + 1);
-  SET_HDR(mvar,&EMPTY_MVAR_info,CCCS);
-  mvar->head = mvar->tail = (StgTSO *)&END_TSO_QUEUE_closure;
-  mvar->value = (StgClosure *)&END_TSO_QUEUE_closure;
+  SET_HDR(mvar,&stg_EMPTY_MVAR_info,CCCS);
+  mvar->head = mvar->tail = (StgTSO *)&stg_END_TSO_QUEUE_closure;
+  mvar->value = (StgClosure *)&stg_END_TSO_QUEUE_closure;
 
   TICK_RET_UNBOXED_TUP(1);
   RET_P(mvar);
   FE_
 }
 
+/* If R1 isn't available, pass it on the stack */
+#ifdef REG_R1
+#define PerformTake(tso, value) ({             \
+    (tso)->sp[1] = (W_)value;                  \
+    (tso)->sp[0] = (W_)&stg_gc_unpt_r1_info;   \
+  })
+#else
+#define PerformTake(tso, value) ({             \
+    (tso)->sp[1] = (W_)value;                  \
+    (tso)->sp[0] = (W_)&stg_ut_1_0_unreg_info; \
+  })
+#endif
+
+
+#define PerformPut(tso) ({                             \
+    StgClosure *val = (StgClosure *)(tso)->sp[2];      \
+    (tso)->sp += 3;                                    \
+    val;                                               \
+  })
+
 FN_(takeMVarzh_fast)
 {
   StgMVar *mvar;
@@ -849,34 +1201,132 @@ FN_(takeMVarzh_fast)
   /* If the MVar is empty, put ourselves on its blocking queue,
    * and wait until we're woken up.
    */
-  if (info == &EMPTY_MVAR_info) {
-    if (mvar->head == (StgTSO *)&END_TSO_QUEUE_closure) {
+  if (info == &stg_EMPTY_MVAR_info) {
+    if (mvar->head == (StgTSO *)&stg_END_TSO_QUEUE_closure) {
       mvar->head = CurrentTSO;
     } else {
       mvar->tail->link = CurrentTSO;
     }
-    CurrentTSO->link = (StgTSO *)&END_TSO_QUEUE_closure;
+    CurrentTSO->link = (StgTSO *)&stg_END_TSO_QUEUE_closure;
     CurrentTSO->why_blocked = BlockedOnMVar;
     CurrentTSO->block_info.closure = (StgClosure *)mvar;
     mvar->tail = CurrentTSO;
 
 #ifdef SMP
     /* unlock the MVar */
-    mvar->header.info = &EMPTY_MVAR_info;
+    mvar->header.info = &stg_EMPTY_MVAR_info;
 #endif
-    BLOCK(R1_PTR, takeMVarzh_fast);
+    JMP_(stg_block_takemvar);
   }
 
+  /* we got the value... */
   val = mvar->value;
-  mvar->value = (StgClosure *)&END_TSO_QUEUE_closure;
 
-  /* do this last... we might have locked the MVar in the SMP case,
-   * and writing the info pointer will unlock it.
-   */
-  SET_INFO(mvar,&EMPTY_MVAR_info);
+  if (mvar->head != (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+      /* There are putMVar(s) waiting... 
+       * wake up the first thread on the queue
+       */
+      ASSERT(mvar->head->why_blocked == BlockedOnMVar);
+
+      /* actually perform the putMVar for the thread that we just woke up */
+      mvar->value = PerformPut(mvar->head);
+
+#if defined(GRAN) || defined(PAR)
+      /* ToDo: check 2nd arg (mvar) is right */
+      mvar->head = RET_STGCALL2(StgTSO *,unblockOne,mvar->head,mvar);
+#else
+      mvar->head = RET_STGCALL1(StgTSO *,unblockOne,mvar->head);
+#endif
+      if (mvar->head == (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+         mvar->tail = (StgTSO *)&stg_END_TSO_QUEUE_closure;
+      }
+#ifdef SMP
+      /* unlock in the SMP case */
+      SET_INFO(mvar,&stg_FULL_MVAR_info);
+#endif
+      TICK_RET_UNBOXED_TUP(1);
+      RET_P(val);
+  } else {
+      /* No further putMVars, MVar is now empty */
+
+      /* do this last... we might have locked the MVar in the SMP case,
+       * and writing the info pointer will unlock it.
+       */
+      SET_INFO(mvar,&stg_EMPTY_MVAR_info);
+      mvar->value = (StgClosure *)&stg_END_TSO_QUEUE_closure;
+      TICK_RET_UNBOXED_TUP(1);
+      RET_P(val);
+  }
+  FE_
+}
+
+FN_(tryTakeMVarzh_fast)
+{
+  StgMVar *mvar;
+  StgClosure *val;
+  const StgInfoTable *info;
+
+  FB_
+  /* args: R1 = MVar closure */
+
+  mvar = (StgMVar *)R1.p;
+
+#ifdef SMP
+  info = LOCK_CLOSURE(mvar);
+#else
+  info = GET_INFO(mvar);
+#endif
+
+  if (info == &stg_EMPTY_MVAR_info) {
+
+#ifdef SMP
+      /* unlock the MVar */
+      SET_INFO(mvar,&stg_EMPTY_MVAR_info);
+#endif
+
+      /* HACK: we need a pointer to pass back, 
+       * so we abuse NO_FINALIZER_closure
+       */
+      RET_NP(0, &stg_NO_FINALIZER_closure);
+  }
+
+  /* we got the value... */
+  val = mvar->value;
+
+  if (mvar->head != (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+      /* There are putMVar(s) waiting... 
+       * wake up the first thread on the queue
+       */
+      ASSERT(mvar->head->why_blocked == BlockedOnMVar);
+
+      /* actually perform the putMVar for the thread that we just woke up */
+      mvar->value = PerformPut(mvar->head);
+
+#if defined(GRAN) || defined(PAR)
+      /* ToDo: check 2nd arg (mvar) is right */
+      mvar->head = RET_STGCALL2(StgTSO *,unblockOne,mvar->head,mvar);
+#else
+      mvar->head = RET_STGCALL1(StgTSO *,unblockOne,mvar->head);
+#endif
+      if (mvar->head == (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+         mvar->tail = (StgTSO *)&stg_END_TSO_QUEUE_closure;
+      }
+#ifdef SMP
+      /* unlock in the SMP case */
+      SET_INFO(mvar,&stg_FULL_MVAR_info);
+#endif
+  } else {
+      /* No further putMVars, MVar is now empty */
+      mvar->value = (StgClosure *)&stg_END_TSO_QUEUE_closure;
+
+      /* do this last... we might have locked the MVar in the SMP case,
+       * and writing the info pointer will unlock it.
+       */
+      SET_INFO(mvar,&stg_EMPTY_MVAR_info);
+  }
 
   TICK_RET_UNBOXED_TUP(1);
-  RET_P(val);
+  RET_NP((I_)1, val);
   FE_
 }
 
@@ -896,35 +1346,115 @@ FN_(putMVarzh_fast)
   info = GET_INFO(mvar);
 #endif
 
-  if (info == &FULL_MVAR_info) {
-    barf("putMVar#: MVar already full");
+  if (info == &stg_FULL_MVAR_info) {
+    if (mvar->head == (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+      mvar->head = CurrentTSO;
+    } else {
+      mvar->tail->link = CurrentTSO;
+    }
+    CurrentTSO->link = (StgTSO *)&stg_END_TSO_QUEUE_closure;
+    CurrentTSO->why_blocked = BlockedOnMVar;
+    CurrentTSO->block_info.closure = (StgClosure *)mvar;
+    mvar->tail = CurrentTSO;
+
+#ifdef SMP
+    /* unlock the MVar */
+    SET_INFO(mvar,&stg_FULL_MVAR_info);
+#endif
+    JMP_(stg_block_putmvar);
   }
   
-  mvar->value = R2.cl;
+  if (mvar->head != (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+      /* There are takeMVar(s) waiting: wake up the first one
+       */
+      ASSERT(mvar->head->why_blocked == BlockedOnMVar);
 
-  /* wake up the first thread on the queue, it will continue with the
-   * takeMVar operation and mark the MVar empty again.
-   */
-  if (mvar->head != (StgTSO *)&END_TSO_QUEUE_closure) {
-    ASSERT(mvar->head->why_blocked == BlockedOnMVar);
-#if defined(GRAN)
-# error FixME
-#elif defined(PAR)
-    // ToDo: check 2nd arg (mvar) is right
-    mvar->head = RET_STGCALL2(StgTSO *,unblockOne,mvar->head,mvar);
+      /* actually perform the takeMVar */
+      PerformTake(mvar->head, R2.cl);
+      
+#if defined(GRAN) || defined(PAR)
+      /* ToDo: check 2nd arg (mvar) is right */
+      mvar->head = RET_STGCALL2(StgTSO *,unblockOne,mvar->head,mvar);
 #else
-    mvar->head = RET_STGCALL1(StgTSO *,unblockOne,mvar->head);
+      mvar->head = RET_STGCALL1(StgTSO *,unblockOne,mvar->head);
 #endif
-    if (mvar->head == (StgTSO *)&END_TSO_QUEUE_closure) {
-      mvar->tail = (StgTSO *)&END_TSO_QUEUE_closure;
-    }
+      if (mvar->head == (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+         mvar->tail = (StgTSO *)&stg_END_TSO_QUEUE_closure;
+      }
+#ifdef SMP
+      /* unlocks the MVar in the SMP case */
+      SET_INFO(mvar,&stg_EMPTY_MVAR_info);
+#endif
+      JMP_(ENTRY_CODE(Sp[0]));
+  } else {
+      /* No further takes, the MVar is now full. */
+      mvar->value = R2.cl;
+      /* unlocks the MVar in the SMP case */
+      SET_INFO(mvar,&stg_FULL_MVAR_info);
+      JMP_(ENTRY_CODE(Sp[0]));
   }
 
-  /* unlocks the MVar in the SMP case */
-  SET_INFO(mvar,&FULL_MVAR_info);
+  /* ToDo: yield afterward for better communication performance? */
+  FE_
+}
 
-  /* ToDo: yield here for better communication performance? */
-  JMP_(ENTRY_CODE(Sp[0]));
+FN_(tryPutMVarzh_fast)
+{
+  StgMVar *mvar;
+  const StgInfoTable *info;
+
+  FB_
+  /* args: R1 = MVar, R2 = value */
+
+  mvar = (StgMVar *)R1.p;
+
+#ifdef SMP
+  info = LOCK_CLOSURE(mvar);
+#else
+  info = GET_INFO(mvar);
+#endif
+
+  if (info == &stg_FULL_MVAR_info) {
+
+#ifdef SMP
+    /* unlock the MVar */
+    mvar->header.info = &stg_FULL_MVAR_info;
+#endif
+
+    RET_N(0);
+  }
+  
+  if (mvar->head != (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+      /* There are takeMVar(s) waiting: wake up the first one
+       */
+      ASSERT(mvar->head->why_blocked == BlockedOnMVar);
+
+      /* actually perform the takeMVar */
+      PerformTake(mvar->head, R2.cl);
+      
+#if defined(GRAN) || defined(PAR)
+      /* ToDo: check 2nd arg (mvar) is right */
+      mvar->head = RET_STGCALL2(StgTSO *,unblockOne,mvar->head,mvar);
+#else
+      mvar->head = RET_STGCALL1(StgTSO *,unblockOne,mvar->head);
+#endif
+      if (mvar->head == (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+         mvar->tail = (StgTSO *)&stg_END_TSO_QUEUE_closure;
+      }
+#ifdef SMP
+      /* unlocks the MVar in the SMP case */
+      SET_INFO(mvar,&stg_EMPTY_MVAR_info);
+#endif
+      JMP_(ENTRY_CODE(Sp[0]));
+  } else {
+      /* No further takes, the MVar is now full. */
+      mvar->value = R2.cl;
+      /* unlocks the MVar in the SMP case */
+      SET_INFO(mvar,&stg_FULL_MVAR_info);
+      JMP_(ENTRY_CODE(Sp[0]));
+  }
+
+  /* ToDo: yield afterward for better communication performance? */
   FE_
 }
 
@@ -938,7 +1468,7 @@ FN_(makeStableNamezh_fast)
   StgStableName *sn_obj;
   FB_
 
-  HP_CHK_GEN_TICKY(sizeofW(StgStableName), R1_PTR, makeStableNamezh_fast,);
+  HP_CHK_GEN_TICKY(sizeofW(StgStableName), R1_PTR, makeStableNamezh_fast);
   TICK_ALLOC_PRIM(sizeofW(StgHeader), 
                  sizeofW(StgStableName)-sizeofW(StgHeader), 0);
   CCS_ALLOC(CCCS,sizeofW(StgStableName)); /* ccs prof */
@@ -948,7 +1478,7 @@ FN_(makeStableNamezh_fast)
   /* Is there already a StableName for this heap object? */
   if (stable_ptr_table[index].sn_obj == NULL) {
     sn_obj = (StgStableName *) (Hp - sizeofW(StgStableName) + 1);
-    sn_obj->header.info = &STABLE_NAME_info;
+    SET_HDR(sn_obj,&stg_STABLE_NAME_info,CCCS);
     sn_obj->sn = index;
     stable_ptr_table[index].sn_obj = (StgClosure *)sn_obj;
   } else {
@@ -959,6 +1489,102 @@ FN_(makeStableNamezh_fast)
   RET_P(sn_obj);
 }
 
+
+FN_(makeStablePtrzh_fast)
+{
+  /* Args: R1 = a */
+  StgStablePtr sp;
+  FB_
+  MAYBE_GC(R1_PTR, makeStablePtrzh_fast);
+  sp = RET_STGCALL1(StgStablePtr,getStablePtr,R1.p);
+  RET_N(sp);
+  FE_
+}
+
+FN_(deRefStablePtrzh_fast)
+{
+  /* Args: R1 = the stable ptr */
+  P_ r;
+  StgStablePtr sp;
+  FB_
+  sp = (StgStablePtr)R1.w;
+  r = stable_ptr_table[(StgWord)sp].addr;
+  RET_P(r);
+  FE_
+}
+
+/* -----------------------------------------------------------------------------
+   Bytecode object primitives
+   -------------------------------------------------------------------------  */
+
+FN_(newBCOzh_fast)
+{
+  /* R1.p = instrs
+     R2.p = literals
+     R3.p = ptrs
+     R4.p = itbls
+     R5.i = arity
+     R6.p = bitmap array
+  */
+  StgBCO *bco;
+  nat size;
+  StgArrWords *bitmap_arr;
+  FB_
+
+  bitmap_arr = (StgArrWords *)R6.cl;
+  size = sizeofW(StgBCO) + bitmap_arr->words;
+  HP_CHK_GEN_TICKY(size,R1_PTR|R2_PTR|R3_PTR|R4_PTR|R6_PTR, newBCOzh_fast);
+  TICK_ALLOC_PRIM(size, size-sizeofW(StgHeader), 0);
+  CCS_ALLOC(CCCS,size); /* ccs prof */
+  bco = (StgBCO *) (Hp + 1 - size);
+  SET_HDR(bco, (const StgInfoTable *)&stg_BCO_info, CCCS);
+
+  bco->instrs     = (StgArrWords*)R1.cl;
+  bco->literals   = (StgArrWords*)R2.cl;
+  bco->ptrs       = (StgMutArrPtrs*)R3.cl;
+  bco->itbls      = (StgArrWords*)R4.cl;
+  bco->arity      = R5.w;
+  bco->size       = size;
+
+  // Copy the arity/bitmap info into the BCO
+  { 
+    int i;
+    for (i = 0; i < bitmap_arr->words; i++) {
+       bco->bitmap[i] = bitmap_arr->payload[i];
+    }
+  }
+
+  TICK_RET_UNBOXED_TUP(1);
+  RET_P(bco);
+  FE_
+}
+
+FN_(mkApUpd0zh_fast)
+{
+  // R1.p = the BCO# for the AP
+  //
+  StgPAP* ap;
+  FB_
+
+  // This function is *only* used to wrap zero-arity BCOs in an
+  // updatable wrapper (see ByteCodeLink.lhs).  An AP thunk is always
+  // saturated and always points directly to a FUN or BCO.
+  ASSERT(get_itbl(R1.cl)->type == BCO && ((StgBCO *)R1.p)->arity == 0);
+
+  HP_CHK_GEN_TICKY(PAP_sizeW(0), R1_PTR, mkApUpd0zh_fast);
+  TICK_ALLOC_PRIM(sizeofW(StgHeader), PAP_sizeW(0)-sizeofW(StgHeader), 0);
+  CCS_ALLOC(CCCS,PAP_sizeW(0)); /* ccs prof */
+  ap = (StgPAP *) (Hp + 1 - PAP_sizeW(0));
+  SET_HDR(ap, &stg_AP_info, CCCS);
+
+  ap->n_args = 0;
+  ap->fun = R1.cl;
+
+  TICK_RET_UNBOXED_TUP(1);
+  RET_P(ap);
+  FE_
+}
+
 /* -----------------------------------------------------------------------------
    Thread I/O blocking primitives
    -------------------------------------------------------------------------- */
@@ -993,23 +1619,120 @@ FN_(waitWritezh_fast)
 
 FN_(delayzh_fast)
 {
+#ifdef mingw32_TARGET_OS
+  StgAsyncIOResult* ares;
+  unsigned int reqID;
+#else
+  StgTSO *t, *prev;
+  nat target;
+#endif
   FB_
     /* args: R1.i */
     ASSERT(CurrentTSO->why_blocked == NotBlocked);
     CurrentTSO->why_blocked = BlockedOnDelay;
 
     ACQUIRE_LOCK(&sched_mutex);
-
-    /* Add on ticks_since_select, since these will be subtracted at
-     * the next awaitEvent call.
+#ifdef mingw32_TARGET_OS
+    /* could probably allocate this on the heap instead */
+    ares = (StgAsyncIOResult*)RET_STGCALL2(P_,stgMallocBytes,sizeof(StgAsyncIOResult), "asyncWritezh_fast");
+    reqID = RET_STGCALL1(W_,addDelayRequest,R1.i);
+    ares->reqID   = reqID;
+    ares->len     = 0;
+    ares->errCode = 0;
+    CurrentTSO->block_info.async_result = ares;
+    /* Having all async-blocked threads reside on the blocked_queue simplifies matters, so
+     * change the status to OnDoProc & put the delayed thread on the blocked_queue.
      */
-    CurrentTSO->block_info.delay = R1.i + ticks_since_select;
-
+    CurrentTSO->why_blocked = BlockedOnDoProc;
     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+#else
+    target = (R1.i / (TICK_MILLISECS*1000)) + getourtimeofday();
+    CurrentTSO->block_info.target = target;
+
+    /* Insert the new thread in the sleeping queue. */
+    prev = NULL;
+    t = sleeping_queue;
+    while (t != END_TSO_QUEUE && t->block_info.target < target) {
+       prev = t;
+       t = t->link;
+    }
 
+    CurrentTSO->link = t;
+    if (prev == NULL) {
+       sleeping_queue = CurrentTSO;
+    } else {
+       prev->link = CurrentTSO;
+    }
+#endif
     RELEASE_LOCK(&sched_mutex);
     JMP_(stg_block_noregs);
   FE_
 }
 
+#ifdef mingw32_TARGET_OS
+FN_(asyncReadzh_fast)
+{
+  StgAsyncIOResult* ares;
+  unsigned int reqID;
+  FB_
+    /* args: R1.i = fd, R2.i = isSock, R3.i = len, R4.p = buf */
+    ASSERT(CurrentTSO->why_blocked == NotBlocked);
+    CurrentTSO->why_blocked = BlockedOnRead;
+    ACQUIRE_LOCK(&sched_mutex);
+    /* could probably allocate this on the heap instead */
+    ares = (StgAsyncIOResult*)RET_STGCALL2(P_,stgMallocBytes,sizeof(StgAsyncIOResult), "asyncReadzh_fast");
+    reqID = RET_STGCALL5(W_,addIORequest,R1.i,FALSE,R2.i,R3.i,(char*)R4.p);
+    ares->reqID   = reqID;
+    ares->len     = 0;
+    ares->errCode = 0;
+    CurrentTSO->block_info.async_result = ares;
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    RELEASE_LOCK(&sched_mutex);
+    JMP_(stg_block_async);
+  FE_
+}
+
+FN_(asyncWritezh_fast)
+{
+  StgAsyncIOResult* ares;
+  unsigned int reqID;
+  FB_
+    /* args: R1.i */
+    /* args: R1.i = fd, R2.i = isSock, R3.i = len, R4.p = buf */
+    ASSERT(CurrentTSO->why_blocked == NotBlocked);
+    CurrentTSO->why_blocked = BlockedOnWrite;
+    ACQUIRE_LOCK(&sched_mutex);
+    ares = (StgAsyncIOResult*)RET_STGCALL2(P_,stgMallocBytes,sizeof(StgAsyncIOResult), "asyncWritezh_fast");
+    reqID = RET_STGCALL5(W_,addIORequest,R1.i,TRUE,R2.i,R3.i,(char*)R4.p);
+    ares->reqID   = reqID;
+    ares->len     = 0;
+    ares->errCode = 0;
+    CurrentTSO->block_info.async_result = ares;
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    RELEASE_LOCK(&sched_mutex);
+    JMP_(stg_block_async);
+  FE_
+}
 
+FN_(asyncDoProczh_fast)
+{
+  StgAsyncIOResult* ares;
+  unsigned int reqID;
+  FB_
+    /* args: R1.i = proc, R2.i = param */
+    ASSERT(CurrentTSO->why_blocked == NotBlocked);
+    CurrentTSO->why_blocked = BlockedOnDoProc;
+    ACQUIRE_LOCK(&sched_mutex);
+    /* could probably allocate this on the heap instead */
+    ares = (StgAsyncIOResult*)RET_STGCALL2(P_,stgMallocBytes,sizeof(StgAsyncIOResult), "asyncDoProczh_fast");
+    reqID = RET_STGCALL2(W_,addDoProcRequest,R1.p,R2.p);
+    ares->reqID   = reqID;
+    ares->len     = 0;
+    ares->errCode = 0;
+    CurrentTSO->block_info.async_result = ares;
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    RELEASE_LOCK(&sched_mutex);
+    JMP_(stg_block_async);
+  FE_
+}
+#endif