[project @ 2001-03-23 16:36:20 by simonmar]
[ghc-hetmet.git] / ghc / rts / PrimOps.hc
index 0d16ae6..8c5c55e 100644 (file)
@@ -1,14 +1,15 @@
 /* -----------------------------------------------------------------------------
- * $Id: PrimOps.hc,v 1.2 1998/12/02 13:28:32 simonm Exp $
+ * $Id: PrimOps.hc,v 1.75 2001/03/23 16:36:21 simonmar Exp $
+ *
+ * (c) The GHC Team, 1998-2000
  *
  * Primitive functions / data
  *
  * ---------------------------------------------------------------------------*/
 
+#include "Stg.h"
 #include "Rts.h"
 
-#ifdef COMPILER
-
 #include "RtsFlags.h"
 #include "StgStartup.h"
 #include "SchedAPI.h"
 #include "RtsUtils.h"
 #include "Storage.h"
 #include "BlockAlloc.h" /* tmp */
-#include "StablePtr.h"
+#include "StablePriv.h"
+#include "HeapStackCheck.h"
+#include "StgRun.h"
+#include "Itimer.h"
+#include "Prelude.h"
 
 /* ** temporary **
 
    classes CCallable and CReturnable don't really exist, but the
    compiler insists on generating dictionaries containing references
    to GHC_ZcCCallable_static_info etc., so we provide dummy symbols
-   for these.
+   for these.  Some C compilers can't cope with zero-length static arrays,
+   so we have to make these one element long.
 */
 
-W_ GHC_ZcCCallable_static_info[0];
-W_ GHC_ZcCReturnable_static_info[0];
-
-#ifndef aix_TARGET_OS /* AIX gives link errors with this as a const (RO assembler section) */
-const 
-#endif 
-      StgClosure *PrelBase_Bool_closure_tbl[] = {
-    &False_closure,
-    &True_closure
-};
-
+StgWord GHC_ZCCCallable_static_info[1];
+StgWord GHC_ZCCReturnable_static_info[1];
+  
 /* -----------------------------------------------------------------------------
    Macros for Hand-written primitives.
    -------------------------------------------------------------------------- */
@@ -58,7 +56,7 @@ const
  */
 
 /*------ All Regs available */
-#ifdef REG_R8
+#if defined(REG_R8)
 # define RET_P(a)     R1.w = (W_)(a); JMP_(ENTRY_CODE(Sp[0]));
 # define RET_N(a)     RET_P(a)
 
@@ -74,20 +72,69 @@ const
         R1.w = (W_)(a); R2.w = (W_)(b); R3.w = (W_)(c); R4.w = (W_)d; \
         JMP_(ENTRY_CODE(Sp[0]));
 
+# define RET_NPNP(a,b,c,d) \
+        R1.w = (W_)(a); R2.w = (W_)(b); R3.w = (W_)(c); R4.w = (W_)(d); \
+       JMP_(ENTRY_CODE(Sp[0]));
+
 # define RET_NNPNNP(a,b,c,d,e,f) \
         R1.w = (W_)(a); R2.w = (W_)(b); R3.w = (W_)(c); \
         R4.w = (W_)(d); R5.w = (W_)(e); R6.w = (W_)(f); \
        JMP_(ENTRY_CODE(Sp[0]));
 
-#else
-
-#if defined(REG_R7) || defined(REG_R6) || defined(REG_R5) || \
-    defined(REG_R4) || defined(REG_R3) || defined(REG_R2)
+#elif defined(REG_R7) || defined(REG_R6) || defined(REG_R5) || \
+      defined(REG_R4) || defined(REG_R3)
 # error RET_n macros not defined for this setup.
-#else
+
+/*------ 2 Registers available */
+#elif defined(REG_R2)
+
+# define RET_P(a)     R1.w = (W_)(a); JMP_(ENTRY_CODE(Sp[0]));
+# define RET_N(a)     RET_P(a)
+
+# define RET_PP(a,b)   R1.w = (W_)(a); R2.w = (W_)(b); \
+                      JMP_(ENTRY_CODE(Sp[0]));
+# define RET_NN(a,b)   RET_PP(a,b)
+# define RET_NP(a,b)   RET_PP(a,b)
+
+# define RET_PPP(a,b,c) \
+       R1.w = (W_)(a); R2.w = (W_)(b); Sp[-1] = (W_)(c); Sp -= 1; \
+       JMP_(ENTRY_CODE(Sp[1]));
+# define RET_NNP(a,b,c) \
+       R1.w = (W_)(a); R2.w = (W_)(b); Sp[-1] = (W_)(c); Sp -= 1; \
+       JMP_(ENTRY_CODE(Sp[1]));
+
+# define RET_NNNP(a,b,c,d)                     \
+       R1.w = (W_)(a);                         \
+        R2.w = (W_)(b);                        \
+    /*  Sp[-3] = ARGTAG(1); */                 \
+        Sp[-2] = (W_)(c);                      \
+        Sp[-1] = (W_)(d);                      \
+        Sp -= 3;                               \
+        JMP_(ENTRY_CODE(Sp[3]));
+
+# define RET_NPNP(a,b,c,d)                     \
+       R1.w = (W_)(a);                         \
+        R2.w = (W_)(b);                        \
+    /*  Sp[-3] = ARGTAG(1); */                 \
+        Sp[-2] = (W_)(c);                      \
+        Sp[-1] = (W_)(d);                      \
+        Sp -= 3;                               \
+        JMP_(ENTRY_CODE(Sp[3]));
+
+# define RET_NNPNNP(a,b,c,d,e,f)               \
+        R1.w = (W_)(a);                                \
+       R2.w = (W_)(b);                         \
+       Sp[-6] = (W_)(c);                       \
+       /* Sp[-5] = ARGTAG(1); */               \
+       Sp[-4] = (W_)(d);                       \
+       /* Sp[-3] = ARGTAG(1); */               \
+       Sp[-2] = (W_)(e);                       \
+       Sp[-1] = (W_)(f);                       \
+       Sp -= 6;                                \
+       JMP_(ENTRY_CODE(Sp[6]));
 
 /*------ 1 Register available */
-#ifdef REG_R1
+#elif defined(REG_R1)
 # define RET_P(a)     R1.w = (W_)(a); JMP_(ENTRY_CODE(Sp[0]));
 # define RET_N(a)     RET_P(a)
 
@@ -114,6 +161,15 @@ const
         Sp -= 5;                               \
         JMP_(ENTRY_CODE(Sp[5]));
 
+# define RET_NPNP(a,b,c,d)                     \
+       R1.w = (W_)(a);                         \
+        Sp[-4] = (W_)(b);                      \
+    /*  Sp[-3] = ARGTAG(1); */                 \
+        Sp[-2] = (W_)(c);                      \
+        Sp[-1] = (W_)(d);                      \
+        Sp -= 4;                               \
+        JMP_(ENTRY_CODE(Sp[4]));
+
 # define RET_NNPNNP(a,b,c,d,e,f)               \
         R1.w = (W_)(a);                                \
        Sp[-1] = (W_)(f);                       \
@@ -130,7 +186,13 @@ const
 #else /* 0 Regs available */
 
 #define PUSH_P(o,x) Sp[-o] = (W_)(x)
-#define PUSH_N(o,x) Sp[1-o] = (W_)(x); /* Sp[-o] = ARGTAG(1) */
+
+#ifdef DEBUG
+#define PUSH_N(o,x) Sp[1-o] = (W_)(x);  Sp[-o] = ARG_TAG(1);
+#else
+#define PUSH_N(o,x) Sp[1-o] = (W_)(x);
+#endif
+
 #define PUSHED(m)   Sp -= (m); JMP_(ENTRY_CODE(Sp[m]));
 
 /* Here's how to construct these macros:
@@ -158,16 +220,14 @@ const
 # define RET_NP(a,b)   PUSH_N(3,a); PUSH_P(1,b); PUSHED(3)
 
 # define RET_PPP(a,b,c) PUSH_P(3,a); PUSH_P(2,b); PUSH_P(1,c); PUSHED(3)
-# define RET_NNP(a,b,c) PUSH_N(6,a); PUSH_N(4,b); PUSH_N(2,c); PUSHED(6)
+# define RET_NNP(a,b,c) PUSH_N(5,a); PUSH_N(3,b); PUSH_P(1,c); PUSHED(5)
 
 # define RET_NNNP(a,b,c,d) PUSH_N(7,a); PUSH_N(5,b); PUSH_N(3,c); PUSH_P(1,d); PUSHED(7)       
+# define RET_NPNP(a,b,c,d) PUSH_N(6,a); PUSH_P(4,b); PUSH_N(3,c); PUSH_P(1,d); PUSHED(6)       
 # define RET_NNPNNP(a,b,c,d,e,f) PUSH_N(10,a); PUSH_N(8,b); PUSH_P(6,c); PUSH_N(5,d); PUSH_N(3,e); PUSH_P(1,f); PUSHED(10)
 
 #endif
 
-#endif
-#endif
-
 /*-----------------------------------------------------------------------------
   Array Primitives
 
@@ -185,73 +245,68 @@ const
 
 #define BYTES_TO_STGWORDS(n) ((n) + sizeof(W_) - 1)/sizeof(W_)
 
-#define newByteArray(ty,scale)                         \
- FN_(new##ty##ArrayZh_fast)                            \
+FN_(newByteArrayzh_fast)                               \
  {                                                     \
-   W_ stuff_size, size, n;                             \
+   W_ size, stuff_size, n;                             \
    StgArrWords* p;                                     \
    FB_                                                 \
-     MAYBE_GC(NO_PTRS,new##ty##ArrayZh_fast);          \
+     MAYBE_GC(NO_PTRS,newByteArrayzh_fast);            \
      n = R1.w;                                         \
-     stuff_size = BYTES_TO_STGWORDS(n*scale);          \
+     stuff_size = BYTES_TO_STGWORDS(n);                        \
      size = sizeofW(StgArrWords)+ stuff_size;          \
-     p = (StgArrWords *)allocate(size);                        \
-     SET_HDR(p, &MUT_ARR_WORDS_info, CCCS);            \
+     p = (StgArrWords *)RET_STGCALL1(P_,allocate,size);        \
+     TICK_ALLOC_PRIM(sizeofW(StgArrWords),stuff_size,0); \
+     SET_HDR(p, &stg_ARR_WORDS_info, CCCS);            \
      p->words = stuff_size;                            \
+     TICK_RET_UNBOXED_TUP(1)                           \
      RET_P(p);                                         \
    FE_                                                 \
  }
 
-newByteArray(Char,   sizeof(C_))
-newByteArray(Int,    sizeof(I_));
-newByteArray(Word,   sizeof(W_));
-newByteArray(Addr,   sizeof(P_));
-newByteArray(Float,  sizeof(StgFloat));
-newByteArray(Double, sizeof(StgDouble));
-newByteArray(StablePtr, sizeof(StgStablePtr));
-
-FN_(newArrayZh_fast)
+FN_(newArrayzh_fast)
 {
   W_ size, n, init;
-  StgArrPtrs* arr;
+  StgMutArrPtrs* arr;
   StgPtr p;
   FB_
     n = R1.w;
 
-    MAYBE_GC(R2_PTR,newArrayZh_fast);
+    MAYBE_GC(R2_PTR,newArrayzh_fast);
 
-    size = sizeofW(StgArrPtrs) + n;
-    arr = (StgArrPtrs *)allocate(size);
+    size = sizeofW(StgMutArrPtrs) + n;
+    arr = (StgMutArrPtrs *)RET_STGCALL1(P_, allocate, size);
+    TICK_ALLOC_PRIM(sizeofW(StgMutArrPtrs), n, 0);
 
-    SET_HDR(arr,&MUT_ARR_PTRS_info,CCCS);
+    SET_HDR(arr,&stg_MUT_ARR_PTRS_info,CCCS);
     arr->ptrs = n;
 
     init = R2.w;
-    for (p = (P_)arr + sizeofW(StgArrPtrs); 
+    for (p = (P_)arr + sizeofW(StgMutArrPtrs); 
         p < (P_)arr + size; p++) {
        *p = (W_)init;
     }
 
+    TICK_RET_UNBOXED_TUP(1);
     RET_P(arr);
   FE_
 }
 
-FN_(newMutVarZh_fast)
+FN_(newMutVarzh_fast)
 {
   StgMutVar* mv;
   /* Args: R1.p = initialisation value */
   FB_
 
-  HP_CHK_GEN(sizeofW(StgMutVar), R1_PTR, newMutVarZh_fast,);
-  TICK_ALLOC_PRIM(sizeofW(StgMutVar),wibble,wibble,wibble)
+  HP_CHK_GEN_TICKY(sizeofW(StgMutVar), R1_PTR, newMutVarzh_fast,);
+  TICK_ALLOC_PRIM(sizeofW(StgHeader)+1,1, 0); /* hack, dependent on rep. */
   CCS_ALLOC(CCCS,sizeofW(StgMutVar));
 
-  mv = stgCast(StgMutVar*,Hp-sizeofW(StgMutVar)+1);
-  SET_HDR(mv,&MUT_VAR_info,CCCS);
+  mv = (StgMutVar *)(Hp-sizeofW(StgMutVar)+1);
+  SET_HDR(mv,&stg_MUT_VAR_info,CCCS);
   mv->var = R1.cl;
 
+  TICK_RET_UNBOXED_TUP(1);
   RET_P(mv);
-
   FE_
 }
 
@@ -260,85 +315,112 @@ FN_(newMutVarZh_fast)
 
    -------------------------------------------------------------------------- */
 
-#ifndef PAR
-FN_(makeForeignObjZh_fast)
+FN_(mkForeignObjzh_fast)
 {
   /* R1.p = ptr to foreign object,
   */
   StgForeignObj *result;
   FB_
 
-  HP_CHK_GEN(sizeofW(StgForeignObj), NO_PTRS, makeForeignObjZh_fast,);
-  TICK_ALLOC_PRIM(sizeofW(StgForeignObj),wibble,wibble,wibble)
+  HP_CHK_GEN_TICKY(sizeofW(StgForeignObj), NO_PTRS, mkForeignObjzh_fast,);
+  TICK_ALLOC_PRIM(sizeofW(StgHeader),
+                 sizeofW(StgForeignObj)-sizeofW(StgHeader), 0);
   CCS_ALLOC(CCCS,sizeofW(StgForeignObj)); /* ccs prof */
 
   result = (StgForeignObj *) (Hp + 1 - sizeofW(StgForeignObj));
-  SET_HDR(result,&FOREIGN_info,CCCS);
+  SET_HDR(result,&stg_FOREIGN_info,CCCS);
   result->data = R1.p;
 
   /* returns (# s#, ForeignObj# #) */
+  TICK_RET_UNBOXED_TUP(1);
   RET_P(result);
   FE_
 }
-#endif
+
+/* These two are out-of-line for the benefit of the NCG */
+FN_(unsafeThawArrayzh_fast)
+{
+  FB_
+  SET_INFO((StgClosure *)R1.cl,&stg_MUT_ARR_PTRS_info);
+  recordMutable((StgMutClosure*)R1.cl);
+
+  TICK_RET_UNBOXED_TUP(1);
+  RET_P(R1.p);
+  FE_
+}
 
 /* -----------------------------------------------------------------------------
    Weak Pointer Primitives
    -------------------------------------------------------------------------- */
 
-#ifndef PAR
-
-FN_(mkWeakZh_fast)
+FN_(mkWeakzh_fast)
 {
   /* R1.p = key
      R2.p = value
-     R3.p = finaliser
+     R3.p = finalizer (or NULL)
   */
   StgWeak *w;
   FB_
 
-  HP_CHK_GEN(sizeofW(StgWeak), R1_PTR|R2_PTR|R3_PTR, mkWeakZh_fast,);
-  TICK_ALLOC_PRIM(sizeofW(StgWeak),wibble,wibble,wibble);
+  if (R3.cl == NULL) {
+    R3.cl = &stg_NO_FINALIZER_closure;
+  }
+
+  HP_CHK_GEN_TICKY(sizeofW(StgWeak),R1_PTR|R2_PTR|R3_PTR, mkWeakzh_fast,);
+  TICK_ALLOC_PRIM(sizeofW(StgHeader)+1,  // +1 is for the link field
+                 sizeofW(StgWeak)-sizeofW(StgHeader)-1, 0);
   CCS_ALLOC(CCCS,sizeofW(StgWeak)); /* ccs prof */
 
   w = (StgWeak *) (Hp + 1 - sizeofW(StgWeak));
-  SET_HDR(w, &WEAK_info, CCCS);
+  SET_HDR(w, &stg_WEAK_info, CCCS);
 
   w->key        = R1.cl;
   w->value      = R2.cl;
-  w->finaliser  = R3.cl;
+  w->finalizer  = R3.cl;
 
   w->link       = weak_ptr_list;
   weak_ptr_list = w;
   IF_DEBUG(weak, fprintf(stderr,"New weak pointer at %p\n",w));
 
+  TICK_RET_UNBOXED_TUP(1);
   RET_P(w);
   FE_
 }
 
-FN_(deRefWeakZh_fast)
+FN_(finalizzeWeakzh_fast)
 {
   /* R1.p = weak ptr
    */
-  StgWeak *w;
+  StgDeadWeak *w;
+  StgClosure *f;
   FB_
-  
-  w = (StgWeak *)R1.p;
-  if (w->header.info == &WEAK_info) {
-       RET_NP(1, w->value);
+  TICK_RET_UNBOXED_TUP(0);
+  w = (StgDeadWeak *)R1.p;
+
+  /* already dead? */
+  if (w->header.info == &stg_DEAD_WEAK_info) {
+      RET_NP(0,&stg_NO_FINALIZER_closure);
+  }
+
+  /* kill it */
+  w->header.info = &stg_DEAD_WEAK_info;
+  f = ((StgWeak *)w)->finalizer;
+  w->link = ((StgWeak *)w)->link;
+
+  /* return the finalizer */
+  if (f == &stg_NO_FINALIZER_closure) {
+      RET_NP(0,&stg_NO_FINALIZER_closure);
   } else {
-       RET_NP(0, w);
+      RET_NP(1,f);
   }
   FE_
 }
 
-#endif /* !PAR */
-
 /* -----------------------------------------------------------------------------
    Arbitrary-precision Integer operations.
    -------------------------------------------------------------------------- */
 
-FN_(int2IntegerZh_fast)
+FN_(int2Integerzh_fast)
 {
    /* arguments: R1 = Int# */
 
@@ -347,12 +429,12 @@ FN_(int2IntegerZh_fast)
    FB_
 
    val = R1.i;
-   HP_CHK_GEN(sizeofW(StgArrWords)+1, NO_PTRS, int2IntegerZh_fast,)
-   TICK_ALLOC_PRIM(sizeofW(StgArrWords)+1,wibble,wibble,wibble)
+   HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+1, NO_PTRS, int2Integerzh_fast,);
+   TICK_ALLOC_PRIM(sizeofW(StgArrWords),1,0);
    CCS_ALLOC(CCCS,sizeofW(StgArrWords)+1); /* ccs prof */
 
-   p = stgCast(StgArrWords*,Hp)-1;
-   SET_ARR_HDR(p, &ARR_WORDS_info, CCCS, 1);
+   p = (StgArrWords *)Hp - 1;
+   SET_ARR_HDR(p, &stg_ARR_WORDS_info, CCCS, 1);
 
    /* mpz_set_si is inlined here, makes things simpler */
    if (val < 0) { 
@@ -365,16 +447,16 @@ FN_(int2IntegerZh_fast)
        s = 0;
    }
 
-   /* returns (# alloc :: Int#, 
-                size  :: Int#, 
+   /* returns (# size  :: Int#, 
                 data  :: ByteArray# 
               #)
    */
-   RET_NNP(1,s,p);
+   TICK_RET_UNBOXED_TUP(2);
+   RET_NP(s,p);
    FE_
 }
 
-FN_(word2IntegerZh_fast)
+FN_(word2Integerzh_fast)
 {
    /* arguments: R1 = Word# */
 
@@ -384,12 +466,12 @@ FN_(word2IntegerZh_fast)
    FB_
 
    val = R1.w;
-   HP_CHK_GEN(sizeofW(StgArrWords)+1, NO_PTRS, word2IntegerZh_fast,)
-   TICK_ALLOC_PRIM(sizeofW(StgArrWords)+1,wibble,wibble,wibble)
+   HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+1, NO_PTRS, word2Integerzh_fast,)
+   TICK_ALLOC_PRIM(sizeofW(StgArrWords),1,0);
    CCS_ALLOC(CCCS,sizeofW(StgArrWords)+1); /* ccs prof */
 
-   p = stgCast(StgArrWords*,Hp)-1;
-   SET_ARR_HDR(p, &ARR_WORDS_info, CCCS, 1);
+   p = (StgArrWords *)Hp - 1;
+   SET_ARR_HDR(p, &stg_ARR_WORDS_info, CCCS, 1);
 
    if (val != 0) {
        s = 1;
@@ -398,34 +480,15 @@ FN_(word2IntegerZh_fast)
        s = 0;
    }
 
-   /* returns (# alloc :: Int#, 
-                size  :: Int#, 
+   /* returns (# size  :: Int#, 
                 data  :: ByteArray# 
               #)
    */
-   RET_NNP(1,s,p);
+   TICK_RET_UNBOXED_TUP(2);
+   RET_NP(s,p);
    FE_
 }
 
-FN_(addr2IntegerZh_fast)
-{
-  MP_INT result;
-  char *str;
-  FB_
-
-  MAYBE_GC(NO_PTRS,addr2IntegerZh_fast);
-
-  /* args:   R1 :: Addr# */
-  str = R1.a;
-
-  /* Perform the operation */
-  if (RET_STGCALL3(int, mpz_init_set_str,&result,(str),/*base*/10))
-      abort();
-
-  RET_NNP(result._mp_alloc, result._mp_size, 
-         result._mp_d - sizeofW(StgArrWords));
-  FE_
-}
 
 /*
  * 'long long' primops for converting to/from Integers.
@@ -433,68 +496,67 @@ FN_(addr2IntegerZh_fast)
 
 #ifdef SUPPORT_LONG_LONGS
 
-FN_(int64ToIntegerZh_fast)
+FN_(int64ToIntegerzh_fast)
 {
    /* arguments: L1 = Int64# */
 
-   StgInt64 val; /* to avoid aliasing */
+   StgInt64  val; /* to avoid aliasing */
    W_ hi;
-   I_  s,a, neg, words_needed;
+   I_  s, neg, words_needed;
    StgArrWords* p;     /* address of array result */
    FB_
 
-     /* ToDo: extend StgUnion?? */
    val = (LI_)L1;
    neg = 0;
-   if ((LW_)(val) >= 0x100000000ULL)  { 
+
+   if ( val >= 0x100000000LL || val <= -0x100000000LL )  { 
        words_needed = 2;
    } else { 
        /* minimum is one word */
        words_needed = 1;
    }
-   HP_CHK_GEN(sizeofW(StgArrWords)+words_needed, NO_PTRS, int64ToIntegerZh_fast,)
-   TICK_ALLOC_PRIM(sizeofW(StgArrWords)+words_needed,wibble,wibble,wibble)
+   HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+words_needed, NO_PTRS, int64ToIntegerzh_fast,)
+   TICK_ALLOC_PRIM(sizeofW(StgArrWords),words_needed,0);
    CCS_ALLOC(CCCS,sizeofW(StgArrWords)+words_needed); /* ccs prof */
 
-   p = stgCast(StgArrWords*,Hp)-1;
-   SET_ARR_HDR(p, &ARR_WORDS_info, CCCS, words_needed);
+   p = (StgArrWords *)(Hp-words_needed+1) - 1;
+   SET_ARR_HDR(p, &stg_ARR_WORDS_info, CCCS, words_needed);
 
    if ( val < 0LL ) {
      neg = 1;
      val = -val;
    }
+
    hi = (W_)((LW_)val / 0x100000000ULL);
-   if ((LW_)(val) >= 0x100000000ULL)  { 
-      s = 2; 
-      a = 2;
-      Hp[0] = (W_)val;
-      Hp[1] = hi;
+
+   if ( words_needed == 2 )  { 
+      s = 2;
+      Hp[-1] = (W_)val;
+      Hp[0] = hi;
    } else if ( val != 0 ) {
       s = 1;
-      a = 1;
-     Hp[0] =  (W_)val;
+      Hp[0] = (W_)val;
    }  else /* val==0 */   {
       s = 0;
-      a = 1;
    }
-  s = ( neg ? -s : s );
+   s = ( neg ? -s : s );
 
-   /* returns (# alloc :: Int#, 
-                size  :: Int#, 
+   /* returns (# size  :: Int#, 
                 data  :: ByteArray# 
               #)
    */
-   RET_NNP(a,s,p);
+   TICK_RET_UNBOXED_TUP(2);
+   RET_NP(s,p);
    FE_
 }
 
-FN_(word64ToIntegerZh_fast)
+FN_(word64ToIntegerzh_fast)
 {
    /* arguments: L1 = Word64# */
 
-   StgNat64 val; /* to avoid aliasing */
+   StgWord64 val; /* to avoid aliasing */
    StgWord hi;
-   I_  s,a,words_needed;
+   I_  s, words_needed;
    StgArrWords* p;     /* address of array result */
    FB_
 
@@ -504,34 +566,31 @@ FN_(word64ToIntegerZh_fast)
    } else {
       words_needed = 1;
    }
-   HP_CHK_GEN(sizeofW(StgArrWords)+words_needed, NO_PTRS, word64ToIntegerZh_fast,)
-   TICK_ALLOC_PRIM(sizeofW(StgArrWords)+words_needed,wibble,wibble,wibble)
+   HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+words_needed, NO_PTRS, word64ToIntegerzh_fast,)
+   TICK_ALLOC_PRIM(sizeofW(StgArrWords),words_needed,0);
    CCS_ALLOC(CCCS,sizeofW(StgArrWords)+words_needed); /* ccs prof */
 
-   p = stgCast(StgArrWords*,Hp)-1;
-   SET_ARR_HDR(p, &ARR_WORDS_info, CCCS, words_needed);
+   p = (StgArrWords *)(Hp-words_needed+1) - 1;
+   SET_ARR_HDR(p, &stg_ARR_WORDS_info, CCCS, words_needed);
 
    hi = (W_)((LW_)val / 0x100000000ULL);
    if ( val >= 0x100000000ULL ) { 
      s = 2;
-     a = 2;
-     Hp[0] = ((W_)val);
-     Hp[1] = (hi);
+     Hp[-1] = ((W_)val);
+     Hp[0]  = (hi);
    } else if ( val != 0 )      {
       s = 1;
-      a = 1;
       Hp[0] = ((W_)val);
    } else /* val==0 */         {
       s = 0;
-      a = 1;
    }
 
-   /* returns (# alloc :: Int#, 
-                size  :: Int#, 
+   /* returns (# size  :: Int#, 
                 data  :: ByteArray# 
               #)
    */
-   RET_NNP(a,s,p);
+   TICK_RET_UNBOXED_TUP(2);
+   RET_NP(s,p);
    FE_
 }
 
@@ -544,25 +603,23 @@ FN_(word64ToIntegerZh_fast)
 FN_(name)                                                              \
 {                                                                      \
   MP_INT arg1, arg2, result;                                           \
-  I_ a1, s1, a2, s2;                                                   \
+  I_ s1, s2;                                                           \
   StgArrWords* d1;                                                     \
   StgArrWords* d2;                                                     \
   FB_                                                                  \
                                                                        \
   /* call doYouWantToGC() */                                           \
-  MAYBE_GC(R3_PTR | R6_PTR, name);                                     \
+  MAYBE_GC(R2_PTR | R4_PTR, name);                                     \
                                                                        \
-  a1 = R1.i;                                                           \
-  s1 = R2.i;                                                           \
-  d1 = stgCast(StgArrWords*,R3.p);                                     \
-  a2 = R4.i;                                                           \
-  s2 = R5.i;                                                           \
-  d2 = stgCast(StgArrWords*,R6.p);                                     \
+  d1 = (StgArrWords *)R2.p;                                            \
+  s1 = R1.i;                                                           \
+  d2 = (StgArrWords *)R4.p;                                            \
+  s2 = R3.i;                                                           \
                                                                        \
-  arg1._mp_alloc       = (a1);                                         \
+  arg1._mp_alloc       = d1->words;                                    \
   arg1._mp_size                = (s1);                                         \
   arg1._mp_d           = (unsigned long int *) (BYTE_ARR_CTS(d1));     \
-  arg2._mp_alloc       = (a2);                                         \
+  arg2._mp_alloc       = d2->words;                                    \
   arg2._mp_size                = (s2);                                         \
   arg2._mp_d           = (unsigned long int *) (BYTE_ARR_CTS(d2));     \
                                                                        \
@@ -571,9 +628,38 @@ FN_(name)                                                          \
   /* Perform the operation */                                          \
   STGCALL3(mp_fun,&result,&arg1,&arg2);                                        \
                                                                        \
-  RET_NNP(result._mp_alloc,                                            \
-         result._mp_size,                                              \
-          result._mp_d-sizeofW(StgArrWords));                          \
+  TICK_RET_UNBOXED_TUP(2);                                             \
+  RET_NP(result._mp_size,                                              \
+         result._mp_d-sizeofW(StgArrWords));                           \
+  FE_                                                                  \
+}
+
+#define GMP_TAKE1_RET1(name,mp_fun)                                    \
+FN_(name)                                                              \
+{                                                                      \
+  MP_INT arg1, result;                                                 \
+  I_ s1;                                                               \
+  StgArrWords* d1;                                                     \
+  FB_                                                                  \
+                                                                       \
+  /* call doYouWantToGC() */                                           \
+  MAYBE_GC(R2_PTR, name);                                              \
+                                                                       \
+  d1 = (StgArrWords *)R2.p;                                            \
+  s1 = R1.i;                                                           \
+                                                                       \
+  arg1._mp_alloc       = d1->words;                                    \
+  arg1._mp_size                = (s1);                                         \
+  arg1._mp_d           = (unsigned long int *) (BYTE_ARR_CTS(d1));     \
+                                                                       \
+  STGCALL1(mpz_init,&result);                                          \
+                                                                       \
+  /* Perform the operation */                                          \
+  STGCALL2(mp_fun,&result,&arg1);                                      \
+                                                                       \
+  TICK_RET_UNBOXED_TUP(2);                                             \
+  RET_NP(result._mp_size,                                              \
+         result._mp_d-sizeofW(StgArrWords));                           \
   FE_                                                                  \
 }
 
@@ -581,25 +667,23 @@ FN_(name)                                                         \
 FN_(name)                                                              \
 {                                                                      \
   MP_INT arg1, arg2, result1, result2;                                 \
-  I_ a1, s1, a2, s2;                                                   \
+  I_ s1, s2;                                                           \
   StgArrWords* d1;                                                     \
   StgArrWords* d2;                                                     \
   FB_                                                                  \
                                                                        \
   /* call doYouWantToGC() */                                           \
-  MAYBE_GC(R3_PTR | R6_PTR, name);                                     \
+  MAYBE_GC(R2_PTR | R4_PTR, name);                                     \
                                                                        \
-  a1 = R1.i;                                                           \
-  s1 = R2.i;                                                           \
-  d1 = stgCast(StgArrWords*,R3.p);                                     \
-  a2 = R4.i;                                                           \
-  s2 = R5.i;                                                           \
-  d2 = stgCast(StgArrWords*,R6.p);                                     \
+  d1 = (StgArrWords *)R2.p;                                            \
+  s1 = R1.i;                                                           \
+  d2 = (StgArrWords *)R4.p;                                            \
+  s2 = R3.i;                                                           \
                                                                        \
-  arg1._mp_alloc       = (a1);                                         \
+  arg1._mp_alloc       = d1->words;                                    \
   arg1._mp_size                = (s1);                                         \
   arg1._mp_d           = (unsigned long int *) (BYTE_ARR_CTS(d1));     \
-  arg2._mp_alloc       = (a2);                                         \
+  arg2._mp_alloc       = d2->words;                                    \
   arg2._mp_size                = (s2);                                         \
   arg2._mp_d           = (unsigned long int *) (BYTE_ARR_CTS(d2));     \
                                                                        \
@@ -609,25 +693,30 @@ FN_(name)                                                         \
   /* Perform the operation */                                          \
   STGCALL4(mp_fun,&result1,&result2,&arg1,&arg2);                      \
                                                                        \
-  RET_NNPNNP(result1._mp_alloc,                                                \
-            result1._mp_size,                                          \
-             result1._mp_d-sizeofW(StgArrWords),                       \
-            result2._mp_alloc,                                         \
-            result2._mp_size,                                          \
-             result2._mp_d-sizeofW(StgArrWords));                      \
+  TICK_RET_UNBOXED_TUP(4);                                             \
+  RET_NPNP(result1._mp_size,                                           \
+           result1._mp_d-sizeofW(StgArrWords),                         \
+          result2._mp_size,                                            \
+           result2._mp_d-sizeofW(StgArrWords));                                \
   FE_                                                                  \
 }
 
-GMP_TAKE2_RET1(plusIntegerZh_fast,  mpz_add);
-GMP_TAKE2_RET1(minusIntegerZh_fast, mpz_sub);
-GMP_TAKE2_RET1(timesIntegerZh_fast, mpz_mul);
-GMP_TAKE2_RET1(gcdIntegerZh_fast,   mpz_gcd);
-
-GMP_TAKE2_RET2(quotRemIntegerZh_fast, mpz_tdiv_qr);
-GMP_TAKE2_RET2(divModIntegerZh_fast,  mpz_fdiv_qr);
-
-#ifndef FLOATS_AS_DOUBLES
-FN_(decodeFloatZh_fast)
+GMP_TAKE2_RET1(plusIntegerzh_fast,     mpz_add);
+GMP_TAKE2_RET1(minusIntegerzh_fast,    mpz_sub);
+GMP_TAKE2_RET1(timesIntegerzh_fast,    mpz_mul);
+GMP_TAKE2_RET1(gcdIntegerzh_fast,      mpz_gcd);
+GMP_TAKE2_RET1(quotIntegerzh_fast,     mpz_tdiv_q);
+GMP_TAKE2_RET1(remIntegerzh_fast,      mpz_tdiv_r);
+GMP_TAKE2_RET1(divExactIntegerzh_fast, mpz_divexact);
+GMP_TAKE2_RET1(andIntegerzh_fast,      mpz_and);
+GMP_TAKE2_RET1(orIntegerzh_fast,       mpz_ior);
+GMP_TAKE2_RET1(xorIntegerzh_fast,      mpz_xor);
+GMP_TAKE1_RET1(complementIntegerzh_fast, mpz_com);
+
+GMP_TAKE2_RET2(quotRemIntegerzh_fast, mpz_tdiv_qr);
+GMP_TAKE2_RET2(divModIntegerzh_fast,  mpz_fdiv_qr);
+
+FN_(decodeFloatzh_fast)
 { 
   MP_INT mantissa;
   I_ exponent;
@@ -638,29 +727,29 @@ FN_(decodeFloatZh_fast)
   /* arguments: F1 = Float# */
   arg = F1;
 
-  HP_CHK_GEN(sizeof(StgArrWords)+1, NO_PTRS, decodeFloatZh_fast,);
-  TICK_ALLOC_PRIM(sizeofW(StgArrWords)+1,wibble,wibble,wibble)
+  HP_CHK_GEN_TICKY(sizeofW(StgArrWords)+1, NO_PTRS, decodeFloatzh_fast,);
+  TICK_ALLOC_PRIM(sizeofW(StgArrWords),1,0);
   CCS_ALLOC(CCCS,sizeofW(StgArrWords)+1); /* ccs prof */
 
   /* Be prepared to tell Lennart-coded __decodeFloat   */
   /* where mantissa._mp_d can be put (it does not care about the rest) */
-  p = stgCast(StgArrWords*,Hp)-1;
-  SET_ARR_HDR(p,&ARR_WORDS_info,CCCS,1)
+  p = (StgArrWords *)Hp - 1;
+  SET_ARR_HDR(p,&stg_ARR_WORDS_info,CCCS,1)
   mantissa._mp_d = (void *)BYTE_ARR_CTS(p);
 
   /* Perform the operation */
   STGCALL3(__decodeFloat,&mantissa,&exponent,arg);
 
-  /* returns: (R1 = Int# (expn), R2 = Int#, R3 = Int#, R4 = ByteArray#) */
-  RET_NNNP(exponent,mantissa._mp_alloc,mantissa._mp_size,p);
+  /* returns: (Int# (expn), Int#, ByteArray#) */
+  TICK_RET_UNBOXED_TUP(3);
+  RET_NNP(exponent,mantissa._mp_size,p);
   FE_
 }
-#endif /* !FLOATS_AS_DOUBLES */
 
-#define DOUBLE_MANTISSA_SIZE (sizeof(StgDouble)/sizeof(W_))
-#define ARR_SIZE (sizeof(StgArrWords) + DOUBLE_MANTISSA_SIZE)
+#define DOUBLE_MANTISSA_SIZE (sizeofW(StgDouble))
+#define ARR_SIZE (sizeofW(StgArrWords) + DOUBLE_MANTISSA_SIZE)
 
-FN_(decodeDoubleZh_fast)
+FN_(decodeDoublezh_fast)
 { MP_INT mantissa;
   I_ exponent;
   StgDouble arg;
@@ -670,21 +759,22 @@ FN_(decodeDoubleZh_fast)
   /* arguments: D1 = Double# */
   arg = D1;
 
-  HP_CHK_GEN(ARR_SIZE, NO_PTRS, decodeDoubleZh_fast,);
-  TICK_ALLOC_PRIM(ARR_SIZE,wibble,wibble,wibble)
+  HP_CHK_GEN_TICKY(ARR_SIZE, NO_PTRS, decodeDoublezh_fast,);
+  TICK_ALLOC_PRIM(sizeofW(StgArrWords),DOUBLE_MANTISSA_SIZE,0);
   CCS_ALLOC(CCCS,ARR_SIZE); /* ccs prof */
 
   /* Be prepared to tell Lennart-coded __decodeDouble  */
   /* where mantissa.d can be put (it does not care about the rest) */
-  p = stgCast(StgArrWords*,Hp-ARR_SIZE+1);
-  SET_ARR_HDR(p, &ARR_WORDS_info, CCCS, DOUBLE_MANTISSA_SIZE);
+  p = (StgArrWords *)(Hp-ARR_SIZE+1);
+  SET_ARR_HDR(p, &stg_ARR_WORDS_info, CCCS, DOUBLE_MANTISSA_SIZE);
   mantissa._mp_d = (void *)BYTE_ARR_CTS(p);
 
   /* Perform the operation */
   STGCALL3(__decodeDouble,&mantissa,&exponent,arg);
 
-  /* returns: (R1 = Int# (expn), R2 = Int#, R3 = Int#, R4 = ByteArray#) */
-  RET_NNNP(exponent,mantissa._mp_alloc,mantissa._mp_size,p);
+  /* returns: (Int# (expn), Int#, ByteArray#) */
+  TICK_RET_UNBOXED_TUP(3);
+  RET_NNP(exponent,mantissa._mp_size,p);
   FE_
 }
 
@@ -692,137 +782,286 @@ FN_(decodeDoubleZh_fast)
  * Concurrency primitives
  * -------------------------------------------------------------------------- */
 
-FN_(forkZh_fast)
+FN_(forkzh_fast)
 {
   FB_
   /* args: R1 = closure to spark */
   
-  if (closure_SHOULD_SPARK(stgCast(StgClosure*,R1.p))) {
+  MAYBE_GC(R1_PTR, forkzh_fast);
 
-    MAYBE_GC(R1_PTR, forkZh_fast);
-
-    /* create it right now, return ThreadID in R1 */
-    R1.t = RET_STGCALL2(StgTSO *, createIOThread, 
-                       RtsFlags.GcFlags.initialStkSize, R1.cl);
+  /* create it right now, return ThreadID in R1 */
+  R1.t = RET_STGCALL2(StgTSO *, createIOThread, 
+                     RtsFlags.GcFlags.initialStkSize, R1.cl);
+  STGCALL1(scheduleThread, R1.t);
       
-    /* switch at the earliest opportunity */ 
-    context_switch = 1;
-  }
+  /* switch at the earliest opportunity */ 
+  context_switch = 1;
   
-  JMP_(*Sp);
-
+  JMP_(ENTRY_CODE(Sp[0]));
   FE_
 }
 
-FN_(killThreadZh_fast)
+FN_(yieldzh_fast)
 {
   FB_
-  /* args: R1.p = TSO to kill */
-
-  /* The thread is dead, but the TSO sticks around for a while.  That's why
-   * we don't have to explicitly remove it from any queues it might be on.
-   */
-  STGCALL1(deleteThread, (StgTSO *)R1.p);
-
-  /* We might have killed ourselves.  In which case, better return to the
-   * scheduler...
-   */
-  if ((StgTSO *)R1.p == CurrentTSO) {
-       JMP_(stg_stop_thread_entry); /* leave semi-gracefully */
-  }
-
-  JMP_(ENTRY_CODE(Sp[0]));
+  JMP_(stg_yield_noregs);
   FE_
 }
 
-FN_(newMVarZh_fast)
+FN_(newMVarzh_fast)
 {
   StgMVar *mvar;
 
   FB_
   /* args: none */
 
-  HP_CHK_GEN(sizeofW(StgMVar), NO_PTRS, newMVarZh_fast,);
-  TICK_ALLOC_PRIM(sizeofW(StgMVar),wibble,wibble,wibble)
+  HP_CHK_GEN_TICKY(sizeofW(StgMVar), NO_PTRS, newMVarzh_fast,);
+  TICK_ALLOC_PRIM(sizeofW(StgMutVar)-1, // consider head,tail,link as admin wds
+                 1, 0);
   CCS_ALLOC(CCCS,sizeofW(StgMVar)); /* ccs prof */
   
   mvar = (StgMVar *) (Hp - sizeofW(StgMVar) + 1);
-  SET_INFO(mvar,&EMPTY_MVAR_info);
-  mvar->head = mvar->tail = (StgTSO *)&END_TSO_QUEUE_closure;
-  mvar->value = (StgClosure *)&END_TSO_QUEUE_closure;
+  SET_HDR(mvar,&stg_EMPTY_MVAR_info,CCCS);
+  mvar->head = mvar->tail = (StgTSO *)&stg_END_TSO_QUEUE_closure;
+  mvar->value = (StgClosure *)&stg_END_TSO_QUEUE_closure;
 
-  R1.p = (P_)mvar;
-
-  JMP_(ENTRY_CODE(Sp[0]));
+  TICK_RET_UNBOXED_TUP(1);
+  RET_P(mvar);
   FE_
 }
 
-FN_(takeMVarZh_fast)
+FN_(takeMVarzh_fast)
 {
   StgMVar *mvar;
+  StgClosure *val;
+  const StgInfoTable *info;
 
   FB_
   /* args: R1 = MVar closure */
 
   mvar = (StgMVar *)R1.p;
 
+#ifdef SMP
+  info = LOCK_CLOSURE(mvar);
+#else
+  info = GET_INFO(mvar);
+#endif
+
   /* If the MVar is empty, put ourselves on its blocking queue,
    * and wait until we're woken up.
    */
-  if (GET_INFO(mvar) != &FULL_MVAR_info) {
-    if (mvar->head == (StgTSO *)&END_TSO_QUEUE_closure) {
+  if (info == &stg_EMPTY_MVAR_info) {
+    if (mvar->head == (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+      mvar->head = CurrentTSO;
+    } else {
+      mvar->tail->link = CurrentTSO;
+    }
+    CurrentTSO->link = (StgTSO *)&stg_END_TSO_QUEUE_closure;
+    CurrentTSO->why_blocked = BlockedOnMVar;
+    CurrentTSO->block_info.closure = (StgClosure *)mvar;
+    mvar->tail = CurrentTSO;
+
+#ifdef SMP
+    /* unlock the MVar */
+    mvar->header.info = &stg_EMPTY_MVAR_info;
+#endif
+    BLOCK(R1_PTR, takeMVarzh_fast);
+  }
+
+  val = mvar->value;
+  mvar->value = (StgClosure *)&stg_END_TSO_QUEUE_closure;
+
+  /* wake up the first thread on the queue
+   */
+  if (mvar->head != (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+      ASSERT(mvar->head->why_blocked == BlockedOnMVar);
+#if defined(GRAN) || defined(PAR)
+      /* ToDo: check 2nd arg (mvar) is right */
+      mvar->head = RET_STGCALL2(StgTSO *,unblockOne,mvar->head,mvar);
+#else
+      mvar->head = RET_STGCALL1(StgTSO *,unblockOne,mvar->head);
+#endif
+      if (mvar->head == (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+         mvar->tail = (StgTSO *)&stg_END_TSO_QUEUE_closure;
+      }
+  }
+
+  /* do this last... we might have locked the MVar in the SMP case,
+   * and writing the info pointer will unlock it.
+   */
+  SET_INFO(mvar,&stg_EMPTY_MVAR_info);
+
+  TICK_RET_UNBOXED_TUP(1);
+  RET_P(val);
+  FE_
+}
+
+FN_(tryTakeMVarzh_fast)
+{
+  StgMVar *mvar;
+  StgClosure *val;
+  const StgInfoTable *info;
+
+  FB_
+  /* args: R1 = MVar closure */
+
+  mvar = (StgMVar *)R1.p;
+
+#ifdef SMP
+  info = LOCK_CLOSURE(mvar);
+#else
+  info = GET_INFO(mvar);
+#endif
+
+  if (info == &stg_EMPTY_MVAR_info) {
+
+#ifdef SMP
+    /* unlock the MVar */
+    mvar->header.info = &stg_EMPTY_MVAR_info;
+#endif
+
+    /* HACK: we need a pointer to pass back, so we abuse NO_FINALIZER_closure */
+    RET_NP(0, &stg_NO_FINALIZER_closure);
+  }
+
+  val = mvar->value;
+  mvar->value = (StgClosure *)&stg_END_TSO_QUEUE_closure;
+
+  /* wake up the first thread on the queue
+   */
+  if (mvar->head != (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+      ASSERT(mvar->head->why_blocked == BlockedOnMVar);
+#if defined(GRAN) || defined(PAR)
+      /* ToDo: check 2nd arg (mvar) is right */
+      mvar->head = RET_STGCALL2(StgTSO *,unblockOne,mvar->head,mvar);
+#else
+      mvar->head = RET_STGCALL1(StgTSO *,unblockOne,mvar->head);
+#endif
+      if (mvar->head == (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+         mvar->tail = (StgTSO *)&stg_END_TSO_QUEUE_closure;
+      }
+  }
+
+  /* do this last... we might have locked the MVar in the SMP case,
+   * and writing the info pointer will unlock it.
+   */
+  SET_INFO(mvar,&stg_EMPTY_MVAR_info);
+
+  TICK_RET_UNBOXED_TUP(1);
+  RET_NP(1,val);
+  FE_
+}
+
+FN_(putMVarzh_fast)
+{
+  StgMVar *mvar;
+  const StgInfoTable *info;
+
+  FB_
+  /* args: R1 = MVar, R2 = value */
+
+  mvar = (StgMVar *)R1.p;
+
+#ifdef SMP
+  info = LOCK_CLOSURE(mvar);
+#else
+  info = GET_INFO(mvar);
+#endif
+
+  if (info == &stg_FULL_MVAR_info) {
+    if (mvar->head == (StgTSO *)&stg_END_TSO_QUEUE_closure) {
       mvar->head = CurrentTSO;
     } else {
       mvar->tail->link = CurrentTSO;
     }
-    CurrentTSO->link = (StgTSO *)&END_TSO_QUEUE_closure;
+    CurrentTSO->link = (StgTSO *)&stg_END_TSO_QUEUE_closure;
+    CurrentTSO->why_blocked = BlockedOnMVar;
+    CurrentTSO->block_info.closure = (StgClosure *)mvar;
     mvar->tail = CurrentTSO;
 
-    BLOCK(R1_PTR, takeMVarZh_fast);
+#ifdef SMP
+    /* unlock the MVar */
+    mvar->header.info = &stg_FULL_MVAR_info;
+#endif
+    BLOCK( R1_PTR | R2_PTR, putMVarzh_fast );
   }
+  
+  mvar->value = R2.cl;
 
-  SET_INFO(mvar,&EMPTY_MVAR_info);
-  R1.cl = mvar->value;
-  mvar->value = (StgClosure *)&END_TSO_QUEUE_closure;
+  /* wake up the first thread on the queue, it will continue with the
+   * takeMVar operation and mark the MVar empty again.
+   */
+  if (mvar->head != (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+    ASSERT(mvar->head->why_blocked == BlockedOnMVar);
+#if defined(GRAN) || defined(PAR)
+    /* ToDo: check 2nd arg (mvar) is right */
+    mvar->head = RET_STGCALL2(StgTSO *,unblockOne,mvar->head,mvar);
+#else
+    mvar->head = RET_STGCALL1(StgTSO *,unblockOne,mvar->head);
+#endif
+    if (mvar->head == (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+      mvar->tail = (StgTSO *)&stg_END_TSO_QUEUE_closure;
+    }
+  }
 
+  /* unlocks the MVar in the SMP case */
+  SET_INFO(mvar,&stg_FULL_MVAR_info);
+
+  /* ToDo: yield here for better communication performance? */
   JMP_(ENTRY_CODE(Sp[0]));
   FE_
 }
 
-FN_(putMVarZh_fast)
+FN_(tryPutMVarzh_fast)
 {
   StgMVar *mvar;
-  StgTSO *tso;
+  const StgInfoTable *info;
 
   FB_
   /* args: R1 = MVar, R2 = value */
 
   mvar = (StgMVar *)R1.p;
-  if (GET_INFO(mvar) == &FULL_MVAR_info) {
-    fflush(stdout);
-    fprintf(stderr, "putMVar#: MVar already full.\n");
-    stg_exit(EXIT_FAILURE);
+
+#ifdef SMP
+  info = LOCK_CLOSURE(mvar);
+#else
+  info = GET_INFO(mvar);
+#endif
+
+  if (info == &stg_FULL_MVAR_info) {
+
+#ifdef SMP
+    /* unlock the MVar */
+    mvar->header.info = &stg_FULL_MVAR_info;
+#endif
+
+    /* HACK: we need a pointer to pass back, so we abuse NO_FINALIZER_closure */
+    RET_N(0);
   }
   
-  SET_INFO(mvar,&FULL_MVAR_info);
   mvar->value = R2.cl;
 
-  /* wake up the first thread on the queue,
-   * it will continue with the takeMVar operation and mark the MVar
-   * empty again.
+  /* wake up the first thread on the queue, it will continue with the
+   * takeMVar operation and mark the MVar empty again.
    */
-  tso = mvar->head;
-  if (tso != (StgTSO *)&END_TSO_QUEUE_closure) {
-    PUSH_ON_RUN_QUEUE(tso);
-    mvar->head = tso->link;
-    tso->link = (StgTSO *)&END_TSO_QUEUE_closure;
-    if (mvar->head == (StgTSO *)&END_TSO_QUEUE_closure) {
-      mvar->tail = (StgTSO *)&END_TSO_QUEUE_closure;
+  if (mvar->head != (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+    ASSERT(mvar->head->why_blocked == BlockedOnMVar);
+#if defined(GRAN) || defined(PAR)
+    /* ToDo: check 2nd arg (mvar) is right */
+    mvar->head = RET_STGCALL2(StgTSO *,unblockOne,mvar->head,mvar);
+#else
+    mvar->head = RET_STGCALL1(StgTSO *,unblockOne,mvar->head);
+#endif
+    if (mvar->head == (StgTSO *)&stg_END_TSO_QUEUE_closure) {
+      mvar->tail = (StgTSO *)&stg_END_TSO_QUEUE_closure;
     }
   }
 
+  /* unlocks the MVar in the SMP case */
+  SET_INFO(mvar,&stg_FULL_MVAR_info);
+
   /* ToDo: yield here for better communication performance? */
-  JMP_(ENTRY_CODE(*Sp));
+  RET_N(1);
   FE_
 }
 
@@ -830,22 +1069,146 @@ FN_(putMVarZh_fast)
    Stable pointer primitives
    -------------------------------------------------------------------------  */
 
-FN_(makeStablePtrZh_fast)
+FN_(makeStableNamezh_fast)
 {
-  StgInt stable_ptr;
-  FB_ 
+  StgWord index;
+  StgStableName *sn_obj;
+  FB_
 
-    if (stable_ptr_free == NULL) {
-      enlargeStablePtrTable();
+  HP_CHK_GEN_TICKY(sizeofW(StgStableName), R1_PTR, makeStableNamezh_fast,);
+  TICK_ALLOC_PRIM(sizeofW(StgHeader), 
+                 sizeofW(StgStableName)-sizeofW(StgHeader), 0);
+  CCS_ALLOC(CCCS,sizeofW(StgStableName)); /* ccs prof */
+  
+  index = RET_STGCALL1(StgWord,lookupStableName,R1.p);
+
+  /* Is there already a StableName for this heap object? */
+  if (stable_ptr_table[index].sn_obj == NULL) {
+    sn_obj = (StgStableName *) (Hp - sizeofW(StgStableName) + 1);
+    sn_obj->header.info = &stg_STABLE_NAME_info;
+    sn_obj->sn = index;
+    stable_ptr_table[index].sn_obj = (StgClosure *)sn_obj;
+  } else {
+    (StgClosure *)sn_obj = stable_ptr_table[index].sn_obj;
+  }
+
+  TICK_RET_UNBOXED_TUP(1);
+  RET_P(sn_obj);
+}
+
+/* -----------------------------------------------------------------------------
+   Bytecode object primitives
+   -------------------------------------------------------------------------  */
+
+FN_(newBCOzh_fast)
+{
+  /* R1.p = instrs
+     R2.p = literals
+     R3.p = ptrs
+     R4.p = itbls
+  */
+  StgBCO *bco;
+  FB_
+
+  HP_CHK_GEN_TICKY(sizeofW(StgBCO),R1_PTR|R2_PTR|R3_PTR|R4_PTR, newBCOzh_fast,);
+  TICK_ALLOC_PRIM(sizeofW(StgHeader), sizeofW(StgBCO)-sizeofW(StgHeader), 0);
+  CCS_ALLOC(CCCS,sizeofW(StgBCO)); /* ccs prof */
+  bco = (StgBCO *) (Hp + 1 - sizeofW(StgBCO));
+  SET_HDR(bco, &stg_BCO_info, CCCS);
+
+  bco->instrs     = (StgArrWords*)R1.cl;
+  bco->literals   = (StgArrWords*)R2.cl;
+  bco->ptrs       = (StgMutArrPtrs*)R3.cl;
+  bco->itbls      = (StgArrWords*)R4.cl;
+
+  TICK_RET_UNBOXED_TUP(1);
+  RET_P(bco);
+  FE_
+}
+
+FN_(mkApUpd0zh_fast)
+{
+  /* R1.p = the fn for the AP_UPD
+  */
+  StgAP_UPD* ap;
+  FB_
+  HP_CHK_GEN_TICKY(AP_sizeW(0), R1_PTR, mkApUpd0zh_fast,);
+  TICK_ALLOC_PRIM(sizeofW(StgHeader), AP_sizeW(0)-sizeofW(StgHeader), 0);
+  CCS_ALLOC(CCCS,AP_sizeW(0)); /* ccs prof */
+  ap = (StgAP_UPD *) (Hp + 1 - AP_sizeW(0));
+  SET_HDR(ap, &stg_AP_UPD_info, CCCS);
+
+  ap->n_args = 0;
+  ap->fun = R1.cl;
+
+  TICK_RET_UNBOXED_TUP(1);
+  RET_P(ap);
+  FE_
+}
+
+/* -----------------------------------------------------------------------------
+   Thread I/O blocking primitives
+   -------------------------------------------------------------------------- */
+
+FN_(waitReadzh_fast)
+{
+  FB_
+    /* args: R1.i */
+    ASSERT(CurrentTSO->why_blocked == NotBlocked);
+    CurrentTSO->why_blocked = BlockedOnRead;
+    CurrentTSO->block_info.fd = R1.i;
+    ACQUIRE_LOCK(&sched_mutex);
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    RELEASE_LOCK(&sched_mutex);
+    JMP_(stg_block_noregs);
+  FE_
+}
+
+FN_(waitWritezh_fast)
+{
+  FB_
+    /* args: R1.i */
+    ASSERT(CurrentTSO->why_blocked == NotBlocked);
+    CurrentTSO->why_blocked = BlockedOnWrite;
+    CurrentTSO->block_info.fd = R1.i;
+    ACQUIRE_LOCK(&sched_mutex);
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    RELEASE_LOCK(&sched_mutex);
+    JMP_(stg_block_noregs);
+  FE_
+}
+
+FN_(delayzh_fast)
+{
+  StgTSO *t, *prev;
+  nat target;
+  FB_
+    /* args: R1.i */
+    ASSERT(CurrentTSO->why_blocked == NotBlocked);
+    CurrentTSO->why_blocked = BlockedOnDelay;
+
+    ACQUIRE_LOCK(&sched_mutex);
+
+    target = (R1.i / (TICK_MILLISECS*1000)) + getourtimeofday();
+    CurrentTSO->block_info.target = target;
+
+    /* Insert the new thread in the sleeping queue. */
+    prev = NULL;
+    t = sleeping_queue;
+    while (t != END_TSO_QUEUE && t->block_info.target < target) {
+       prev = t;
+       t = t->link;
     }
 
-    stable_ptr = stable_ptr_free - stable_ptr_table;
-    (P_)stable_ptr_free  = *stable_ptr_free;
-    stable_ptr_table[stable_ptr] = R1.p;
+    CurrentTSO->link = t;
+    if (prev == NULL) {
+       sleeping_queue = CurrentTSO;
+    } else {
+       prev->link = CurrentTSO;
+    }
 
-    R1.i = stable_ptr;
-    JMP_(ENTRY_CODE(Sp[0]));
+    RELEASE_LOCK(&sched_mutex);
+    JMP_(stg_block_noregs);
   FE_
 }
 
-#endif /* COMPILER */