[project @ 2005-05-10 13:25:41 by simonmar]
authorsimonmar <unknown>
Tue, 10 May 2005 13:25:43 +0000 (13:25 +0000)
committersimonmar <unknown>
Tue, 10 May 2005 13:25:43 +0000 (13:25 +0000)
Two SMP-related changes:

  - New storage manager interface:

    bdescr *allocateLocal(StgRegTable *reg, nat words)

    which allocates from the current thread's nursery (being careful
    not to clash with the heap pointer).  It can do this without
    taking any locks; the lock only has to be taken if a block needs
    to be allocated.  allocateLocal() is now used instead of allocate()
    in a few PrimOps.

    This removes locks from most Integer operations, cutting down
    the overhead for SMP a bit more.

    To make this work, we have to be able to grab the current thread's
    Capability out of thin air (i.e. when called from GMP), so the
    Capability subsystem needs to keep a hash from thread IDs to
    Capabilities.

  - Small MVar optimisation: instead of taking the global
    storage-manager lock, do our own locking of MVars with a bit of
    inline assembly (x86 only for now).

13 files changed:
ghc/includes/Cmm.h
ghc/includes/Regs.h
ghc/includes/Rts.h
ghc/includes/SMP.h
ghc/includes/StgMiscClosures.h
ghc/includes/Storage.h
ghc/rts/Capability.c
ghc/rts/Capability.h
ghc/rts/GC.c
ghc/rts/Makefile
ghc/rts/PrimOps.cmm
ghc/rts/StgMiscClosures.cmm
ghc/rts/Storage.c

index e989a00..415dc4c 100644 (file)
    HP_CHK_GEN(alloc,liveness,reentry);                 \
    TICK_ALLOC_HEAP_NOCTR(alloc);
 
+// allocateLocal() allocates from the nursery, so we check to see
+// whether the nursery is nearly empty in any function that uses
+// allocateLocal() - this includes many of the primops.
 #define MAYBE_GC(liveness,reentry)                     \
-   if (CInt[alloc_blocks] >= CInt[alloc_blocks_lim]) { \
+   if (bdescr_link(CurrentNursery) == NULL) {          \
        R9  = liveness;                                 \
         R10 = reentry;                                 \
         jump stg_gc_gen_hp;                            \
index 0203238..5374972 100644 (file)
@@ -87,7 +87,8 @@ typedef struct StgRegTable_ {
   StgPtr         rHpLim;
   struct StgTSO_ *rCurrentTSO;
   struct step_   *rNursery;
-  struct bdescr_ *rCurrentNursery;
+  struct bdescr_ *rCurrentNursery; /* Hp/HpLim point into this block */
+  struct bdescr_ *rCurrentAlloc;   /* for allocation using allocate() */
   StgWord         rHpAlloc;    /* number of *bytes* being allocated in heap */
 #if defined(SMP) || defined(PAR)
   StgSparkPool   rSparks;      /* per-task spark pool */
index fb2a70b..27331be 100644 (file)
@@ -110,13 +110,8 @@ extern void _assertFail (char *, unsigned int);
 #include "Parallel.h"
 
 /* STG/Optimised-C related stuff */
-#include "SMP.h"
 #include "Block.h"
 
-#ifdef SMP
-#include <pthread.h>
-#endif
-
 /* GNU mp library */
 #include "gmp.h"
 
index e35b95b..86930f9 100644 (file)
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  *
- * (c) The GHC Team, 1999
+ * (c) The GHC Team, 2005
  *
  * Macros for SMP support
  *
 #error Build options incompatible with SMP.
 #endif
 
-/*
- * CMPXCHG - this instruction is the standard "test & set".  We use it
- * for locking closures in the thunk and blackhole entry code.  If the
- * closure is already locked, or has an unexpected info pointer
- * (because another thread is altering it in parallel), we just jump
- * to the new entry point.
- */
-#if defined(i386_HOST_ARCH) && defined(TABLES_NEXT_TO_CODE)
-#define CMPXCHG(p, cmp, new)                   \
-  __asm__ __volatile__ (                       \
-         "lock ; cmpxchg %1, %0\n"             \
-          "\tje 1f\n"                          \
-          "\tjmp *%%eax\n"                     \
-          "\t1:\n"                             \
-         : /* no outputs */                    \
-         : "m" (p), "r" (new), "r" (cmp)       \
-         )
-
 /* 
  * XCHG - the atomic exchange instruction.  Used for locking closures
  * during updates (see LOCK_CLOSURE below) and the MVar primops.
  */
-#define XCHG(reg, obj)                         \
-  __asm__ __volatile__ (                       \
-         "xchgl %1,%0"                         \
-          :"+r" (reg), "+m" (obj)              \
-          : /* no input-only operands */       \
-         )
-
+INLINE_HEADER StgWord
+xchg(StgPtr p, StgWord w)
+{
+    StgWord result;
+    result = w;
+    __asm__ __volatile__ (
+         "xchgl %1,%0"
+          :"+r" (result), "+m" (*p)
+          : /* no input-only operands */
+       );
+    return result;
+}
+
+INLINE_HEADER StgInfoTable *
+lockClosure(StgClosure *p)
+{
+    StgWord info;
+#if 0
+    do {
+       info = xchg((P_)&p->header.info, (W_)&stg_WHITEHOLE_info);
+       if (info != (W_)&stg_WHITEHOLE_info) return (StgInfoTable *)info;
+       yieldThread();
+    } while (1);
 #else
-#error SMP macros not defined for this architecture
+    info = p->header.info;
 #endif
-
-/*
- * LOCK_CLOSURE locks the specified closure, busy waiting for any
- * existing locks to be cleared.
- */
-#define LOCK_CLOSURE(c)                                        \
-  ({                                                   \
-    const StgInfoTable *__info;                                \
-    __info = &stg_WHITEHOLE_info;                      \
-    do {                                               \
-      XCHG(__info,((StgClosure *)(c))->header.info);   \
-    } while (__info == &stg_WHITEHOLE_info);           \
-    __info;                                            \
-  })
-
-#define LOCK_THUNK(__info)                             \
-  CMPXCHG(R1.cl->header.info, __info, &stg_WHITEHOLE_info);
-
-#else /* !SMP */
-
-#define LOCK_CLOSURE(c)     /* nothing */
-#define LOCK_THUNK(__info)  /* nothing */
+}
 
 #endif /* SMP */
 
index f8332aa..026c2cf 100644 (file)
@@ -95,6 +95,7 @@ RTS_INFO(stg_IND_OLDGEN_info);
 RTS_INFO(stg_IND_OLDGEN_PERM_info);
 RTS_INFO(stg_CAF_UNENTERED_info);
 RTS_INFO(stg_CAF_ENTERED_info);
+RTS_INFO(stg_WHITEHOLE_info);
 RTS_INFO(stg_BLACKHOLE_info);
 RTS_INFO(stg_CAF_BLACKHOLE_info);
 #ifdef TICKY_TICKY
@@ -155,6 +156,7 @@ RTS_ENTRY(stg_IND_OLDGEN_entry);
 RTS_ENTRY(stg_IND_OLDGEN_PERM_entry);
 RTS_ENTRY(stg_CAF_UNENTERED_entry);
 RTS_ENTRY(stg_CAF_ENTERED_entry);
+RTS_ENTRY(stg_WHITEHOLE_entry);
 RTS_ENTRY(stg_BLACKHOLE_entry);
 RTS_ENTRY(stg_CAF_BLACKHOLE_entry);
 #ifdef TICKY_TICKY
index a8a6d24..0ef5785 100644 (file)
@@ -139,6 +139,7 @@ extern void exitStorage(void);
    -------------------------------------------------------------------------- */
 
 extern StgPtr  allocate        ( nat n );
+extern StgPtr  allocateLocal   ( StgRegTable *reg, nat n );
 extern StgPtr  allocatePinned  ( nat n );
 extern lnat    allocated_bytes ( void );
 
@@ -193,6 +194,9 @@ extern void GarbageCollect(void (*get_roots)(evac_fn),rtsBool force_major_gc);
  */
 #if defined(SMP)
 extern Mutex sm_mutex;
+#endif
+
+#if defined(SMP)
 #define ACQUIRE_SM_LOCK   ACQUIRE_LOCK(&sm_mutex);
 #define RELEASE_SM_LOCK   RELEASE_LOCK(&sm_mutex);
 #else
index 1e2d3d6..0ae4688 100644 (file)
@@ -22,6 +22,9 @@
 #include "OSThreads.h"
 #include "Capability.h"
 #include "Schedule.h"  /* to get at EMPTY_RUN_QUEUE() */
+#if defined(SMP)
+#include "Hash.h"
+#endif
 
 #if !defined(SMP)
 Capability MainCapability;     /* for non-SMP, we have one global capability */
@@ -81,6 +84,11 @@ static rtsBool passingCapability = rtsFalse;
  * Free capability list. 
  */
 Capability *free_capabilities;
+
+/* 
+ * Maps OSThreadId to Capability *
+ */
+HashTable *capability_hash;
 #endif
 
 #ifdef SMP
@@ -133,6 +141,8 @@ initCapabilities( void )
     free_capabilities = &capabilities[0];
     rts_n_free_capabilities = n;
 
+    capability_hash = allocHashTable();
+
     IF_DEBUG(scheduler, sched_belch("allocated %d capabilities", n));
 #else
     capabilities = &MainCapability;
@@ -164,6 +174,7 @@ grabCapability( Capability** cap )
   *cap = free_capabilities;
   free_capabilities = (*cap)->link;
   rts_n_free_capabilities--;
+  insertHashTable(capability_hash, osThreadId(), *cap);
 #else
 # if defined(RTS_SUPPORTS_THREADS)
   ASSERT(rts_n_free_capabilities == 1);
@@ -177,6 +188,23 @@ grabCapability( Capability** cap )
 }
 
 /* ----------------------------------------------------------------------------
+ * Function:  myCapability(void)
+ *
+ * Purpose:   Return the capability owned by the current thread.
+ *            Should not be used if the current thread does not 
+ *            hold a Capability.
+ * ------------------------------------------------------------------------- */
+Capability *
+myCapability (void)
+{
+#if defined(SMP)
+    return lookupHashTable(capability_hash, osThreadId());
+#else
+    return &MainCapability;
+#endif
+}
+
+/* ----------------------------------------------------------------------------
  * Function:  releaseCapability(Capability*)
  *
  * Purpose:   Letting go of a capability. Causes a
@@ -195,6 +223,8 @@ releaseCapability( Capability* cap UNUSED_IF_NOT_SMP )
 #if defined(SMP)
     cap->link = free_capabilities;
     free_capabilities = cap;
+    ASSERT(myCapability() == cap);
+    removeHashTable(capability_hash, osThreadId(), NULL);
 #endif
     // Check to see whether a worker thread can be given
     // the go-ahead to return the result of an external call..
index 21d4ce4..f1615dc 100644 (file)
@@ -38,6 +38,10 @@ extern void releaseCapability( Capability* cap );
 //
 extern void threadRunnable ( void );
 
+// Return the capability that I own.
+// 
+extern Capability *myCapability (void);
+
 extern void prodWorker ( void );
 
 #ifdef RTS_SUPPORTS_THREADS
index db05ef5..fce011a 100644 (file)
@@ -739,7 +739,12 @@ GarbageCollect ( void (*get_roots)(evac_fn), rtsBool force_major_gc )
          if (stp->is_compacted) {
              collected += (oldgen_saved_blocks - stp->n_blocks) * BLOCK_SIZE_W;
          } else {
-             collected += stp->n_blocks * BLOCK_SIZE_W;
+             if (g == 0 && s == 0) {
+                 collected += countNurseryBlocks() * BLOCK_SIZE_W;
+                 collected += alloc_blocks;
+             } else {
+                 collected += stp->n_blocks * BLOCK_SIZE_W;
+             }
          }
 
        /* free old memory and shift to-space into from-space for all
index b564849..802ce61 100644 (file)
@@ -319,7 +319,8 @@ SRC_HC_OPTS += \
   -\#include LdvProfile.h \
   -\#include Profiling.h \
   -\#include OSThreads.h \
-  -\#include Apply.h
+  -\#include Apply.h \
+  -\#include SMP.h
 
 ifeq "$(Windows)" "YES"
 PrimOps_HC_OPTS += -\#include '<windows.h>' -\#include win32/AsyncIO.h
index ff1b442..cdca634 100644 (file)
@@ -49,7 +49,7 @@ newByteArrayzh_fast
     n = R1;
     payload_words = ROUNDUP_BYTES_TO_WDS(n);
     words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-    "ptr" p = foreign "C" allocate(words);
+    "ptr" p = foreign "C" allocateLocal(BaseReg "ptr",words);
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
     StgArrWords_words(p) = payload_words;
@@ -97,7 +97,7 @@ newArrayzh_fast
     MAYBE_GC(R2_PTR,newArrayzh_fast);
 
     words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + n;
-    "ptr" arr = foreign "C" allocate(words);
+    "ptr" arr = foreign "C" allocateLocal(BaseReg "ptr",words);
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
 
     SET_HDR(arr, stg_MUT_ARR_PTRS_info, W_[CCCS]);
@@ -1429,14 +1429,14 @@ takeMVarzh_fast
 {
     W_ mvar, val, info, tso;
 
-#if defined(SMP)
-    foreign "C" ACQUIRE_LOCK(sm_mutex "ptr");
-#endif
-
     /* args: R1 = MVar closure */
     mvar = R1;
 
+#if defined(SMP)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr");
+#else
     info = GET_INFO(mvar);
+#endif
 
     /* If the MVar is empty, put ourselves on its blocking queue,
      * and wait until we're woken up.
@@ -1453,7 +1453,7 @@ takeMVarzh_fast
        StgMVar_tail(mvar) = CurrentTSO;
        
 #if defined(SMP)
-       foreign "C" RELEASE_LOCK(sm_mutex "ptr");
+        SET_INFO(mvar,stg_EMPTY_MVAR_info);
 #endif
 
        jump stg_block_takemvar;
@@ -1486,7 +1486,7 @@ takeMVarzh_fast
       }
 
 #if defined(SMP)
-      foreign "C" RELEASE_LOCK(sm_mutex "ptr");
+      SET_INFO(mvar,stg_FULL_MVAR_info);
 #endif
 
       RET_P(val);
@@ -1494,16 +1494,10 @@ takeMVarzh_fast
   else
   {
       /* No further putMVars, MVar is now empty */
-      
-      /* do this last... we might have locked the MVar in the SMP case,
-       * and writing the info pointer will unlock it.
-       */
-      SET_INFO(mvar,stg_EMPTY_MVAR_info);
       StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
-
-#if defined(SMP)
-      foreign "C" RELEASE_LOCK(sm_mutex "ptr");
-#endif
+      /* unlocks the closure in the SMP case */
+      SET_INFO(mvar,stg_EMPTY_MVAR_info);
 
       RET_P(val);
   }
@@ -1514,23 +1508,23 @@ tryTakeMVarzh_fast
 {
     W_ mvar, val, info, tso;
 
-#if defined(SMP)
-    foreign "C" ACQUIRE_LOCK(sm_mutex "ptr");
-#endif
-
     /* args: R1 = MVar closure */
 
     mvar = R1;
 
+#if defined(SMP)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr");
+#else
     info = GET_INFO(mvar);
+#endif
 
     if (info == stg_EMPTY_MVAR_info) {
+#if defined(SMP)
+        SET_INFO(mvar,stg_EMPTY_MVAR_info);
+#endif
        /* HACK: we need a pointer to pass back, 
         * so we abuse NO_FINALIZER_closure
         */
-#if defined(SMP)
-       foreign "C" RELEASE_LOCK(sm_mutex "ptr");
-#endif
        RET_NP(0, stg_NO_FINALIZER_closure);
     }
 
@@ -1559,6 +1553,9 @@ tryTakeMVarzh_fast
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
            StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
        }
+#if defined(SMP)
+        SET_INFO(mvar,stg_FULL_MVAR_info);
+#endif
     }
     else 
     {
@@ -1567,10 +1564,6 @@ tryTakeMVarzh_fast
        SET_INFO(mvar,stg_EMPTY_MVAR_info);
     }
     
-#if defined(SMP)
-    foreign "C" RELEASE_LOCK(sm_mutex "ptr");
-#endif
-
     RET_NP(1, val);
 }
 
@@ -1579,14 +1572,14 @@ putMVarzh_fast
 {
     W_ mvar, info, tso;
 
-#if defined(SMP)
-    foreign "C" ACQUIRE_LOCK(sm_mutex "ptr");
-#endif
-
     /* args: R1 = MVar, R2 = value */
     mvar = R1;
 
+#if defined(SMP)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr");
+#else
     info = GET_INFO(mvar);
+#endif
 
     if (info == stg_FULL_MVAR_info) {
        if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
@@ -1600,7 +1593,7 @@ putMVarzh_fast
        StgMVar_tail(mvar) = CurrentTSO;
        
 #if defined(SMP)
-       foreign "C" RELEASE_LOCK(sm_mutex "ptr");
+        SET_INFO(mvar,stg_FULL_MVAR_info);
 #endif
        jump stg_block_putmvar;
     }
@@ -1628,7 +1621,7 @@ putMVarzh_fast
        }
 
 #if defined(SMP)
-       foreign "C" RELEASE_LOCK(sm_mutex "ptr");
+        SET_INFO(mvar,stg_EMPTY_MVAR_info);
 #endif
        jump %ENTRY_CODE(Sp(0));
     }
@@ -1639,9 +1632,6 @@ putMVarzh_fast
        /* unlocks the MVar in the SMP case */
        SET_INFO(mvar,stg_FULL_MVAR_info);
 
-#if defined(SMP)
-       foreign "C" RELEASE_LOCK(sm_mutex "ptr");
-#endif
        jump %ENTRY_CODE(Sp(0));
     }
     
@@ -1653,18 +1643,18 @@ tryPutMVarzh_fast
 {
     W_ mvar, info, tso;
 
-#if defined(SMP)
-    foreign "C" ACQUIRE_LOCK(sm_mutex "ptr");
-#endif
-
     /* args: R1 = MVar, R2 = value */
     mvar = R1;
 
+#if defined(SMP)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr");
+#else
     info = GET_INFO(mvar);
+#endif
 
     if (info == stg_FULL_MVAR_info) {
 #if defined(SMP)
-       foreign "C" RELEASE_LOCK(sm_mutex "ptr");
+        SET_INFO(mvar,stg_FULL_MVAR_info);
 #endif
        RET_N(0);
     }
@@ -1692,7 +1682,7 @@ tryPutMVarzh_fast
        }
 
 #if defined(SMP)
-       foreign "C" RELEASE_LOCK(sm_mutex "ptr");
+        SET_INFO(mvar,stg_EMPTY_MVAR_info);
 #endif
        jump %ENTRY_CODE(Sp(0));
     }
@@ -1702,9 +1692,7 @@ tryPutMVarzh_fast
        StgMVar_value(mvar) = R2;
        /* unlocks the MVar in the SMP case */
        SET_INFO(mvar,stg_FULL_MVAR_info);
-#if defined(SMP)
-       foreign "C" RELEASE_LOCK(sm_mutex "ptr");
-#endif
+
        jump %ENTRY_CODE(Sp(0));
     }
     
index 4e2c0fb..15f27d6 100644 (file)
@@ -419,6 +419,12 @@ INFO_TABLE(stg_SE_CAF_BLACKHOLE,0,1,SE_CAF_BLACKHOLE,"SE_CAF_BLACKHOLE","SE_CAF_
 #endif
 
 /* ----------------------------------------------------------------------------
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_WHITEHOLE, 0,0, INVALID_OBJECT, "WHITEHOLE", "WHITEHOLE")
+{ foreign "C" barf("WHITEHOLE object entered!"); }
+
+/* ----------------------------------------------------------------------------
    Some static info tables for things that don't get entered, and
    therefore don't need entry code (i.e. boxed but unpointed objects)
    NON_ENTERABLE_ENTRY_CODE now defined at the beginning of the file
index f466a58..7e07ff2 100644 (file)
@@ -26,6 +26,9 @@
 #include <stdlib.h>
 #include <string.h>
 
+/* 
+ * All these globals require sm_mutex to access in SMP mode.
+ */
 StgClosure    *caf_list         = NULL;
 StgClosure    *revertible_caf_list = NULL;
 rtsBool       keepCAFs;
@@ -405,10 +408,12 @@ assignNurseriesToCapabilities (void)
     for (i = 0; i < n_nurseries; i++) {
        capabilities[i].r.rNursery        = &nurseries[i];
        capabilities[i].r.rCurrentNursery = nurseries[i].blocks;
+       capabilities[i].r.rCurrentAlloc   = NULL;
     }
 #else /* SMP */
     MainCapability.r.rNursery        = &nurseries[0];
     MainCapability.r.rCurrentNursery = nurseries[0].blocks;
+    MainCapability.r.rCurrentAlloc   = NULL;
 #endif
 }
 
@@ -534,49 +539,49 @@ resizeNurseries (nat blocks)
 StgPtr
 allocate( nat n )
 {
-  bdescr *bd;
-  StgPtr p;
+    bdescr *bd;
+    StgPtr p;
 
-  ACQUIRE_SM_LOCK;
+    ACQUIRE_SM_LOCK;
 
-  TICK_ALLOC_HEAP_NOCTR(n);
-  CCS_ALLOC(CCCS,n);
-
-  /* big allocation (>LARGE_OBJECT_THRESHOLD) */
-  /* ToDo: allocate directly into generation 1 */
-  if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
-    nat req_blocks =  (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
-    bd = allocGroup(req_blocks);
-    dbl_link_onto(bd, &g0s0->large_objects);
-    g0s0->n_large_blocks += req_blocks;
-    bd->gen_no  = 0;
-    bd->step = g0s0;
-    bd->flags = BF_LARGE;
-    bd->free = bd->start + n;
-    alloc_blocks += req_blocks;
-    RELEASE_SM_LOCK;
-    return bd->start;
+    TICK_ALLOC_HEAP_NOCTR(n);
+    CCS_ALLOC(CCCS,n);
 
-  /* small allocation (<LARGE_OBJECT_THRESHOLD) */
-  } else if (small_alloc_list == NULL || alloc_Hp + n > alloc_HpLim) {
-    if (small_alloc_list) {
-      small_alloc_list->free = alloc_Hp;
+    /* big allocation (>LARGE_OBJECT_THRESHOLD) */
+    /* ToDo: allocate directly into generation 1 */
+    if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
+       nat req_blocks =  (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
+       bd = allocGroup(req_blocks);
+       dbl_link_onto(bd, &g0s0->large_objects);
+       g0s0->n_large_blocks += req_blocks;
+       bd->gen_no  = 0;
+       bd->step = g0s0;
+       bd->flags = BF_LARGE;
+       bd->free = bd->start + n;
+       alloc_blocks += req_blocks;
+       RELEASE_SM_LOCK;
+       return bd->start;
+       
+       /* small allocation (<LARGE_OBJECT_THRESHOLD) */
+    } else if (small_alloc_list == NULL || alloc_Hp + n > alloc_HpLim) {
+       if (small_alloc_list) {
+           small_alloc_list->free = alloc_Hp;
+       }
+       bd = allocBlock();
+       bd->link = small_alloc_list;
+       small_alloc_list = bd;
+       bd->gen_no = 0;
+       bd->step = g0s0;
+       bd->flags = 0;
+       alloc_Hp = bd->start;
+       alloc_HpLim = bd->start + BLOCK_SIZE_W;
+       alloc_blocks++;
     }
-    bd = allocBlock();
-    bd->link = small_alloc_list;
-    small_alloc_list = bd;
-    bd->gen_no = 0;
-    bd->step = g0s0;
-    bd->flags = 0;
-    alloc_Hp = bd->start;
-    alloc_HpLim = bd->start + BLOCK_SIZE_W;
-    alloc_blocks++;
-  }
-
-  p = alloc_Hp;
-  alloc_Hp += n;
-  RELEASE_SM_LOCK;
-  return p;
+    
+    p = alloc_Hp;
+    alloc_Hp += n;
+    RELEASE_SM_LOCK;
+    return p;
 }
 
 lnat
@@ -603,6 +608,82 @@ tidyAllocateLists (void)
     }
 }
 
+/* -----------------------------------------------------------------------------
+   allocateLocal()
+
+   This allocates memory in the current thread - it is intended for
+   use primarily from STG-land where we have a Capability.  It is
+   better than allocate() because it doesn't require taking the
+   sm_mutex lock in the common case.
+
+   Memory is allocated directly from the nursery if possible (but not
+   from the current nursery block, so as not to interfere with
+   Hp/HpLim).
+   -------------------------------------------------------------------------- */
+
+StgPtr
+allocateLocal( StgRegTable *reg, nat n )
+{
+    bdescr *bd;
+    StgPtr p;
+
+    TICK_ALLOC_HEAP_NOCTR(n);
+    CCS_ALLOC(CCCS,n);
+    
+    /* big allocation (>LARGE_OBJECT_THRESHOLD) */
+    /* ToDo: allocate directly into generation 1 */
+    if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
+       nat req_blocks =  (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
+       ACQUIRE_SM_LOCK;
+       bd = allocGroup(req_blocks);
+       dbl_link_onto(bd, &g0s0->large_objects);
+       g0s0->n_large_blocks += req_blocks;
+       bd->gen_no  = 0;
+       bd->step = g0s0;
+       bd->flags = BF_LARGE;
+       bd->free = bd->start + n;
+       alloc_blocks += req_blocks;
+       RELEASE_SM_LOCK;
+       return bd->start;
+       
+       /* small allocation (<LARGE_OBJECT_THRESHOLD) */
+    } else {
+
+       bd = reg->rCurrentAlloc;
+       if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
+
+           // The CurrentAlloc block is full, we need to find another
+           // one.  First, we try taking the next block from the
+           // nursery:
+           bd = reg->rCurrentNursery->link;
+
+           if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
+               // The nursery is empty, or the next block is already
+               // full: allocate a fresh block (we can't fail here).
+               ACQUIRE_SM_LOCK;
+               bd = allocBlock();
+               alloc_blocks++;
+               RELEASE_SM_LOCK;
+               bd->gen_no = 0;
+               bd->step = g0s0;
+               bd->flags = 0;
+           } else {
+               // we have a block in the nursery: take it and put
+               // it at the *front* of the nursery list, and use it
+               // to allocate() from.
+               reg->rCurrentNursery->link = bd->link;
+           }
+           bd->link = reg->rNursery->blocks;
+           reg->rNursery->blocks = bd;
+           bd->u.back = NULL;
+           reg->rCurrentAlloc = bd;
+       }
+    }
+    p = bd->free;
+    bd->free += n;
+    return p;
+}
+
 /* ---------------------------------------------------------------------------
    Allocate a fixed/pinned object.
 
@@ -690,7 +771,11 @@ stgAllocForGMP (size_t size_in_bytes)
   total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
   
   /* allocate and fill it in. */
-  arr = (StgArrWords *)allocate(total_size_in_words);
+#if defined(SMP)
+  arr = (StgArrWords *)allocateLocal(&(myCapability()->r), total_size_in_words);
+#else
+  arr = (StgArrWords *)allocateLocal(&MainCapability.r, total_size_in_words);
+#endif
   SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
   
   /* and return a ptr to the goods inside the array */
@@ -740,9 +825,7 @@ calcAllocated( void )
   nat i;
 
   allocated = allocated_bytes();
-  for (i = 0; i < n_nurseries; i++) {
-      allocated += nurseries[i].n_blocks * BLOCK_SIZE_W;
-  }
+  allocated += countNurseryBlocks() * BLOCK_SIZE_W;
   
 #ifdef SMP
   for (i = 0; i < n_nurseries; i++) {