From cf9650f2a1690c04051c716124bb0350adc74ae7 Mon Sep 17 00:00:00 2001 From: "berthold@mathematik.uni-marburg.de" Date: Mon, 15 Sep 2008 13:28:46 +0000 Subject: [PATCH] Work stealing for sparks Spark stealing support for PARALLEL_HASKELL and THREADED_RTS versions of the RTS. Spark pools are per capability, separately allocated and held in the Capability structure. The implementation uses Double-Ended Queues (deque) and cas-protected access. The write end of the queue (position bottom) can only be used with mutual exclusion, i.e. by exactly one caller at a time. Multiple readers can steal()/findSpark() from the read end (position top), and are synchronised without a lock, based on a cas of the top position. One reader wins, the others return NULL for a failure. Work stealing is called when Capabilities find no other work (inside yieldCapability), and tries all capabilities 0..n-1 twice, unless a theft succeeds. Inside schedulePushWork, all considered cap.s (those which were idle and could be grabbed) are woken up. Future versions should wake up capabilities immediately when putting a new spark in the local pool, from newSpark(). Patch has been re-recorded due to conflicting bugfixes in the sparks.c, also fixing a (strange) conflict in the scheduler. --- includes/Regs.h | 96 +--------- includes/RtsTypes.h | 34 ++++ rts/Capability.c | 57 +++++- rts/Capability.h | 4 + rts/Schedule.c | 110 +++++++---- rts/Sparks.c | 517 ++++++++++++++++++++++++++++++++++++++++----------- rts/Sparks.h | 69 ++++--- 7 files changed, 617 insertions(+), 270 deletions(-) diff --git a/includes/Regs.h b/includes/Regs.h index 0f974ec..cf083c9 100644 --- a/includes/Regs.h +++ b/includes/Regs.h @@ -22,31 +22,6 @@ #ifndef REGS_H #define REGS_H -/* - * Spark pools: used to store pending sparks - * (THREADED_RTS & PARALLEL_HASKELL only) - * This is a circular buffer. Invariants: - * - base <= hd < lim - * - base <= tl < lim - * - if hd==tl, then the pool is empty. - * - if hd == tl+1, then the pool is full. - * Adding to the pool is done by assigning to *tl++ (wrapping round as - * necessary). When adding to a full pool, we have the option of - * throwing away either the oldest (hd++) or the most recent (tl--) entry. - */ -typedef struct StgSparkPool_ { - StgClosure **base; - StgClosure **lim; - StgClosure **hd; - StgClosure **tl; -} StgSparkPool; - -#define ASSERT_SPARK_POOL_INVARIANTS(p) \ - ASSERT((p)->base <= (p)->hd); \ - ASSERT((p)->hd < (p)->lim); \ - ASSERT((p)->base <= (p)->tl); \ - ASSERT((p)->tl < (p)->lim); - typedef struct { StgFunPtr stgGCEnter1; StgFunPtr stgGCFun; @@ -120,7 +95,6 @@ typedef struct StgRegTable_ { StgWord rmp_result1[MP_INT_WORDS]; StgWord rmp_result2[MP_INT_WORDS]; StgWord rRet; // holds the return code of the thread - StgSparkPool rSparks; /* per-task spark pool */ } StgRegTable; #if IN_STG_CODE @@ -163,10 +137,6 @@ typedef struct StgRegTable_ { #define SAVE_CurrentTSO (BaseReg->rCurrentTSO) #define SAVE_CurrentNursery (BaseReg->rCurrentNursery) #define SAVE_HpAlloc (BaseReg->rHpAlloc) -#define SAVE_SparkHd (BaseReg->rSparks.hd) -#define SAVE_SparkTl (BaseReg->rSparks.tl) -#define SAVE_SparkBase (BaseReg->rSparks.base) -#define SAVE_SparkLim (BaseReg->rSparks.lim) /* We sometimes need to save registers across a C-call, eg. if they * are clobbered in the standard calling convention. We define the @@ -401,30 +371,6 @@ GLOBAL_REG_DECL(bdescr *,HpAlloc,REG_HpAlloc) #define HpAlloc (BaseReg->rHpAlloc) #endif -#if defined(REG_SparkHd) && !defined(NO_GLOBAL_REG_DECLS) -GLOBAL_REG_DECL(bdescr *,SparkHd,REG_SparkHd) -#else -#define SparkHd (BaseReg->rSparks.hd) -#endif - -#if defined(REG_SparkTl) && !defined(NO_GLOBAL_REG_DECLS) -GLOBAL_REG_DECL(bdescr *,SparkTl,REG_SparkTl) -#else -#define SparkTl (BaseReg->rSparks.tl) -#endif - -#if defined(REG_SparkBase) && !defined(NO_GLOBAL_REG_DECLS) -GLOBAL_REG_DECL(bdescr *,SparkBase,REG_SparkBase) -#else -#define SparkBase (BaseReg->rSparks.base) -#endif - -#if defined(REG_SparkLim) && !defined(NO_GLOBAL_REG_DECLS) -GLOBAL_REG_DECL(bdescr *,SparkLim,REG_SparkLim) -#else -#define SparkLim (BaseReg->rSparks.lim) -#endif - /* ----------------------------------------------------------------------------- Get absolute function pointers from the register table, to save code space. On x86, @@ -665,38 +611,6 @@ GLOBAL_REG_DECL(bdescr *,SparkLim,REG_SparkLim) #define CALLER_RESTORE_HpAlloc /* nothing */ #endif -#ifdef CALLER_SAVES_SparkHd -#define CALLER_SAVE_SparkHd SAVE_SparkHd = SparkHd; -#define CALLER_RESTORE_SparkHd SparkHd = SAVE_SparkHd; -#else -#define CALLER_SAVE_SparkHd /* nothing */ -#define CALLER_RESTORE_SparkHd /* nothing */ -#endif - -#ifdef CALLER_SAVES_SparkTl -#define CALLER_SAVE_SparkTl SAVE_SparkTl = SparkTl; -#define CALLER_RESTORE_SparkTl SparkTl = SAVE_SparkTl; -#else -#define CALLER_SAVE_SparkTl /* nothing */ -#define CALLER_RESTORE_SparkTl /* nothing */ -#endif - -#ifdef CALLER_SAVES_SparkBase -#define CALLER_SAVE_SparkBase SAVE_SparkBase = SparkBase; -#define CALLER_RESTORE_SparkBase SparkBase = SAVE_SparkBase; -#else -#define CALLER_SAVE_SparkBase /* nothing */ -#define CALLER_RESTORE_SparkBase /* nothing */ -#endif - -#ifdef CALLER_SAVES_SparkLim -#define CALLER_SAVE_SparkLim SAVE_SparkLim = SparkLim; -#define CALLER_RESTORE_SparkLim SparkLim = SAVE_SparkLim; -#else -#define CALLER_SAVE_SparkLim /* nothing */ -#define CALLER_RESTORE_SparkLim /* nothing */ -#endif - #endif /* IN_STG_CODE */ /* ---------------------------------------------------------------------------- @@ -731,10 +645,6 @@ GLOBAL_REG_DECL(bdescr *,SparkLim,REG_SparkLim) CALLER_SAVE_HpLim \ CALLER_SAVE_CurrentTSO \ CALLER_SAVE_CurrentNursery \ - CALLER_SAVE_SparkHd \ - CALLER_SAVE_SparkTl \ - CALLER_SAVE_SparkBase \ - CALLER_SAVE_SparkLim \ CALLER_SAVE_Base #define CALLER_RESTORE_USER \ @@ -763,11 +673,7 @@ GLOBAL_REG_DECL(bdescr *,SparkLim,REG_SparkLim) CALLER_RESTORE_Hp \ CALLER_RESTORE_HpLim \ CALLER_RESTORE_CurrentTSO \ - CALLER_RESTORE_CurrentNursery \ - CALLER_RESTORE_SparkHd \ - CALLER_RESTORE_SparkTl \ - CALLER_RESTORE_SparkBase \ - CALLER_RESTORE_SparkLim + CALLER_RESTORE_CurrentNursery #else /* not IN_STG_CODE */ diff --git a/includes/RtsTypes.h b/includes/RtsTypes.h index 9e8c7b8..3510ee7 100644 --- a/includes/RtsTypes.h +++ b/includes/RtsTypes.h @@ -37,6 +37,40 @@ typedef enum { Types specific to the parallel runtime system. */ + +/* Spark pools: used to store pending sparks + * (THREADED_RTS & PARALLEL_HASKELL only) + * Implementation uses a DeQue to enable concurrent read accesses at + * the top end. + */ +typedef struct SparkPool_ { + /* Size of elements array. Used for modulo calculation: we round up + to powers of 2 and use the dyadic log (modulo == bitwise &) */ + StgWord size; + StgWord moduloSize; /* bitmask for modulo */ + + /* top, index where multiple readers steal() (protected by a cas) */ + StgWord top; + + /* bottom, index of next free place where one writer can push + elements. This happens unsynchronised. */ + StgWord bottom; + /* both position indices are continuously incremented, and used as + an index modulo the current array size. */ + + /* lower bound on the current top value. This is an internal + optimisation to avoid unnecessarily accessing the top field + inside pushBottom */ + StgWord topBound; + + /* The elements array */ + StgClosurePtr* elements; + /* Please note: the dataspace cannot follow the admin fields + immediately, as it should be possible to enlarge it without + disposing the old one automatically (as realloc would)! */ + +} SparkPool; + typedef ullong rtsTime; #if defined(PAR) diff --git a/rts/Capability.c b/rts/Capability.c index 4d5748c..516aaa5 100644 --- a/rts/Capability.c +++ b/rts/Capability.c @@ -54,6 +54,55 @@ globalWorkToDo (void) #endif #if defined(THREADED_RTS) +rtsBool stealWork( Capability *cap) { + /* use the normal Sparks.h interface (internally modified to enable + concurrent stealing) + and immediately turn the spark into a thread when successful + */ + Capability *robbed; + SparkPool *pool; + StgClosurePtr spark; + rtsBool success = rtsFalse; + nat i = 0; + + debugTrace(DEBUG_sched, + "cap %d: Trying to steal work from other capabilities", + cap->no); + + if (n_capabilities == 1) { return rtsFalse; } // makes no sense... + + /* visit cap.s 0..n-1 in sequence until a theft succeeds. We could + start at a random place instead of 0 as well. */ + for ( i=0 ; i < n_capabilities ; i++ ) { + robbed = &capabilities[i]; + if (cap == robbed) // ourselves... + continue; + + if (emptySparkPoolCap(robbed)) // nothing to steal here + continue; + + spark = findSpark(robbed); + + if (spark == NULL && !emptySparkPoolCap(robbed)) { + spark = findSpark(robbed); // lost race in concurrent access, try again + } + if (spark != NULL) { + debugTrace(DEBUG_sched, + "cap %d: Stole a spark from capability %d", + cap->no, robbed->no); + + createSparkThread(cap,spark); + success = rtsTrue; + break; // got one, leave the loop + } + // otherwise: no success, try next one + } + debugTrace(DEBUG_sched, + "Leaving work stealing routine (%s)", + success?"one spark stolen":"thefts did not succeed"); + return success; +} + STATIC_INLINE rtsBool anyWorkForMe( Capability *cap, Task *task ) { @@ -73,9 +122,11 @@ anyWorkForMe( Capability *cap, Task *task ) if (emptyRunQueue(cap)) { return !emptySparkPoolCap(cap) || !emptyWakeupQueue(cap) - || globalWorkToDo(); - } else + || globalWorkToDo() + || stealWork(cap); /* if all false: try to steal work */ + } else { return cap->run_queue_hd->bound == NULL; + } } } #endif @@ -778,7 +829,7 @@ void freeCapability (Capability *cap) { stgFree(cap->mut_lists); #if defined(THREADED_RTS) || defined(PARALLEL_HASKELL) - freeSparkPool(&cap->r.rSparks); + freeSparkPool(cap->sparks); #endif } diff --git a/rts/Capability.h b/rts/Capability.h index 70d9ee9..5945895 100644 --- a/rts/Capability.h +++ b/rts/Capability.h @@ -23,6 +23,7 @@ #ifndef CAPABILITY_H #define CAPABILITY_H +#include "RtsTypes.h" #include "RtsFlags.h" #include "Task.h" @@ -98,6 +99,9 @@ struct Capability_ { StgTRecChunk *free_trec_chunks; StgTRecHeader *free_trec_headers; nat transaction_tokens; + + SparkPool *sparks; + }; // typedef Capability, defined in RtsAPI.h diff --git a/rts/Schedule.c b/rts/Schedule.c index 626c097..09150fd 100644 --- a/rts/Schedule.c +++ b/rts/Schedule.c @@ -137,17 +137,17 @@ static Capability *schedule (Capability *initialCapability, Task *task); // scheduler clearer. // static void schedulePreLoop (void); -#if defined(THREADED_RTS) -static void schedulePushWork(Capability *cap, Task *task); -#endif static void scheduleStartSignalHandlers (Capability *cap); static void scheduleCheckBlockedThreads (Capability *cap); static void scheduleCheckWakeupThreads(Capability *cap USED_IF_NOT_THREADS); static void scheduleCheckBlackHoles (Capability *cap); static void scheduleDetectDeadlock (Capability *cap, Task *task); -#if defined(PARALLEL_HASKELL) +#if defined(PARALLEL_HASKELL) || defined(THREADED_RTS) +static void schedulePushWork(Capability *cap, Task *task); static rtsBool scheduleGetRemoteWork(Capability *cap); +#if defined(PARALLEL_HASKELL) static void scheduleSendPendingMessages(void); +#endif static void scheduleActivateSpark(Capability *cap); #endif static void schedulePostRunThread(Capability *cap, StgTSO *t); @@ -291,13 +291,15 @@ schedule (Capability *initialCapability, Task *task) } else { // Yield the capability to higher-priority tasks if necessary. yieldCapability(&cap, task); + /* inside yieldCapability, attempts to steal work from other + capabilities, unless the capability has own work. + See (REMARK) below. + */ } #endif - -#if defined(THREADED_RTS) - schedulePushWork(cap,task); -#endif + /* THIS WAS THE PLACE FOR THREADED_RTS::schedulePushWork(cap,task) */ + // Check whether we have re-entered the RTS from Haskell without // going via suspendThread()/resumeThread (i.e. a 'safe' foreign // call). @@ -365,21 +367,7 @@ schedule (Capability *initialCapability, Task *task) barf("sched_state: %d", sched_state); } -#if defined(THREADED_RTS) - // If the run queue is empty, take a spark and turn it into a thread. - { - if (emptyRunQueue(cap)) { - StgClosure *spark; - spark = findSpark(cap); - if (spark != NULL) { - debugTrace(DEBUG_sched, - "turning spark of closure %p into a thread", - (StgClosure *)spark); - createSparkThread(cap,spark); - } - } - } -#endif // THREADED_RTS + /* this was the place to activate a spark, now below... */ scheduleStartSignalHandlers(cap); @@ -393,11 +381,19 @@ schedule (Capability *initialCapability, Task *task) scheduleCheckBlockedThreads(cap); -#if defined(PARALLEL_HASKELL) - /* message processing and work distribution goes here */ +#if defined(THREADED_RTS) || defined(PARALLEL_HASKELL) + /* work distribution in multithreaded and parallel systems + + REMARK: IMHO best location for work-stealing as well. + tests above might yield some new jobs, so no need to steal a + spark in some cases. I believe the yieldCapability.. above + should be moved here. + */ +#if defined(PARALLEL_HASKELL) /* if messages have been buffered... a NOOP in THREADED_RTS */ scheduleSendPendingMessages(); +#endif /* If the run queue is empty,...*/ if (emptyRunQueue(cap)) { @@ -406,6 +402,7 @@ schedule (Capability *initialCapability, Task *task) /* if this did not work, try to steal a spark from someone else */ if (emptyRunQueue(cap)) { +#if defined(PARALLEL_HASKELL) receivedFinish = scheduleGetRemoteWork(cap); continue; // a new round, (hopefully) with new work /* @@ -414,10 +411,20 @@ schedule (Capability *initialCapability, Task *task) b) (blocking) awaits and receives messages in Eden, this is only the blocking receive, as b) in GUM. + + in Threaded-RTS, this does plain nothing. Stealing routine + is inside Capability.c and called from + yieldCapability() at the very beginning, see REMARK. */ +#endif } - } + } else { /* i.e. run queue was (initially) not empty */ + schedulePushWork(cap,task); + /* work pushing, currently relevant only for THREADED_RTS: + (pushes threads, wakes up idle capabilities for stealing) */ + } +#if defined(PARALLEL_HASKELL) /* since we perform a blocking receive and continue otherwise, either we never reach here or we definitely have work! */ // from here: non-empty run queue @@ -430,7 +437,9 @@ schedule (Capability *initialCapability, Task *task) above, waits for messages as well! */ processMessages(cap, &receivedFinish); } -#endif // PARALLEL_HASKELL +#endif // PARALLEL_HASKELL: non-empty run queue! + +#endif /* THREADED_RTS || PARALLEL_HASKELL */ scheduleDetectDeadlock(cap,task); #if defined(THREADED_RTS) @@ -679,11 +688,15 @@ schedulePreLoop(void) * Push work to other Capabilities if we have some. * -------------------------------------------------------------------------- */ -#if defined(THREADED_RTS) +#if defined(THREADED_RTS) || defined(PARALLEL_HASKELL) static void schedulePushWork(Capability *cap USED_IF_THREADS, Task *task USED_IF_THREADS) { + /* following code not for PARALLEL_HASKELL. I kept the call general, + future GUM versions might use pushing in a distributed setup */ +#if defined(THREADED_RTS) + Capability *free_caps[n_capabilities], *cap0; nat i, n_free_caps; @@ -726,7 +739,12 @@ schedulePushWork(Capability *cap USED_IF_THREADS, StgTSO *prev, *t, *next; rtsBool pushed_to_all; - debugTrace(DEBUG_sched, "excess threads on run queue and %d free capabilities, sharing...", n_free_caps); + debugTrace(DEBUG_sched, + "cap %d: %s and %d free capabilities, sharing...", + cap->no, + (!emptyRunQueue(cap) && cap->run_queue_hd->_link != END_TSO_QUEUE)? + "excess threads on run queue":"sparks to share (>=2)", + n_free_caps); i = 0; pushed_to_all = rtsFalse; @@ -760,6 +778,9 @@ schedulePushWork(Capability *cap USED_IF_THREADS, cap->run_queue_tl = prev; } +#ifdef SPARK_PUSHING + /* JB I left this code in place, it would work but is not necessary */ + // If there are some free capabilities that we didn't push any // threads to, then try to push a spark to each one. if (!pushed_to_all) { @@ -775,16 +796,23 @@ schedulePushWork(Capability *cap USED_IF_THREADS, } } } +#endif /* SPARK_PUSHING */ // release the capabilities for (i = 0; i < n_free_caps; i++) { task->cap = free_caps[i]; releaseCapability(free_caps[i]); } + // now wake them all up, and they might steal sparks if + // the did not get a thread + prodAllCapabilities(); } task->cap = cap; // reset to point to our Capability. + +#endif /* THREADED_RTS */ + } -#endif +#endif /* THREADED_RTS || PARALLEL_HASKELL */ /* ---------------------------------------------------------------------------- * Start any pending signal handlers @@ -965,7 +993,7 @@ scheduleDetectDeadlock (Capability *cap, Task *task) * ------------------------------------------------------------------------- */ #if defined(PARALLEL_HASKELL) -static StgTSO * +static void scheduleSendPendingMessages(void) { @@ -984,10 +1012,10 @@ scheduleSendPendingMessages(void) #endif /* ---------------------------------------------------------------------------- - * Activate spark threads (PARALLEL_HASKELL only) + * Activate spark threads (PARALLEL_HASKELL and THREADED_RTS) * ------------------------------------------------------------------------- */ -#if defined(PARALLEL_HASKELL) +#if defined(PARALLEL_HASKELL) || defined(THREADED_RTS) static void scheduleActivateSpark(Capability *cap) { @@ -1012,14 +1040,14 @@ scheduleActivateSpark(Capability *cap) createSparkThread(cap,spark); // defined in Sparks.c } } -#endif // PARALLEL_HASKELL +#endif // PARALLEL_HASKELL || THREADED_RTS /* ---------------------------------------------------------------------------- * Get work from a remote node (PARALLEL_HASKELL only) * ------------------------------------------------------------------------- */ -#if defined(PARALLEL_HASKELL) -static rtsBool +#if defined(PARALLEL_HASKELL) || defined(THREADED_RTS) +static rtsBool /* return value used in PARALLEL_HASKELL only */ scheduleGetRemoteWork(Capability *cap) { #if defined(PARALLEL_HASKELL) @@ -1057,7 +1085,7 @@ scheduleGetRemoteWork(Capability *cap) #endif /* PARALLEL_HASKELL */ } -#endif // PARALLEL_HASKELL +#endif // PARALLEL_HASKELL || THREADED_RTS /* ---------------------------------------------------------------------------- * After running a thread... @@ -1483,6 +1511,14 @@ scheduleDoGC (Capability *cap, Task *task USED_IF_THREADS, rtsBool force_major) performHeapProfile = rtsFalse; } +#ifdef SPARKBALANCE + /* JB + Once we are all together... this would be the place to balance all + spark pools. No concurrent stealing or adding of new sparks can + occur. Should be defined in Sparks.c. */ + balanceSparkPoolsCaps(n_capabilities, capabilities); +#endif + #if defined(THREADED_RTS) // release our stash of capabilities. for (i = 0; i < n_capabilities; i++) { diff --git a/rts/Sparks.c b/rts/Sparks.c index 2e9e61c..ac11172 100644 --- a/rts/Sparks.c +++ b/rts/Sparks.c @@ -1,10 +1,39 @@ /* --------------------------------------------------------------------------- * - * (c) The GHC Team, 2000-2006 + * (c) The GHC Team, 2000-2008 * * Sparking support for PARALLEL_HASKELL and THREADED_RTS versions of the RTS. * - * -------------------------------------------------------------------------*/ + * The implementation uses Double-Ended Queues with lock-free access + * (thereby often called "deque") as described in + * + * D.Chase and Y.Lev, Dynamic Circular Work-Stealing Deque. + * SPAA'05, July 2005, Las Vegas, USA. + * ACM 1-58113-986-1/05/0007 + * + * Author: Jost Berthold MSRC 07-09/2008 + * + * The DeQue is held as a circular array with known length. Positions + * of top (read-end) and bottom (write-end) always increase, and the + * array is accessed with indices modulo array-size. While this bears + * the risk of overflow, we assume that (with 64 bit indices), a + * program must run very long to reach that point. + * + * The write end of the queue (position bottom) can only be used with + * mutual exclusion, i.e. by exactly one caller at a time. At this + * end, new items can be enqueued using pushBottom()/newSpark(), and + * removed using popBottom()/reclaimSpark() (the latter implying a cas + * synchronisation with potential concurrent readers for the case of + * just one element). + * + * Multiple readers can steal()/findSpark() from the read end + * (position top), and are synchronised without a lock, based on a cas + * of the top position. One reader wins, the others return NULL for a + * failure. + * + * Both popBottom and steal also return NULL when the queue is empty. + * + -------------------------------------------------------------------------*/ #include "PosixSource.h" #include "Rts.h" @@ -14,22 +43,52 @@ #include "RtsFlags.h" #include "RtsUtils.h" #include "ParTicky.h" -# if defined(PARALLEL_HASKELL) -# include "ParallelRts.h" -# include "GranSimRts.h" // for GR_... -# elif defined(GRAN) -# include "GranSimRts.h" -# endif -#include "Sparks.h" #include "Trace.h" +#include "SMP.h" // for cas + +#include "Sparks.h" + #if defined(THREADED_RTS) || defined(PARALLEL_HASKELL) -static INLINE_ME void bump_hd (StgSparkPool *p) -{ p->hd++; if (p->hd == p->lim) p->hd = p->base; } +/* internal helpers ... */ + +StgWord roundUp2(StgWord val); -static INLINE_ME void bump_tl (StgSparkPool *p) -{ p->tl++; if (p->tl == p->lim) p->tl = p->base; } +StgWord roundUp2(StgWord val) { + StgWord rounded = 1; + + /* StgWord is unsigned anyway, only catch 0 */ + if (val == 0) { + barf("DeQue,roundUp2: invalid size 0 requested"); + } + /* at least 1 bit set, shift up to its place */ + do { + rounded = rounded << 1; + } while (0 != (val = val>>1)); + return rounded; +} + +INLINE_HEADER +rtsBool casTop(StgPtr addr, StgWord old, StgWord new); + +#if !defined(THREADED_RTS) +/* missing def. in non THREADED RTS, and makes no sense anyway... */ +StgWord cas(StgPtr addr,StgWord old,StgWord new); +StgWord cas(StgPtr addr,StgWord old,StgWord new) { + barf("cas: not implemented without multithreading"); + old = new = *addr; /* to avoid gcc warnings */ +} +#endif + +INLINE_HEADER +rtsBool casTop(StgWord* addr, StgWord old, StgWord new) { + StgWord res = cas((StgPtr) addr, old, new); + return ((res == old)); +} + +/* or simply like this */ +#define CASTOP(addr,old,new) ((old) == cas(((StgPtr)addr),(old),(new))) /* ----------------------------------------------------------------------------- * @@ -37,15 +96,28 @@ static INLINE_ME void bump_tl (StgSparkPool *p) * * -------------------------------------------------------------------------- */ -static void -initSparkPool(StgSparkPool *pool) -{ - pool->base = stgMallocBytes(RtsFlags.ParFlags.maxLocalSparks - * sizeof(StgClosure *), - "initSparkPools"); - pool->lim = pool->base + RtsFlags.ParFlags.maxLocalSparks; - pool->hd = pool->base; - pool->tl = pool->base; +/* constructor */ +SparkPool* initPool(StgWord size) { + + StgWord realsize; + SparkPool *q; + + realsize = roundUp2(size); /* to compute modulo as a bitwise & */ + + q = (SparkPool*) stgMallocBytes(sizeof(SparkPool), /* admin fields */ + "newSparkPool"); + q->elements = (StgClosurePtr*) + stgMallocBytes(realsize * sizeof(StgClosurePtr), /* dataspace */ + "newSparkPool:data space"); + q->top=0; + q->bottom=0; + q->topBound=0; /* read by writer, updated each time top is read */ + + q->size = realsize; /* power of 2 */ + q->moduloSize = realsize - 1; /* n % size == n & moduloSize */ + + ASSERT_SPARK_POOL_INVARIANTS(q); + return q; } void @@ -55,17 +127,71 @@ initSparkPools( void ) /* walk over the capabilities, allocating a spark pool for each one */ nat i; for (i = 0; i < n_capabilities; i++) { - initSparkPool(&capabilities[i].r.rSparks); + capabilities[i].sparks = initPool(RtsFlags.ParFlags.maxLocalSparks); } #else /* allocate a single spark pool */ - initSparkPool(&MainCapability.r.rSparks); + MainCapability->sparks = initPool(RtsFlags.ParFlags.maxLocalSparks); #endif } void -freeSparkPool(StgSparkPool *pool) { - stgFree(pool->base); +freeSparkPool(SparkPool *pool) { + /* should not interfere with concurrent findSpark() calls! And + nobody should use the pointer any more. We cross our fingers...*/ + stgFree(pool->elements); + stgFree(pool); +} + +/* reclaimSpark(cap): remove a spark from the write end of the queue. + * Returns the removed spark, and NULL if a race is lost or the pool + * empty. + * + * If only one spark is left in the pool, we synchronise with + * concurrently stealing threads by using cas to modify the top field. + * This routine should NEVER be called by a task which does not own + * the capability. Can this be checked here? + */ +StgClosure* reclaimSpark(Capability *cap) { + SparkPool *deque = cap->sparks; + /* also a bit tricky, has to avoid concurrent steal() calls by + accessing top with cas, when there is only one element left */ + StgWord t, b; + StgClosurePtr* pos; + long currSize; + StgClosurePtr removed; + + ASSERT_SPARK_POOL_INVARIANTS(deque); + + b = deque->bottom; + /* "decrement b as a test, see what happens" */ + deque->bottom = --b; + pos = (deque->elements) + (b & (deque->moduloSize)); + t = deque->top; /* using topBound would give an *upper* bound, we + need a lower bound. We use the real top here, but + can update the topBound value */ + deque->topBound = t; + currSize = b - t; + if (currSize < 0) { /* was empty before decrementing b, set b + consistently and abort */ + deque->bottom = t; + return NULL; + } + removed = *pos; + if (currSize > 0) { /* no danger, still elements in buffer after b-- */ + return removed; + } + /* otherwise, has someone meanwhile stolen the same (last) element? + Check and increment top value to know */ + if ( !(CASTOP(&(deque->top),t,t+1)) ) { + removed = NULL; /* no success, but continue adjusting bottom */ + } + deque->bottom = t+1; /* anyway, empty now. Adjust bottom consistently. */ + deque->topBound = t+1; /* ...and cached top value as well */ + + ASSERT_SPARK_POOL_INVARIANTS(deque); + + return removed; } /* ----------------------------------------------------------------------------- @@ -73,32 +199,80 @@ freeSparkPool(StgSparkPool *pool) { * findSpark: find a spark on the current Capability that we can fork * into a thread. * - * -------------------------------------------------------------------------- */ + * May be called by concurrent threads, which synchronise on top + * variable. Returns a spark, or NULL if pool empty or race lost. + * + -------------------------------------------------------------------------- */ + +StgClosurePtr steal(SparkPool *deque); + +/* steal an element from the read end. Synchronises multiple callers + by failing with NULL return. Returns NULL when deque is empty. */ +StgClosurePtr steal(SparkPool *deque) { + StgClosurePtr* pos; + StgClosurePtr* arraybase; + StgWord sz; + StgClosurePtr stolen; + StgWord b,t; + + ASSERT_SPARK_POOL_INVARIANTS(deque); + + b = deque->bottom; + t = deque->top; + if (b - t <= 0 ) { + return NULL; /* already looks empty, abort */ + } + + /* now access array, see pushBottom() */ + arraybase = deque->elements; + sz = deque->moduloSize; + pos = arraybase + (t & sz); + stolen = *pos; + + /* now decide whether we have won */ + if ( !(CASTOP(&(deque->top),t,t+1)) ) { + /* lost the race, someon else has changed top in the meantime */ + stolen = NULL; + } /* else: OK, top has been incremented by the cas call */ + + + ASSERT_SPARK_POOL_INVARIANTS(deque); + /* return NULL or stolen element */ + return stolen; +} StgClosure * findSpark (Capability *cap) { - StgSparkPool *pool; - StgClosure *spark; + SparkPool *deque = (cap->sparks); + StgClosure *stolen; + + ASSERT_SPARK_POOL_INVARIANTS(deque); + + do { + /* keep trying until good spark found or pool looks empty. + TODO is this a good idea? */ + + stolen = steal(deque); - pool = &(cap->r.rSparks); - ASSERT_SPARK_POOL_INVARIANTS(pool); + } while ( ( !stolen /* nothing stolen*/ + || !closure_SHOULD_SPARK(stolen)) /* spark not OK */ + && !looksEmpty(deque)); /* run empty, give up */ - while (pool->hd != pool->tl) { - spark = *pool->hd; - bump_hd(pool); - if (closure_SHOULD_SPARK(spark)) { -#ifdef GRAN - if (RtsFlags.ParFlags.ParStats.Sparks) - DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC, - GR_STEALING, ((StgTSO *)NULL), spark, - 0, 0 /* spark_queue_len(ADVISORY_POOL) */); -#endif - return spark; - } - } - // spark pool is now empty - return NULL; + /* return stolen element */ + return stolen; +} + + +/* "guesses" whether a deque is empty. Can return false negatives in + presence of concurrent steal() calls, and false positives in + presence of a concurrent pushBottom().*/ +rtsBool looksEmpty(SparkPool* deque) { + StgWord t = deque->top; + StgWord b = deque->bottom; + /* try to prefer false negatives by reading top first */ + return (b - t <= 0); + /* => array is *never* completely filled, always 1 place free! */ } /* ----------------------------------------------------------------------------- @@ -123,11 +297,64 @@ createSparkThread (Capability *cap, StgClosure *p) * -------------------------------------------------------------------------- */ #define DISCARD_NEW +void pushBottom(SparkPool* deque, StgClosurePtr elem); + +/* enqueue an element. Should always succeed by resizing the array + (not implemented yet, silently fails in that case). */ +void pushBottom(SparkPool* deque, StgClosurePtr elem) { + StgWord t; + StgClosurePtr* pos; + StgWord sz = deque->moduloSize; + StgWord b = deque->bottom; + + ASSERT_SPARK_POOL_INVARIANTS(deque); + + /* we try to avoid reading deque->top (accessed by all) and use + deque->topBound (accessed only by writer) instead. + This is why we do not just call empty(deque) here. + */ + t = deque->topBound; + if ( b - t >= sz ) { /* nota bene: sz == deque->size - 1, thus ">=" */ + /* could be full, check the real top value in this case */ + t = deque->top; + deque->topBound = t; + if (b - t >= sz) { /* really no space left :-( */ + /* reallocate the array, copying the values. Concurrent steal()s + will in the meantime use the old one and modify only top. + This means: we cannot safely free the old space! Can keep it + on a free list internally here... + + Potential bug in combination with steal(): if array is + replaced, it is unclear which one concurrent steal operations + use. Must read the array base address in advance in steal(). + */ +#if defined(DISCARD_NEW) + ASSERT_SPARK_POOL_INVARIANTS(deque); + return; /* for now, silently fail */ +#else + /* could make room by incrementing the top position here. In + * this case, should use CASTOP. If this fails, someone else has + * removed something, and new room will be available. + */ + ASSERT_SPARK_POOL_INVARIANTS(deque); +#endif + } + } + pos = (deque->elements) + (b & sz); + *pos = elem; + (deque->bottom)++; + + ASSERT_SPARK_POOL_INVARIANTS(deque); + return; +} + +/* this is called as a direct C-call from Stg => we need to keep the + pool in a register (???) */ StgInt newSpark (StgRegTable *reg, StgClosure *p) { - StgSparkPool *pool = &(reg->rSparks); + SparkPool *pool = (reg->rCurrentTSO->cap->sparks); /* I am not sure whether this is the right thing to do. * Maybe it is better to exploit the tag information @@ -138,82 +365,125 @@ newSpark (StgRegTable *reg, StgClosure *p) ASSERT_SPARK_POOL_INVARIANTS(pool); if (closure_SHOULD_SPARK(p)) { -#ifdef DISCARD_NEW - StgClosure **new_tl; - new_tl = pool->tl + 1; - if (new_tl == pool->lim) { new_tl = pool->base; } - if (new_tl != pool->hd) { - *pool->tl = p; - pool->tl = new_tl; - } else if (!closure_SHOULD_SPARK(*pool->hd)) { - // if the old closure is not sparkable, discard it and - // keep the new one. Otherwise, keep the old one. - *pool->tl = p; - bump_hd(pool); - } -#else /* DISCARD OLD */ - *pool->tl = p; - bump_tl(pool); - if (pool->tl == pool->hd) { bump_hd(pool); } -#endif + pushBottom(pool,p); } ASSERT_SPARK_POOL_INVARIANTS(pool); return 1; } -/* ----------------------------------------------------------------------------- - * Mark all nodes pointed to by sparks in the spark queues (for GC) Does an - * implicit slide i.e. after marking all sparks are at the beginning of the - * spark pool and the spark pool only contains sparkable closures + + +/* -------------------------------------------------------------------------- + * Remove all sparks from the spark queues which should not spark any + * more. Called after GC. We assume exclusive access to the structure + * and replace all sparks in the queue, see explanation below. At exit, + * the spark pool only contains sparkable closures. * -------------------------------------------------------------------------- */ static void pruneSparkQueue (Capability *cap) { - StgClosure *spark, **sparkp, **to_sparkp; + SparkPool *pool; + StgClosurePtr spark, evacspark, *elements; nat n, pruned_sparks; // stats only - StgSparkPool *pool; + StgWord botInd,oldBotInd,currInd; // indices in array (always < size) PAR_TICKY_MARK_SPARK_QUEUE_START(); n = 0; pruned_sparks = 0; - pool = &(cap->r.rSparks); + pool = cap->sparks; + debugTrace(DEBUG_sched, + "markSparkQueue: current spark queue len=%d; (hd=%ld; tl=%ld)", + sparkPoolSize(pool), pool->bottom, pool->top); ASSERT_SPARK_POOL_INVARIANTS(pool); - - sparkp = pool->hd; - to_sparkp = pool->hd; - while (sparkp != pool->tl) { - ASSERT(*sparkp!=NULL); - ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgClosure *)*sparkp))); - // ToDo?: statistics gathering here (also for GUM!) - spark = *sparkp; - if (!closure_SHOULD_SPARK(spark)) { - pruned_sparks++; - } else{ - *to_sparkp++ = spark; - if (to_sparkp == pool->lim) { - to_sparkp = pool->base; - } - n++; - } - sparkp++; - if (sparkp == pool->lim) { - sparkp = pool->base; - } - } - pool->tl = to_sparkp; - + + elements = pool->elements; + + /* We have exclusive access to the structure here, so we can reset + bottom and top counters, and prune invalid sparks. Contents are + copied in-place if they are valuable, otherwise discarded. The + routine uses "real" indices t and b, starts by computing them + as the modulus size of top and bottom, + + Copying: + + At the beginning, the pool structure can look like this: + ( bottom % size >= top % size , no wrap-around) + t b + ___________***********_________________ + + or like this ( bottom % size < top % size, wrap-around ) + b t + ***********__________****************** + As we need to remove useless sparks anyway, we make one pass + between t and b, moving valuable content to b and subsequent + cells (wrapping around when the size is reached). + + b t + ***********OOO_______XX_X__X?********** + ^____move?____/ + + After this movement, botInd becomes the new bottom, and old + bottom becomes the new top index, both as indices in the array + size range. + */ + // starting here + currInd = (pool->top) & (pool->moduloSize); // mod + + // copies of evacuated closures go to space from botInd on + // we keep oldBotInd to know when to stop + oldBotInd = botInd = (pool->bottom) & (pool->moduloSize); // mod + + // on entry to loop, we are within the bounds + ASSERT( currInd < pool->size && botInd < pool->size ); + + while (currInd != oldBotInd ) { + /* must use != here, wrap-around at size + subtle: loop not entered if queue empty + */ + + /* check element at currInd. if valuable, evacuate and move to + botInd, otherwise move on */ + spark = elements[currInd]; + + /* if valuable work: shift inside the pool */ + if ( closure_SHOULD_SPARK(spark) ) { + elements[botInd] = spark; // keep entry (new address) + botInd++; + n++; + } else { + pruned_sparks++; // discard spark + } + currInd++; + + // in the loop, we may reach the bounds, and instantly wrap around + ASSERT( currInd <= pool->size && botInd <= pool->size ); + if ( currInd == pool->size ) { currInd = 0; } + if ( botInd == pool->size ) { botInd = 0; } + + } // while-loop over spark pool elements + + ASSERT(currInd == oldBotInd); + + pool->top = oldBotInd; // where we started writing + pool->topBound = pool->top; + + pool->bottom = (oldBotInd <= botInd) ? botInd : (botInd + pool->size); + // first free place we did not use (corrected by wraparound) + PAR_TICKY_MARK_SPARK_QUEUE_END(n); - + debugTrace(DEBUG_sched, "pruned %d sparks", pruned_sparks); debugTrace(DEBUG_sched, - "new spark queue len=%d; (hd=%p; tl=%p)", - sparkPoolSize(pool), pool->hd, pool->tl); + "new spark queue len=%d; (hd=%ld; tl=%ld)", + sparkPoolSize(pool), pool->bottom, pool->top); + + ASSERT_SPARK_POOL_INVARIANTS(pool); } void @@ -225,21 +495,50 @@ pruneSparkQueues (void) } } +/* GC for the spark pool, called inside Capability.c for all + capabilities in turn. Blindly "evac"s complete spark pool. */ void traverseSparkQueue (evac_fn evac, void *user, Capability *cap) { StgClosure **sparkp; - StgSparkPool *pool; + SparkPool *pool; + StgWord top,bottom, modMask; - pool = &(cap->r.rSparks); - sparkp = pool->hd; - while (sparkp != pool->tl) { - evac(user, sparkp); - sparkp++; - if (sparkp == pool->lim) { - sparkp = pool->base; - } + pool = cap->sparks; + + ASSERT_SPARK_POOL_INVARIANTS(pool); + + top = pool->top; + bottom = pool->bottom; + sparkp = pool->elements; + modMask = pool->moduloSize; + + while (top < bottom) { + /* call evac for all closures in range (wrap-around via modulo) + * In GHC-6.10, evac takes an additional 1st argument to hold a + * GC-specific register, see rts/sm/GC.c::mark_root() + */ + evac( user , sparkp + (top & modMask) ); + top++; } + + debugTrace(DEBUG_sched, + "traversed spark queue, len=%d; (hd=%ld; tl=%ld)", + sparkPoolSize(pool), pool->bottom, pool->top); +} + +/* ---------------------------------------------------------------------------- + + * balanceSparkPoolsCaps: takes an array of capabilities (usually: all + * capabilities) and its size. Accesses all spark pools and equally + * distributes the sparks among them. + * + * Could be called after GC, before Cap. release, from scheduler. + * -------------------------------------------------------------------------- */ +void balanceSparkPoolsCaps(nat n_caps, Capability caps[]); + +void balanceSparkPoolsCaps(nat n_caps, Capability caps[]) { + barf("not implemented"); } #else @@ -259,6 +558,8 @@ newSpark (StgRegTable *reg STG_UNUSED, StgClosure *p STG_UNUSED) * * GRAN & PARALLEL_HASKELL stuff beyond here. * + * TODO "nuke" this! + * * -------------------------------------------------------------------------- */ #if defined(PARALLEL_HASKELL) || defined(GRAN) diff --git a/rts/Sparks.h b/rts/Sparks.h index 8e0ba90..dbbf268 100644 --- a/rts/Sparks.h +++ b/rts/Sparks.h @@ -9,17 +9,45 @@ #ifndef SPARKS_H #define SPARKS_H +#if defined(PARALLEL_HASKELL) +#error Sparks.c using new internal structure, needs major overhaul! +#endif + +/* typedef for SparkPool in RtsTypes.h */ + #if defined(THREADED_RTS) + +/* INVARIANTS, in this order: bottom/top consistent, reasonable size, + topBound consistent, space pointer, space accessible to us */ +#define ASSERT_SPARK_POOL_INVARIANTS(p) \ + ASSERT((p)->bottom >= (p)->top); \ + ASSERT((p)->size > 0); \ + ASSERT((p)->size > (p)->bottom - (p)->top); \ + ASSERT((p)->topBound <= (p)->top); \ + ASSERT((p)->elements != NULL); \ + ASSERT(*((p)->elements) || 1); \ + ASSERT(*((p)->elements - 1 + ((p)->size)) || 1); + +// missing in old interface. Currently called by initSparkPools +// internally. +SparkPool* initPool(StgWord size); + +// special case: accessing our own pool, at the write end +// otherwise, we can always steal from our pool as the others do... +StgClosure* reclaimSpark(Capability *cap); + +rtsBool looksEmpty(SparkPool* deque); + +// rest: same as old interface StgClosure * findSpark (Capability *cap); void initSparkPools (void); -void freeSparkPool (StgSparkPool *pool); +void freeSparkPool (SparkPool *pool); void createSparkThread (Capability *cap, StgClosure *p); void pruneSparkQueues (void); void traverseSparkQueue(evac_fn evac, void *user, Capability *cap); -INLINE_HEADER void discardSparks (StgSparkPool *pool); -INLINE_HEADER nat sparkPoolSize (StgSparkPool *pool); -INLINE_HEADER rtsBool emptySparkPool (StgSparkPool *pool); +INLINE_HEADER void discardSparks (SparkPool *pool); +INLINE_HEADER nat sparkPoolSize (SparkPool *pool); INLINE_HEADER void discardSparksCap (Capability *cap); INLINE_HEADER nat sparkPoolSizeCap (Capability *cap); @@ -32,46 +60,33 @@ INLINE_HEADER rtsBool emptySparkPoolCap (Capability *cap); #if defined(PARALLEL_HASKELL) || defined(THREADED_RTS) -INLINE_HEADER rtsBool -emptySparkPool (StgSparkPool *pool) -{ - return (pool->hd == pool->tl); -} +INLINE_HEADER rtsBool +emptySparkPool (SparkPool *pool) +{ return looksEmpty(pool); } INLINE_HEADER rtsBool emptySparkPoolCap (Capability *cap) -{ return emptySparkPool(&cap->r.rSparks); } +{ return looksEmpty(cap->sparks); } INLINE_HEADER nat -sparkPoolSize (StgSparkPool *pool) +sparkPoolSize (SparkPool *pool) { - if (pool->hd <= pool->tl) { - return (pool->tl - pool->hd); - } else { - return (pool->lim - pool->hd + pool->tl - pool->base); - } + return (pool->bottom - pool->top); } INLINE_HEADER nat sparkPoolSizeCap (Capability *cap) -{ return sparkPoolSize(&cap->r.rSparks); } +{ return sparkPoolSize(cap->sparks); } INLINE_HEADER void -discardSparks (StgSparkPool *pool) +discardSparks (SparkPool *pool) { - pool->hd = pool->tl; + pool->top = pool->bottom = 0; } INLINE_HEADER void discardSparksCap (Capability *cap) -{ return discardSparks(&cap->r.rSparks); } - - -#elif defined(THREADED_RTS) - -INLINE_HEADER rtsBool -emptySparkPoolCap (Capability *cap STG_UNUSED) -{ return rtsTrue; } +{ return discardSparks(cap->sparks); } #endif -- 1.7.10.4