From: hwloidl <unknown>
Date: Thu, 13 Jan 2000 14:34:09 +0000 (+0000)
Subject: [project @ 2000-01-13 14:33:57 by hwloidl]
X-Git-Tag: Approximately_9120_patches~5299
X-Git-Url: http://git.megacz.com/?a=commitdiff_plain;h=1b28d4e1f43185ad8c8e7407c66413e1b358402b;p=ghc-hetmet.git

[project @ 2000-01-13 14:33:57 by hwloidl]
Merged GUM-4-04 branch into the main trunk. In particular merged GUM and
SMP code. Most of the GranSim code in GUM-4-04 still has to be carried over.
---

diff --git a/ghc/compiler/absCSyn/Costs.lhs b/ghc/compiler/absCSyn/Costs.lhs
index 24563c7..7bbadff 100644
--- a/ghc/compiler/absCSyn/Costs.lhs
+++ b/ghc/compiler/absCSyn/Costs.lhs
@@ -1,7 +1,9 @@
 %
-% (c) The GRASP/AQUA Project, Glasgow University, 1994-1998
-%     Hans Wolfgang Loidl
+% (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
 %
+% $Id: Costs.lhs,v 1.20 2000/01/13 14:33:57 hwloidl Exp $
+%
+% Only needed in a GranSim setup -- HWL
 % ---------------------------------------------------------------------------
 
 \section[Costs]{Evaluating the costs of computing some abstract C code}
@@ -28,9 +30,11 @@ The meaning of the result tuple is:
    instructions.
 \end{itemize}
 
-This function is needed in GrAnSim for parallelism.
+This function is needed in GranSim for costing pieces of abstract C.
 
-These are first suggestions for scaling the costs. But, this scaling should be done in the RTS rather than the compiler (this really should be tunable!):
+These are first suggestions for scaling the costs. But, this scaling should
+be done in the RTS rather than the compiler (this really should be
+tunable!):
 
 \begin{pseudocode}
 
@@ -82,6 +86,7 @@ instance Num CostRes where
  negate	 = mapOp negate
  abs	 = mapOp abs
  signum	 = mapOp signum
+ fromInteger _ = error "fromInteger not defined"
 
 mapOp :: (Int -> Int) -> CostRes -> CostRes
 mapOp g ( Cost (i, b, l, s, f) )  = Cost (g i, g b, g l, g s, g f)
@@ -202,7 +207,10 @@ costs absC =
 
    CSimultaneous absC	     -> costs absC
 
-   CCheck _ amodes code	     -> Cost (2, 1, 0, 0, 0)
+   CCheck _ amodes code	     -> Cost (2, 1, 0, 0, 0) -- ToDo: refine this by 
+                                                     -- looking at the first arg 
+
+   CRetDirect _ _ _ _	     -> nullCosts
 
    CMacroStmt	macro modes  -> stmtMacroCosts macro modes
 
@@ -215,19 +223,28 @@ costs absC =
   -- *** the next three [or so...] are DATA (those above are CODE) ***
   -- as they are data rather than code they all have nullCosts	       -- HWL
 
+   CCallTypedef _ _ _ _      -> nullCosts
+
    CStaticClosure _ _ _ _    -> nullCosts
 
-   CClosureInfoAndCode _ _ _ _ -> nullCosts
+   CSRT _ _                  -> nullCosts
 
-   CRetDirect _ _ _ _	     -> nullCosts
+   CBitmap _ _               -> nullCosts
+
+   CClosureInfoAndCode _ _ _ _ -> nullCosts
 
    CRetVector _ _ _ _        -> nullCosts
 
+   CClosureTbl _             -> nullCosts
+
    CCostCentreDecl _ _	     -> nullCosts
+
    CCostCentreStackDecl _    -> nullCosts
 
    CSplitMarker		     -> nullCosts
 
+   _ -> trace ("Costs.costs") nullCosts
+
 -- ---------------------------------------------------------------------------
 
 addrModeCosts :: CAddrMode -> Side -> CostRes
@@ -242,7 +259,11 @@ addrModeCosts addr_mode side =
     CVal _ _ -> if lhs then Cost (0, 0, 0, 1, 0)
 		       else Cost (0, 0, 1, 0, 0)
 
-    CReg _   -> nullCosts	 {- loading from, storing to reg is free ! -}
+    CAddr (CIndex _ n _ ) -> Cost (1, 0, 1, 0, 0) -- does pointer arithmetic
+
+    CAddr _ -> nullCosts
+
+    CReg _  -> nullCosts	 {- loading from, storing to reg is free ! -}
 				 {- for costing CReg->Creg ops see special -}
 				 {- case in costs fct -}
 
@@ -277,6 +298,8 @@ addrModeCosts addr_mode side =
 
     CMacroExpr _ macro mode_list -> exprMacroCosts side macro mode_list
 
+    _ -> trace ("Costs.addrModeCosts") nullCosts
+
 -- ---------------------------------------------------------------------------
 
 exprMacroCosts :: Side -> CExprMacro -> [CAddrMode] -> CostRes
@@ -288,10 +311,11 @@ exprMacroCosts side macro mode_list =
   in
   arg_costs +
   case macro of
-    ENTRY_CODE -> nullCosts
-    ARG_TAG -> nullCosts -- XXX
-    GET_TAG -> nullCosts -- XXX
-    
+    ENTRY_CODE -> nullCosts -- nothing 
+    ARG_TAG -> nullCosts -- nothing
+    GET_TAG -> Cost (0, 0, 1, 0, 0)  -- indirect load
+    UPD_FRAME_UPDATEE -> Cost (0, 0, 1, 0, 0)  -- indirect load
+    _ -> trace ("Costs.exprMacroCosts") nullCosts
 
 -- ---------------------------------------------------------------------------
 
@@ -309,7 +333,9 @@ stmtMacroCosts macro modes =
     UPD_CAF		  ->  Cost (7, 0, 1, 3, 0)	 {- SMupdate.lh	 -}
     UPD_BH_UPDATABLE	  ->  Cost (3, 0, 0, 1, 0)	 {- SMupdate.lh	 -}
     UPD_BH_SINGLE_ENTRY	  ->  Cost (3, 0, 0, 1, 0)	 {- SMupdate.lh	 -}
-    PUSH_UPD_FRAME	  ->  Cost (3, 0, 0, 4, 0)	 {- SMupdate.lh	 -}
+    PUSH_UPD_FRAME	  ->  Cost (3, 0, 0, 4, 0)	 {- Updates.h	 -}
+    PUSH_SEQ_FRAME	  ->  Cost (2, 0, 0, 3, 0)	 {- StgMacros.h	 !-}
+    UPDATE_SU_FROM_UPD_FRAME -> Cost (1, 0, 1, 0, 0)     {- StgMacros.h	 !-}
     SET_TAG		  ->  nullCosts		    {- COptRegs.lh -}
     GRAN_FETCH			->  nullCosts	  {- GrAnSim bookkeeping -}
     GRAN_RESCHEDULE		->  nullCosts	  {- GrAnSim bookkeeping -}
diff --git a/ghc/compiler/basicTypes/Module.lhs b/ghc/compiler/basicTypes/Module.lhs
index 40c25f5..af634fd 100644
--- a/ghc/compiler/basicTypes/Module.lhs
+++ b/ghc/compiler/basicTypes/Module.lhs
@@ -285,7 +285,8 @@ getAllFilesMatching :: SearchPath
 		    -> (ModuleHiMap, ModuleHiMap)
 		    -> (FilePath, String) 
 		    -> IO (ModuleHiMap, ModuleHiMap)
-getAllFilesMatching dirs hims (dir_path, suffix) = ( do
+getAllFilesMatching dirs hims (dir_path, suffix) = 
+ do
     -- fpaths entries do not have dir_path prepended
   fpaths  <- getDirectoryContents dir_path
   is_dll <- catch
@@ -297,7 +298,7 @@ getAllFilesMatching dirs hims (dir_path, suffix) = ( do
 		)
 		(\ _ {-don't care-} -> return NotDll)
   return (foldl (addModules is_dll) hims fpaths)
-  )  -- soft failure
+  -- soft failure
       `catch` 
         (\ err -> do
 	      hPutStrLn stderr
diff --git a/ghc/compiler/codeGen/CgCase.lhs b/ghc/compiler/codeGen/CgCase.lhs
index 4e755ca..e358b9b 100644
--- a/ghc/compiler/codeGen/CgCase.lhs
+++ b/ghc/compiler/codeGen/CgCase.lhs
@@ -1,7 +1,7 @@
 %
 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
 %
-% $Id: CgCase.lhs,v 1.36 1999/11/01 17:10:06 simonpj Exp $
+% $Id: CgCase.lhs,v 1.37 2000/01/13 14:33:57 hwloidl Exp $
 %
 %********************************************************
 %*							*
@@ -602,9 +602,10 @@ cgAlgDefault gc_flag is_fun uniq cc_slot must_label_branch
   = 	-- We have arranged that Node points to the thing
     restoreCurrentCostCentre cc_slot `thenFC` \restore_cc ->
     getAbsC (absC restore_cc `thenC`
-             (if opt_GranMacros && emit_yield
-                then yield [node] False
-                else absC AbsCNop)                            `thenC`     
+             -- HWL: maybe need yield here
+             --(if emit_yield
+             --   then yield [node] True
+             --   else absC AbsCNop)                            `thenC`     
 	     possibleHeapCheck gc_flag is_fun [node] [] Nothing (cgExpr rhs)
 	-- Node is live, but doesn't need to point at the thing itself;
 	-- it's ok for Node to point to an indirection or FETCH_ME
@@ -633,9 +634,10 @@ cgAlgAlt gc_flag uniq cc_slot must_label_branch
   = 
     restoreCurrentCostCentre cc_slot `thenFC` \restore_cc ->
     getAbsC (absC restore_cc `thenC`
-    	     (if opt_GranMacros && emit_yield
-      		then yield [node] True		-- XXX live regs wrong
-      		else absC AbsCNop)                               `thenC`     
+             -- HWL: maybe need yield here
+    	     -- (if emit_yield
+      	     --    then yield [node] True		-- XXX live regs wrong
+      	     --    else absC AbsCNop)                               `thenC`    
     	     (case gc_flag of
 		NoGC   	    -> mapFCs bindNewToTemp args `thenFC` \_ -> nopC
     		GCMayHappen -> bindConArgs con args
@@ -667,9 +669,10 @@ cgUnboxedTupleAlt lbl cc_slot emit_yield (con,args,use_mask,rhs)
         restoreCurrentCostCentre cc_slot `thenFC` \restore_cc ->
 	absC restore_cc `thenC`
 
-  	(if opt_GranMacros && emit_yield
-  	    then yield live_regs True		-- XXX live regs wrong?
-  	    else absC AbsCNop)                         `thenC`     
+        -- HWL: maybe need yield here
+  	-- (if emit_yield
+  	--    then yield live_regs True		-- XXX live regs wrong?
+  	--    else absC AbsCNop)                         `thenC`     
   	let 
 	      -- ToDo: could maybe use Nothing here if stack_res is False
 	      -- since the heap-check can just return to the top of the 
diff --git a/ghc/compiler/codeGen/CgClosure.lhs b/ghc/compiler/codeGen/CgClosure.lhs
index c40320c..1b80bea 100644
--- a/ghc/compiler/codeGen/CgClosure.lhs
+++ b/ghc/compiler/codeGen/CgClosure.lhs
@@ -1,7 +1,7 @@
 %
 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
 %
-% $Id: CgClosure.lhs,v 1.38 1999/11/11 17:50:49 simonpj Exp $
+% $Id: CgClosure.lhs,v 1.39 2000/01/13 14:33:58 hwloidl Exp $
 %
 \section[CgClosure]{Code generation for closures}
 
@@ -40,7 +40,8 @@ import CgUsages		( adjustSpAndHp, setRealAndVirtualSp, getVirtSp,
 			  getSpRelOffset, getHpRelOffset
 			)
 import CLabel		( CLabel, mkClosureLabel, mkFastEntryLabel,
-			  mkRednCountsLabel, mkInfoTableLabel
+			  mkRednCountsLabel, mkInfoTableLabel,
+                          pprCLabel
 			)
 import ClosureInfo	-- lots and lots of stuff
 import CmdLineOpts	( opt_GranMacros, opt_SccProfilingOn, opt_DoTickyProfiling )
@@ -325,7 +326,12 @@ closureCodeBody binder_info closure_info cc all_args body
 	--
     	arg_regs = case entry_conv of
 		DirectEntry lbl arity regs -> regs
-		other 		            -> panic "closureCodeBody:arg_regs"
+		other 		           -> trace ("*** closureCodeBody:arg_regs " ++ (pprHWL entry_conv) ++ "(HWL ignored; no args passed in regs)") []
+
+        pprHWL :: EntryConvention -> String    
+        pprHWL (ViaNode) = "ViaNode"
+        pprHWL (StdEntry cl) = "StdEntry"
+        pprHWL (DirectEntry cl i l) = "DirectEntry"
 
 	num_arg_regs = length arg_regs
 	
@@ -350,7 +356,7 @@ closureCodeBody binder_info closure_info cc all_args body
 	    mapCs bindNewToStack arg_offsets	  	    `thenC`
 	    setRealAndVirtualSp sp_all_args		    `thenC`
 
-	    argSatisfactionCheck closure_info		    `thenC`
+	    argSatisfactionCheck closure_info	arg_regs	    `thenC`
 
 	    -- OK, so there are enough args.  Now we need to stuff as
 	    -- many of them in registers as the fast-entry code
@@ -516,24 +522,24 @@ relative offset of this word tells how many words of arguments
 are expected.
 
 \begin{code}
-argSatisfactionCheck :: ClosureInfo -> Code
+argSatisfactionCheck :: ClosureInfo -> [MagicId] {-GRAN-} -> Code
 
-argSatisfactionCheck closure_info
+argSatisfactionCheck closure_info arg_regs
 
   = nodeMustPointToIt (closureLFInfo closure_info)   `thenFC` \ node_points ->
 
-    let
-       emit_gran_macros = opt_GranMacros
-    in
+--      let
+--         emit_gran_macros = opt_GranMacros
+--      in
 
     -- HWL  ngo' ngoq:
     -- absC (CMacroStmt GRAN_FETCH []) 			`thenC`
     -- forceHeapCheck [] node_points (absC AbsCNop)			`thenC`
-    (if emit_gran_macros 
-      then if node_points 
-             then fetchAndReschedule  [] node_points 
-             else yield [] node_points
-      else absC AbsCNop)                       `thenC`
+    --(if opt_GranMacros
+    --  then if node_points 
+    --         then fetchAndReschedule  arg_regs node_points 
+    --         else yield arg_regs node_points
+    --  else absC AbsCNop)                       `thenC`
 
         getSpRelOffset 0 	`thenFC` \ (SpRel sp) ->
 	let
@@ -565,16 +571,13 @@ thunkWrapper closure_info lbl thunk_code
   = 	-- Stack and heap overflow checks
     nodeMustPointToIt (closureLFInfo closure_info) `thenFC` \ node_points ->
 
-    let
-       emit_gran_macros = opt_GranMacros
-    in
-	-- HWL: insert macros for GrAnSim; 2 versions depending on liveness of node
-	-- (we prefer fetchAndReschedule-style context switches to yield ones)
-    (if emit_gran_macros 
-      then if node_points 
-             then fetchAndReschedule  [] node_points 
-             else yield [] node_points
-      else absC AbsCNop)                       `thenC`
+    -- HWL: insert macros for GrAnSim; 2 versions depending on liveness of node
+    -- (we prefer fetchAndReschedule-style context switches to yield ones)
+    (if opt_GranMacros
+       then if node_points 
+              then fetchAndReschedule [] node_points 
+              else yield [] node_points
+       else absC AbsCNop)                       `thenC`
 
         -- stack and/or heap checks
     thunkChecks lbl node_points (
@@ -597,13 +600,10 @@ funWrapper :: ClosureInfo 	-- Closure whose code body this is
 funWrapper closure_info arg_regs stk_tags info_label fun_body
   = 	-- Stack overflow check
     nodeMustPointToIt (closureLFInfo closure_info)  	`thenFC` \ node_points ->
-    let
-       emit_gran_macros = opt_GranMacros
-    in
     -- HWL   chu' ngoq:
-    (if emit_gran_macros
-      then yield  arg_regs node_points
-      else absC AbsCNop)                                 `thenC`
+    (if opt_GranMacros
+       then yield arg_regs node_points
+       else absC AbsCNop)                                 `thenC`
 
         -- heap and/or stack checks
     fastEntryChecks arg_regs stk_tags info_label node_points (
diff --git a/ghc/compiler/codeGen/CgHeapery.lhs b/ghc/compiler/codeGen/CgHeapery.lhs
index a4f6bc2..566cfcb 100644
--- a/ghc/compiler/codeGen/CgHeapery.lhs
+++ b/ghc/compiler/codeGen/CgHeapery.lhs
@@ -1,7 +1,7 @@
 %
 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
 %
-% $Id: CgHeapery.lhs,v 1.19 1999/10/13 16:39:15 simonmar Exp $
+% $Id: CgHeapery.lhs,v 1.20 2000/01/13 14:33:58 hwloidl Exp $
 %
 \section[CgHeapery]{Heap management functions}
 
@@ -32,7 +32,7 @@ import ClosureInfo	( closureSize, closureGoodStuffSize,
 			)
 import PrimRep		( PrimRep(..), isFollowableRep )
 import Unique		( Unique )
-import CmdLineOpts	( opt_SccProfilingOn )
+import CmdLineOpts	( opt_SccProfilingOn, opt_GranMacros )
 import GlaExts
 import Outputable
 
@@ -78,6 +78,10 @@ fastEntryChecks regs tags ret node_points code
      getTickyCtrLabel `thenFC` \ ticky_ctr ->
 
      ( if all_pointers then -- heap checks are quite easy
+          -- HWL: gran-yield immediately before heap check proper
+          --(if node `elem` regs
+          --   then yield regs True
+          --   else absC AbsCNop ) `thenC`
 	  absC (checking_code stk_words hp_words tag_assts 
 			free_reg (length regs) ticky_ctr)
 
@@ -382,22 +386,22 @@ mkRegLiveness (VanillaReg rep i : regs) | isFollowableRep rep
   =  ((int2Word# 1#) `shiftL#` (i -# 1#)) `or#` mkRegLiveness regs
 mkRegLiveness (_ : regs)  =  mkRegLiveness regs
 
+-- The two functions below are only used in a GranSim setup
 -- Emit macro for simulating a fetch and then reschedule
 
 fetchAndReschedule ::   [MagicId]               -- Live registers
 			-> Bool                 -- Node reqd?
 			-> Code
 
-fetchAndReschedule regs node_reqd  =
+fetchAndReschedule regs node_reqd  = 
       if (node `elem` regs || node_reqd)
 	then fetch_code `thenC` reschedule_code
 	else absC AbsCNop
       where
 	all_regs = if node_reqd then node:regs else regs
-	liveness_mask = 0 {-XXX: mkLiveRegsMask all_regs-}
-
+        liveness_mask = mkRegLiveness regs
 	reschedule_code = absC  (CMacroStmt GRAN_RESCHEDULE [
-				 mkIntCLit liveness_mask,
+                                 mkIntCLit (IBOX(word2Int# liveness_mask)), 
 				 mkIntCLit (if node_reqd then 1 else 0)])
 
 	 --HWL: generate GRAN_FETCH macro for GrAnSim
@@ -423,15 +427,16 @@ yield ::   [MagicId]               -- Live registers
              -> Bool                 -- Node reqd?
              -> Code 
 
-yield regs node_reqd =
-      -- NB: node is not alive; that's why we use DO_YIELD rather than 
-      --     GRAN_RESCHEDULE 
-      yield_code
-      where
-        all_regs = if node_reqd then node:regs else regs
-        liveness_mask = 0 {-XXX: mkLiveRegsMask all_regs-}
-
-        yield_code = absC (CMacroStmt GRAN_YIELD [mkIntCLit liveness_mask])
+yield regs node_reqd = 
+   if opt_GranMacros && node_reqd
+     then yield_code
+     else absC AbsCNop
+   where
+     -- all_regs = if node_reqd then node:regs else regs
+     liveness_mask = mkRegLiveness regs
+     yield_code = 
+       absC (CMacroStmt GRAN_YIELD 
+                          [mkIntCLit (IBOX(word2Int# liveness_mask))])
 \end{code}
 
 %************************************************************************
diff --git a/ghc/compiler/codeGen/CgStackery.lhs b/ghc/compiler/codeGen/CgStackery.lhs
index d97476e..33a873a 100644
--- a/ghc/compiler/codeGen/CgStackery.lhs
+++ b/ghc/compiler/codeGen/CgStackery.lhs
@@ -1,7 +1,7 @@
 %
 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1998
 %
-% $Id: CgStackery.lhs,v 1.12 1999/06/24 13:04:20 simonmar Exp $
+% $Id: CgStackery.lhs,v 1.13 2000/01/13 14:33:58 hwloidl Exp $
 %
 \section[CgStackery]{Stack management functions}
 
@@ -25,9 +25,10 @@ import AbsCSyn
 import CgUsages		( getRealSp )
 import AbsCUtils	( mkAbstractCs, mkAbsCStmts, getAmodeRep )
 import PrimRep		( getPrimRepSize, PrimRep(..), isFollowableRep )
-import CmdLineOpts	( opt_SccProfilingOn )
+import CmdLineOpts	( opt_SccProfilingOn, opt_GranMacros )
 import Panic		( panic )
-import Constants	( uF_SIZE, sCC_UF_SIZE, sEQ_FRAME_SIZE, sCC_SEQ_FRAME_SIZE )
+import Constants	( uF_SIZE, sCC_UF_SIZE, gRAN_UF_SIZE, 
+			  sEQ_FRAME_SIZE, sCC_SEQ_FRAME_SIZE, gRAN_SEQ_FRAME_SIZE )
 
 import IOExts		( trace )
 \end{code}
@@ -224,11 +225,13 @@ getFinalStackHW fcode info_down (MkCgState absC binds usages) = state1
 \end{code}
 
 \begin{code}
-updateFrameSize | opt_SccProfilingOn = sCC_UF_SIZE
-		| otherwise          = uF_SIZE
+updateFrameSize | opt_SccProfilingOn = trace ("updateFrameSize = " ++ (show sCC_UF_SIZE)) sCC_UF_SIZE
+		| opt_GranMacros     = trace ("updateFrameSize = " ++ (show gRAN_UF_SIZE))gRAN_UF_SIZE
+		| otherwise          = trace ("updateFrameSize = " ++ (show uF_SIZE)) uF_SIZE
 
 seqFrameSize    | opt_SccProfilingOn  = sCC_SEQ_FRAME_SIZE
-	        | otherwise           = sEQ_FRAME_SIZE
+	        | opt_GranMacros      = gRAN_SEQ_FRAME_SIZE
+		| otherwise           = sEQ_FRAME_SIZE
 \end{code}			
 
 %************************************************************************
diff --git a/ghc/compiler/main/Constants.lhs b/ghc/compiler/main/Constants.lhs
index ae358e2..14f4667 100644
--- a/ghc/compiler/main/Constants.lhs
+++ b/ghc/compiler/main/Constants.lhs
@@ -34,6 +34,7 @@ module Constants (
 
 	uF_SIZE,
 	sCC_UF_SIZE,
+	gRAN_UF_SIZE,  -- HWL
 	uF_RET,
 	uF_SU,
 	uF_UPDATEE,
@@ -41,6 +42,7 @@ module Constants (
 
 	sEQ_FRAME_SIZE,
 	sCC_SEQ_FRAME_SIZE,
+	gRAN_SEQ_FRAME_SIZE, -- HWL
 
 	mAX_Vanilla_REG,
 	mAX_Float_REG,
@@ -157,6 +159,9 @@ uF_SIZE	= (NOSCC_UF_SIZE::Int)
 -- Same again, with profiling
 sCC_UF_SIZE = (SCC_UF_SIZE::Int)
 
+-- Same again, with gransim
+gRAN_UF_SIZE = (GRAN_UF_SIZE::Int)
+
 -- Offsets in an update frame.  They don't change with profiling!
 uF_RET         = (UF_RET::Int)
 uF_SU          = (UF_SU::Int)
@@ -169,6 +174,7 @@ Seq frame sizes.
 \begin{code}
 sEQ_FRAME_SIZE = (NOSCC_SEQ_FRAME_SIZE::Int)
 sCC_SEQ_FRAME_SIZE = (SCC_SEQ_FRAME_SIZE::Int)
+gRAN_SEQ_FRAME_SIZE = (GRAN_SEQ_FRAME_SIZE::Int)
 \end{code}
 
 \begin{code}
diff --git a/ghc/compiler/nativeGen/StixMacro.lhs b/ghc/compiler/nativeGen/StixMacro.lhs
index 747759e..5ff2ea1 100644
--- a/ghc/compiler/nativeGen/StixMacro.lhs
+++ b/ghc/compiler/nativeGen/StixMacro.lhs
@@ -122,6 +122,7 @@ macroCode PUSH_UPD_FRAME args
     	frame n = StInd PtrRep
 	    (StIndex PtrRep stgSp (StInt (toInteger (n-uF_SIZE))))
 
+        -- HWL: these values are *wrong* in a GranSim setup; ToDo: fix
 	a1 = StAssign PtrRep (frame uF_RET)     upd_frame_info
 	a3 = StAssign PtrRep (frame uF_SU)      stgSu
 	a4 = StAssign PtrRep (frame uF_UPDATEE) bhptr
diff --git a/ghc/driver/ghc-asm.lprl b/ghc/driver/ghc-asm.lprl
index 590b3a1..d5adb3f 100644
--- a/ghc/driver/ghc-asm.lprl
+++ b/ghc/driver/ghc-asm.lprl
@@ -687,6 +687,9 @@ sub mangle_asm {
 		    print STDERR "$Pgm: unknown prologue mangling? $TargetPlatform\n";
 		}
 
+		# HWL HACK: dont die, just print a warning
+		#print stderr  "HWL: this should die! Prologue junk?: $p\n" if $p =~ /^\t[^\.]/
+		#    && $TargetPlatform !~ /^powerpc-/; #ToDo: remove test
 		die "Prologue junk?: $p\n" if $p =~ /^\t[^\.]/
 		    && $TargetPlatform !~ /^powerpc-/; #ToDo: remove test
 
@@ -727,6 +730,9 @@ sub mangle_asm {
 		} else {
 		    print STDERR "$Pgm: unknown epilogue mangling? $TargetPlatform\n";
 		}
+		# HWL HACK: dont die, just print a warning
+		#print stderr "HWL: this should die! Epilogue junk?: $e\n" if $e =~ /^\t[^\.]/
+		#    && $TargetPlatform !~ /^powerpc-/; #ToDo: remove test
 		die "Epilogue junk?: $e\n" if $e =~ /^\t[^\.]/
 		    && $TargetPlatform !~ /^powerpc-/; #ToDo: remove test
 
diff --git a/ghc/driver/ghc.lprl b/ghc/driver/ghc.lprl
index dca6d70..bb80a14 100644
--- a/ghc/driver/ghc.lprl
+++ b/ghc/driver/ghc.lprl
@@ -1911,6 +1911,7 @@ eval 'exec perl -S \$0 \${1+"\$@"}'
 # =!=!=!=!=!=!=!=!=!=!=!
 # This script is automatically generated: DO NOT EDIT!!!
 # Generated by Glasgow Haskell, version ${ProjectVersion}
+# ngoqvam choHbogh vaj' vIHoHnISbej !!!!
 #
 \$pvm_executable      = '$pvm_executable';
 \$pvm_executable_base = '$pvm_executable_base';
@@ -1942,7 +1943,9 @@ args: while ($a = shift(@ARGV)) {
     }
     if ( $a eq '-d' && $in_RTS_args ) {
 	$debug = '-';
-    } elsif ( $a =~ /^-N(\d+)/ && $in_RTS_args ) {
+    } elsif ( $a =~ /^-qN(\d+)/ && $in_RTS_args ) {
+	$nprocessors = $1;
+    } elsif ( $a =~ /^-qp(\d+)/ && $in_RTS_args ) {
 	$nprocessors = $1;
     } else {
 	push(@nonPVM_args, $a);
@@ -2817,9 +2820,24 @@ sub saveIntermediate {
   local ($final,$suffix,$tmp)= @_ ;
   local ($to_do);
 
+  local ($new_suffix);
+
   # $final  -- root of where to park ${final}.${suffix}
   # $tmp    -- temporary file where hsc put the intermediate file.
 
+  # HWL: use -odir for .hc and .s files, too
+  if ( $Specific_output_dir ne '' ) {
+    $final = "${Specific_output_dir}/${final}";
+  }	
+  # HWL: use the same suffix as for $Osuffix in generating intermediate file,
+  #      replacing o with hc or s, respectively. 
+  if ( $Osuffix ne '' ) {
+    ($new_suffix = $Osuffix) =~ s/o$/hc/ if $suffix eq "hc";
+    ($new_suffix = $Osuffix) =~ s/o$/s/ if $suffix eq "s";
+    $suffix = $new_suffix;
+    print stderr "HWL says: suffix for intermediate file is $suffix; ${final}.${suffix} overall\n" if $Verbose;
+  }
+
   # Delete the old file
   $to_do = "$Rm ${final}.${suffix}"; &run_something($to_do, "Removing old .${suffix} file");
 
diff --git a/ghc/driver/test_mangler b/ghc/driver/test_mangler
index f24f0e4..96cf31c 100644
--- a/ghc/driver/test_mangler
+++ b/ghc/driver/test_mangler
@@ -1,7 +1,9 @@
-#! /usr/local/bin/perl
+#! /usr/bin/perl
 # a simple wrapper to test a .s-file mangler
 # reads stdin, writes stdout
 
+push(@INC,"/net/dazdak/BUILDS/gransim-4.04/i386-unknown-linux/ghc/driver");
+
 $TargetPlatform = $ARGV[0]; shift; # nice error checking, Will
 
 require("ghc-asm.prl") || die "require mangler failed!\n";
diff --git a/ghc/includes/CCall.h b/ghc/includes/CCall.h
index 97ff9df..3040c17 100644
--- a/ghc/includes/CCall.h
+++ b/ghc/includes/CCall.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: CCall.h,v 1.3 1999/02/05 16:02:19 simonm Exp $
+ * $Id: CCall.h,v 1.4 2000/01/13 14:34:00 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -55,6 +55,9 @@
 #define STGCALL5(f,a,b,c,d,e) \
   CALLER_SAVE_ALL (void) f(a,b,c,d,e); CALLER_RESTORE_ALL
 
+#define STGCALL6(f,a,b,c,d,e,z) \
+  CALLER_SAVE_ALL (void) f(a,b,c,d,e,z); CALLER_RESTORE_ALL
+
 
 #define RET_STGCALL0(t,f) \
   ({ t _r; CALLER_SAVE_ALL _r = f(); CALLER_RESTORE_ALL; _r; })
@@ -74,6 +77,9 @@
 #define RET_STGCALL5(t,f,a,b,c,d,e) \
   ({ t _r; CALLER_SAVE_ALL _r = f(a,b,c,d,e); CALLER_RESTORE_ALL; _r; })
 
+#define RET_STGCALL6(t,f,a,b,c,d,e,z) \
+  ({ t _r; CALLER_SAVE_ALL _r = f(a,b,c,d,e,z); CALLER_RESTORE_ALL; _r; })
+
 
 /*
  * A PRIM_STGCALL is used when we have arranged to save the R<n>,
@@ -101,6 +107,9 @@
 #define PRIM_STGCALL5(f,a,b,c,d,e) \
   CALLER_SAVE_SYSTEM (void) f(a,b,c,d,e); CALLER_RESTORE_SYSTEM
 
+#define PRIM_STGCALL6(f,a,b,c,d,e,z) \
+  CALLER_SAVE_SYSTEM (void) f(a,b,c,d,e,z); CALLER_RESTORE_SYSTEM
+
 
 #define RET_PRIM_STGCALL0(t,f) \
   ({ t _r; CALLER_SAVE_SYSTEM _r = f(); CALLER_RESTORE_SYSTEM; _r; })
@@ -120,6 +129,9 @@
 #define RET_PRIM_STGCALL5(t,f,a,b,c,d,e) \
   ({ t _r; CALLER_SAVE_SYSTEM _r = f(a,b,c,d,e); CALLER_RESTORE_SYSTEM; _r; })
 
+#define RET_PRIM_STGCALL6(t,f,a,b,c,d,e,z) \
+  ({ t _r; CALLER_SAVE_SYSTEM _r = f(a,b,c,d,e,z); CALLER_RESTORE_SYSTEM; _r; })
+
 /* ToDo: ccalls that might garbage collect - do we need to return to
  * the scheduler to perform these?  Similarly, ccalls that might want
  * to call Haskell right back, or start a new thread or something.
diff --git a/ghc/includes/ClosureTypes.h b/ghc/includes/ClosureTypes.h
index de58fac..e1a9f2c 100644
--- a/ghc/includes/ClosureTypes.h
+++ b/ghc/includes/ClosureTypes.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
- * $Id: ClosureTypes.h,v 1.11 1999/05/11 16:47:40 keithw Exp $
+ * $Id: ClosureTypes.h,v 1.12 2000/01/13 14:34:00 hwloidl Exp $
  * 
  * (c) The GHC Team, 1998-1999
  *
@@ -72,10 +72,15 @@
 #define WEAK		        56
 #define FOREIGN		        57
 #define STABLE_NAME	        58
+
 #define TSO		        59
 #define BLOCKED_FETCH	        60
 #define FETCH_ME                61
-#define EVACUATED               62
-#define N_CLOSURE_TYPES         63
+#define FETCH_ME_BQ             62
+#define RBH                     63
+
+#define EVACUATED               64
+
+#define N_CLOSURE_TYPES         65
 
 #endif CLOSURETYPES_H
diff --git a/ghc/includes/Closures.h b/ghc/includes/Closures.h
index 3ed2809..1de91ef 100644
--- a/ghc/includes/Closures.h
+++ b/ghc/includes/Closures.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
- * $Id: Closures.h,v 1.14 1999/12/01 14:34:48 simonmar Exp $
+ * $Id: Closures.h,v 1.15 2000/01/13 14:34:00 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -37,21 +37,39 @@ typedef struct {
    The parallel header
    -------------------------------------------------------------------------- */
 
-#ifdef GRAN
+#ifdef PAR
 
 typedef struct {
-  W_ procs;
-} StgGranHeader;
+  /* StgWord ga; */  /* nope! global addresses are managed via a hash table */
+} StgParHeader;
 
 #else /* !PAR */
 
 typedef struct {
   /* empty */
-} StgGranHeader;
+} StgParHeader;
 
 #endif /* PAR */
 
 /* -----------------------------------------------------------------------------
+   The GranSim header
+   -------------------------------------------------------------------------- */
+
+#if defined(GRAN)
+
+typedef struct {
+  StgWord procs; /* bitmask indicating on which PEs this closure resides */
+} StgGranHeader;
+
+#else /* !GRAN */
+
+typedef struct {
+  /* empty */
+} StgGranHeader;
+
+#endif /* GRAN */
+
+/* -----------------------------------------------------------------------------
    The ticky-ticky header
 
    Comment from old Ticky.h:
@@ -96,8 +114,11 @@ typedef struct {
 #ifdef PROFILING
 	StgProfHeader         prof;
 #endif
-#ifdef GRAN
-	StgGranHeader         par;
+#ifdef PAR
+	StgParHeader          par;
+#endif
+#if defined(GRAN)
+	StgGranHeader         gran;
 #endif
 #ifdef TICKY_TICKY
 	StgTickyHeader        ticky;
@@ -189,12 +210,6 @@ typedef struct StgCAF_ {
 
 typedef struct {
     StgHeader  header;
-    struct StgTSO_ *blocking_queue;
-    StgMutClosure *mut_link;
-} StgBlockingQueue;
-
-typedef struct {
-    StgHeader  header;
     StgWord    words;
     StgWord    payload[0];
 } StgArrWords;
@@ -294,12 +309,71 @@ typedef struct {
   StgClosure*     value;
 } StgMVar;
 
-/* Parallel FETCH_ME closures */
-#ifdef PAR
-typedef struct {
+#if defined(PAR) || defined(GRAN)
+/*
+  StgBlockingQueueElement represents the types of closures that can be 
+  found on a blocking queue: StgTSO, StgRBHSave, StgBlockedFetch.
+  (StgRBHSave can only appear at the end of a blocking queue).  
+  Logically, this is a union type, but defining another struct with a common
+  layout is easier to handle in the code (same as for StgMutClosures).
+*/
+typedef struct StgBlockingQueueElement_ {
+  StgHeader                         header;
+  struct StgBlockingQueueElement_  *link;
+  StgMutClosure                    *mut_link;
+  struct StgClosure_               *payload[0];
+} StgBlockingQueueElement;
+
+typedef struct StgBlockingQueue_ {
+  StgHeader                         header;
+  struct StgBlockingQueueElement_  *blocking_queue;
+  StgMutClosure                    *mut_link;
+} StgBlockingQueue;
+
+/* this closure is hanging at the end of a blocking queue in (par setup only) */
+typedef struct StgRBHSave_ {
   StgHeader    header;
-  void        *ga;		/* type globalAddr is abstract here */
+  StgPtr       payload[0];
+} StgRBHSave;
+
+typedef struct StgRBH_ {
+  StgHeader                                header;
+  struct StgBlockingQueueElement_         *blocking_queue;
+  StgMutClosure                           *mut_link;
+} StgRBH;
+
+#else
+/* old sequential version of a blocking queue, which can only hold TSOs */
+typedef struct StgBlockingQueue_ {
+  StgHeader                 header;
+  struct StgTSO_           *blocking_queue;
+  StgMutClosure            *mut_link;
+} StgBlockingQueue;
+#endif
+
+#if defined(PAR)
+/* global indirections aka FETCH_ME closures */
+typedef struct StgFetchMe_ {
+  StgHeader              header;
+  globalAddr            *ga;		/* type globalAddr is abstract here */
+  StgMutClosure         *mut_link;
 } StgFetchMe;
+
+/* same contents as an ordinary StgBlockingQueue */
+typedef struct StgFetchMeBlockingQueue_ {
+  StgHeader                          header;
+  struct StgBlockingQueueElement_   *blocking_queue;
+  StgMutClosure                     *mut_link;
+} StgFetchMeBlockingQueue;
+
+/* entry in a blocking queue, indicating a request from a TSO on another PE */
+typedef struct StgBlockedFetch_ {
+  StgHeader                         header;
+  struct StgBlockingQueueElement_  *link;
+  StgMutClosure                    *mut_link;
+  StgClosure                       *node;
+  globalAddr                        ga;
+} StgBlockedFetch;
 #endif
 
 #endif /* CLOSURES_H */
diff --git a/ghc/includes/Constants.h b/ghc/includes/Constants.h
index bf7c83e..3983196 100644
--- a/ghc/includes/Constants.h
+++ b/ghc/includes/Constants.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
- * $Id: Constants.h,v 1.7 1999/10/27 09:58:36 simonmar Exp $
+ * $Id: Constants.h,v 1.8 2000/01/13 14:34:00 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -18,11 +18,13 @@
    Header Sizes
 
    NOTE: keep these in line with the real definitions in Closures.h
+   HWL: checked GRAN_HDR_SIZE; ok
    -------------------------------------------------------------------------- */
 
 #define STD_HDR_SIZE   1
 #define PROF_HDR_SIZE  1
 #define GRAN_HDR_SIZE  1
+#define PAR_HDR_SIZE   0
 #define TICKY_HDR_SIZE 0
 
 #define ARR_HDR_SIZE   1
@@ -36,11 +38,13 @@
    NOTE: keep these in line with the real definitions in InfoTables.h
 
    NOTE: the PROF, and GRAN values are *wrong*  (ToDo)
+   HWL: checked GRAN_ITBL_SIZE; ok
    -------------------------------------------------------------------------- */
 
 #define STD_ITBL_SIZE   3
 #define PROF_ITBL_SIZE  1
 #define GRAN_ITBL_SIZE  1
+#define PAR_ITBL_SIZE   0
 #define TICKY_ITBL_SIZE 0
 
 /* -----------------------------------------------------------------------------
@@ -98,9 +102,13 @@
 
 /* -----------------------------------------------------------------------------
    Update Frame Layout
+   GranSim uses an additional word as bitmask in the update frame; actually,
+   not really necessary, but uses standard closure layout that way
+   NB: UF_RET etc are *wrong* in a GranSim setup; should be increased by 1 
+       if compiling for GranSim (currently not used in compiler) -- HWL
    -------------------------------------------------------------------------- */
-
 #define NOSCC_UF_SIZE 	3
+#define GRAN_UF_SIZE 	4
 #define SCC_UF_SIZE	4
 
 #define UF_RET		0
@@ -112,9 +120,11 @@
    SEQ frame size
 
    I don't think seq frames really need sccs --SDM
+   They don't need a GranSim bitmask either, but who cares anyway -- HWL
    -------------------------------------------------------------------------- */
 
 #define NOSCC_SEQ_FRAME_SIZE 2
+#define GRAN_SEQ_FRAME_SIZE  3
 #define SCC_SEQ_FRAME_SIZE   3
 
 /* -----------------------------------------------------------------------------
diff --git a/ghc/includes/GranSim.h b/ghc/includes/GranSim.h
new file mode 100644
index 0000000..88c6ad9
--- /dev/null
+++ b/ghc/includes/GranSim.h
@@ -0,0 +1,327 @@
+/*
+  Time-stamp: <Tue Jan 11 2000 11:29:41 Stardate: [-30]4188.43 hwloidl>
+  $Id: GranSim.h,v 1.2 2000/01/13 14:34:00 hwloidl Exp $
+  
+  Headers for GranSim specific objects.
+  
+  Note that in GranSim we have one run-queue and blocking-queue for each
+  processor. Therefore, this header file redefines variables like
+  run_queue_hd to be relative to CurrentProc. The main arrays of runnable
+  and blocking queues are defined in Schedule.c.  The important STG-called
+  GranSim macros (e.g. for fetching nodes) are at the end of this
+  file. Usually they are just wrappers to proper C functions in GranSim.c.  */
+
+#ifndef GRANSIM_H
+#define GRANSIM_H
+
+#if !defined(GRAN)
+
+//Dummy definitions for basic GranSim macros (see GranSim.h)
+#define DO_GRAN_ALLOCATE(n)     			  /* nothing */
+#define DO_GRAN_UNALLOCATE(n)   			  /* nothing */
+#define DO_GRAN_FETCH(node)     			  /* nothing */
+#define DO_GRAN_EXEC(arith,branch,load,store,floats)      /* nothing */
+#define GRAN_FETCH_AND_RESCHEDULE(liveness_mask,reenter)  /* nothing */
+#define GRAN_RESCHEDULE(liveness_mask,reenter)	          /* nothing */
+
+#endif
+
+#if defined(GRAN)  /* whole file */
+
+extern StgTSO *CurrentTSOs[];
+
+//@node Headers for GranSim specific objects, , ,
+//@section Headers for GranSim specific objects
+
+//@menu
+//* Includes::			
+//* Externs and prototypes::	
+//* Run and blocking queues::	
+//* Spark queues::		
+//* Processor related stuff::	
+//* GranSim costs::		
+//* STG called GranSim functions::  
+//* STG-called routines::	
+//@end menu
+
+//@node Includes, Externs and prototypes, Headers for GranSim specific objects, Headers for GranSim specific objects
+//@subsection Includes
+
+/*
+#include "Closures.h"
+#include "TSO.h"
+#include "Rts.h"
+*/
+
+//@node Externs and prototypes, Run and blocking queues, Includes, Headers for GranSim specific objects
+//@subsection Externs and prototypes
+
+/* Global constants */
+extern char *gran_event_names[];
+extern char *proc_status_names[];
+extern char *event_names[];
+
+/* Vars checked from within STG land */
+extern rtsBool  NeedToReSchedule, IgnoreEvents, IgnoreYields; 
+; 
+extern rtsTime  TimeOfNextEvent, TimeOfLastEvent, EndOfTimeSlice;
+
+/* costs for basic operations (copied from RTS flags) */
+extern nat gran_arith_cost, gran_branch_cost, gran_load_cost, gran_store_cost, gran_float_cost;
+
+extern nat SparksAvail;     /* How many sparks are available */
+extern nat SurplusThreads;  /* How many excess threads are there */
+extern nat sparksIgnored, sparksCreated;
+
+//@node Run and blocking queues, Spark queues, Externs and prototypes, Headers for GranSim specific objects
+//@subsection Run and blocking queues
+
+/* declared in Schedule.c */
+extern StgTSO *run_queue_hds[], *run_queue_tls[];
+extern StgTSO *blocked_queue_hds[], *blocked_queue_tls[];
+extern StgTSO *ccalling_threadss[];
+
+#define run_queue_hd         run_queue_hds[CurrentProc]
+#define run_queue_tl         run_queue_tls[CurrentProc]
+#define blocked_queue_hd     blocked_queue_hds[CurrentProc]
+#define blocked_queue_tl     blocked_queue_tls[CurrentProc]
+#define pending_sparks_hd    pending_sparks_hds[CurrentProc]
+#define pending_sparks_tl    pending_sparks_tls[CurrentProc]
+#define ccalling_threads     ccalling_threadss[CurrentProc]
+
+//@node Spark queues, Processor related stuff, Run and blocking queues, Headers for GranSim specific objects
+//@subsection Spark queues
+
+/*
+In GranSim we use a double linked list to represent spark queues.
+
+This is more flexible, but slower, than the array of pointers
+representation used in GUM. We use the flexibility to define new fields in
+the rtsSpark structure, representing e.g. granularity info (see HWL's PhD
+thesis), or info about the parent of a spark.
+*/
+
+/* Sparks and spark queues */
+typedef struct rtsSpark_
+{
+  StgClosure    *node;
+  StgInt         name, global;
+  StgInt         gran_info;      /* for granularity improvement mechanisms */
+  PEs            creator;        /* PE that created this spark (unused) */
+  struct rtsSpark_  *prev, *next;
+} rtsSpark;
+typedef rtsSpark *rtsSparkQ;
+
+/* The spark queues, proper */
+/* In GranSim this is a globally visible array of spark queues */
+extern rtsSparkQ pending_sparks_hds[];
+extern rtsSparkQ pending_sparks_tls[];
+
+/* Prototypes of those spark routines visible to compiler generated .hc */
+/* Routines only used inside the RTS are defined in rts/parallel GranSimRts.h */
+rtsSpark    *newSpark(StgClosure *node, 
+		      StgInt name, StgInt gran_info, StgInt size_info, 
+		      StgInt par_info, StgInt local);
+void         add_to_spark_queue(rtsSpark *spark);
+
+//@node Processor related stuff, GranSim costs, Spark queues, Headers for GranSim specific objects
+//@subsection Processor related stuff
+
+extern PEs CurrentProc;
+extern rtsTime CurrentTime[];  
+
+/* Maximum number of PEs that can be simulated */
+#define MAX_PROC             32 /* (BITS_IN(StgWord))  */ // ToDo: fix this!!
+//#if MAX_PROC==16 
+//#else 
+//#error MAX_PROC should be 32 on this architecture 
+//#endif 
+
+#define CurrentTSO           CurrentTSOs[CurrentProc]
+
+/* Processor numbers to bitmasks and vice-versa */
+#define MainProc	     0           /* Id of main processor */
+#define NO_PRI               0           /* dummy priority */
+#define MAX_PRI              10000       /* max possible priority */
+#define MAIN_PRI             MAX_PRI     /* priority of main thread */ 
+
+/* GrAnSim uses IdleProcs as bitmask to indicate which procs are idle */
+#define PE_NUMBER(n)          (1l << (long)n)
+#define ThisPE		      PE_NUMBER(CurrentProc)
+#define MainPE		      PE_NUMBER(MainProc)
+#define Everywhere	      (~0l)
+#define Nowhere	              (0l)
+#define Now                   CurrentTime[CurrentProc]
+
+#define IS_LOCAL_TO(ga,proc)  ((1l << (PEs) proc) & ga)
+
+#define GRAN_TIME_SLICE       1000        /* max time between 2 ReSchedules */
+
+//@node GranSim costs, STG called GranSim functions, Processor related stuff, Headers for GranSim specific objects
+//@subsection GranSim costs
+
+/* Default constants for communication (see RtsFlags on how to change them) */
+
+#define LATENCY		           1000	/* Latency for single packet */
+#define ADDITIONAL_LATENCY	    100	/* Latency for additional packets */
+#define BASICBLOCKTIME	    	     10
+#define FETCHTIME	  	(LATENCY*2+MSGUNPACKTIME)
+#define LOCALUNBLOCKTIME  	     10
+#define GLOBALUNBLOCKTIME 	(LATENCY+MSGUNPACKTIME)
+
+#define	MSGPACKTIME		     0  /* Cost of creating a packet */
+#define	MSGUNPACKTIME		     0  /* Cost of receiving a packet */
+#define MSGTIDYTIME                  0  /* Cost of cleaning up after send */
+
+/* How much to increase GrAnSims internal packet size if an overflow 
+   occurs.
+   NB: This is a GrAnSim internal variable and is independent of the
+   simulated packet buffer size.
+*/
+
+#define GRANSIM_DEFAULT_PACK_BUFFER_SIZE     400
+#define REALLOC_SZ                           200
+
+/* extern W_ gran_mpacktime, gran_mtidytime, gran_munpacktime; */
+
+/* Thread cost model */
+#define THREADCREATETIME	   (25+THREADSCHEDULETIME)
+#define THREADQUEUETIME		    12  /* Cost of adding a thread to the running/runnable queue */
+#define THREADDESCHEDULETIME	    75  /* Cost of descheduling a thread */
+#define THREADSCHEDULETIME	    75  /* Cost of scheduling a thread */
+#define THREADCONTEXTSWITCHTIME	    (THREADDESCHEDULETIME+THREADSCHEDULETIME)
+
+/* Instruction Cost model (SPARC, including cache misses) */
+#define ARITH_COST	     	   1
+#define BRANCH_COST	     	   2
+#define LOAD_COST	  	   4
+#define STORE_COST	  	   4
+#define FLOAT_COST		   1 /* ? */
+
+#define HEAPALLOC_COST             11
+
+#define PRI_SPARK_OVERHEAD    5
+#define PRI_SCHED_OVERHEAD    5
+
+//@node STG called GranSim functions, STG-called routines, GranSim costs, Headers for GranSim specific objects
+//@subsection STG called GranSim functions
+
+/* STG called GranSim functions */
+void GranSimAllocate(StgInt n);
+void GranSimUnallocate(StgInt n);
+void GranSimExec(StgWord ariths, StgWord branches, StgWord loads, StgWord stores, StgWord floats);
+StgInt GranSimFetch(StgClosure *node);
+void GranSimSpark(StgInt local, StgClosure *node);
+void GranSimSparkAt(rtsSpark *spark, StgClosure *where,StgInt identifier);
+void GranSimSparkAtAbs(rtsSpark *spark, PEs proc, StgInt identifier);
+void GranSimBlock(StgTSO *tso, PEs proc, StgClosure *node);
+
+
+//@node STG-called routines,  , STG called GranSim functions, Headers for GranSim specific objects
+//@subsection STG-called routines
+
+/* Wrapped version of calls to GranSim-specific STG routines */
+
+/*
+#define DO_PERFORM_RESCHEDULE(liveness, always_reenter_node) PerformReschedule_wrapper(liveness, always_reenter_node)
+*/
+#define DO_GRAN_ALLOCATE(n)     STGCALL1(GranSimAllocate, n)
+#define DO_GRAN_UNALLOCATE(n)   STGCALL1(GranSimUnallocate, n)
+#define DO_GRAN_FETCH(node)     STGCALL1(GranSimFetch, node)
+#define DO_GRAN_EXEC(arith,branch,load,store,floats) GranSimExec(arith,branch,load,store,floats)
+
+/* 
+   ToDo: Clean up this mess of GRAN macros!!! -- HWL
+*/
+// DO_GRAN_FETCH((StgClosure*)R1.p);
+#define GRAN_FETCH()		/* nothing */
+
+#define GRAN_FETCH_AND_RESCHEDULE(liveness,reenter)	\
+          DO_GRAN_FETCH((StgClosure*)R1.p); 			        \
+          DO_GRAN_YIELD(liveness,ENTRY_CODE((D_)(*R1.p))); 
+// RESTORE_EVERYTHING is done implicitly before entering threaded world agian
+
+/*
+  This is the only macro currently enabled;
+  It should check whether it is time for the current thread to yield
+  (e.g. if there is a more recent event in the queue) and it should check
+  whether node is local, via a call to GranSimFetch.
+  ToDo: split this in 2 routines:
+         - GRAN_YIELD (as it is below)
+	 - GRAN_FETCH (the rest of this macro)
+        emit only these 2 macros based on node's liveness
+	  node alive: emit both macros
+	  node not alive: do only a GRAN_YIELD
+	  
+        replace gran_yield_? with gran_block_? (they really block the current
+	thread)
+*/
+#define GRAN_RESCHEDULE(liveness,ptrs)  \
+          if (RET_STGCALL1(StgInt, GranSimFetch, (StgClosure*)R1.p)) {\
+            EXTFUN_RTS(gran_block_##ptrs); \
+            JMP_(gran_block_##ptrs);       \
+          } else {                         \
+	    if (TimeOfLastEvent < CurrentTime[CurrentProc] && \
+                HEAP_ALLOCED((StgClosure *)R1.p) && \
+                LOOKS_LIKE_GHC_INFO(get_itbl((StgClosure *)R1.p))) { \
+                                  EXTFUN_RTS(gran_yield_##ptrs); \
+                                  JMP_(gran_yield_##ptrs); \
+                } \
+            /* GRAN_YIELD(ptrs)  */             \
+	  }
+
+
+//                                                   YIELD(liveness,reenter)
+
+// GRAN_YIELD(liveness_mask); 
+
+// GRAN_FETCH_AND_RESCHEDULE(liveness_mask,reenter)
+
+#define THREAD_CONTEXT_SWITCH(liveness_mask,reenter)	\
+        do { \
+	if (context_switch /* OR_INTERVAL_EXPIRED */) {	\
+          GRAN_RESCHEDULE(liveness_mask,reenter); \
+        } }while(0)
+
+#define GRAN_EXEC(arith,branch,load,store,floats)       \
+        { \
+          W_ cost = gran_arith_cost*arith +   \
+                    gran_branch_cost*branch + \
+                    gran_load_cost*load +   \
+                    gran_store_cost*store +   \
+                    gran_float_cost*floats;   \
+          CurrentTSO->gran.exectime += cost;                      \
+          CurrentTime[CurrentProc] += cost;                      \
+        }
+
+/* In GranSim we first check whether there is an event to handle; only if
+   this is the case (or the time slice is over in case of fair scheduling)
+   we do a yield, which is very similar to that in the concurrent world 
+   ToDo: check whether gran_yield_? can be merged with other yielding codes
+*/
+
+#define DO_GRAN_YIELD(ptrs)	if (!IgnoreYields && \
+                                    TimeOfLastEvent < CurrentTime[CurrentProc] && \
+				    HEAP_ALLOCED((StgClosure *)R1.p) && \
+                                    LOOKS_LIKE_GHC_INFO(get_itbl((StgClosure *)R1.p))) { \
+                                  EXTFUN_RTS(gran_yield_##ptrs); \
+                                  JMP_(gran_yield_##ptrs); \
+                                }
+
+#define GRAN_YIELD(ptrs)                                   \
+        {                                                   \
+          extern  nat context_switch;                          \
+          if ( (CurrentTime[CurrentProc]>=EndOfTimeSlice) ||   \
+               ((CurrentTime[CurrentProc]>=TimeOfNextEvent) && \
+	        (TimeOfNextEvent!=0) && !IgnoreEvents )) {     \
+	    /* context_switch = 1; */                          \
+            DO_GRAN_YIELD(ptrs);   \
+	  }                                                    \
+	}
+
+#define ADD_TO_SPARK_QUEUE(spark)	      \
+   STGCALL1(add_to_spark_queue,spark) \
+
+#endif /* GRAN */
+
+#endif /* GRANSIM_H */
diff --git a/ghc/includes/InfoMacros.h b/ghc/includes/InfoMacros.h
index 91900d4..a85529b 100644
--- a/ghc/includes/InfoMacros.h
+++ b/ghc/includes/InfoMacros.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
- * $Id: InfoMacros.h,v 1.8 1999/11/30 11:44:32 simonmar Exp $
+ * $Id: InfoMacros.h,v 1.9 2000/01/13 14:34:00 hwloidl Exp $
  * 
  * (c) The GHC Team, 1998-1999
  *
@@ -31,8 +31,35 @@
 #define INIT_VECTOR
 #endif
 
+/*
+  On the GRAN/PAR specific parts of the InfoTables:
+
+  In both GranSim and GUM we use revertible black holes (RBH) when putting
+  an updatable closure into a packet for communication. The entry code for
+  an RBH performs standard blocking (as with any kind of BH). The info
+  table for the RBH resides just before the one for the std info
+  table. (NB: there is one RBH ITBL for every ITBL of an updatable
+  closure.) The @rbh_infoptr@ field in the ITBL points from the std ITBL to
+  the RBH ITBL and vice versa. This is used by the RBH_INFOPTR and
+  REVERT_INFOPTR macros to turn an updatable node into an RBH and vice
+  versa. Note, that the only case where we have to revert the RBH in its
+  original form is when a packet is sent back because of garbage collection
+  on another PE. In the RTS for GdH we will use this reversion mechanism in 
+  order to deal with faults in the system. 
+  ToDo: Check that RBHs are needed for all the info tables below. From a quick
+  check of the macros generated in the libs it seems that all of them are used
+  for generating THUNKs.
+  Possible optimisation: Note that any RBH ITBL is a fixed distance away from 
+  the actual ITBL. We could inline this offset as a constant into the RTS and
+  avoid the rbh_infoptr fields altogether (Jim did that in the old RTS).
+  -- HWL
+*/
+
+
 /* function/thunk info tables --------------------------------------------- */
 
+#if defined(GRAN) || defined(PAR)
+
 #define \
 INFO_TABLE_SRT(info,				/* info-table label */	\
 	       entry,				/* entry code label */	\
@@ -41,17 +68,73 @@ INFO_TABLE_SRT(info,				/* info-table label */	\
 	       type,				/* closure type */	\
 	       info_class, entry_class,		/* C storage classes */	\
 	       prof_descr, prof_type)		/* profiling info */	\
+        entry_class(RBH_##entry);                                      \
         entry_class(entry);                                             \
+	info_class INFO_TBL_CONST StgInfoTable info; \
+	info_class INFO_TBL_CONST StgInfoTable RBH_##info = {		\
+		layout : { payload : {ptrs,nptrs} },			\
+		SRT_INFO(RBH,srt_,srt_off_,srt_len_),                  \
+                INCLUDE_RBH_INFO(info),			                \
+                INIT_ENTRY(RBH_##entry),                           \
+                INIT_VECTOR                                             \
+	} ; \
+        StgFunPtr  RBH_##entry (void) { JMP_(RBH_entry); } ;            \
 	info_class INFO_TBL_CONST StgInfoTable info = {			\
 		layout : { payload : {ptrs,nptrs} },			\
 		SRT_INFO(type,srt_,srt_off_,srt_len_),			\
+                INCLUDE_RBH_INFO(RBH_##info),			\
                 INIT_ENTRY(entry),                                      \
                 INIT_VECTOR                                             \
 	}
 
+#else
+
+#define \
+INFO_TABLE_SRT(info,				/* info-table label */	\
+	       entry,				/* entry code label */	\
+	       ptrs, nptrs,			/* closure layout info */\
+	       srt_, srt_off_, srt_len_,	/* SRT info */		\
+	       type,				/* closure type */	\
+	       info_class, entry_class,		/* C storage classes */	\
+	       prof_descr, prof_type)		/* profiling info */	\
+        entry_class(entry);                                             \
+	info_class INFO_TBL_CONST StgInfoTable info = {			\
+		layout : { payload : {ptrs,nptrs} },			\
+		SRT_INFO(type,srt_,srt_off_,srt_len_),			\
+                INIT_ENTRY(entry),                                      \
+                INIT_VECTOR                                             \
+	}
+
+#endif
 
 /* direct-return address info tables  --------------------------------------*/
 
+#if defined(GRAN) || defined(PAR)
+
+#define									\
+INFO_TABLE_SRT_BITMAP(info, entry, bitmap_, srt_, srt_off_, srt_len_,	\
+		      type, info_class, entry_class,			\
+		      prof_descr, prof_type)				\
+        entry_class(RBH_##entry);                                      \
+        entry_class(entry);                                             \
+	info_class INFO_TBL_CONST StgInfoTable info; \
+	info_class INFO_TBL_CONST StgInfoTable RBH_##info = {		\
+		layout : { bitmap : (StgWord32)bitmap_ },		\
+		SRT_INFO(RBH,srt_,srt_off_,srt_len_),			\
+                INCLUDE_RBH_INFO(info),			                \
+                INIT_ENTRY(RBH_##entry),				\
+                INIT_VECTOR						\
+	};                                                              \
+        StgFunPtr  RBH_##entry (void) { JMP_(RBH_entry); } ;            \
+	info_class INFO_TBL_CONST StgInfoTable info = {			\
+		layout : { bitmap : (StgWord32)bitmap_ },		\
+		SRT_INFO(type,srt_,srt_off_,srt_len_),			\
+                INCLUDE_RBH_INFO(RBH_##info),		                \
+                INIT_ENTRY(entry),					\
+                INIT_VECTOR						\
+	}
+#else
+
 #define									\
 INFO_TABLE_SRT_BITMAP(info, entry, bitmap_, srt_, srt_off_, srt_len_,	\
 		      type, info_class, entry_class,			\
@@ -63,9 +146,36 @@ INFO_TABLE_SRT_BITMAP(info, entry, bitmap_, srt_, srt_off_, srt_len_,	\
                 INIT_ENTRY(entry),					\
                 INIT_VECTOR						\
 	}
+#endif
 
 /* info-table without an SRT -----------------------------------------------*/
 
+#if defined(GRAN) || defined(PAR)
+
+#define							\
+INFO_TABLE(info, entry, ptrs, nptrs, type, info_class,	\
+	   entry_class, prof_descr, prof_type)		\
+        entry_class(RBH_##entry);                                      \
+        entry_class(entry);                                             \
+	info_class INFO_TBL_CONST StgInfoTable info; \
+	info_class INFO_TBL_CONST StgInfoTable RBH_##info = {	\
+		layout : { payload : {ptrs,nptrs} },	\
+		STD_INFO(RBH),				\
+                INCLUDE_RBH_INFO(info),	                \
+                INIT_ENTRY(RBH_##entry),		\
+                INIT_VECTOR				\
+	};                                              \
+        StgFunPtr  RBH_##entry (void) { JMP_(RBH_entry); } ;            \
+	info_class INFO_TBL_CONST StgInfoTable info = {	\
+		layout : { payload : {ptrs,nptrs} },	\
+		STD_INFO(type),				\
+                INCLUDE_RBH_INFO(RBH_##info),		                \
+                INIT_ENTRY(entry),			\
+                INIT_VECTOR				\
+	}
+
+#else
+
 #define							\
 INFO_TABLE(info, entry, ptrs, nptrs, type, info_class,	\
 	   entry_class, prof_descr, prof_type)		\
@@ -77,8 +187,36 @@ INFO_TABLE(info, entry, ptrs, nptrs, type, info_class,	\
                 INIT_VECTOR				\
 	}
 
+#endif
+
 /* special selector-thunk info table ---------------------------------------*/
 
+#if defined(GRAN) || defined(PAR)
+
+#define							\
+INFO_TABLE_SELECTOR(info, entry, offset, info_class,	\
+		    entry_class, prof_descr, prof_type)	\
+        entry_class(RBH_##entry);                                      \
+        entry_class(entry);                                             \
+	info_class INFO_TBL_CONST StgInfoTable info; \
+	info_class INFO_TBL_CONST StgInfoTable RBH_##info = {	\
+		layout : { selector_offset : offset },	\
+		STD_INFO(RBH),		                \
+                INCLUDE_RBH_INFO(info),	                \
+                INIT_ENTRY(RBH_##entry),		\
+                INIT_VECTOR				\
+	};                                              \
+        StgFunPtr  RBH_##entry (void) { JMP_(RBH_entry); } ;            \
+	info_class INFO_TBL_CONST StgInfoTable info = {	\
+		layout : { selector_offset : offset },	\
+		STD_INFO(THUNK_SELECTOR),		\
+                INCLUDE_RBH_INFO(RBH_##info),           \
+                INIT_ENTRY(entry),			\
+                INIT_VECTOR				\
+	}
+
+#else
+
 #define							\
 INFO_TABLE_SELECTOR(info, entry, offset, info_class,	\
 		    entry_class, prof_descr, prof_type)	\
@@ -90,6 +228,8 @@ INFO_TABLE_SELECTOR(info, entry, offset, info_class,	\
                 INIT_VECTOR				\
 	}
 
+#endif
+
 /* constructor info table --------------------------------------------------*/
 
 #define \
diff --git a/ghc/includes/InfoTables.h b/ghc/includes/InfoTables.h
index 0f8a659..b3db5e5 100644
--- a/ghc/includes/InfoTables.h
+++ b/ghc/includes/InfoTables.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
- * $Id: InfoTables.h,v 1.17 1999/07/16 09:41:12 panne Exp $
+ * $Id: InfoTables.h,v 1.18 2000/01/13 14:34:00 hwloidl Exp $
  * 
  * (c) The GHC Team, 1998-1999
  *
@@ -36,12 +36,15 @@ typedef struct {
    Parallelism info
    -------------------------------------------------------------------------- */
 
-#ifdef PAR
+#if 0 && (defined(PAR) || defined(GRAN))
 
-#define PAR_INFO_WORDS 0
+// CURRENTLY UNUSED
+// ToDo: use this in StgInfoTable (mutually recursive) -- HWL
+
+#define PAR_INFO_WORDS 1
 
 typedef struct {
-       /* empty */
+  StgInfoTable *rbh_infoptr;     /* infoptr to the RBH  */
 } StgParInfo;
 
 #else /* !PAR */
@@ -54,6 +57,54 @@ typedef struct {
 
 #endif /* PAR */
 
+/*
+   Copied from ghc-0.29; ToDo: check this code -- HWL
+
+   In the parallel system, all updatable closures have corresponding
+   revertible black holes.  When we are assembly-mangling, we guarantee
+   that the revertible black hole code precedes the normal entry code, so
+   that the RBH info table resides at a fixed offset from the normal info
+   table.  Otherwise, we add the RBH info table pointer to the end of the
+   normal info table and vice versa.
+
+   Currently has to use a !RBH_MAGIC_OFFSET setting.
+   Still todo: init of par.infoptr field in all infotables!!
+*/
+
+#if defined(PAR) || defined(GRAN)
+# define RBH_INFO_OFFSET	    (GEN_INFO_OFFSET+GEN_INFO_WORDS)
+
+# ifdef RBH_MAGIC_OFFSET
+
+#  error magic offset not yet implemented
+
+#  define RBH_INFO_WORDS    0
+#  define INCLUDE_RBH_INFO(infoptr)
+
+#  define RBH_INFOPTR(infoptr)	    (((P_)infoptr) - RBH_MAGIC_OFFSET)
+#  define REVERT_INFOPTR(infoptr)   (((P_)infoptr) + RBH_MAGIC_OFFSET)
+
+# else
+
+#  define RBH_INFO_WORDS    1
+#  define INCLUDE_RBH_INFO(info)    rbh_infoptr : &(info)
+
+#  define RBH_INFOPTR(infoptr)	    (((StgInfoTable *)(infoptr))->rbh_infoptr)
+#  define REVERT_INFOPTR(infoptr)   (((StgInfoTable *)(infoptr))->rbh_infoptr)
+
+# endif
+
+/* see ParallelRts.h */
+// EXTFUN(RBH_entry);
+//StgClosure *convertToRBH(StgClosure *closure);
+//#if defined(GRAN)
+//void convertFromRBH(StgClosure *closure);
+//#elif defined(PAR)
+//void convertToFetchMe(StgPtr closure, globalAddr *ga);
+//#endif
+
+#endif
+
 /* -----------------------------------------------------------------------------
    Debugging info
    -------------------------------------------------------------------------- */
@@ -98,11 +149,27 @@ extern StgWord16 closure_flags[];
 
 #define closureFlags(c)         (closure_flags[get_itbl(c)->type])
 
-#define closure_STATIC(c)       (  closureFlags(c) & _STA)
+#define closure_HNF(c)          (  closureFlags(c) & _HNF)
+#define closure_BITMAP(c)       (  closureFlags(c) & _BTM)
+#define closure_NON_SPARK(c)    ( (closureFlags(c) & _NS))
 #define closure_SHOULD_SPARK(c) (!(closureFlags(c) & _NS))
+#define closure_STATIC(c)       (  closureFlags(c) & _STA)
+#define closure_THUNK(c)        (  closureFlags(c) & _THU)
 #define closure_MUTABLE(c)      (  closureFlags(c) & _MUT)
 #define closure_UNPOINTED(c)    (  closureFlags(c) & _UPT)
+#define closure_SRT(c)          (  closureFlags(c) & _SRT)
+
+/* same as above but for info-ptr rather than closure */
+#define ipFlags(ip)             (closure_flags[ip->type])
 
+#define ip_HNF(ip)               (  ipFlags(ip) & _HNF)
+#define ip_BITMAP(ip)       	 (  ipFlags(ip) & _BTM)
+#define ip_SHOULD_SPARK(ip) 	 (!(ipFlags(ip) & _NS))
+#define ip_STATIC(ip)       	 (  ipFlags(ip) & _STA)
+#define ip_THUNK(ip)        	 (  ipFlags(ip) & _THU)
+#define ip_MUTABLE(ip)      	 (  ipFlags(ip) & _MUT)
+#define ip_UNPOINTED(ip)    	 (  ipFlags(ip) & _UPT)
+#define ip_SRT(ip)          	 (  ipFlags(ip) & _SRT)
 
 /* -----------------------------------------------------------------------------
    Info Tables
@@ -153,8 +220,9 @@ typedef StgClosure* StgSRT[];
 
 typedef struct _StgInfoTable {
     StgSRT         *srt;	/* pointer to the SRT table */
-#ifdef PAR
-    StgParInfo	    par;
+#if defined(PAR) || defined(GRAN)
+  // StgParInfo	    par;
+    struct _StgInfoTable    *rbh_infoptr;
 #endif
 #ifdef PROFILING
   /* StgProfInfo     prof; */
diff --git a/ghc/includes/Parallel.h b/ghc/includes/Parallel.h
new file mode 100644
index 0000000..e9a6ef1
--- /dev/null
+++ b/ghc/includes/Parallel.h
@@ -0,0 +1,342 @@
+/*
+  Time-stamp: <Fri Dec 10 1999 17:15:01 Stardate: [-30]4028.38 software>
+ 
+  Definitions for parallel machines.
+
+  This section contains definitions applicable only to programs compiled
+  to run on a parallel machine, i.e. on GUM. Some of these definitions
+  are also used when simulating parallel execution, i.e. on GranSim.
+*/
+
+/*
+  ToDo: Check the PAR specfic part of this file 
+        Move stuff into Closures.h and ClosureMacros.h 
+	Clean-up GRAN specific code
+  -- HWL
+*/
+
+#ifndef PARALLEL_H
+#define PARALLEL_H
+
+#if defined(PAR) || defined(GRAN)        /* whole file */
+
+//@node Parallel definitions, End of File
+//@section Parallel definitions
+
+//@menu
+//* Basic definitions::		
+//* GUM::			
+//* GranSim::			
+//@end menu
+
+//@node Basic definitions, GUM, Parallel definitions, Parallel definitions
+//@subsection Basic definitions
+
+/* SET_PAR_HDR and SET_STATIC_PAR_HDR now live in ClosureMacros.h */
+
+/* Needed for dumping routines */
+#if defined(PAR)
+# define NODE_STR_LEN              20
+# define TIME_STR_LEN              120
+# define TIME                      rtsTime
+# define CURRENT_TIME              msTime()
+# define TIME_ON_PROC(p)           msTime()
+# define CURRENT_PROC              thisPE
+# define BINARY_STATS              RtsFlags.ParFlags.ParStats.Binary
+#elif defined(GRAN)
+# define NODE_STR_LEN              20
+# define TIME_STR_LEN              120
+# define TIME                      rtsTime
+# define CURRENT_TIME              CurrentTime[CurrentProc]
+# define TIME_ON_PROC(p)           CurrentTime[p]
+# define CURRENT_PROC              CurrentProc
+# define BINARY_STATS              RtsFlags.GranFlags.GranSimStats.Binary
+#endif
+
+#if defined(PAR)
+#  define MAX_PES	256		/* Maximum number of processors */
+	/* MAX_PES is enforced by SysMan, which does not
+	   allow more than this many "processors".
+	   This is important because PackGA [GlobAddr.lc]
+	   **assumes** that a PE# can fit in 8+ bits.
+	*/
+
+# define SPARK_POOLS 	2   /* no. of spark pools */
+# define REQUIRED_POOL 	0   /* idx of pool of mandatory sparks (concurrency) */
+# define ADVISORY_POOL 	1   /* idx of pool of advisory sparks (parallelism) */
+#endif
+
+//@menu
+//* GUM::			
+//* GranSim::			
+//@end menu
+//*/
+
+//@node GUM, GranSim, Basic definitions, Parallel definitions
+//@subsection GUM
+
+#if defined(PAR) 
+/*
+Symbolic constants for the packing code.
+
+This constant defines how many words of data we can pack into a single
+packet in the parallel (GUM) system.
+*/
+
+//@menu
+//* Types::			
+//* Externs::			
+//* Prototypes::		
+//* Macros::			
+//@end menu
+//*/
+
+//@node Types, Externs, GUM, GUM
+//@subsubsection Types
+
+/* Sparks and spark queues */
+typedef StgClosure  *rtsSpark;
+typedef rtsSpark    *rtsSparkQ;
+
+typedef struct rtsPackBuffer_ {
+  StgInt /* nat */           id; 
+  StgInt /* nat */           size;
+  StgInt /* nat */           unpacked_size;
+  StgTSO       *tso;
+  StgWord     *buffer[0];  
+} rtsPackBuffer;
+
+#define PACK_BUFFER_HDR_SIZE 4
+
+//@node Externs, Prototypes, Types, GUM
+//@subsubsection Externs
+
+// extern rtsBool do_sp_profile;
+
+extern globalAddr theGlobalFromGA, theGlobalToGA;
+extern StgBlockedFetch *PendingFetches;
+extern GlobalTaskId    *allPEs;
+
+extern rtsBool      IAmMainThread, GlobalStopPending;
+//extern rtsBool      fishing;
+extern rtsTime      last_fish_arrived_at;
+extern nat          outstandingFishes;
+extern GlobalTaskId SysManTask;
+extern int          seed;     /* pseudo-random-number generator seed: */
+                              /* Initialised in ParInit */
+extern StgInt       threadId; /* Number of Threads that have existed on a PE */
+extern GlobalTaskId mytid;
+
+extern GlobalTaskId *allPEs;
+extern nat nPEs;
+extern nat sparksIgnored, sparksCreated, threadsIgnored, threadsCreated;
+extern nat advisory_thread_count;
+
+extern rtsBool InGlobalGC;  /* Are we in the midst of performing global GC */
+
+static ullong startTime;    /* start of comp; in RtsStartup.c */
+
+/* the spark pools proper */
+extern rtsSpark *pending_sparks_hd[];  /* ptr to start of a spark pool */ 
+extern rtsSpark *pending_sparks_tl[];  /* ptr to end of a spark pool */ 
+extern rtsSpark *pending_sparks_lim[]; 
+extern rtsSpark *pending_sparks_base[]; 
+extern nat spark_limit[];
+
+extern rtsPackBuffer *PackBuffer;      /* size: can be set via option */
+extern rtsPackBuffer *buffer;             /* HWL_ */
+extern rtsPackBuffer *freeBuffer;           /* HWL_ */
+extern rtsPackBuffer *packBuffer;           /* HWL_ */
+extern rtsPackBuffer *gumPackBuffer;
+
+extern int thisPE;
+
+/* From Global.c */
+extern GALA *freeGALAList;
+extern GALA *freeIndirections;
+extern GALA *liveIndirections;
+extern GALA *liveRemoteGAs;
+
+/*
+extern HashTable *taskIDtoPEtable;
+extern HashTable *LAtoGALAtable;
+extern HashTable *pGAtoGALAtable;
+*/
+
+//@node Prototypes, Macros, Externs, GUM
+//@subsubsection Prototypes
+
+/* From ParInit.c */
+void          initParallelSystem(void);
+void          SynchroniseSystem(void);
+void          par_exit(StgInt n);
+
+PEs           taskIDtoPE (GlobalTaskId gtid);
+void          registerTask (GlobalTaskId gtid);
+globalAddr   *LAGAlookup (StgClosure *addr);
+StgClosure   *GALAlookup (globalAddr *ga);
+//static GALA  *allocIndirection (StgPtr addr);
+globalAddr   *makeGlobal (StgClosure *addr, rtsBool preferred);
+globalAddr   *setRemoteGA (StgClosure *addr, globalAddr *ga, rtsBool preferred);
+void          splitWeight (globalAddr *to, globalAddr *from);
+globalAddr   *addWeight (globalAddr *ga);
+void          initGAtables (void);
+void          RebuildLAGAtable (void);
+StgWord       PackGA (StgWord pe, int slot);
+
+//@node Macros,  , Prototypes, GUM
+//@subsubsection Macros
+
+/* delay (in us) between dying fish returning and sending out a new fish */
+#define FISH_DELAY                   1000
+/* max no. of outstanding spark steals */
+#define MAX_FISHES                   1  
+
+// ToDo: check which of these is actually needed!
+
+#    define PACK_HEAP_REQUIRED  ((RtsFlags.ParFlags.packBufferSize - PACK_HDR_SIZE) / (PACK_GA_SIZE + _FHS) * (MIN_UPD_SIZE + 2))
+
+#  define MAX_GAS 	(RtsFlags.ParFlags.packBufferSize / PACK_GA_SIZE)
+
+
+#  define PACK_GA_SIZE	3	/* Size of a packed GA in words */
+			        /* Size of a packed fetch-me in words */
+#  define PACK_FETCHME_SIZE (PACK_GA_SIZE + FIXED_HS)
+
+#  define PACK_HDR_SIZE	1	/* Words of header in a packet */
+
+#  define PACK_PLC_SIZE	2	/* Size of a packed PLC in words */
+
+/*
+  Definitions relating to the entire parallel-only fixed-header field.
+
+  On GUM, the global addresses for each local closure are stored in a
+  separate hash table, rather then with the closure in the heap.  We call
+  @getGA@ to look up the global address associated with a local closure (0
+  is returned for local closures that have no global address), and @setGA@
+  to store a new global address for a local closure which did not
+  previously have one.  */
+
+#  define GA_HDR_SIZE			0
+  
+#  define GA(closure)		        getGA(closure)
+  
+#  define SET_GA(closure, ga)             setGA(closure,ga)
+#  define SET_STATIC_GA(closure)
+#  define SET_GRAN_HDR(closure,pe)
+#  define SET_STATIC_PROCS(closure)
+  
+#  define MAX_GA_WEIGHT			0	/* Treat as 2^n */
+  
+/* At the moment, there is no activity profiling for GUM.  This may change. */
+#  define SET_TASK_ACTIVITY(act)        /* nothing */
+
+#endif /* PAR */
+
+//@node GranSim,  , GUM, Parallel definitions
+//@subsection GranSim
+
+#if defined(GRAN)
+/* ToDo: Check which of the PAR routines are needed in GranSim -- HWL */
+
+//@menu
+//* Types::			
+//* Prototypes::		
+//* Macros::			
+//@end menu
+//*/
+
+//@node Types, Prototypes, GranSim, GranSim
+//@subsubsection Types
+
+typedef struct rtsPackBuffer_ {
+  StgInt /* nat */           id;
+  StgInt /* nat */           size;
+  StgInt /* nat */           unpacked_size;
+  StgTSO       *tso;
+  StgClosure  **buffer;  
+} rtsPackBuffer;
+
+//@node Prototypes, Macros, Types, GranSim
+//@subsubsection Prototypes
+
+
+/* main packing functions */
+/*
+rtsPackBuffer *PackNearbyGraph(StgClosure* closure, StgTSO* tso, nat *packbuffersize);
+rtsPackBuffer *PackOneNode(StgClosure* closure, StgTSO* tso, nat *packbuffersize);
+void PrintPacket(rtsPackBuffer *buffer);
+StgClosure *UnpackGraph(rtsPackBuffer* buffer);
+*/
+/* important auxiliary functions */
+
+/* 
+OLD CODE -- HWL
+void  InitPackBuffer(void);
+P_    AllocateHeap (W_ size);
+P_    PackNearbyGraph (P_ closure, P_ tso, W_ *packbuffersize);
+P_    PackOneNode (P_ closure, P_ tso, W_ *packbuffersize);
+P_    UnpackGraph (P_ buffer);
+
+void    InitClosureQueue (void);
+P_      DeQueueClosure(void);
+void    QueueClosure (P_ closure);
+// rtsBool QueueEmpty();
+void    PrintPacket (P_ buffer);
+*/
+
+// StgInfoTable *get_closure_info(StgClosure* node, unsigned int /* nat */ *size, unsigned int /* nat */ *ptrs, unsigned int /* nat */ *nonptrs, unsigned int /* nat */ *vhs, char *info_hdr_ty);
+// int /* rtsBool */ IS_BLACK_HOLE(StgClosure* node)          ;
+
+//@node Macros,  , Prototypes, GranSim
+//@subsubsection Macros
+
+/* max no. of outstanding spark steals */
+#define MAX_FISHES                   1  
+
+/* These are needed in the packing code to get the size of the packet
+   right. The closures itself are never built in GrAnSim. */
+#  define FETCHME_VHS				IND_VHS
+#  define FETCHME_HS				IND_HS
+  
+#  define FETCHME_GA_LOCN                       FETCHME_HS
+  
+#  define FETCHME_CLOSURE_SIZE(closure)		IND_CLOSURE_SIZE(closure)
+#  define FETCHME_CLOSURE_NoPTRS(closure)		0L
+#  define FETCHME_CLOSURE_NoNONPTRS(closure)	(IND_CLOSURE_SIZE(closure)-IND_VHS)
+  
+#  define MAX_GAS 	(RtsFlags.GranFlags.packBufferSize / PACK_GA_SIZE)
+#  define PACK_GA_SIZE	3	/* Size of a packed GA in words */
+			        /* Size of a packed fetch-me in words */
+#  define PACK_FETCHME_SIZE (PACK_GA_SIZE + FIXED_HS)
+#  define PACK_HDR_SIZE	4	/* Words of header in a packet */
+
+#    define PACK_HEAP_REQUIRED  \
+      (RtsFlags.GranFlags.packBufferSize * sizeofW(StgClosure*) + \
+      2 * sizeofW(StgInt) + sizeofW(StgTSO*))
+
+#    define PACK_FLAG_LOCN           0  
+#    define PACK_TSO_LOCN            1
+#    define PACK_UNPACKED_SIZE_LOCN  2
+#    define PACK_SIZE_LOCN           3
+#    define MAGIC_PACK_FLAG          0xfabc
+
+#  define GA_HDR_SIZE			1
+
+#  define PROCS_HDR_POSN		PAR_HDR_POSN
+#  define PROCS_HDR_SIZE		1
+
+/* Accessing components of the field */
+#  define PROCS(closure)	        ((closure)->header.gran.procs)
+/* SET_PROCS is now SET_GRAN_HEADER in ClosureMacros.h. */
+
+#endif   /* GRAN */
+
+//@node End of File,  , Parallel definitions
+//@section End of File
+
+#endif /* defined(PAR) || defined(GRAN)         whole file */
+
+#endif /* Parallel_H */
+
+
diff --git a/ghc/includes/PrimOps.h b/ghc/includes/PrimOps.h
index d11de24..0d97628 100644
--- a/ghc/includes/PrimOps.h
+++ b/ghc/includes/PrimOps.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: PrimOps.h,v 1.43 2000/01/12 15:15:17 simonmar Exp $
+ * $Id: PrimOps.h,v 1.44 2000/01/13 14:34:00 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -734,6 +734,118 @@ EF_(unblockAsyncExceptionszh_fast);
 
 extern int cmp_thread(const StgTSO *tso1, const StgTSO *tso2);
 
+/* ------------------------------------------------------------------------
+   Parallel PrimOps
+
+   A par in the Haskell code is ultimately translated to a parzh macro
+   (with a case wrapped around it to guarantee that the macro is actually 
+    executed; see compiler/prelude/PrimOps.lhs)
+   ---------------------------------------------------------------------- */
+
+#if defined(GRAN)
+// hash coding changed from 2.10 to 4.00
+#define parzh(r,node)             parZh(r,node)
+
+#define parZh(r,node)				\
+	PARZh(r,node,1,0,0,0,0,0)
+
+#define parAtzh(r,node,where,identifier,gran_info,size_info,par_info,rest) \
+	parATZh(r,node,where,identifier,gran_info,size_info,par_info,rest,1)
+
+#define parAtAbszh(r,node,proc,identifier,gran_info,size_info,par_info,rest) \
+	parATZh(r,node,proc,identifier,gran_info,size_info,par_info,rest,2)
+
+#define parAtRelzh(r,node,proc,identifier,gran_info,size_info,par_info,rest) \
+	parATZh(r,node,proc,identifier,gran_info,size_info,par_info,rest,3)
+
+#define parAtForNowzh(r,node,where,identifier,gran_info,size_info,par_info,rest)	\
+	parATZh(r,node,where,identifier,gran_info,size_info,par_info,rest,0)
+
+#define parATZh(r,node,where,identifier,gran_info,size_info,par_info,rest,local)	\
+{							\
+  rtsSparkQ result;						\
+  if (closure_SHOULD_SPARK((StgClosure*)node)) {				\
+    rtsSparkQ result;						\
+    STGCALL6(newSpark, node,identifier,gran_info,size_info,par_info,local);	\
+    if (local==2) {         /* special case for parAtAbs */   \
+      STGCALL3(GranSimSparkAtAbs, result,(I_)where,identifier);\
+    } else if (local==3) {  /* special case for parAtRel */   \
+      STGCALL3(GranSimSparkAtAbs, result,(I_)(CurrentProc+where),identifier);	\
+    } else {       \
+      STGCALL3(GranSimSparkAt, result,where,identifier);	\
+    }        \
+  }                                                     \
+}
+
+#define parLocalzh(r,node,identifier,gran_info,size_info,par_info,rest)	\
+	PARZh(r,node,rest,identifier,gran_info,size_info,par_info,1)
+
+#define parGlobalzh(r,node,identifier,gran_info,size_info,par_info,rest) \
+	PARZh(r,node,rest,identifier,gran_info,size_info,par_info,0)
+
+#define PARZh(r,node,rest,identifier,gran_info,size_info,par_info,local) \
+{                                                                        \
+  if (closure_SHOULD_SPARK((StgClosure*)node)) {                         \
+    rtsSpark *result;						         \
+    result = RET_STGCALL6(rtsSpark*, newSpark,                           \
+                          node,identifier,gran_info,size_info,par_info,local);\
+    STGCALL1(add_to_spark_queue,result); 				\
+    STGCALL2(GranSimSpark, local,(P_)node);	                        \
+  }							                \
+}
+
+#define copyablezh(r,node)				\
+  /* copyable not yet implemented!! */
+
+#define noFollowzh(r,node)				\
+  /* noFollow not yet implemented!! */
+
+#endif  /* GRAN */
+
+#if 0
+
+# if defined(GRAN)
+/* ToDo: Use a parallel ticky macro for this */
+# define COUNT_SPARK(node)     { (CurrentTSO->gran.globalsparks)++; sparksCreated++; }
+# elif defined(PAR)
+# define COUNT_SPARK(node)     { (CurrentTSO->par.globalsparks)++; sparksCreated++; }
+# endif
+
+/* 
+   Note that we must bump the required thread count NOW, rather
+   than when the thread is actually created.  
+
+   forkzh not needed any more; see ghc/rts/PrimOps.hc
+*/
+#define forkzh(r,liveness,node)				\
+{							\
+  extern  nat context_switch;                           \
+  while (pending_sparks_tl[REQUIRED_POOL] == pending_sparks_lim[REQUIRED_POOL]) \
+    DO_YIELD((liveness << 1) | 1);			\
+  if (closure_SHOULD_SPARK((StgClosure *)node)) {				\
+    *pending_sparks_tl[REQUIRED_POOL]++ = (P_)(node);	\
+  } else {                                              \
+    sparksIgnored++;                                    \
+  }							\
+  context_switch = 1;					\
+}
+
+// old version of par (previously used in GUM
+
+#define parzh(r,node)					\
+{							\
+  extern  nat context_switch;                           \
+  COUNT_SPARK(node);						\
+  if (closure_SHOULD_SPARK((StgClosure *)node) &&	\
+      pending_sparks_tl[ADVISORY_POOL] < pending_sparks_lim[ADVISORY_POOL]) {\
+    *pending_sparks_tl[ADVISORY_POOL]++ = (StgClosure *)(node);	\
+  } else {						\
+    sparksIgnored++;					\
+  }							\
+  r = context_switch = 1;					\
+}
+#endif /* 0 */
+
 #if defined(SMP) || defined(PAR)
 #define parzh(r,node)					\
 {							\
diff --git a/ghc/includes/Rts.h b/ghc/includes/Rts.h
index 7d35118..40deb1e 100644
--- a/ghc/includes/Rts.h
+++ b/ghc/includes/Rts.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: Rts.h,v 1.11 2000/01/13 12:40:15 simonmar Exp $
+ * $Id: Rts.h,v 1.12 2000/01/13 14:34:01 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -42,17 +42,10 @@
 #define MAX_RTS_ARGS 32
 
 /* -----------------------------------------------------------------------------
-   Useful typedefs
+   Assertions and Debuggery
    -------------------------------------------------------------------------- */
 
-typedef unsigned int  nat;           /* at least 32 bits (like int) */
-typedef unsigned long lnat;          /* at least 32 bits            */
-typedef unsigned long long ullong;   /* at least 32 bits            */
-  
-typedef enum { 
-    rtsFalse = 0, 
-    rtsTrue 
-} rtsBool;
+#define IF_RTSFLAGS(c,s)  if (RtsFlags.c) { s; }
 
 /* -----------------------------------------------------------------------------
    Assertions and Debuggery
diff --git a/ghc/includes/RtsTypes.h b/ghc/includes/RtsTypes.h
new file mode 100644
index 0000000..10c4bde
--- /dev/null
+++ b/ghc/includes/RtsTypes.h
@@ -0,0 +1,76 @@
+/*
+  Time-stamp: <Mon Nov 22 1999 21:29:44 Stardate: [-30]3939.47 hwloidl>
+
+  RTS specific types.
+*/
+
+/* -------------------------------------------------------------------------
+   Generally useful typedefs
+   ------------------------------------------------------------------------- */
+
+#ifndef RTS_TYPES_H
+#define RTS_TYPES_H
+
+typedef unsigned int  nat;           /* at least 32 bits (like int) */
+typedef unsigned long lnat;          /* at least 32 bits            */
+typedef unsigned long long ullong;   /* at least 32 bits            */
+
+/* ullong (64|128-bit) type: only include if needed (not ANSI) */
+#if defined(__GNUC__) 
+#define LL(x) (x##LL)
+#else
+#define LL(x) (x##L)
+#endif
+  
+typedef enum { 
+    rtsFalse = 0, 
+    rtsTrue 
+} rtsBool;
+
+/* 
+   Types specific to the parallel runtime system.
+*/
+
+#if defined(PAR)
+/* types only needed in the parallel system */
+typedef struct hashtable ParHashTable;
+typedef struct hashlist ParHashList;
+
+// typedef double REAL_TIME;
+// typedef W_ TIME;
+// typedef GlobalTaskId Proc;
+typedef int           GlobalTaskId;
+typedef ullong        rtsTime;
+typedef GlobalTaskId  PEs;
+typedef unsigned int  rtsWeight;
+typedef int           rtsPacket;
+typedef int           OpCode;
+
+/* Global addresses i.e. unique ids in a parallel setup; needed in Closures.h*/
+typedef struct {
+  union {
+    StgPtr plc;
+    struct {
+      GlobalTaskId gtid;
+      int slot;
+    } gc;
+  } payload;
+  rtsWeight weight;
+} globalAddr;
+
+/* (GA, LA) pairs */
+typedef struct gala {
+    globalAddr ga;
+    StgPtr la;
+    struct gala *next;
+    rtsBool preferred;
+} GALA;
+
+#elif defined(GRAN)
+
+typedef lnat      rtsTime;
+typedef StgWord   PEs;
+
+#endif
+
+#endif /* RTS_TYPES_H */
diff --git a/ghc/includes/SchedAPI.h b/ghc/includes/SchedAPI.h
index 317a177..18c48f5 100644
--- a/ghc/includes/SchedAPI.h
+++ b/ghc/includes/SchedAPI.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: SchedAPI.h,v 1.8 1999/11/18 12:10:17 sewardj Exp $
+ * $Id: SchedAPI.h,v 1.9 2000/01/13 14:34:01 hwloidl Exp $
  *
  * (c) The GHC Team 1998
  *
@@ -11,6 +11,11 @@
 #ifndef SCHEDAPI_H
 #define SCHEDAPI_H
 
+#if defined(GRAN)
+// Dummy def for NO_PRI if not in GranSim
+#define NO_PRI  0
+#endif
+
 /* 
  * schedule() plus the thread creation functions are not part
  * part of the external RTS API, so leave them out if we're
@@ -22,8 +27,11 @@ SchedulerStatus waitThread(StgTSO *main_thread, /*out*/StgClosure **ret);
 /* 
  * Creating threads
  */
-
+#if defined(GRAN)
+StgTSO *createThread(nat stack_size, StgInt pri);
+#else
 StgTSO *createThread(nat stack_size);
+#endif
 void scheduleThread(StgTSO *tso);
 
 static inline void pushClosure   (StgTSO *tso, StgClosure *c) {
@@ -38,7 +46,11 @@ static inline void pushRealWorld (StgTSO *tso) {
 static inline StgTSO *
 createGenThread(nat stack_size,  StgClosure *closure) {
   StgTSO *t;
+#if defined(GRAN)
+  t = createThread(stack_size, NO_PRI);
+#else
   t = createThread(stack_size);
+#endif
   pushClosure(t,closure);
   return t;
 }
@@ -46,7 +58,11 @@ createGenThread(nat stack_size,  StgClosure *closure) {
 static inline StgTSO *
 createIOThread(nat stack_size,  StgClosure *closure) {
   StgTSO *t;
+#if defined(GRAN)
+  t = createThread(stack_size, NO_PRI);
+#else
   t = createThread(stack_size);
+#endif
   pushRealWorld(t);
   pushClosure(t,closure);
   return t;
@@ -60,7 +76,11 @@ createIOThread(nat stack_size,  StgClosure *closure) {
 static inline StgTSO *
 createStrictIOThread(nat stack_size,  StgClosure *closure) {
   StgTSO *t;
+#if defined(GRAN)
+  t = createThread(stack_size, NO_PRI);
+#else
   t = createThread(stack_size);
+#endif
   pushClosure(t,closure);
   pushClosure(t,(StgClosure*)&forceIO_closure);
   return t;
diff --git a/ghc/includes/Stg.h b/ghc/includes/Stg.h
index c8d729d..0ae31a0 100644
--- a/ghc/includes/Stg.h
+++ b/ghc/includes/Stg.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: Stg.h,v 1.21 1999/11/09 15:57:40 simonmar Exp $
+ * $Id: Stg.h,v 1.22 2000/01/13 14:34:01 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -103,6 +103,7 @@ void _stgAssert (char *, unsigned int);
 
 /* Global type definitions*/
 #include "StgTypes.h"
+#include "RtsTypes.h"
 
 /* Global constaints */
 #include "Constants.h"
@@ -116,6 +117,12 @@ void _stgAssert (char *, unsigned int);
 #include "InfoTables.h"
 #include "TSO.h"
 
+/* Simulated-parallel information */
+#include "GranSim.h"
+
+/* Parallel information */
+#include "Parallel.h"
+
 /* STG/Optimised-C related stuff */
 #include "SMP.h"
 #include "MachRegs.h"
diff --git a/ghc/includes/StgMacros.h b/ghc/includes/StgMacros.h
index bc269ad..ab78687 100644
--- a/ghc/includes/StgMacros.h
+++ b/ghc/includes/StgMacros.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: StgMacros.h,v 1.19 1999/11/22 16:44:30 sewardj Exp $
+ * $Id: StgMacros.h,v 1.20 2000/01/13 14:34:01 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -138,6 +138,7 @@ static inline int IS_ARG_TAG( StgWord p ) { return p <= ARGTAG_MAX; }
 	}
        
 #define HP_CHK(headroom,ret,r,layout,tag_assts)			\
+        DO_GRAN_ALLOCATE(headroom)                              \
 	if ((Hp += headroom) > HpLim) {				\
 	    EXTFUN_RTS(stg_chk_##layout);		 	\
 	    tag_assts						\
@@ -146,6 +147,7 @@ static inline int IS_ARG_TAG( StgWord p ) { return p <= ARGTAG_MAX; }
 	}
 
 #define HP_STK_CHK(stk_headroom,hp_headroom,ret,r,layout,tag_assts) \
+        DO_GRAN_ALLOCATE(hp_headroom)                              \
 	if (Sp - stk_headroom < SpLim || (Hp += hp_headroom) > HpLim) {	\
 	    EXTFUN_RTS(stg_chk_##layout);		 	\
 	    tag_assts						\
@@ -165,6 +167,10 @@ static inline int IS_ARG_TAG( StgWord p ) { return p <= ARGTAG_MAX; }
    functions.  In all these cases, node points to a closure that we
    can just enter to restart the heap check (the NP stands for 'node points').
 
+   In the NP case GranSim absolutely has to check whether the current node 
+   resides on the current processor. Otherwise a FETCH event has to be
+   scheduled. All that is done in GranSimFetch. -- HWL
+
    HpLim points to the LAST WORD of valid allocation space.
    -------------------------------------------------------------------------- */
 
@@ -176,6 +182,7 @@ static inline int IS_ARG_TAG( StgWord p ) { return p <= ARGTAG_MAX; }
 	}
 
 #define HP_CHK_NP(headroom,ptrs,tag_assts)			\
+        DO_GRAN_ALLOCATE(headroom)                              \
 	if ((Hp += (headroom)) > HpLim) {			\
 	    EXTFUN_RTS(stg_gc_enter_##ptrs);			\
             tag_assts						\
@@ -183,6 +190,7 @@ static inline int IS_ARG_TAG( StgWord p ) { return p <= ARGTAG_MAX; }
 	}
 
 #define HP_CHK_SEQ_NP(headroom,ptrs,tag_assts)			\
+        DO_GRAN_ALLOCATE(headroom)                              \
 	if ((Hp += (headroom)) > HpLim) {			\
 	    EXTFUN_RTS(stg_gc_seq_##ptrs);			\
             tag_assts						\
@@ -190,6 +198,7 @@ static inline int IS_ARG_TAG( StgWord p ) { return p <= ARGTAG_MAX; }
 	}
 
 #define HP_STK_CHK_NP(stk_headroom, hp_headroom, ptrs, tag_assts) \
+        DO_GRAN_ALLOCATE(hp_headroom)                              \
 	if ((Sp - (stk_headroom)) < SpLim || (Hp += (hp_headroom)) > HpLim) { \
 	    EXTFUN_RTS(stg_gc_enter_##ptrs);		 	\
             tag_assts						\
@@ -200,6 +209,7 @@ static inline int IS_ARG_TAG( StgWord p ) { return p <= ARGTAG_MAX; }
 /* Heap checks for branches of a primitive case / unboxed tuple return */
 
 #define GEN_HP_CHK_ALT(headroom,lbl,tag_assts)			\
+        DO_GRAN_ALLOCATE(headroom)                              \
 	if ((Hp += (headroom)) > HpLim) {			\
 	    EXTFUN_RTS(lbl);					\
             tag_assts						\
@@ -341,6 +351,25 @@ EF_(stg_gen_block);
     JMP_(stg_block_##ptrs);			\
   }
 
+#if defined(PAR)
+/*
+  Similar to BLOCK_NP but separates the saving of the thread state from the
+  actual jump via an StgReturn
+*/
+
+#define SAVE_THREAD_STATE(ptrs)                  \
+  ASSERT(ptrs==1);                               \
+  Sp -= 1;                                       \
+  Sp[0] = R1.w;                                  \
+  SaveThreadState();                             
+
+#define THREAD_RETURN(ptrs)                      \
+  ASSERT(ptrs==1);                               \
+  CurrentTSO->whatNext = ThreadEnterGHC;         \
+  R1.i = ThreadBlocked;                          \
+  JMP_(StgReturn);                               
+#endif
+
 /* -----------------------------------------------------------------------------
    CCall_GC needs to push a dummy stack frame containing the contents
    of volatile registers and variables.  
diff --git a/ghc/includes/StgMiscClosures.h b/ghc/includes/StgMiscClosures.h
index d9c3489..e0ed424 100644
--- a/ghc/includes/StgMiscClosures.h
+++ b/ghc/includes/StgMiscClosures.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: StgMiscClosures.h,v 1.15 1999/11/02 15:05:53 simonmar Exp $
+ * $Id: StgMiscClosures.h,v 1.16 2000/01/13 14:34:01 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -31,6 +31,9 @@ STGFUN(WHITEHOLE_entry);
 STGFUN(SE_BLACKHOLE_entry);
 STGFUN(SE_CAF_BLACKHOLE_entry);
 #endif
+#if defined(PAR) || defined(GRAN)
+STGFUN(RBH_entry);
+#endif
 STGFUN(BCO_entry);
 STGFUN(EVACUATED_entry);
 STGFUN(FOREIGN_entry);
@@ -50,6 +53,15 @@ STGFUN(MUT_CONS_entry);
 STGFUN(END_MUT_LIST_entry);
 STGFUN(dummy_ret_entry);
 
+/* this is the NIL ptr for a TSO queue (e.g. runnable queue) */
+#define END_TSO_QUEUE  ((StgTSO *)(void*)&END_TSO_QUEUE_closure)
+#if defined(PAR) || defined(GRAN)
+/* this is the NIL ptr for a blocking queue */
+# define END_BQ_QUEUE  ((StgBlockingQueueElement *)(void*)&END_TSO_QUEUE_closure)
+/* this is the NIL ptr for a blocked fetch queue (as in PendingFetches in GUM) */
+# define END_BF_QUEUE  ((StgBlockedFetch *)(void*)&END_TSO_QUEUE_closure)
+#endif
+
 /* info tables */
 
 extern DLL_IMPORT_RTS const StgInfoTable IND_info;
@@ -69,6 +81,9 @@ extern DLL_IMPORT_RTS const StgInfoTable WHITEHOLE_info;
 extern DLL_IMPORT_RTS const StgInfoTable SE_BLACKHOLE_info;
 extern DLL_IMPORT_RTS const StgInfoTable SE_CAF_BLACKHOLE_info;
 #endif
+#if defined(PAR) || defined(GRAN)
+extern DLL_IMPORT_RTS const StgInfoTable RBH_info;
+#endif
 extern DLL_IMPORT_RTS const StgInfoTable BCO_info;
 extern DLL_IMPORT_RTS const StgInfoTable EVACUATED_info;
 extern DLL_IMPORT_RTS const StgInfoTable FOREIGN_info;
diff --git a/ghc/includes/TSO.h b/ghc/includes/TSO.h
index 5cc34be..ce46e00 100644
--- a/ghc/includes/TSO.h
+++ b/ghc/includes/TSO.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: TSO.h,v 1.9 1999/12/01 14:34:49 simonmar Exp $
+ * $Id: TSO.h,v 1.10 2000/01/13 14:34:01 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -10,6 +10,30 @@
 #ifndef TSO_H
 #define TSO_H
 
+#if defined(GRAN) || defined(PAR)
+// magic marker for TSOs; debugging only
+#define TSO_MAGIC 4321
+
+typedef struct {
+  StgInt   pri;
+  StgInt   magic;
+  StgInt   sparkname;
+  rtsTime  startedat;
+  rtsBool  exported;
+  StgInt   basicblocks;
+  StgInt   allocs;
+  rtsTime  exectime;
+  rtsTime  fetchtime;
+  rtsTime  fetchcount;
+  rtsTime  blocktime;
+  StgInt   blockcount;
+  rtsTime  blockedat;
+  StgInt   globalsparks;
+  StgInt   localsparks;
+  rtsTime  clock;
+} StgTSOStatBuf;
+#endif
+
 #if defined(PROFILING)
 typedef struct {
   CostCentreStack *CCCS;	/* thread's current CCS */
@@ -20,14 +44,21 @@ typedef struct {
 #endif /* PROFILING */
 
 #if defined(PAR)
-typedef struct {
-} StgTSOParInfo;
+typedef StgTSOStatBuf StgTSOParInfo;
 #else /* !PAR */
 typedef struct {
 } StgTSOParInfo;
 #endif /* PAR */
 
-#if defined(TICKY_TICKY)
+#if defined(GRAN)
+typedef StgTSOStatBuf StgTSOGranInfo;
+#else /* !GRAN */
+typedef struct {
+} StgTSOGranInfo;
+#endif /* GRAN */
+
+
+#if defined(TICKY)
 typedef struct {
 } StgTSOTickyInfo;
 #else /* !TICKY_TICKY */
@@ -86,6 +117,9 @@ typedef enum {
   BlockedOnRead,
   BlockedOnWrite,
   BlockedOnDelay
+#if defined(PAR)
+  , BlockedOnGA    // blocked on a remote closure represented by a Global Address
+#endif
 } StgTSOBlockReason;
 
 typedef union {
@@ -93,6 +127,9 @@ typedef union {
   struct StgTSO_ *tso;
   int fd;
   unsigned int delay;
+#if defined(PAR)
+  globalAddr ga;
+#endif
 } StgTSOBlockInfo;
 
 /*
@@ -104,6 +141,7 @@ typedef union {
 typedef struct StgTSO_ {
   StgHeader          header;
   struct StgTSO_*    link;
+  /* SDM and HWL agree that it would be cool to have a list of all TSOs */
   StgMutClosure *    mut_link;	/* TSO's are mutable of course! */
   StgTSOWhatNext     whatNext;
   StgTSOBlockReason  why_blocked;
@@ -113,7 +151,7 @@ typedef struct StgTSO_ {
   StgTSOTickyInfo    ticky; 
   StgTSOProfInfo     prof;
   StgTSOParInfo      par;
-  /* GranSim Info? */
+  StgTSOGranInfo     gran;
 
   /* The thread stack... */
   StgWord    	     stack_size;     /* stack size in *words* */
diff --git a/ghc/includes/Updates.h b/ghc/includes/Updates.h
index d814c10..5378b6c 100644
--- a/ghc/includes/Updates.h
+++ b/ghc/includes/Updates.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: Updates.h,v 1.15 1999/11/09 15:47:09 simonmar Exp $
+ * $Id: Updates.h,v 1.16 2000/01/13 14:34:01 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -91,18 +91,80 @@
    Awaken any threads waiting on this computation
    -------------------------------------------------------------------------- */
 
+#if defined(PAR) 
+
+/* 
+   In a parallel setup several types of closures, might have a blocking queue:
+     BLACKHOLE_BQ ... same as in the default concurrent setup; it will be
+                      reawakened via calling UPD_IND on that closure after
+		      having finished the computation of the graph
+     FETCH_ME_BQ  ... a global indirection (FETCH_ME) may be entered by a 
+                      local TSO, turning it into a FETCH_ME_BQ; it will be
+		      reawakened via calling processResume
+     RBH          ... a revertible black hole may be entered by another 
+                      local TSO, putting it onto its blocking queue; since
+		      RBHs only exist while the corresponding closure is in 
+		      transit, they will be reawakened via calling 
+		      convertToFetchMe (upon processing an ACK message)
+
+   In a parallel setup a blocking queue may contain 3 types of closures:
+     TSO           ... as in the default concurrent setup
+     BLOCKED_FETCH ... indicating that a TSO on another PE is waiting for
+                       the result of the current computation
+     CONSTR        ... a RBHSave closure (which contains data ripped out of
+                       the closure to make room for a blocking queue; since
+		       it only contains data we use the exisiting type of
+		       a CONSTR closure); this closure is the end of a 
+		       blocking queue for an RBH closure; it only exists in
+		       this kind of blocking queue and must be at the end
+		       of the queue
+*/		      
+extern void awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node);
+#define DO_AWAKEN_BQ(bqe, node)  STGCALL2(awakenBlockedQueue, bqe, node);
+
+#define AWAKEN_BQ(info,closure)						\
+     	if (info == &BLACKHOLE_BQ_info ||               \
+	    info == &FETCH_ME_BQ_info ||                \
+	    get_itbl(closure)->type == RBH) {		                \
+		StgBlockingQueueElement *bqe = ((StgBlockingQueue *)closure)->blocking_queue;\
+		ASSERT(bqe!=END_BQ_QUEUE);		                \
+		DO_AWAKEN_BQ(bqe, closure);     	                \
+	}
+
+#elif defined(GRAN)
+
+extern void awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node);
+#define DO_AWAKEN_BQ(bq, node)  STGCALL2(awakenBlockedQueue, bq, node);
+
+/* In GranSim we don't have FETCH_ME or FETCH_ME_BQ closures, so they are
+   not checked. The rest of the code is the same as for GUM.
+*/
+#define AWAKEN_BQ(info,closure)						\
+     	if (info == &BLACKHOLE_BQ_info ||               \
+	    get_itbl(closure)->type == RBH) {		                \
+		StgBlockingQueueElement *bqe = ((StgBlockingQueue *)closure)->blocking_queue;\
+		ASSERT(bqe!=END_BQ_QUEUE);		                \
+		DO_AWAKEN_BQ(bqe, closure);     	                \
+	}
+
+
+#else /* !GRAN && !PAR */
+
 extern void awakenBlockedQueue(StgTSO *q);
+#define DO_AWAKEN_BQ(closure)  	\
+        STGCALL1(awakenBlockedQueue,		\
+		 ((StgBlockingQueue *)closure)->blocking_queue);
 
 #define AWAKEN_BQ(info,closure)						\
      	if (info == &BLACKHOLE_BQ_info) {				\
-	     STGCALL1(awakenBlockedQueue,				\
-		      ((StgBlockingQueue *)closure)->blocking_queue);	\
+          DO_AWAKEN_BQ(closure);                                        \
 	}
 
+#endif /* GRAN || PAR */
 
-/* -----------------------------------------------------------------------------
+/* -------------------------------------------------------------------------
    Push an update frame on the stack.
-   -------------------------------------------------------------------------- */
+   ------------------------------------------------------------------------- */
 
 #if defined(PROFILING)
 #define PUSH_STD_CCCS(frame) frame->header.prof.ccs = CCCS
diff --git a/ghc/lib/std/Makefile b/ghc/lib/std/Makefile
index 51b68d2..853c599 100644
--- a/ghc/lib/std/Makefile
+++ b/ghc/lib/std/Makefile
@@ -47,6 +47,13 @@ ifneq "$(way)" ""
 SRC_HC_OPTS += -hisuf $(way_)hi
 endif
 
+# HWL: for debugging GranSim generate .hc and .s files
+SRC_HC_OPTS += -keep-hc-files-too -keep-s-files-too
+# # HWL: why isn't that on by default !!????????????
+# ifeq "$(way)" "mg"
+# SRC_HC_OPTS += -gransim
+# endif
+
 # per-module flags
 PrelArrExtra_HC_OPTS     += -monly-2-regs
 
@@ -55,7 +62,7 @@ PrelArrExtra_HC_OPTS     += -monly-2-regs
 PrelNumExtra_HC_OPTS     += -H24m -K2m
 
 PrelPack_HC_OPTS	 += -K4m
-PrelBase_HC_OPTS         += -H12m
+PrelBase_HC_OPTS         += -H32m -K32m
 PrelRead_HC_OPTS         += -H20m
 PrelTup_HC_OPTS          += -H12m -K2m
 PrelNum_HC_OPTS		 += -H12m -K4m
diff --git a/ghc/lib/std/PrelHandle.lhs b/ghc/lib/std/PrelHandle.lhs
index 85289ad..d65c234 100644
--- a/ghc/lib/std/PrelHandle.lhs
+++ b/ghc/lib/std/PrelHandle.lhs
@@ -29,7 +29,9 @@ import PrelShow
 import PrelAddr		( Addr, nullAddr )
 import PrelReal		( toInteger )
 import PrelPack         ( packString )
+#ifndef __PARALLEL_HASKELL__
 import PrelWeak		( addForeignFinalizer )
+#endif
 import Ix
 
 #ifdef __CONCURRENT_HASKELL__
diff --git a/ghc/lib/std/PrelWeak.lhs b/ghc/lib/std/PrelWeak.lhs
index 3b09a39..354332b 100644
--- a/ghc/lib/std/PrelWeak.lhs
+++ b/ghc/lib/std/PrelWeak.lhs
@@ -7,6 +7,8 @@
 \begin{code}
 {-# OPTIONS -fno-implicit-prelude #-}
 
+#ifndef __PARALLEL_HASKELL__
+
 module PrelWeak where
 
 import PrelGHC
@@ -43,4 +45,6 @@ instance Eq (Weak v) where
   (Weak w1) == (Weak w2) = w1 `sameWeak#` w2
 -}
 
+#endif
+
 \end{code}
diff --git a/ghc/mk/paths.mk b/ghc/mk/paths.mk
index be6dd7d..88e39ae 100644
--- a/ghc/mk/paths.mk
+++ b/ghc/mk/paths.mk
@@ -1,5 +1,5 @@
 # -----------------------------------------------------------------------------
-# $Id: paths.mk,v 1.17 2000/01/10 11:59:55 simonmar Exp $
+# $Id: paths.mk,v 1.18 2000/01/13 14:34:02 hwloidl Exp $
 #
 # ghc project specific make variables
 #
@@ -25,7 +25,6 @@ GHC_INCLUDE_DIR   	:= $(TOP)/includes
 GHC_UTILS_DIR	 	:= $(TOP)/utils
 GHC_INTERPRETER_DIR 	:= $(TOP)/interpreter
 
-GHC_SYSMAN_DIR 		:= $(GHC_RUNTIME_DIR)/gum
 GHC_HSP_DIR 		:= $(GHC_HSC_DIR)
 GHC_MKDEPENDHS_DIR	:= $(GHC_UTILS_DIR)/mkdependHS
 GHC_HSCPP_DIR 		:= $(GHC_UTILS_DIR)/hscpp
@@ -37,11 +36,12 @@ GHC_HSCPP    		= $(GHC_HSCPP_DIR)/hscpp
 GHC_MKDEPENDHS 		= $(GHC_MKDEPENDHS_DIR)/mkdependHS-inplace
 GHC_HSP    		= $(GHC_HSP_DIR)/hsp
 GHC_HSC    		= $(GHC_HSC_DIR)/hsc
-GHC_SYSMAN    		= $(GHC_RUNTIME_DIR)/gum/SysMan
-
 UNLIT	 		= $(GHC_UNLIT_DIR)/unlit
 GHC_UNLIT		= $(GHC_UNLIT_DIR)/unlit
 
+GHC_SYSMAN    		= $(GHC_RUNTIME_DIR)/parallel/SysMan
+GHC_SYSMAN_DIR 		= $(GHC_RUNTIME_DIR)/parallel
+
 #-----------------------------------------------------------------------------
 # Stuff for the C-compiling phase in particular...
 
diff --git a/ghc/rts/ClosureFlags.c b/ghc/rts/ClosureFlags.c
index a7fdb0b..89e98e4 100644
--- a/ghc/rts/ClosureFlags.c
+++ b/ghc/rts/ClosureFlags.c
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: ClosureFlags.c,v 1.5 2000/01/12 12:39:20 simonmar Exp $
+ * $Id: ClosureFlags.c,v 1.6 2000/01/13 14:34:02 hwloidl Exp $
  *
  * (c) The GHC Team 1998-1999
  *
@@ -23,6 +23,7 @@ StgWord16 closure_flags[] = {
  *  to thunks.)
  */
 
+/*                             0    1    2    3    4   5   6   7 */
 /*			     HNF  BTM   NS  STA  THU MUT UPT SRT */
 				                                    
 /* INVALID_OBJECT       */ ( 0                                   ),
@@ -84,8 +85,14 @@ StgWord16 closure_flags[] = {
 /* WEAK		   	*/ (_HNF|     _NS|              _UPT     ),
 /* FOREIGN		*/ (_HNF|     _NS|              _UPT     ),
 /* STABLE_NAME	   	*/ (_HNF|     _NS|              _UPT     ),
+
 /* TSO                  */ (_HNF|     _NS|         _MUT|_UPT     ),
-/* BLOCKED_FETCH	*/ (_HNF|     _NS                        ),
-/* FETCH_ME		*/ (_HNF|     _NS                        ),
-/* EVACUATED		*/ ( 0                                   )
+/* BLOCKED_FETCH	*/ (_HNF|     _NS|         _MUT|_UPT     ),
+/* FETCH_ME		*/ (_HNF|     _NS|         _MUT|_UPT     ),
+/* FETCH_ME_BQ          */ ( 	      _NS|         _MUT|_UPT     ),
+/* RBH                  */ ( 	      _NS|         _MUT|_UPT     ),
+
+/* EVACUATED		*/ ( 0                                   ),
+
+/* N_CLOSURE_TYPES      */ ( 0                                   )
 };
diff --git a/ghc/rts/Exception.hc b/ghc/rts/Exception.hc
index f19f212..7fdd6fd 100644
--- a/ghc/rts/Exception.hc
+++ b/ghc/rts/Exception.hc
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: Exception.hc,v 1.2 1999/12/02 09:52:41 simonmar Exp $
+ * $Id: Exception.hc,v 1.3 2000/01/13 14:34:02 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -14,6 +14,9 @@
 #include "Storage.h"
 #include "RtsUtils.h"
 #include "RtsFlags.h"
+#if defined(PAR)
+# include "FetchMe.h"
+#endif
 
 /* -----------------------------------------------------------------------------
    Exception Primitives
@@ -62,7 +65,16 @@ FN_(unblockAsyncExceptionszh_ret_entry)
 {
   FB_
     ASSERT(CurrentTSO->blocked_exceptions != NULL);
+#if defined(GRAN)
+# error FixME
+#elif defined(PAR)
+      // is CurrentTSO->block_info.closure always set to the node
+      // holding the blocking queue !? -- HWL
+      awakenBlockedQueue(CurrentTSO->blocked_exceptions, 
+	                 CurrentTSO->block_info.closure);
+#else
     awakenBlockedQueue(CurrentTSO->blocked_exceptions);
+#endif
     CurrentTSO->blocked_exceptions = NULL;
     Sp++;
     JMP_(ENTRY_CODE(Sp[0]));
@@ -76,7 +88,16 @@ FN_(unblockAsyncExceptionszh_fast)
     STK_CHK_GEN(2, R1_PTR, unblockAsyncExceptionszh_fast, );
 
     if (CurrentTSO->blocked_exceptions != NULL) {
+#if defined(GRAN)
+# error FixME
+#elif defined(PAR)
+      // is CurrentTSO->block_info.closure always set to the node
+      // holding the blocking queue !? -- HWL
+      awakenBlockedQueue(CurrentTSO->blocked_exceptions, 
+	                 CurrentTSO->block_info.closure);
+#else
       awakenBlockedQueue(CurrentTSO->blocked_exceptions);
+#endif
       CurrentTSO->blocked_exceptions = NULL;
       Sp--;
       Sp[0] = (W_)&blockAsyncExceptionszh_ret_info;
diff --git a/ghc/rts/GC.c b/ghc/rts/GC.c
index f3ce4c6..3665034 100644
--- a/ghc/rts/GC.c
+++ b/ghc/rts/GC.c
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: GC.c,v 1.68 1999/12/01 15:07:00 simonmar Exp $
+ * $Id: GC.c,v 1.69 2000/01/13 14:34:02 hwloidl Exp $
  *
  * (c) The GHC Team 1998-1999
  *
@@ -7,6 +7,25 @@
  *
  * ---------------------------------------------------------------------------*/
 
+//@menu
+//* Includes::			
+//* STATIC OBJECT LIST::	
+//* Static function declarations::  
+//* Garbage Collect::		
+//* Weak Pointers::		
+//* Evacuation::		
+//* Scavenging::		
+//* Reverting CAFs::		
+//* Sanity code for CAF garbage collection::  
+//* Lazy black holing::		
+//* Stack squeezing::		
+//* Pausing a thread::		
+//* Index::			
+//@end menu
+
+//@node Includes, STATIC OBJECT LIST
+//@subsection Includes
+
 #include "Rts.h"
 #include "RtsFlags.h"
 #include "RtsUtils.h"
@@ -23,9 +42,21 @@
 #include "SchedAPI.h"
 #include "Weak.h"
 #include "StablePriv.h"
+#if defined(GRAN) || defined(PAR)
+# include "GranSimRts.h"
+# include "ParallelRts.h"
+# include "FetchMe.h"
+# if defined(DEBUG)
+#  include "Printer.h"
+#  include "ParallelDebug.h"
+# endif
+#endif
 
 StgCAF* enteredCAFs;
 
+//@node STATIC OBJECT LIST, Static function declarations, Includes
+//@subsection STATIC OBJECT LIST
+
 /* STATIC OBJECT LIST.
  *
  * During GC:
@@ -96,6 +127,9 @@ bdescr *old_to_space;
 lnat new_blocks;		/* blocks allocated during this GC */
 lnat g0s0_pcnt_kept = 30;	/* percentage of g0s0 live at last minor GC */
 
+//@node Static function declarations, Garbage Collect, STATIC OBJECT LIST
+//@subsection Static function declarations
+
 /* -----------------------------------------------------------------------------
    Static function declarations
    -------------------------------------------------------------------------- */
@@ -119,6 +153,9 @@ static void         scavenge_mut_once_list  ( generation *g );
 static void         gcCAFs                  ( void );
 #endif
 
+//@node Garbage Collect, Weak Pointers, Static function declarations
+//@subsection Garbage Collect
+
 /* -----------------------------------------------------------------------------
    GarbageCollect
 
@@ -141,6 +178,7 @@ static void         gcCAFs                  ( void );
      - free from-space in each step, and set from-space = to-space.
 
    -------------------------------------------------------------------------- */
+//@cindex GarbageCollect
 
 void GarbageCollect(void (*get_roots)(void))
 {
@@ -153,6 +191,11 @@ void GarbageCollect(void (*get_roots)(void))
   CostCentreStack *prev_CCS;
 #endif
 
+#if defined(DEBUG) && defined(GRAN)
+  IF_DEBUG(gc, belch("@@ Starting garbage collection at %ld (%lx)\n", 
+		     Now, Now))
+#endif
+
   /* tell the stats department that we've started a GC */
   stat_startGC();
 
@@ -176,8 +219,10 @@ void GarbageCollect(void (*get_roots)(void))
   major_gc = (N == RtsFlags.GcFlags.generations-1);
 
   /* check stack sanity *before* GC (ToDo: check all threads) */
-  /*IF_DEBUG(sanity, checkTSO(MainTSO,0)); */
-  IF_DEBUG(sanity, checkFreeListSanity());
+#if defined(GRAN)
+  // ToDo!: check sanity  IF_DEBUG(sanity, checkTSOsSanity());
+#endif
+    IF_DEBUG(sanity, checkFreeListSanity());
 
   /* Initialise the static object lists
    */
@@ -296,6 +341,8 @@ void GarbageCollect(void (*get_roots)(void))
 
     /* Do the mut-once lists first */
     for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
+      IF_PAR_DEBUG(verbose,
+		   printMutOnceList(&generations[g]));
       scavenge_mut_once_list(&generations[g]);
       evac_gen = g;
       for (st = generations[g].n_steps-1; st >= 0; st--) {
@@ -304,6 +351,8 @@ void GarbageCollect(void (*get_roots)(void))
     }
 
     for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
+      IF_PAR_DEBUG(verbose,
+		   printMutableList(&generations[g]));
       scavenge_mutable_list(&generations[g]);
       evac_gen = g;
       for (st = generations[g].n_steps-1; st >= 0; st--) {
@@ -317,6 +366,19 @@ void GarbageCollect(void (*get_roots)(void))
   evac_gen = 0;
   get_roots();
 
+#if defined(PAR)
+  /* And don't forget to mark the TSO if we got here direct from
+   * Haskell! */
+  /* Not needed in a seq version?
+  if (CurrentTSO) {
+    CurrentTSO = (StgTSO *)MarkRoot((StgClosure *)CurrentTSO);
+  }
+  */
+
+  /* Mark the entries in the GALA table of the parallel system */
+  markLocalGAs(major_gc);
+#endif
+
   /* Mark the weak pointer list, and prepare to detect dead weak
    * pointers.
    */
@@ -577,7 +639,7 @@ void GarbageCollect(void (*get_roots)(void))
       int pc_free; 
       
       adjusted_blocks = (RtsFlags.GcFlags.maxHeapSize - 2 * blocks);
-      IF_DEBUG(gc, fprintf(stderr, "Near maximum heap size of 0x%x blocks, blocks = %d, adjusted to %d\n", RtsFlags.GcFlags.maxHeapSize, blocks, adjusted_blocks));
+      IF_DEBUG(gc, fprintf(stderr, "@@ Near maximum heap size of 0x%x blocks, blocks = %d, adjusted to %d\n", RtsFlags.GcFlags.maxHeapSize, blocks, adjusted_blocks));
       pc_free = adjusted_blocks * 100 / RtsFlags.GcFlags.maxHeapSize;
       if (pc_free < RtsFlags.GcFlags.pcFreeHeap) /* might even be < 0 */ {
 	heapOverflow();
@@ -648,6 +710,11 @@ void GarbageCollect(void (*get_roots)(void))
    */
   resetNurseries();
 
+#if defined(PAR)
+  /* Reconstruct the Global Address tables used in GUM */
+  RebuildGAtables(major_gc);
+#endif
+
   /* start any pending finalizers */
   scheduleFinalizers(old_weak_ptr_list);
   
@@ -675,6 +742,9 @@ void GarbageCollect(void (*get_roots)(void))
   stat_endGC(allocated, collected, live, copied, N);
 }
 
+//@node Weak Pointers, Evacuation, Garbage Collect
+//@subsection Weak Pointers
+
 /* -----------------------------------------------------------------------------
    Weak Pointers
 
@@ -694,6 +764,7 @@ void GarbageCollect(void (*get_roots)(void))
    probably be optimised by keeping per-generation lists of weak
    pointers, but for a few weak pointers this scheme will work.
    -------------------------------------------------------------------------- */
+//@cindex traverse_weak_ptr_list
 
 static rtsBool 
 traverse_weak_ptr_list(void)
@@ -782,6 +853,8 @@ traverse_weak_ptr_list(void)
    evacuated need to be evacuated now.
    -------------------------------------------------------------------------- */
 
+//@cindex cleanup_weak_ptr_list
+
 static void
 cleanup_weak_ptr_list ( StgWeak **list )
 {
@@ -809,6 +882,8 @@ cleanup_weak_ptr_list ( StgWeak **list )
    closure if it is alive, or NULL otherwise.
    -------------------------------------------------------------------------- */
 
+//@cindex isAlive
+
 StgClosure *
 isAlive(StgClosure *p)
 {
@@ -823,10 +898,14 @@ isAlive(StgClosure *p)
      * for static closures with an empty SRT or CONSTR_STATIC_NOCAFs.
      */
 
+#if 1 || !defined(PAR)
     /* ignore closures in generations that we're not collecting. */
+    /* In GUM we use this routine when rebuilding GA tables; for some
+       reason it has problems with the LOOKS_LIKE_STATIC macro -- HWL */
     if (LOOKS_LIKE_STATIC(p) || Bdescr((P_)p)->gen->no > N) {
       return p;
     }
+#endif
     
     switch (info->type) {
       
@@ -850,12 +929,24 @@ isAlive(StgClosure *p)
   }
 }
 
+//@cindex MarkRoot
 StgClosure *
 MarkRoot(StgClosure *root)
 {
+  //if (root != END_TSO_QUEUE)
   return evacuate(root);
 }
 
+//@cindex MarkRootHWL
+StgClosure *
+MarkRootHWL(StgClosure *root)
+{
+  StgClosure *new = evacuate(root);
+  upd_evacuee(root, new);
+  return new;
+}
+
+//@cindex addBlock
 static void addBlock(step *step)
 {
   bdescr *bd = allocBlock();
@@ -877,6 +968,8 @@ static void addBlock(step *step)
   new_blocks++;
 }
 
+//@cindex upd_evacuee
+
 static __inline__ void 
 upd_evacuee(StgClosure *p, StgClosure *dest)
 {
@@ -884,6 +977,8 @@ upd_evacuee(StgClosure *p, StgClosure *dest)
   ((StgEvacuated *)p)->evacuee = dest;
 }
 
+//@cindex copy
+
 static __inline__ StgClosure *
 copy(StgClosure *src, nat size, step *step)
 {
@@ -925,6 +1020,8 @@ copy(StgClosure *src, nat size, step *step)
  * used to optimise evacuation of BLACKHOLEs.
  */
 
+//@cindex copyPart
+
 static __inline__ StgClosure *
 copyPart(StgClosure *src, nat size_to_reserve, nat size_to_copy, step *step)
 {
@@ -953,6 +1050,9 @@ copyPart(StgClosure *src, nat size_to_reserve, nat size_to_copy, step *step)
   return (StgClosure *)dest;
 }
 
+//@node Evacuation, Scavenging, Weak Pointers
+//@subsection Evacuation
+
 /* -----------------------------------------------------------------------------
    Evacuate a large object
 
@@ -964,6 +1064,8 @@ copyPart(StgClosure *src, nat size_to_reserve, nat size_to_copy, step *step)
    evacuated, or 0 otherwise.
    -------------------------------------------------------------------------- */
 
+//@cindex evacuate_large
+
 static inline void
 evacuate_large(StgPtr p, rtsBool mutable)
 {
@@ -1026,6 +1128,8 @@ evacuate_large(StgPtr p, rtsBool mutable)
    the promotion until the next GC.
    -------------------------------------------------------------------------- */
 
+//@cindex mkMutCons
+
 static StgClosure *
 mkMutCons(StgClosure *ptr, generation *gen)
 {
@@ -1075,7 +1179,7 @@ mkMutCons(StgClosure *ptr, generation *gen)
                          didn't manage to evacuate this object into evac_gen.
 
    -------------------------------------------------------------------------- */
-
+//@cindex evacuate
 
 static StgClosure *
 evacuate(StgClosure *q)
@@ -1085,6 +1189,9 @@ evacuate(StgClosure *q)
   step *step;
   const StgInfoTable *info;
 
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+
 loop:
   if (HEAP_ALLOCED(q)) {
     bd = Bdescr((P_)q);
@@ -1110,7 +1217,15 @@ loop:
   ASSERT(q && (LOOKS_LIKE_GHC_INFO(GET_INFO(q))
 	       || IS_HUGS_CONSTR_INFO(GET_INFO(q))));
   info = get_itbl(q);
-
+  /*
+  if (info->type==RBH) {
+    info = REVERT_INFOPTR(info);
+    IF_DEBUG(gc,
+	     belch("@_ Trying to evacuate an RBH %p (%s); reverting to IP %p (%s)",
+		     q, info_type(q), info, info_type_by_ip(info)));
+  }
+  */
+  
   switch (info -> type) {
 
   case BCO:
@@ -1328,7 +1443,7 @@ loop:
   case CATCH_FRAME:
   case SEQ_FRAME:
     /* shouldn't see these */
-    barf("evacuate: stack frame\n");
+    barf("evacuate: stack frame at %p\n", q);
 
   case AP_UPD:
   case PAP:
@@ -1347,7 +1462,7 @@ loop:
     if (evac_gen > 0) {		/* optimisation */
       StgClosure *p = ((StgEvacuated*)q)->evacuee;
       if (Bdescr((P_)p)->gen->no < evac_gen) {
-	/*	fprintf(stderr,"evac failed!\n");*/
+	IF_DEBUG(gc, belch("@@ evacuate: evac of EVACUATED node %p failed!", p));
 	failed_to_evac = rtsTrue;
 	TICK_GC_FAILED_PROMOTION();
       }
@@ -1417,10 +1532,44 @@ loop:
       }
     }
 
+#if defined(PAR)
+  case RBH: // cf. BLACKHOLE_BQ
+    {
+      //StgInfoTable *rip = get_closure_info(q, &size, &ptrs, &nonptrs, &vhs, str);
+      to = copy(q,BLACKHOLE_sizeW(),step); 
+      //ToDo: derive size etc from reverted IP
+      //to = copy(q,size,step);
+      recordMutable((StgMutClosure *)to);
+      IF_DEBUG(gc,
+	       belch("@@ evacuate: RBH %p (%s) to %p (%s)",
+		     q, info_type(q), to, info_type(to)));
+      return to;
+    }
+
   case BLOCKED_FETCH:
+    ASSERT(sizeofW(StgBlockedFetch) >= MIN_NONUPD_SIZE);
+    to = copy(q,sizeofW(StgBlockedFetch),step);
+    IF_DEBUG(gc,
+	     belch("@@ evacuate: %p (%s) to %p (%s)",
+		   q, info_type(q), to, info_type(to)));
+    return to;
+
   case FETCH_ME:
-    fprintf(stderr,"evacuate: unimplemented/strange closure type\n");
-    return q;
+    ASSERT(sizeofW(StgBlockedFetch) >= MIN_UPD_SIZE);
+    to = copy(q,sizeofW(StgFetchMe),step);
+    IF_DEBUG(gc,
+	     belch("@@ evacuate: %p (%s) to %p (%s)",
+		   q, info_type(q), to, info_type(to)));
+    return to;
+
+  case FETCH_ME_BQ:
+    ASSERT(sizeofW(StgBlockedFetch) >= MIN_UPD_SIZE);
+    to = copy(q,sizeofW(StgFetchMeBlockingQueue),step);
+    IF_DEBUG(gc,
+	     belch("@@ evacuate: %p (%s) to %p (%s)",
+		   q, info_type(q), to, info_type(to)));
+    return to;
+#endif
 
   default:
     barf("evacuate: strange closure type %d", (int)(info->type));
@@ -1433,6 +1582,7 @@ loop:
    relocate_TSO is called just after a TSO has been copied from src to
    dest.  It adjusts the update frame list for the new location.
    -------------------------------------------------------------------------- */
+//@cindex relocate_TSO
 
 StgTSO *
 relocate_TSO(StgTSO *src, StgTSO *dest)
@@ -1481,6 +1631,11 @@ relocate_TSO(StgTSO *src, StgTSO *dest)
   return dest;
 }
 
+//@node Scavenging, Reverting CAFs, Evacuation
+//@subsection Scavenging
+
+//@cindex scavenge_srt
+
 static inline void
 scavenge_srt(const StgInfoTable *info)
 {
@@ -1548,7 +1703,7 @@ scavengeTSO (StgTSO *tso)
    scavenging a mutable object where early promotion isn't such a good
    idea.  
    -------------------------------------------------------------------------- */
-   
+//@cindex scavenge
 
 static void
 scavenge(step *step)
@@ -1582,6 +1737,11 @@ scavenge(step *step)
 		 || IS_HUGS_CONSTR_INFO(GET_INFO((StgClosure *)p))));
 
     info = get_itbl((StgClosure *)p);
+    /*
+    if (info->type==RBH)
+      info = REVERT_INFOPTR(info);
+    */
+
     switch (info -> type) {
 
     case BCO:
@@ -1849,8 +2009,72 @@ scavenge(step *step)
 	break;
       }
 
+#if defined(PAR)
+    case RBH: // cf. BLACKHOLE_BQ
+      { 
+	// nat size, ptrs, nonptrs, vhs;
+	// char str[80];
+	// StgInfoTable *rip = get_closure_info(p, &size, &ptrs, &nonptrs, &vhs, str);
+	StgRBH *rbh = (StgRBH *)p;
+	(StgClosure *)rbh->blocking_queue = 
+	  evacuate((StgClosure *)rbh->blocking_queue);
+	if (failed_to_evac) {
+	  failed_to_evac = rtsFalse;
+	  recordMutable((StgMutClosure *)rbh);
+	}
+	IF_DEBUG(gc,
+		 belch("@@ scavenge: RBH %p (%s) (new blocking_queue link=%p)",
+		       p, info_type(p), (StgClosure *)rbh->blocking_queue));
+	// ToDo: use size of reverted closure here!
+	p += BLACKHOLE_sizeW(); 
+	break;
+      }
+
     case BLOCKED_FETCH:
+      { 
+	StgBlockedFetch *bf = (StgBlockedFetch *)p;
+	/* follow the pointer to the node which is being demanded */
+	(StgClosure *)bf->node = 
+	  evacuate((StgClosure *)bf->node);
+	/* follow the link to the rest of the blocking queue */
+	(StgClosure *)bf->link = 
+	  evacuate((StgClosure *)bf->link);
+	if (failed_to_evac) {
+	  failed_to_evac = rtsFalse;
+	  recordMutable((StgMutClosure *)bf);
+	}
+	IF_DEBUG(gc,
+		 belch("@@ scavenge: %p (%s); node is now %p; exciting, isn't it",
+		     bf, info_type((StgClosure *)bf), 
+		     bf->node, info_type(bf->node)));
+	p += sizeofW(StgBlockedFetch);
+	break;
+      }
+
     case FETCH_ME:
+      IF_DEBUG(gc,
+	       belch("@@ scavenge: HWL claims nothing to do for %p (%s)",
+		     p, info_type((StgClosure *)p)));
+      p += sizeofW(StgFetchMe);
+      break; // nothing to do in this case
+
+    case FETCH_ME_BQ: // cf. BLACKHOLE_BQ
+      { 
+	StgFetchMeBlockingQueue *fmbq = (StgFetchMeBlockingQueue *)p;
+	(StgClosure *)fmbq->blocking_queue = 
+	  evacuate((StgClosure *)fmbq->blocking_queue);
+	if (failed_to_evac) {
+	  failed_to_evac = rtsFalse;
+	  recordMutable((StgMutClosure *)fmbq);
+	}
+	IF_DEBUG(gc,
+		 belch("@@ scavenge: %p (%s) exciting, isn't it",
+		     p, info_type((StgClosure *)p)));
+	p += sizeofW(StgFetchMeBlockingQueue);
+	break;
+      }
+#endif
+
     case EVACUATED:
       barf("scavenge: unimplemented/strange closure type\n");
 
@@ -1879,6 +2103,8 @@ scavenge(step *step)
    because they contain old-to-new generation pointers.  Only certain
    objects can have this property.
    -------------------------------------------------------------------------- */
+//@cindex scavenge_one
+
 static rtsBool
 scavenge_one(StgClosure *p)
 {
@@ -1890,6 +2116,11 @@ scavenge_one(StgClosure *p)
 
   info = get_itbl(p);
 
+  /* ngoq moHqu'! 
+  if (info->type==RBH)
+    info = REVERT_INFOPTR(info); // if it's an RBH, look at the orig closure
+  */
+
   switch (info -> type) {
 
   case FUN:
@@ -1976,6 +2207,7 @@ scavenge_one(StgClosure *p)
    generations older than the one being collected) as roots.  We also
    remove non-mutable objects from the mutable list at this point.
    -------------------------------------------------------------------------- */
+//@cindex scavenge_mut_once_list
 
 static void
 scavenge_mut_once_list(generation *gen)
@@ -1997,6 +2229,10 @@ scavenge_mut_once_list(generation *gen)
 		 || IS_HUGS_CONSTR_INFO(GET_INFO(p))));
     
     info = get_itbl(p);
+    /*
+    if (info->type==RBH)
+      info = REVERT_INFOPTR(info); // if it's an RBH, look at the orig closure
+    */
     switch(info->type) {
       
     case IND_OLDGEN:
@@ -2008,7 +2244,8 @@ scavenge_mut_once_list(generation *gen)
       ((StgIndOldGen *)p)->indirectee = 
         evacuate(((StgIndOldGen *)p)->indirectee);
       
-#if 0
+#ifdef DEBUG
+      if (RtsFlags.DebugFlags.gc) 
       /* Debugging code to print out the size of the thing we just
        * promoted 
        */
@@ -2107,6 +2344,7 @@ scavenge_mut_once_list(generation *gen)
   gen->mut_once_list = new_list;
 }
 
+//@cindex scavenge_mutable_list
 
 static void
 scavenge_mutable_list(generation *gen)
@@ -2127,6 +2365,10 @@ scavenge_mutable_list(generation *gen)
 		 || IS_HUGS_CONSTR_INFO(GET_INFO(p))));
     
     info = get_itbl(p);
+    /*
+    if (info->type==RBH)
+      info = REVERT_INFOPTR(info); // if it's an RBH, look at the orig closure
+    */
     switch(info->type) {
       
     case MUT_ARR_PTRS_FROZEN:
@@ -2136,6 +2378,10 @@ scavenge_mutable_list(generation *gen)
       {
 	StgPtr end, q;
 	
+	IF_DEBUG(gc,
+		 belch("@@ scavenge_mut_list: scavenging MUT_ARR_PTRS_FROZEN %p; size: %#x ; next: %p",
+		       p, mut_arr_ptrs_sizeW((StgMutArrPtrs*)p), p->mut_link));
+
 	end = (P_)p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
 	evac_gen = gen->no;
 	for (q = (P_)((StgMutArrPtrs *)p)->payload; q < end; q++) {
@@ -2158,6 +2404,10 @@ scavenge_mutable_list(generation *gen)
       {
 	StgPtr end, q;
 	
+	IF_DEBUG(gc,
+		 belch("@@ scavenge_mut_list: scavenging MUT_ARR_PTRS %p; size: %#x ; next: %p",
+		       p, mut_arr_ptrs_sizeW((StgMutArrPtrs*)p), p->mut_link));
+
 	end = (P_)p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
 	for (q = (P_)((StgMutArrPtrs *)p)->payload; q < end; q++) {
 	  (StgClosure *)*q = evacuate((StgClosure *)*q);
@@ -2170,6 +2420,10 @@ scavenge_mutable_list(generation *gen)
        * it from the mutable list if possible by promoting whatever it
        * points to.
        */
+	IF_DEBUG(gc,
+		 belch("@@ scavenge_mut_list: scavenging MUT_VAR %p; var: %p ; next: %p",
+		       p, ((StgMutVar *)p)->var, p->mut_link));
+
       ASSERT(p->header.info != &MUT_CONS_info);
       ((StgMutVar *)p)->var = evacuate(((StgMutVar *)p)->var);
       p->mut_link = gen->mut_list;
@@ -2179,6 +2433,11 @@ scavenge_mutable_list(generation *gen)
     case MVAR:
       {
 	StgMVar *mvar = (StgMVar *)p;
+
+	IF_DEBUG(gc,
+		 belch("@@ scavenge_mut_list: scavenging MAVR %p; head: %p; tail: %p; value: %p ; next: %p",
+		       mvar, mvar->head, mvar->tail, mvar->value, p->mut_link));
+
 	(StgClosure *)mvar->head = evacuate((StgClosure *)mvar->head);
 	(StgClosure *)mvar->tail = evacuate((StgClosure *)mvar->tail);
 	(StgClosure *)mvar->value = evacuate((StgClosure *)mvar->value);
@@ -2205,6 +2464,11 @@ scavenge_mutable_list(generation *gen)
     case BLACKHOLE_BQ:
       { 
 	StgBlockingQueue *bh = (StgBlockingQueue *)p;
+
+	IF_DEBUG(gc,
+		 belch("@@ scavenge_mut_list: scavenging BLACKHOLE_BQ (%p); next: %p",
+		       p, p->mut_link));
+
 	(StgClosure *)bh->blocking_queue = 
 	  evacuate((StgClosure *)bh->blocking_queue);
 	p->mut_link = gen->mut_list;
@@ -2233,6 +2497,8 @@ scavenge_mutable_list(generation *gen)
       }
       continue;
 
+    // HWL: old PAR code deleted here
+
     default:
       /* shouldn't have anything else on the mutables list */
       barf("scavenge_mutable_list: strange object? %d", (int)(info->type));
@@ -2240,6 +2506,8 @@ scavenge_mutable_list(generation *gen)
   }
 }
 
+//@cindex scavenge_static
+
 static void
 scavenge_static(void)
 {
@@ -2255,7 +2523,10 @@ scavenge_static(void)
   while (p != END_OF_STATIC_LIST) {
 
     info = get_itbl(p);
-
+    /*
+    if (info->type==RBH)
+      info = REVERT_INFOPTR(info); // if it's an RBH, look at the orig closure
+    */
     /* make sure the info pointer is into text space */
     ASSERT(p && (LOOKS_LIKE_GHC_INFO(GET_INFO(p))
 		 || IS_HUGS_CONSTR_INFO(GET_INFO(p))));
@@ -2324,6 +2595,7 @@ scavenge_static(void)
    objects pointed to by it.  We can use the same code for walking
    PAPs, since these are just sections of copied stack.
    -------------------------------------------------------------------------- */
+//@cindex scavenge_stack
 
 static void
 scavenge_stack(StgPtr p, StgPtr stack_end)
@@ -2332,6 +2604,8 @@ scavenge_stack(StgPtr p, StgPtr stack_end)
   const StgInfoTable* info;
   StgWord32 bitmap;
 
+  IF_DEBUG(sanity, belch("  scavenging stack between %p and %p", p, stack_end));
+
   /* 
    * Each time around this loop, we are looking at a chunk of stack
    * that starts with either a pending argument section or an 
@@ -2380,8 +2654,18 @@ scavenge_stack(StgPtr p, StgPtr stack_end)
       /* probably a slow-entry point return address: */
     case FUN:
     case FUN_STATIC:
-      p++;
+      {
+#if 0	
+	StgPtr old_p = p;
+	p++; p++; 
+	IF_DEBUG(sanity, 
+		 belch("HWL: scavenge_stack: FUN(_STATIC) adjusting p from %p to %p (instead of %p)",
+		       old_p, p, old_p+1));
+#else
+      p++; /* what if FHS!=1 !? -- HWL */
+#endif
       goto follow_srt;
+      }
 
       /* Specialised code for update frames, since they're so common.
        * We *know* the updatee points to a BLACKHOLE, CAF_BLACKHOLE,
@@ -2436,14 +2720,29 @@ scavenge_stack(StgPtr p, StgPtr stack_end)
       }
 
       /* small bitmap (< 32 entries, or 64 on a 64-bit machine) */
-    case RET_BCO:
-    case RET_SMALL:
-    case RET_VEC_SMALL:
     case STOP_FRAME:
     case CATCH_FRAME:
     case SEQ_FRAME:
+      {
+	StgPtr old_p = p; // debugging only -- HWL
+      /* stack frames like these are ordinary closures and therefore may 
+	 contain setup-specific fixed-header words (as in GranSim!);
+	 therefore, these cases should not use p++ but &(p->payload) -- HWL */
+      IF_DEBUG(gran, IF_DEBUG(sanity, printObj(p)));
+      bitmap = info->layout.bitmap;
+
+      p = (StgPtr)&(((StgClosure *)p)->payload);
+      IF_DEBUG(sanity, 
+		 belch("HWL: scavenge_stack: (STOP|CATCH|SEQ)_FRAME adjusting p from %p to %p (instead of %p)",
+		       old_p, p, old_p+1));
+      goto small_bitmap;
+      }
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
       bitmap = info->layout.bitmap;
       p++;
+      /* this assumes that the payload starts immediately after the info-ptr */
     small_bitmap:
       while (bitmap != 0) {
 	if ((bitmap & 1) == 0) {
@@ -2504,6 +2803,7 @@ scavenge_stack(StgPtr p, StgPtr stack_end)
   objects are (repeatedly) mutable, so most of the time evac_gen will
   be zero.
   --------------------------------------------------------------------------- */
+//@cindex scavenge_large
 
 static void
 scavenge_large(step *step)
@@ -2580,6 +2880,7 @@ scavenge_large(step *step)
 
     case TSO:
 	scavengeTSO((StgTSO *)p);
+        // HWL: old PAR code deleted here
 	continue;
 
     default:
@@ -2588,6 +2889,8 @@ scavenge_large(step *step)
   }
 }
 
+//@cindex zero_static_object_list
+
 static void
 zero_static_object_list(StgClosure* first_static)
 {
@@ -2610,6 +2913,8 @@ zero_static_object_list(StgClosure* first_static)
  * It doesn't do any harm to zero all the mutable link fields on the
  * mutable list.
  */
+//@cindex zero_mutable_list
+
 static void
 zero_mutable_list( StgMutClosure *first )
 {
@@ -2621,9 +2926,13 @@ zero_mutable_list( StgMutClosure *first )
   }
 }
 
+//@node Reverting CAFs, Sanity code for CAF garbage collection, Scavenging
+//@subsection Reverting CAFs
+
 /* -----------------------------------------------------------------------------
    Reverting CAFs
    -------------------------------------------------------------------------- */
+//@cindex RevertCAFs
 
 void RevertCAFs(void)
 {
@@ -2639,6 +2948,8 @@ void RevertCAFs(void)
   enteredCAFs = END_CAF_LIST;
 }
 
+//@cindex revert_dead_CAFs
+
 void revert_dead_CAFs(void)
 {
     StgCAF* caf = enteredCAFs;
@@ -2660,6 +2971,9 @@ void revert_dead_CAFs(void)
     }
 }
 
+//@node Sanity code for CAF garbage collection, Lazy black holing, Reverting CAFs
+//@subsection Sanity code for CAF garbage collection
+
 /* -----------------------------------------------------------------------------
    Sanity code for CAF garbage collection.
 
@@ -2673,6 +2987,8 @@ void revert_dead_CAFs(void)
    -------------------------------------------------------------------------- */
 
 #ifdef DEBUG
+//@cindex gcCAFs
+
 static void
 gcCAFs(void)
 {
@@ -2710,6 +3026,9 @@ gcCAFs(void)
 }
 #endif
 
+//@node Lazy black holing, Stack squeezing, Sanity code for CAF garbage collection
+//@subsection Lazy black holing
+
 /* -----------------------------------------------------------------------------
    Lazy black holing.
 
@@ -2717,6 +3036,7 @@ gcCAFs(void)
    some work, we have to run down the stack and black-hole all the
    closures referred to by update frames.
    -------------------------------------------------------------------------- */
+//@cindex threadLazyBlackHole
 
 static void
 threadLazyBlackHole(StgTSO *tso)
@@ -2772,6 +3092,9 @@ threadLazyBlackHole(StgTSO *tso)
   }
 }
 
+//@node Stack squeezing, Pausing a thread, Lazy black holing
+//@subsection Stack squeezing
+
 /* -----------------------------------------------------------------------------
  * Stack squeezing
  *
@@ -2779,6 +3102,7 @@ threadLazyBlackHole(StgTSO *tso)
  * lazy black holing here.
  *
  * -------------------------------------------------------------------------- */
+//@cindex threadSqueezeStack
 
 static void
 threadSqueezeStack(StgTSO *tso)
@@ -2789,6 +3113,14 @@ threadSqueezeStack(StgTSO *tso)
   StgUpdateFrame *prev_frame;			/* Temporally previous */
   StgPtr bottom;
   rtsBool prev_was_update_frame;
+#if DEBUG
+  StgUpdateFrame *top_frame;
+  nat upd_frames=0, stop_frames=0, catch_frames=0, seq_frames=0,
+      bhs=0, squeezes=0;
+  void printObj( StgClosure *obj ); // from Printer.c
+
+  top_frame  = tso->su;
+#endif
   
   bottom = &(tso->stack[tso->stack_size]);
   frame  = tso->su;
@@ -2814,6 +3146,30 @@ threadSqueezeStack(StgTSO *tso)
     frame->link = next_frame;
     next_frame = frame;
     frame = prev_frame;
+#if DEBUG
+    IF_DEBUG(sanity,
+	     if (!(frame>=top_frame && frame<=bottom)) {
+	       printObj((StgClosure *)prev_frame);
+	       barf("threadSqueezeStack: current frame is rubbish %p; previous was %p\n", 
+		    frame, prev_frame);
+	     })
+    switch (get_itbl(frame)->type) {
+    case UPDATE_FRAME: upd_frames++;
+                       if (frame->updatee->header.info == &BLACKHOLE_info)
+			 bhs++;
+                       break;
+    case STOP_FRAME:  stop_frames++;
+                      break;
+    case CATCH_FRAME: catch_frames++;
+                      break;
+    case SEQ_FRAME: seq_frames++;
+                    break;
+    default:
+      barf("Found non-frame during stack squeezing at %p (prev frame was %p)\n",
+	   frame, prev_frame);
+      printObj((StgClosure *)prev_frame);
+    }
+#endif
     if (get_itbl(frame)->type == UPDATE_FRAME
 	&& frame->updatee->header.info == &BLACKHOLE_info) {
         break;
@@ -2863,8 +3219,9 @@ threadSqueezeStack(StgTSO *tso)
       StgClosure *updatee_keep   = prev_frame->updatee;
       StgClosure *updatee_bypass = frame->updatee;
       
-#if 0 /* DEBUG */
-      fprintf(stderr, "squeezing frame at %p\n", frame);
+#if DEBUG
+      IF_DEBUG(gc, fprintf(stderr, "@@ squeezing frame at %p\n", frame));
+      squeezes++;
 #endif
 
       /* Deal with blocking queues.  If both updatees have blocked
@@ -2949,9 +3306,10 @@ threadSqueezeStack(StgTSO *tso)
       else
 	next_frame_bottom = tso->sp - 1;
       
-#if 0 /* DEBUG */
-      fprintf(stderr, "sliding [%p, %p] by %ld\n", sp, next_frame_bottom,
-	      displacement);
+#if DEBUG
+      IF_DEBUG(gc,
+	       fprintf(stderr, "sliding [%p, %p] by %ld\n", sp, next_frame_bottom,
+		       displacement))
 #endif
       
       while (sp >= next_frame_bottom) {
@@ -2965,8 +3323,16 @@ threadSqueezeStack(StgTSO *tso)
 
   tso->sp += displacement;
   tso->su = prev_frame;
+#if DEBUG
+  IF_DEBUG(gc,
+	   fprintf(stderr, "@@ threadSqueezeStack: squeezed %d update-frames; found %d BHs; found %d update-, %d stop-, %d catch, %d seq-frames\n",
+		   squeezes, bhs, upd_frames, stop_frames, catch_frames, seq_frames))
+#endif
 }
 
+//@node Pausing a thread, Index, Stack squeezing
+//@subsection Pausing a thread
+
 /* -----------------------------------------------------------------------------
  * Pausing a thread
  * 
@@ -2974,6 +3340,7 @@ threadSqueezeStack(StgTSO *tso)
  * here.  We also take the opportunity to do stack squeezing if it's
  * turned on.
  * -------------------------------------------------------------------------- */
+//@cindex threadPaused
 
 void
 threadPaused(StgTSO *tso)
@@ -2983,3 +3350,83 @@ threadPaused(StgTSO *tso)
   else
     threadLazyBlackHole(tso);
 }
+
+#if DEBUG
+//@cindex printMutOnceList
+void
+printMutOnceList(generation *gen)
+{
+  const StgInfoTable *info;
+  StgMutClosure *p, *next, *new_list;
+
+  p = gen->mut_once_list;
+  new_list = END_MUT_LIST;
+  next = p->mut_link;
+
+  evac_gen = gen->no;
+  failed_to_evac = rtsFalse;
+
+  fprintf(stderr, "@@ Mut once list %p: ", gen->mut_once_list);
+  for (; p != END_MUT_LIST; p = next, next = p->mut_link) {
+    fprintf(stderr, "%p (%s), ", 
+	    p, info_type((StgClosure *)p));
+  }
+  fputc('\n', stderr);
+}
+
+//@cindex printMutableList
+void
+printMutableList(generation *gen)
+{
+  const StgInfoTable *info;
+  StgMutClosure *p, *next;
+
+  p = gen->saved_mut_list;
+  next = p->mut_link;
+
+  evac_gen = 0;
+  failed_to_evac = rtsFalse;
+
+  fprintf(stderr, "@@ Mutable list %p: ", gen->saved_mut_list);
+  for (; p != END_MUT_LIST; p = next, next = p->mut_link) {
+    fprintf(stderr, "%p (%s), ", 
+	    p, info_type((StgClosure *)p));
+  }
+  fputc('\n', stderr);
+}
+#endif /* DEBUG */
+
+//@node Index,  , Pausing a thread
+//@subsection Index
+
+//@index
+//* GarbageCollect::  @cindex\s-+GarbageCollect
+//* MarkRoot::  @cindex\s-+MarkRoot
+//* RevertCAFs::  @cindex\s-+RevertCAFs
+//* addBlock::  @cindex\s-+addBlock
+//* cleanup_weak_ptr_list::  @cindex\s-+cleanup_weak_ptr_list
+//* copy::  @cindex\s-+copy
+//* copyPart::  @cindex\s-+copyPart
+//* evacuate::  @cindex\s-+evacuate
+//* evacuate_large::  @cindex\s-+evacuate_large
+//* gcCAFs::  @cindex\s-+gcCAFs
+//* isAlive::  @cindex\s-+isAlive
+//* mkMutCons::  @cindex\s-+mkMutCons
+//* relocate_TSO::  @cindex\s-+relocate_TSO
+//* revert_dead_CAFs::  @cindex\s-+revert_dead_CAFs
+//* scavenge::  @cindex\s-+scavenge
+//* scavenge_large::  @cindex\s-+scavenge_large
+//* scavenge_mut_once_list::  @cindex\s-+scavenge_mut_once_list
+//* scavenge_mutable_list::  @cindex\s-+scavenge_mutable_list
+//* scavenge_one::  @cindex\s-+scavenge_one
+//* scavenge_srt::  @cindex\s-+scavenge_srt
+//* scavenge_stack::  @cindex\s-+scavenge_stack
+//* scavenge_static::  @cindex\s-+scavenge_static
+//* threadLazyBlackHole::  @cindex\s-+threadLazyBlackHole
+//* threadPaused::  @cindex\s-+threadPaused
+//* threadSqueezeStack::  @cindex\s-+threadSqueezeStack
+//* traverse_weak_ptr_list::  @cindex\s-+traverse_weak_ptr_list
+//* upd_evacuee::  @cindex\s-+upd_evacuee
+//* zero_mutable_list::  @cindex\s-+zero_mutable_list
+//* zero_static_object_list::  @cindex\s-+zero_static_object_list
+//@end index
diff --git a/ghc/rts/GC.h b/ghc/rts/GC.h
index dc7beb8..212620e 100644
--- a/ghc/rts/GC.h
+++ b/ghc/rts/GC.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: GC.h,v 1.4 1999/02/05 16:02:43 simonm Exp $
+ * $Id: GC.h,v 1.5 2000/01/13 14:34:03 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -9,3 +9,4 @@
 
 void threadPaused(StgTSO *);
 StgClosure *isAlive(StgClosure *p);
+void GarbageCollect(void (*get_roots)(void));
diff --git a/ghc/rts/Hash.h b/ghc/rts/Hash.h
index ac0df5c..74ff321 100644
--- a/ghc/rts/Hash.h
+++ b/ghc/rts/Hash.h
@@ -1,5 +1,5 @@
 /*-----------------------------------------------------------------------------
- * $Id: Hash.h,v 1.1 1999/01/27 12:11:26 simonm Exp $
+ * $Id: Hash.h,v 1.2 2000/01/13 14:34:03 hwloidl Exp $
  *
  * (c) The GHC Team, 1999
  *
@@ -14,3 +14,4 @@ void        insertHashTable ( HashTable *table, StgWord key, void *data );
 void *      removeHashTable ( HashTable *table, StgWord key, void *data );
 void        freeHashTable   ( HashTable *table, void (*freeDataFun)(void *) );
 HashTable * allocHashTable  ( void );
+
diff --git a/ghc/rts/HeapStackCheck.hc b/ghc/rts/HeapStackCheck.hc
index fc29ba7..1a30f44 100644
--- a/ghc/rts/HeapStackCheck.hc
+++ b/ghc/rts/HeapStackCheck.hc
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: HeapStackCheck.hc,v 1.10 1999/11/09 15:46:51 simonmar Exp $
+ * $Id: HeapStackCheck.hc,v 1.11 2000/01/13 14:34:03 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -276,6 +276,334 @@ EXTFUN(stg_gc_enter_8)
   FE_
 }
 
+#if defined(GRAN)
+/*
+  ToDo: merge the block and yield macros, calling something like BLOCK(N)
+        at the end;
+*/
+
+/* 
+   Should we actually ever do a yield in such a case?? -- HWL
+*/
+EXTFUN(gran_yield_0)
+{
+  FB_
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadYielding;
+  JMP_(StgReturn);
+  FE_
+}
+
+EXTFUN(gran_yield_1)
+{
+  FB_
+  Sp -= 1;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadYielding;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 2 Regs--------------------------------------------------------------------*/
+
+EXTFUN(gran_yield_2)
+{
+  FB_
+  Sp -= 2;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadYielding;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 3 Regs -------------------------------------------------------------------*/
+
+EXTFUN(gran_yield_3)
+{
+  FB_
+  Sp -= 3;
+  Sp[2] = R3.w;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadYielding;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 4 Regs -------------------------------------------------------------------*/
+
+EXTFUN(gran_yield_4)
+{
+  FB_
+  Sp -= 4;
+  Sp[3] = R4.w;
+  Sp[2] = R3.w;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadYielding;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 5 Regs -------------------------------------------------------------------*/
+
+EXTFUN(gran_yield_5)
+{
+  FB_
+  Sp -= 5;
+  Sp[4] = R5.w;
+  Sp[3] = R4.w;
+  Sp[2] = R3.w;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadYielding;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 6 Regs -------------------------------------------------------------------*/
+
+EXTFUN(gran_yield_6)
+{
+  FB_
+  Sp -= 6;
+  Sp[5] = R6.w;
+  Sp[4] = R5.w;
+  Sp[3] = R4.w;
+  Sp[2] = R3.w;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadYielding;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 7 Regs -------------------------------------------------------------------*/
+
+EXTFUN(gran_yield_7)
+{
+  FB_
+  Sp -= 7;
+  Sp[6] = R7.w;
+  Sp[5] = R6.w;
+  Sp[4] = R5.w;
+  Sp[3] = R4.w;
+  Sp[2] = R3.w;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadYielding;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 8 Regs -------------------------------------------------------------------*/
+
+EXTFUN(gran_yield_8)
+{
+  FB_
+  Sp -= 8;
+  Sp[7] = R8.w;
+  Sp[6] = R7.w;
+  Sp[5] = R6.w;
+  Sp[4] = R5.w;
+  Sp[3] = R4.w;
+  Sp[2] = R3.w;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadYielding;
+  JMP_(StgReturn);
+  FE_
+}
+
+// the same routines but with a block rather than a yield
+
+EXTFUN(gran_block_1)
+{
+  FB_
+  Sp -= 1;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadBlocked;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 2 Regs--------------------------------------------------------------------*/
+
+EXTFUN(gran_block_2)
+{
+  FB_
+  Sp -= 2;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadBlocked;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 3 Regs -------------------------------------------------------------------*/
+
+EXTFUN(gran_block_3)
+{
+  FB_
+  Sp -= 3;
+  Sp[2] = R3.w;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadBlocked;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 4 Regs -------------------------------------------------------------------*/
+
+EXTFUN(gran_block_4)
+{
+  FB_
+  Sp -= 4;
+  Sp[3] = R4.w;
+  Sp[2] = R3.w;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadBlocked;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 5 Regs -------------------------------------------------------------------*/
+
+EXTFUN(gran_block_5)
+{
+  FB_
+  Sp -= 5;
+  Sp[4] = R5.w;
+  Sp[3] = R4.w;
+  Sp[2] = R3.w;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadBlocked;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 6 Regs -------------------------------------------------------------------*/
+
+EXTFUN(gran_block_6)
+{
+  FB_
+  Sp -= 6;
+  Sp[5] = R6.w;
+  Sp[4] = R5.w;
+  Sp[3] = R4.w;
+  Sp[2] = R3.w;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadBlocked;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 7 Regs -------------------------------------------------------------------*/
+
+EXTFUN(gran_block_7)
+{
+  FB_
+  Sp -= 7;
+  Sp[6] = R7.w;
+  Sp[5] = R6.w;
+  Sp[4] = R5.w;
+  Sp[3] = R4.w;
+  Sp[2] = R3.w;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadBlocked;
+  JMP_(StgReturn);
+  FE_
+}
+
+/*- 8 Regs -------------------------------------------------------------------*/
+
+EXTFUN(gran_block_8)
+{
+  FB_
+  Sp -= 8;
+  Sp[7] = R8.w;
+  Sp[6] = R7.w;
+  Sp[5] = R6.w;
+  Sp[4] = R5.w;
+  Sp[3] = R4.w;
+  Sp[2] = R3.w;
+  Sp[1] = R2.w;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadBlocked;
+  JMP_(StgReturn);
+  FE_
+}
+
+#endif
+
+#if 0 && defined(PAR)
+
+/*
+  Similar to stg_block_1 (called via StgMacro BLOCK_NP) but separates the
+  saving of the thread state from the actual jump via an StgReturn.
+  We need this separation because we call RTS routines in blocking entry codes
+  before jumping back into the RTS (see parallel/FetchMe.hc).
+*/
+
+EXTFUN(par_block_1_no_jump)
+{
+  FB_
+  Sp -= 1;
+  Sp[0] = R1.w;
+  SaveThreadState();					
+  FE_
+}
+
+EXTFUN(par_jump)
+{
+  FB_
+  CurrentTSO->whatNext = ThreadEnterGHC;		
+  R1.i = ThreadBlocked;
+  JMP_(StgReturn);
+  FE_
+}
+
+#endif
+
 /* -----------------------------------------------------------------------------
    For a case expression on a polymorphic or function-typed object, if
    the default branch (there can only be one branch) of the case fails
diff --git a/ghc/rts/Main.c b/ghc/rts/Main.c
index 09e6e21..1c721b7 100644
--- a/ghc/rts/Main.c
+++ b/ghc/rts/Main.c
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: Main.c,v 1.13 2000/01/13 12:40:15 simonmar Exp $
+ * $Id: Main.c,v 1.14 2000/01/13 14:34:03 hwloidl Exp $
  *
  * (c) The GHC Team 1998-1999
  *
@@ -16,21 +16,25 @@
 #include "RtsUtils.h"
 
 #ifdef DEBUG
-#include "Printer.h"   /* for printing        */
+# include "Printer.h"   /* for printing        */
 #endif
 
 #ifdef INTERPRETER
-#include "Assembler.h"
+# include "Assembler.h"
 #endif
 
 #ifdef PAR
-#include "ParInit.h"
-#include "Parallel.h"
-#include "LLC.h"
+# include "ParInit.h"
+# include "Parallel.h"
+# include "LLC.h"
+#endif
+
+#if defined(GRAN) || defined(PAR)
+# include "GranSimRts.h"
 #endif
 
 #ifdef HAVE_WINDOWS_H
-#include <windows.h>
+# include <windows.h>
 #endif
 
 
@@ -41,24 +45,65 @@
 int main(int argc, char *argv[])
 {
     int exit_status;
-
     SchedulerStatus status;
+    /* all GranSim/GUM init is done in startupHaskell; sets IAmMainThread! */
+
     startupHaskell(argc,argv);
 
-#  ifndef PAR
-    /* ToDo: want to start with a larger stack size */
-    status = rts_evalIO((StgClosure *)&mainIO_closure, NULL);
-#  else
+    /* kick off the computation by creating the main thread with a pointer
+       to mainIO_closure representing the computation of the overall program;
+       then enter the scheduler with this thread and off we go;
+      
+       the same for GranSim (we have only one instance of this code)
+
+       in a parallel setup, where we have many instances of this code
+       running on different PEs, we should do this only for the main PE
+       (IAmMainThread is set in startupHaskell) 
+    */
+
+#  if defined(PAR)
+
+#   if DEBUG
+    { /* a wait loop to allow attachment of gdb to UNIX threads */
+      nat i, j, s;
+
+      for (i=0, s=0; i<RtsFlags.ParFlags.wait; i++)
+	for (j=0; j<1000000; j++) 
+	  s += j % 65536;
+    }
+    IF_PAR_DEBUG(verbose,
+		 belch("Passed wait loop"));
+#   endif
+
     if (IAmMainThread == rtsTrue) {
-    /*Just to show we're alive */
       fprintf(stderr, "Main Thread Started ...\n");
-     
+
+      /* ToDo: Dump event for the main thread */
       status = rts_evalIO((StgClosure *)&mainIO_closure, NULL);
     } else {
-      WaitForPEOp(PP_FINISH,SysManTask);
-      exit(EXIT_SUCCESS);
+      /* Just to show we're alive */
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr, "== [%x] Non-Main PE enters scheduler without work ...\n",
+			   mytid));
+     
+      /* all non-main threads enter the scheduler without work */
+      status = schedule( /* nothing */ );
     }
-#  endif /* PAR */
+
+#  elif defined(GRAN)
+
+    /* ToDo: Dump event for the main thread */
+    status = rts_evalIO((StgClosure *)&mainIO_closure, NULL);
+
+#  else /* !PAR && !GRAN */
+
+    /* ToDo: want to start with a larger stack size */
+    status = rts_evalIO((StgClosure *)&mainIO_closure, NULL);
+
+#  endif /* !PAR && !GRAN */
+
+    // ToDo: update for parallel execution
+    /* check the status of the entire Haskell computation */
     switch (status) {
     case Deadlock:
       prog_belch("no threads to run:  infinite loop or deadlock?");
diff --git a/ghc/rts/Makefile b/ghc/rts/Makefile
index 920d8ee..59d516a 100644
--- a/ghc/rts/Makefile
+++ b/ghc/rts/Makefile
@@ -1,6 +1,6 @@
 #-----------------------------------------------------------------------------
-# $Id: Makefile,v 1.16 1999/12/07 15:52:40 simonmar Exp $
-
+# $Id: Makefile,v 1.17 2000/01/13 14:34:03 hwloidl Exp $
+#
 #  This is the Makefile for the runtime-system stuff.
 #  This stuff is written in C (and cannot be written in Haskell).
 #
@@ -21,9 +21,9 @@ include $(TOP)/mk/boilerplate.mk
 
 WAYS=$(GhcLibWays)
 
-SRCS_RTS_C  = $(wildcard *.c) $(wildcard hooks/*.c) $(filter-out gum/SysMan.c,$(wildcard gum/*.c))
+SRCS_RTS_C  = $(wildcard *.c) $(wildcard hooks/*.c) $(filter-out parallel/SysMan.c,$(wildcard parallel/*.c))
 SRCS_RTS_S  = $(wildcard *.S)
-SRCS_RTS_HC = $(wildcard *.hc)
+SRCS_RTS_HC = $(wildcard *.hc) $(wildcard parallel/*.hc)
 
 ifneq "$(way)" "dll"
 SRCS_RTS_C  := $(filter-out RtsDllMain.c, $(SRCS_RTS_C))
@@ -59,7 +59,7 @@ WARNING_OPTS += -optc-Wbad-function-cast
 #WARNING_OPTS += -optc-Wredundant-decls 
 #WARNING_OPTS += -optc-Wconversion
 
-SRC_HC_OPTS += -I../includes -I. -Igum $(WARNING_OPTS) $(GhcRtsHcOpts) -optc-DCOMPILING_RTS
+SRC_HC_OPTS += -I../includes -I. -Iparallel $(WARNING_OPTS) $(GhcRtsHcOpts) -optc-DCOMPILING_RTS
 SRC_CC_OPTS = $(GhcRtsCcOpts)
 
 ifneq "$(way)" "dll"
@@ -149,7 +149,7 @@ endif
 #
 
 ifeq "$(way)" "mp"
-all :: gum/SysMan
+all :: parallel/SysMan
 
 ifdef solaris2_TARGET_OS
 __socket_libs = -lsocket -lnsl
@@ -157,12 +157,12 @@ else
 __socket_libs =
 endif
 
-gum/SysMan : gum/SysMan.mp_o gum/LLComms.mp_o 
+parallel/SysMan : parallel/SysMan.mp_o parallel/LLComms.mp_o RtsUtils.mp_o RtsFlags.mp_o
 	$(RM) $@
-	gcc -o $@ gum/SysMan.mp_o gum/LLComms.mp_o -L$$PVM_ROOT/lib/$$PVM_ARCH -lgpvm3 -lpvm3 $(__socket_libs)
+	gcc -o $@ parallel/SysMan.mp_o parallel/LLComms.mp_o -L$$PVM_ROOT/lib/$$PVM_ARCH -lgpvm3 -lpvm3 $(__socket_libs)
 
-CLEAN_FILES  += gum/SysMan.mp_o gum/SysMan
-INSTALL_LIBEXECS += gum/SysMan
+CLEAN_FILES  += parallel/SysMan.mp_o parallel/SysMan
+INSTALL_LIBEXECS += parallel/SysMan
 endif
 
 #-----------------------------------------------------------------------------
diff --git a/ghc/rts/PrimOps.hc b/ghc/rts/PrimOps.hc
index 01d0a0a..8a2db25 100644
--- a/ghc/rts/PrimOps.hc
+++ b/ghc/rts/PrimOps.hc
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: PrimOps.hc,v 1.38 2000/01/13 12:40:15 simonmar Exp $
+ * $Id: PrimOps.hc,v 1.39 2000/01/13 14:34:03 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -907,7 +907,14 @@ FN_(putMVarzh_fast)
    */
   if (mvar->head != (StgTSO *)&END_TSO_QUEUE_closure) {
     ASSERT(mvar->head->why_blocked == BlockedOnMVar);
+#if defined(GRAN)
+# error FixME
+#elif defined(PAR)
+    // ToDo: check 2nd arg (mvar) is right
+    mvar->head = RET_STGCALL2(StgTSO *,unblockOne,mvar->head,mvar);
+#else
     mvar->head = RET_STGCALL1(StgTSO *,unblockOne,mvar->head);
+#endif
     if (mvar->head == (StgTSO *)&END_TSO_QUEUE_closure) {
       mvar->tail = (StgTSO *)&END_TSO_QUEUE_closure;
     }
diff --git a/ghc/rts/Printer.c b/ghc/rts/Printer.c
index cbb20dd..600d0a2 100644
--- a/ghc/rts/Printer.c
+++ b/ghc/rts/Printer.c
@@ -1,6 +1,5 @@
-
 /* -----------------------------------------------------------------------------
- * $Id: Printer.c,v 1.18 1999/11/29 18:59:46 sewardj Exp $
+ * $Id: Printer.c,v 1.19 2000/01/13 14:34:04 hwloidl Exp $
  *
  * Copyright (c) 1994-1999.
  *
@@ -18,6 +17,12 @@
 #include "Bytecodes.h"  /* for InstrPtr */
 #include "Disassembler.h"
 
+#include "Printer.h"
+
+// HWL: explicit fixed header size to make debugging easier
+int fixed_hs = FIXED_HS, itbl_sz = sizeofW(StgInfoTable), 
+    uf_sz=sizeofW(StgUpdateFrame), sf_sz=sizeofW(StgSeqFrame); 
+
 /* --------------------------------------------------------------------------
  * local function decls
  * ------------------------------------------------------------------------*/
@@ -198,6 +203,14 @@ void printClosure( StgClosure *obj )
             fprintf(stderr,")\n"); 
             break;
 
+#if defined(GRAN) || defined(PAR)
+    case RBH:
+      fprintf(stderr,"RBH("); 
+      printPtr((StgPtr)stgCast(StgRBH*,obj)->blocking_queue);
+      fprintf(stderr,")\n"); 
+      break;
+#endif
+
     case CONSTR:
     case CONSTR_1_0: case CONSTR_0_1:
     case CONSTR_1_1: case CONSTR_0_2: case CONSTR_2_0:
@@ -305,6 +318,13 @@ void printClosure( StgClosure *obj )
     }
 }
 
+/*
+void printGraph( StgClosure *obj )
+{
+ printClosure(obj);
+}
+*/
+
 StgPtr printStackObj( StgPtr sp )
 {
     /*fprintf(stderr,"Stack[%d] = ", &stgStack[STACK_SIZE] - sp); */
@@ -678,7 +698,7 @@ static void printZcoded( const char *raw )
 /* Causing linking trouble on Win32 plats, so I'm
    disabling this for now. 
 */
-#if defined(HAVE_BFD_H) && !defined(_WIN32)
+#if defined(HAVE_BFD_H) && !defined(_WIN32) && defined(USE_BSD)
 
 #include <bfd.h>
 
diff --git a/ghc/rts/RtsFlags.c b/ghc/rts/RtsFlags.c
index c3c0515..8324e1a 100644
--- a/ghc/rts/RtsFlags.c
+++ b/ghc/rts/RtsFlags.c
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: RtsFlags.c,v 1.23 2000/01/13 12:40:15 simonmar Exp $
+ * $Id: RtsFlags.c,v 1.24 2000/01/13 14:34:04 hwloidl Exp $
  *
  * (c) The AQUA Project, Glasgow University, 1994-1997
  * (c) The GHC Team, 1998-1999
@@ -8,6 +8,19 @@
  *
  * ---------------------------------------------------------------------------*/
 
+//@menu
+//* Includes::			
+//* Constants::			
+//* Static function decls::	
+//* Command-line option parsing routines::  
+//* GranSim specific options::	
+//* Aux fcts::			
+//@end menu
+//*/
+
+//@node Includes, Constants
+//@subsection Includes
+
 #include "Rts.h"
 #include "RtsFlags.h"
 #include "RtsUtils.h"
@@ -36,12 +49,100 @@ char  **prog_argv = NULL;
 int     rts_argc;  /* ditto */
 char   *rts_argv[MAX_RTS_ARGS];
 
+//@node Constants, Static function decls, Includes
+//@subsection Constants
+
 /*
  * constants, used later 
  */
 #define RTS 1
 #define PGM 0
 
+#if defined(GRAN)
+
+char *gran_debug_opts_strs[] = {
+  "DEBUG (-bDe, -bD1): event_trace; printing event trace.\n",
+  "DEBUG (-bDE, -bD2): event_stats; printing event statistics.\n",
+  "DEBUG (-bDb, -bD4): bq; check blocking queues\n",
+  "DEBUG (-bDG, -bD8): pack; routines for (un-)packing graph structures.\n",
+  "DEBUG (-bDq, -bD16): checkSparkQ; check consistency of the spark queues.\n",
+  "DEBUG (-bDf, -bD32): thunkStealing; print forwarding of fetches.\n",
+  "DEBUG (-bDr, -bD64): randomSteal; stealing sparks/threads from random PEs.\n",
+  "DEBUG (-bDF, -bD128): findWork; searching spark-pools (local & remote), thread queues for work.\n",
+  "DEBUG (-bDu, -bD256): unused; currently unused flag.\n",
+  "DEBUG (-bDS, -bD512): pri; priority sparking or scheduling.\n",
+  "DEBUG (-bD:, -bD1024): checkLight; check GranSim-Light setup.\n",
+  "DEBUG (-bDo, -bD2048): sortedQ; check whether spark/thread queues are sorted.\n",
+  "DEBUG (-bDz, -bD4096): blockOnFetch; check for blocked on fetch.\n",
+  "DEBUG (-bDP, -bD8192): packBuffer; routines handling pack buffer (GranSim internal!).\n",
+  "DEBUG (-bDt, -bD16384): blockOnFetch_sanity; check for TSO asleep on fetch.\n",
+};
+
+/* one character codes for the available debug options */
+char gran_debug_opts_flags[] = {
+  'e', 'E', 'b', 'G', 'q', 'f', 'r', 'F', 'u', 'S', ':', 'o', 'z', 'P', 't'
+};
+
+/* prefix strings printed with the debug messages of the corresponding type */
+char *gran_debug_opts_prefix[] = {
+  "", /* event_trace */ 
+  "", /* event_stats */
+  "##", /* bq */
+  "**", /* pack */
+  "^^", /* checkSparkQ */
+  "==", /* thunkStealing */
+  "^^", /* randomSteal */
+  "+-", /* findWork */
+  "", /* unused */
+  "++", /* pri */
+  "::", /* checkLight */
+  "##", /* sortedQ */
+  "", /* blockOnFetch */
+  "", /* packBuffer */
+  "" /* blockOnFetch_sanity */
+};
+
+#elif defined(PAR)
+
+char *par_debug_opts_strs[] = {
+  "DEBUG (-qDv, -qD1): verbose; be generally verbose with parallel related stuff.\n",
+  "DEBUG (-qDt, -qD2): trace; trace messages.\n",
+  "DEBUG (-qDs, -qD4): schedule; scheduling of parallel threads.\n",
+  "DEBUG (-qDe, -qD8): free; free messages.\n",
+  "DEBUG (-qDr, -qD16): resume; resume messages.\n",
+  "DEBUG (-qDw, -qD32): weight; print weights for GC.\n",
+  "DEBUG (-qDF, -qD64): fetch; fetch messages.\n",
+  "DEBUG (-qDa, -qD128): ack; ack messages.\n",
+  "DEBUG (-qDf, -qD256): fish; fish messages.\n",
+  "DEBUG (-qDo, -qD512): forward; forwarding messages to other PEs.\n",
+  "DEBUG (-qDp, -qD1024): pack; packing and unpacking graphs.\n"
+};
+
+/* one character codes for the available debug options */
+char par_debug_opts_flags[] = {
+  'v', 't', 's', 'e', 'r', 'w', 'F', 'a', 'f', 'o', 'p'  
+};
+
+/* prefix strings printed with the debug messages of the corresponding type */
+char *par_debug_opts_prefix[] = {
+  "  ", /* verbose */
+  "..", /* trace */
+  "--", /* schedule */
+  "!!", /* free */
+  "[]", /* resume */
+  ";;", /* weight */
+  "%%", /* fetch */
+  ",,", /* ack */
+  "$$", /* fish */
+  "", /* forward */
+  "**" /* pack */
+};
+
+#endif /* PAR */
+
+//@node Static function decls, Command-line option parsing routines, Constants
+//@subsection Static function decls
+
 /* -----------------------------------------------------------------------------
    Static function decls
    -------------------------------------------------------------------------- */
@@ -56,6 +157,20 @@ open_stats_file (
 static I_ decode(const char *s);
 static void bad_option(const char *s);
 
+#if defined(GRAN)
+static void enable_GranSimLight(void);
+static void process_gran_option(int arg, int *rts_argc, char *rts_argv[], rtsBool *error);
+static void set_GranSim_debug_options(nat n);
+static void help_GranSim_debug_options(nat n);
+#elif defined(PAR)
+static void process_par_option(int arg, int *rts_argc, char *rts_argv[], rtsBool *error);
+static void set_par_debug_options(nat n);
+static void help_par_debug_options(nat n);
+#endif
+
+//@node Command-line option parsing routines, GranSim specific options, Static function decls
+//@subsection Command-line option parsing routines
+
 /* -----------------------------------------------------------------------------
  * Command-line option parsing routines.
  * ---------------------------------------------------------------------------*/
@@ -109,82 +224,92 @@ void initRtsFlagsDefaults(void)
 #endif
 
 #ifdef PAR
-    RtsFlags.ParFlags.parallelStats	= rtsFalse;
-    RtsFlags.ParFlags.granSimStats	= rtsFalse;
-    RtsFlags.ParFlags.granSimStats_Binary = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Full   	  = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Binary 	  = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Sparks 	  = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Heap   	  = rtsFalse;
+    RtsFlags.ParFlags.ParStats.NewLogfile = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Global     = rtsFalse;
+
     RtsFlags.ParFlags.outputDisabled	= rtsFalse;
     RtsFlags.ParFlags.packBufferSize	= 1024;
+
+    RtsFlags.ParFlags.maxThreads        = 1024;
+    RtsFlags.ParFlags.maxFishes        = MAX_FISHES;
+    RtsFlags.ParFlags.fishDelay         = FISH_DELAY;
 #endif
 
 #if defined(PAR) || defined(SMP)
     RtsFlags.ParFlags.maxLocalSparks	= 4096;
-#endif
+#endif /* PAR || SMP */
+
+#if defined(GRAN)
+    /* ToDo: check defaults for GranSim and GUM */
+    RtsFlags.ConcFlags.ctxtSwitchTime	= CS_MIN_MILLISECS;  /* In milliseconds */
+    RtsFlags.ConcFlags.maxThreads	= 65536; // refers to mandatory threads
+    RtsFlags.GcFlags.maxStkSize		= (1024 * 1024) / sizeof(W_);
+    RtsFlags.GcFlags.initialStkSize	= 1024 / sizeof(W_);
+
+    RtsFlags.GranFlags.GranSimStats.Full	= rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Suppressed	= rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Binary      = rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Sparks      = rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Heap        = rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.NewLogfile  = rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Global      = rtsFalse;
 
-#ifdef GRAN
-    RtsFlags.GranFlags.granSimStats	= rtsFalse;
-    RtsFlags.GranFlags.granSimStats_suppressed	= rtsFalse;
-    RtsFlags.GranFlags.granSimStats_Binary = rtsFalse;
-    RtsFlags.GranFlags.granSimStats_Sparks = rtsFalse;
-    RtsFlags.GranFlags.granSimStats_Heap = rtsFalse;
-    RtsFlags.GranFlags.labelling	= rtsFalse;
     RtsFlags.GranFlags.packBufferSize	= 1024;
     RtsFlags.GranFlags.packBufferSize_internal = GRANSIM_DEFAULT_PACK_BUFFER_SIZE;
 
-    RtsFlags.GranFlags.proc  = MAX_PROC;
-    RtsFlags.GranFlags.max_fishes = MAX_FISHES;
-    RtsFlags.GranFlags.time_slice = GRAN_TIME_SLICE;
-    RtsFlags.GranFlags.Light = rtsFalse;
-
-    RtsFlags.GranFlags.gran_latency =             LATENCY;          
-    RtsFlags.GranFlags.gran_additional_latency =  ADDITIONAL_LATENCY; 
-    RtsFlags.GranFlags.gran_fetchtime =           FETCHTIME; 
-    RtsFlags.GranFlags.gran_lunblocktime =        LOCALUNBLOCKTIME; 
-    RtsFlags.GranFlags.gran_gunblocktime =        GLOBALUNBLOCKTIME;
-    RtsFlags.GranFlags.gran_mpacktime =           MSGPACKTIME;      
-    RtsFlags.GranFlags.gran_munpacktime =         MSGUNPACKTIME;
-    RtsFlags.GranFlags.gran_mtidytime =           MSGTIDYTIME;
-
-    RtsFlags.GranFlags.gran_threadcreatetime =         THREADCREATETIME;
-    RtsFlags.GranFlags.gran_threadqueuetime =          THREADQUEUETIME;
-    RtsFlags.GranFlags.gran_threaddescheduletime =     THREADDESCHEDULETIME;
-    RtsFlags.GranFlags.gran_threadscheduletime =       THREADSCHEDULETIME;
-    RtsFlags.GranFlags.gran_threadcontextswitchtime =  THREADCONTEXTSWITCHTIME;
-
-    RtsFlags.GranFlags.gran_arith_cost =         ARITH_COST;       
-    RtsFlags.GranFlags.gran_branch_cost =        BRANCH_COST; 
-    RtsFlags.GranFlags.gran_load_cost =          LOAD_COST;        
-    RtsFlags.GranFlags.gran_store_cost =         STORE_COST; 
-    RtsFlags.GranFlags.gran_float_cost =         FLOAT_COST;       
-
-    RtsFlags.GranFlags.gran_heapalloc_cost =     HEAPALLOC_COST;
-
-    RtsFlags.GranFlags.gran_pri_spark_overhead = PRI_SPARK_OVERHEAD;        
-    RtsFlags.GranFlags.gran_pri_sched_overhead = PRI_SCHED_OVERHEAD;        
-
-    RtsFlags.GranFlags.DoFairSchedule = rtsFalse;             
-    RtsFlags.GranFlags.DoReScheduleOnFetch = rtsFalse;        
-    RtsFlags.GranFlags.DoStealThreadsFirst = rtsFalse;        
-    RtsFlags.GranFlags.SimplifiedFetch = rtsFalse;            
-    RtsFlags.GranFlags.DoAlwaysCreateThreads = rtsFalse;      
-    RtsFlags.GranFlags.DoGUMMFetching = rtsFalse;             
-    RtsFlags.GranFlags.DoThreadMigration = rtsFalse;          
-    RtsFlags.GranFlags.FetchStrategy = 2;                     
+    RtsFlags.GranFlags.proc         = MAX_PROC;
+    RtsFlags.GranFlags.Fishing      = rtsFalse;
+    RtsFlags.GranFlags.maxFishes   = MAX_FISHES;
+    RtsFlags.GranFlags.time_slice   = GRAN_TIME_SLICE;
+    RtsFlags.GranFlags.Light        = rtsFalse;
+
+    RtsFlags.GranFlags.Costs.latency =             LATENCY;          
+    RtsFlags.GranFlags.Costs.additional_latency =  ADDITIONAL_LATENCY; 
+    RtsFlags.GranFlags.Costs.fetchtime =           FETCHTIME; 
+    RtsFlags.GranFlags.Costs.lunblocktime =        LOCALUNBLOCKTIME; 
+    RtsFlags.GranFlags.Costs.gunblocktime =        GLOBALUNBLOCKTIME;
+    RtsFlags.GranFlags.Costs.mpacktime =           MSGPACKTIME;      
+    RtsFlags.GranFlags.Costs.munpacktime =         MSGUNPACKTIME;
+    RtsFlags.GranFlags.Costs.mtidytime =           MSGTIDYTIME;
+
+    RtsFlags.GranFlags.Costs.threadcreatetime =         THREADCREATETIME;
+    RtsFlags.GranFlags.Costs.threadqueuetime =          THREADQUEUETIME;
+    RtsFlags.GranFlags.Costs.threaddescheduletime =     THREADDESCHEDULETIME;
+    RtsFlags.GranFlags.Costs.threadscheduletime =       THREADSCHEDULETIME;
+    RtsFlags.GranFlags.Costs.threadcontextswitchtime =  THREADCONTEXTSWITCHTIME;
+
+    RtsFlags.GranFlags.Costs.arith_cost =         ARITH_COST;       
+    RtsFlags.GranFlags.Costs.branch_cost =        BRANCH_COST; 
+    RtsFlags.GranFlags.Costs.load_cost =          LOAD_COST;        
+    RtsFlags.GranFlags.Costs.store_cost =         STORE_COST; 
+    RtsFlags.GranFlags.Costs.float_cost =         FLOAT_COST;       
+
+    RtsFlags.GranFlags.Costs.heapalloc_cost =     HEAPALLOC_COST;
+
+    RtsFlags.GranFlags.Costs.pri_spark_overhead = PRI_SPARK_OVERHEAD;        
+    RtsFlags.GranFlags.Costs.pri_sched_overhead = PRI_SCHED_OVERHEAD;        
+
+    RtsFlags.GranFlags.DoFairSchedule           = rtsFalse;             
+    RtsFlags.GranFlags.DoAsyncFetch             = rtsFalse;        
+    RtsFlags.GranFlags.DoStealThreadsFirst      = rtsFalse;        
+    RtsFlags.GranFlags.DoAlwaysCreateThreads    = rtsFalse;      
+    RtsFlags.GranFlags.DoBulkFetching           = rtsFalse;             
+    RtsFlags.GranFlags.DoThreadMigration        = rtsFalse;          
+    RtsFlags.GranFlags.FetchStrategy            = 2;                     
     RtsFlags.GranFlags.PreferSparksOfLocalNodes = rtsFalse;   
-    RtsFlags.GranFlags.DoPrioritySparking = rtsFalse;         
-    RtsFlags.GranFlags.DoPriorityScheduling = rtsFalse;       
-    RtsFlags.GranFlags.SparkPriority = 0;
-    RtsFlags.GranFlags.SparkPriority2 = 0; 
-    RtsFlags.GranFlags.RandomPriorities = rtsFalse;           
-    RtsFlags.GranFlags.InversePriorities = rtsFalse;          
-    RtsFlags.GranFlags.IgnorePriorities = rtsFalse;           
-    RtsFlags.GranFlags.ThunksToPack = 0;                      
-    RtsFlags.GranFlags.RandomSteal = rtsTrue;
-    RtsFlags.GranFlags.NoForward = rtsFalse;
-    RtsFlags.GranFlags.PrintFetchMisses = rtsFalse;
-
-    RtsFlags.GranFlags.debug = 0x0;
-    RtsFlags.GranFlags.event_trace = rtsFalse;
-    RtsFlags.GranFlags.event_trace_all = rtsFalse;
+    RtsFlags.GranFlags.DoPrioritySparking       = rtsFalse;         
+    RtsFlags.GranFlags.DoPriorityScheduling     = rtsFalse;       
+    RtsFlags.GranFlags.SparkPriority            = 0;
+    RtsFlags.GranFlags.SparkPriority2           = 0; 
+    RtsFlags.GranFlags.RandomPriorities         = rtsFalse;           
+    RtsFlags.GranFlags.InversePriorities        = rtsFalse;          
+    RtsFlags.GranFlags.IgnorePriorities         = rtsFalse;           
+    RtsFlags.GranFlags.ThunksToPack             = 0;                      
+    RtsFlags.GranFlags.RandomSteal              = rtsTrue;
 #endif
 
 #ifdef TICKY_TICKY
@@ -279,10 +404,15 @@ usage_text[] = {
 # ifdef SMP
 "  -N<n>     Use <n> OS threads (default: 1)",
 # endif
+"  -e<size>        Size of spark pools (default 100)",
+"  -t<num>   Set maximum number of advisory threads per PE (default 32)",
+"  -o<num>   Set stack chunk size (default 1024)",
+
 # ifdef PAR
-"  -q        Enable activity profile (output files in ~/<program>*.gr)",
-"  -qb       Enable binary activity profile (output file /tmp/<program>.gb)",
-"  -Q<size>  Set pack-buffer size (default: 1024)",
+"  -qP       Enable activity profile (output files in ~/<program>*.gr)",
+"  -qQ<size> Set pack-buffer size (default: 1024)",
+"  -qd       Turn on PVM-ish debugging",
+"  -qO       Disable output for performance measurement",
 # endif
 # if defined(SMP) || defined(PAR)
 "  -e<n>     Maximum number of outstanding local sparks (default: 4096)",
@@ -470,6 +600,8 @@ error = rtsTrue;
                    if ((n>>7)&1) RtsFlags.DebugFlags.sanity      = rtsTrue;
                    if ((n>>8)&1) RtsFlags.DebugFlags.stable      = rtsTrue;
                    if ((n>>9)&1) RtsFlags.DebugFlags.prof        = rtsTrue;
+                   if ((n>>10)&1) RtsFlags.DebugFlags.gran       = rtsTrue;
+                   if ((n>>11)&1) RtsFlags.DebugFlags.par        = rtsTrue;
                 }
 		break;
 #endif
@@ -546,7 +678,7 @@ error = rtsTrue;
 		RtsFlags.GcFlags.giveStats ++;
 #ifdef PAR
 		/* Opening all those files would almost certainly fail... */
-		RtsFlags.ParFlags.parallelStats = rtsTrue;
+		RtsFlags.ParFlags.ParStats.Full = rtsTrue;
 		RtsFlags.GcFlags.statsFile = stderr; /* temporary; ToDo: rm */
 #else
 		RtsFlags.GcFlags.statsFile
@@ -776,37 +908,9 @@ error = rtsTrue;
 		}
 		) break;
 
-	      case 'O':
+    	      case 'q':
 		PAR_BUILD_ONLY(
-		RtsFlags.ParFlags.outputDisabled = rtsTrue;
-		) break;
-
-	      case 'q': /* activity profile option */
-    	    	PAR_BUILD_ONLY(
-		if (rts_argv[arg][2] == 'b')
-		    RtsFlags.ParFlags.granSimStats_Binary = rtsTrue;
-    	    	else
-    	    	    RtsFlags.ParFlags.granSimStats = rtsTrue;
-		) break;
-
-#if 0 /* or??? */
-	      case 'q': /* quasi-parallel profile option */
-    	    	GRAN_BUILD_ONLY (
-		if (rts_argv[arg][2] == 'v')
-		    do_qp_prof = 2;
-    	    	else
-    	    	    do_qp_prof++;
-		) break;
-#endif /* 0??? */
-
-	      case 'Q': /* Set pack buffer size */
-		PAR_BUILD_ONLY(
-		if (rts_argv[arg][2] != '\0') {
-		    RtsFlags.ParFlags.packBufferSize = decode(rts_argv[arg]+2);
-		} else {
-		  prog_belch("missing size of PackBuffer (for -Q)");
-		  error = rtsTrue;
-    	    	}
+		process_par_option(arg, rts_argc, rts_argv, &error);
 		) break;
 
 	      /* =========== GRAN =============================== */
@@ -870,6 +974,942 @@ error = rtsTrue;
     }
 }
 
+#if defined(GRAN)
+
+//@node GranSim specific options, Aux fcts, Command-line option parsing routines
+//@subsection GranSim specific options
+
+static void
+enable_GranSimLight(void) {
+
+    fprintf(stderr,"GrAnSim Light enabled (infinite number of processors;  0 communication costs)\n");
+    RtsFlags.GranFlags.Light=rtsTrue;
+    RtsFlags.GranFlags.Costs.latency = 
+	RtsFlags.GranFlags.Costs.fetchtime = 
+	RtsFlags.GranFlags.Costs.additional_latency =
+	RtsFlags.GranFlags.Costs.gunblocktime = 
+	RtsFlags.GranFlags.Costs.lunblocktime =
+	RtsFlags.GranFlags.Costs.threadcreatetime = 
+	RtsFlags.GranFlags.Costs.threadqueuetime =
+	RtsFlags.GranFlags.Costs.threadscheduletime = 
+	RtsFlags.GranFlags.Costs.threaddescheduletime =
+	RtsFlags.GranFlags.Costs.threadcontextswitchtime = 0;
+  
+    RtsFlags.GranFlags.Costs.mpacktime = 
+	RtsFlags.GranFlags.Costs.munpacktime = 0;
+
+    RtsFlags.GranFlags.DoFairSchedule = rtsTrue;
+    RtsFlags.GranFlags.DoAsyncFetch = rtsFalse;
+    RtsFlags.GranFlags.DoAlwaysCreateThreads = rtsTrue;
+    /* FetchStrategy is irrelevant in GrAnSim-Light */
+
+    /* GrAnSim Light often creates an abundance of parallel threads,
+       each with its own stack etc. Therefore, it's in general a good
+       idea to use small stack chunks (use the -o<size> option to 
+       increase it again). 
+    */
+    // RtsFlags.ConcFlags.stkChunkSize = 100;
+
+    RtsFlags.GranFlags.proc = 1; 
+}
+
+static void
+process_gran_option(int arg, int *rts_argc, char *rts_argv[], rtsBool *error)
+{
+    if (rts_argv[arg][1] != 'b') /* All GranSim options start with -b */
+      return;
+
+    /* or a ridiculously idealised simulator */
+    if(strcmp((rts_argv[arg]+2),"oring")==0) {
+      RtsFlags.GranFlags.Costs.latency = 
+	RtsFlags.GranFlags.Costs.fetchtime = 
+	RtsFlags.GranFlags.Costs.additional_latency =
+	RtsFlags.GranFlags.Costs.gunblocktime = 
+	RtsFlags.GranFlags.Costs.lunblocktime =
+	RtsFlags.GranFlags.Costs.threadcreatetime = 
+	RtsFlags.GranFlags.Costs.threadqueuetime =
+	RtsFlags.GranFlags.Costs.threadscheduletime = 
+	RtsFlags.GranFlags.Costs.threaddescheduletime =
+	RtsFlags.GranFlags.Costs.threadcontextswitchtime = 0;
+
+      RtsFlags.GranFlags.Costs.mpacktime = 
+	RtsFlags.GranFlags.Costs.munpacktime = 0;
+
+      RtsFlags.GranFlags.Costs.arith_cost = 
+	RtsFlags.GranFlags.Costs.float_cost = 
+	RtsFlags.GranFlags.Costs.load_cost =
+	RtsFlags.GranFlags.Costs.store_cost = 
+	RtsFlags.GranFlags.Costs.branch_cost = 0;
+
+      RtsFlags.GranFlags.Costs.heapalloc_cost = 1;
+
+      /* ++RtsFlags.GranFlags.DoFairSchedule; */
+      RtsFlags.GranFlags.DoStealThreadsFirst = rtsTrue;        /* -bZ */
+      RtsFlags.GranFlags.DoThreadMigration   = rtsTrue;        /* -bM */
+      RtsFlags.GranFlags.GranSimStats.Full   = rtsTrue;        /* -bP */
+      return;
+    }
+
+      /* or a somewhat idealised simulator */
+      if(strcmp((rts_argv[arg]+2),"onzo")==0) {
+	RtsFlags.GranFlags.Costs.latency = 
+	RtsFlags.GranFlags.Costs.fetchtime = 
+	RtsFlags.GranFlags.Costs.additional_latency =
+	RtsFlags.GranFlags.Costs.gunblocktime = 
+	RtsFlags.GranFlags.Costs.lunblocktime =
+	RtsFlags.GranFlags.Costs.threadcreatetime = 
+	RtsFlags.GranFlags.Costs.threadqueuetime =
+	RtsFlags.GranFlags.Costs.threadscheduletime = 
+	RtsFlags.GranFlags.Costs.threaddescheduletime =
+	RtsFlags.GranFlags.Costs.threadcontextswitchtime = 0;
+
+	RtsFlags.GranFlags.Costs.mpacktime = 
+	RtsFlags.GranFlags.Costs.munpacktime = 0;
+	
+	RtsFlags.GranFlags.Costs.heapalloc_cost = 1;
+
+	/* RtsFlags.GranFlags.DoFairSchedule  = rtsTrue; */       /* -b-R */
+	/* RtsFlags.GranFlags.DoStealThreadsFirst = rtsTrue; */   /* -b-T */
+	RtsFlags.GranFlags.DoAsyncFetch = rtsTrue;         /* -bZ */
+	RtsFlags.GranFlags.DoThreadMigration  = rtsTrue;          /* -bM */
+	RtsFlags.GranFlags.GranSimStats.Full  = rtsTrue;          /* -bP */
+#  if defined(GRAN_CHECK) && defined(GRAN)
+	RtsFlags.GranFlags.Debug.event_stats = rtsTrue; /* print event statistics   */
+#  endif
+	return;
+      }
+
+      /* Communication and task creation cost parameters */
+      switch(rts_argv[arg][2]) {
+        case '.':
+	  IgnoreYields = rtsTrue; // HWL HACK
+	  break;
+
+        case ':':
+	  enable_GranSimLight();       /* set flags for GrAnSim-Light mode */
+	  break;
+
+        case 'l':
+	  if (rts_argv[arg][3] != '\0')
+	    {
+	      RtsFlags.GranFlags.Costs.gunblocktime = 
+	      RtsFlags.GranFlags.Costs.latency = decode(rts_argv[arg]+3);
+	      RtsFlags.GranFlags.Costs.fetchtime = 2*RtsFlags.GranFlags.Costs.latency;
+	    }
+	  else
+	    RtsFlags.GranFlags.Costs.latency = LATENCY;
+	  break;
+
+        case 'a':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.additional_latency = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.additional_latency = ADDITIONAL_LATENCY;
+	  break;
+
+        case 'm':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.mpacktime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.mpacktime = MSGPACKTIME;
+	  break;
+
+        case 'x':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.mtidytime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.mtidytime = 0;
+	  break;
+
+        case 'r':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.munpacktime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.munpacktime = MSGUNPACKTIME;
+	  break;
+	  
+        case 'g':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.fetchtime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.fetchtime = FETCHTIME;
+	  break;
+	  
+        case 'n':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.gunblocktime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.gunblocktime = GLOBALUNBLOCKTIME;
+	  break;
+
+        case 'u':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.lunblocktime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.lunblocktime = LOCALUNBLOCKTIME;
+	  break;
+
+	/* Thread-related metrics */
+        case 't':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.threadcreatetime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.threadcreatetime = THREADCREATETIME;
+	  break;
+	  
+        case 'q':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.threadqueuetime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.threadqueuetime = THREADQUEUETIME;
+	  break;
+	  
+        case 'c':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.threadscheduletime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.threadscheduletime = THREADSCHEDULETIME;
+	  
+	  RtsFlags.GranFlags.Costs.threadcontextswitchtime = RtsFlags.GranFlags.Costs.threadscheduletime
+	    + RtsFlags.GranFlags.Costs.threaddescheduletime;
+	  break;
+
+        case 'd':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.threaddescheduletime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.threaddescheduletime = THREADDESCHEDULETIME;
+	  
+	  RtsFlags.GranFlags.Costs.threadcontextswitchtime = RtsFlags.GranFlags.Costs.threadscheduletime
+	    + RtsFlags.GranFlags.Costs.threaddescheduletime;
+	  break;
+
+	/* Instruction Cost Metrics */
+        case 'A':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.arith_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.arith_cost = ARITH_COST;
+	  break;
+
+        case 'F':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.float_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.float_cost = FLOAT_COST;
+	  break;
+		      
+        case 'B':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.branch_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.branch_cost = BRANCH_COST;
+	  break;
+
+        case 'L':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.load_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.load_cost = LOAD_COST;
+	  break;
+	  
+        case 'S':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.store_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.store_cost = STORE_COST;
+	  break;
+
+        case 'H':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.heapalloc_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.heapalloc_cost = 0;
+	  break;
+
+        case 'y':
+	  RtsFlags.GranFlags.DoAsyncFetch = rtsTrue;
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.FetchStrategy = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.FetchStrategy = 2;
+	  if (RtsFlags.GranFlags.FetchStrategy == 0)
+	    RtsFlags.GranFlags.DoAsyncFetch = rtsFalse;
+	  break;
+	  
+        case 'K':   /* sort overhead (per elem in spark list) */
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.pri_spark_overhead = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.pri_spark_overhead = PRI_SPARK_OVERHEAD;
+	  fprintf(stderr,"Overhead for pri spark: %d (per elem).\n",
+		         RtsFlags.GranFlags.Costs.pri_spark_overhead);
+	  break;
+
+        case 'O':  /* sort overhead (per elem in spark list) */
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.pri_sched_overhead = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.pri_sched_overhead = PRI_SCHED_OVERHEAD;
+	  fprintf(stderr,"Overhead for pri sched: %d (per elem).\n",
+		       RtsFlags.GranFlags.Costs.pri_sched_overhead);
+	  break;
+
+        /* General Parameters */
+        case 'p':
+	  if (rts_argv[arg][3] != '\0')
+	    {
+	      RtsFlags.GranFlags.proc = decode(rts_argv[arg]+3);
+	      if (RtsFlags.GranFlags.proc==0) {
+		  enable_GranSimLight(); /* set flags for GrAnSim-Light mode */
+	      } else if (RtsFlags.GranFlags.proc > MAX_PROC || 
+			 RtsFlags.GranFlags.proc < 1)
+		{
+		  fprintf(stderr,"setupRtsFlags: no more than %u processors
+allowed\n", 
+			  MAX_PROC);
+		  *error = rtsTrue;
+		}
+	    }
+	  else
+	    RtsFlags.GranFlags.proc = MAX_PROC;
+	  break;
+
+        case 'f':
+	  RtsFlags.GranFlags.Fishing = rtsTrue;
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.maxFishes = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.maxFishes = MAX_FISHES;
+	  break;
+	  
+        case 'w':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.time_slice = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.time_slice = GRAN_TIME_SLICE;
+	  break;
+	  
+        case 'C':
+	  RtsFlags.GranFlags.DoAlwaysCreateThreads=rtsTrue;
+	  RtsFlags.GranFlags.DoThreadMigration=rtsTrue;
+	  break;
+
+        case 'G':
+	  fprintf(stderr,"Bulk fetching enabled.\n");
+	  RtsFlags.GranFlags.DoBulkFetching=rtsTrue;
+	  break;
+	  
+        case 'M':
+	  fprintf(stderr,"Thread migration enabled.\n");
+	  RtsFlags.GranFlags.DoThreadMigration=rtsTrue;
+	  break;
+
+        case 'R':
+	  fprintf(stderr,"Fair Scheduling enabled.\n");
+	  RtsFlags.GranFlags.DoFairSchedule=rtsTrue;
+	  break;
+	  
+        case 'I':
+	  fprintf(stderr,"Priority Scheduling enabled.\n");
+	  RtsFlags.GranFlags.DoPriorityScheduling=rtsTrue;
+	  break;
+
+        case 'T':
+	  RtsFlags.GranFlags.DoStealThreadsFirst=rtsTrue;
+	  RtsFlags.GranFlags.DoThreadMigration=rtsTrue;
+	  break;
+	  
+        case 'Z':
+	  RtsFlags.GranFlags.DoAsyncFetch=rtsTrue;
+	  break;
+	  
+/*          case 'z': */
+/*  	  RtsFlags.GranFlags.SimplifiedFetch=rtsTrue; */
+/*  	  break; */
+	  
+        case 'N':
+	  RtsFlags.GranFlags.PreferSparksOfLocalNodes=rtsTrue;
+	  break;
+	  
+        case 'b':
+	  RtsFlags.GranFlags.GranSimStats.Binary=rtsTrue;
+	  break;
+	  
+        case 'P':
+	  /* format is -bP<c> where <c> is one char describing kind of profile */
+	  RtsFlags.GranFlags.GranSimStats.Full = rtsTrue;
+	  switch(rts_argv[arg][3]) {
+	  case '\0': break; // nothing special, just an ordinary profile
+	  case '0': RtsFlags.GranFlags.GranSimStats.Suppressed = rtsTrue;
+	    break;
+	  case 'b': RtsFlags.GranFlags.GranSimStats.Binary = rtsTrue;
+	    break;
+	  case 's': RtsFlags.GranFlags.GranSimStats.Sparks = rtsTrue;
+	    break;
+	  case 'h': RtsFlags.GranFlags.GranSimStats.Heap = rtsTrue;
+	    break;
+	  case 'n': RtsFlags.GranFlags.GranSimStats.NewLogfile = rtsTrue;
+	    break;
+	  case 'g': RtsFlags.GranFlags.GranSimStats.Global = rtsTrue;
+	    break;
+	  default: barf("Unknown option -bP%c", rts_argv[arg][3]);
+	  }
+	  break;
+
+        case 's':
+	  RtsFlags.GranFlags.GranSimStats.Sparks=rtsTrue;
+	  break;
+
+        case 'h':
+	  RtsFlags.GranFlags.GranSimStats.Heap=rtsTrue;
+	  break;
+
+        case 'Y':   /* syntax: -bY<n>[,<n>]  n ... pos int */ 
+	  if (rts_argv[arg][3] != '\0') {
+	    char *arg0, *tmp;
+	    
+	    arg0 = rts_argv[arg]+3;
+	    if ((tmp = strstr(arg0,","))==NULL) {
+	      RtsFlags.GranFlags.SparkPriority = decode(arg0);
+	      fprintf(stderr,"SparkPriority: %u.\n",RtsFlags.GranFlags.SparkPriority);
+	    } else {
+	      *(tmp++) = '\0'; 
+	      RtsFlags.GranFlags.SparkPriority = decode(arg0);
+	      RtsFlags.GranFlags.SparkPriority2 = decode(tmp);
+	      fprintf(stderr,"SparkPriority: %u.\n",
+		      RtsFlags.GranFlags.SparkPriority);
+	      fprintf(stderr,"SparkPriority2:%u.\n",
+		      RtsFlags.GranFlags.SparkPriority2);
+	      if (RtsFlags.GranFlags.SparkPriority2 < 
+		  RtsFlags.GranFlags.SparkPriority) {
+		fprintf(stderr,"WARNING: 2nd pri < main pri (%u<%u); 2nd pri has no effect\n",
+			RtsFlags.GranFlags.SparkPriority2,
+			RtsFlags.GranFlags.SparkPriority);
+	      }
+	    }
+	  } else {
+	    /* plain pri spark is now invoked with -bX  
+	       RtsFlags.GranFlags.DoPrioritySparking = 1;
+	       fprintf(stderr,"PrioritySparking.\n");
+	    */
+	  }
+	  break;
+
+        case 'Q':
+	  if (rts_argv[arg][3] != '\0') {
+	    RtsFlags.GranFlags.ThunksToPack = decode(rts_argv[arg]+3);
+	  } else {
+	    RtsFlags.GranFlags.ThunksToPack = 1;
+	  }
+	  fprintf(stderr,"Thunks To Pack in one packet: %u.\n",
+		  RtsFlags.GranFlags.ThunksToPack);
+	  break;
+		      
+        case 'e':
+	  RtsFlags.GranFlags.RandomSteal = rtsFalse;
+	  fprintf(stderr,"Deterministic mode (no random stealing)\n");
+		      break;
+
+	  /* The following class of options contains eXperimental */
+	  /* features in connection with exploiting granularity */
+	  /* information. I.e. if -bY is chosen these options */
+	  /* tell the RTS what to do with the supplied info --HWL */
+
+        case 'W':
+	  if (rts_argv[arg][3] != '\0') {
+	    RtsFlags.GranFlags.packBufferSize_internal = decode(rts_argv[arg]+3);
+	  } else {
+	    RtsFlags.GranFlags.packBufferSize_internal = GRANSIM_DEFAULT_PACK_BUFFER_SIZE;
+	  }
+	  fprintf(stderr,"Size of GranSim internal pack buffer: %u.\n",
+		  RtsFlags.GranFlags.packBufferSize_internal);
+	  break;
+  		      
+        case 'X':
+	  switch(rts_argv[arg][3]) {
+	    
+	    case '\0':
+	      RtsFlags.GranFlags.DoPrioritySparking = 1;
+	      fprintf(stderr,"Priority Sparking with Normal Priorities.\n");
+	      RtsFlags.GranFlags.InversePriorities = rtsFalse; 
+	      RtsFlags.GranFlags.RandomPriorities = rtsFalse;
+	      RtsFlags.GranFlags.IgnorePriorities = rtsFalse;
+	      break;
+			
+	    case 'I':
+	      RtsFlags.GranFlags.DoPrioritySparking = 1;
+	      fprintf(stderr,"Priority Sparking with Inverse Priorities.\n");
+	      RtsFlags.GranFlags.InversePriorities++; 
+	      break;
+	      
+	    case 'R': 
+	      RtsFlags.GranFlags.DoPrioritySparking = 1;
+	      fprintf(stderr,"Priority Sparking with Random Priorities.\n");
+	      RtsFlags.GranFlags.RandomPriorities++;
+	      break;
+	      
+	    case 'N':
+	      RtsFlags.GranFlags.DoPrioritySparking = 1;
+	      fprintf(stderr,"Priority Sparking with No Priorities.\n");
+	      RtsFlags.GranFlags.IgnorePriorities++;
+	      break;
+	      
+	    default:
+	      bad_option( rts_argv[arg] );
+	      break;
+	  }
+	  break;
+
+        case '-':
+	  switch(rts_argv[arg][3]) {
+	    
+	    case 'C':
+	      RtsFlags.GranFlags.DoAlwaysCreateThreads=rtsFalse;
+	      RtsFlags.GranFlags.DoThreadMigration=rtsFalse;
+	      break;
+
+	    case 'G':
+	      RtsFlags.GranFlags.DoBulkFetching=rtsFalse;
+	      break;
+	      
+	    case 'M':
+	      RtsFlags.GranFlags.DoThreadMigration=rtsFalse;
+	      break;
+
+	    case 'R':
+	      RtsFlags.GranFlags.DoFairSchedule=rtsFalse;
+	      break;
+
+	    case 'T':
+	      RtsFlags.GranFlags.DoStealThreadsFirst=rtsFalse;
+	      RtsFlags.GranFlags.DoThreadMigration=rtsFalse;
+	      break;
+
+	    case 'Z':
+	      RtsFlags.GranFlags.DoAsyncFetch=rtsFalse;
+	      break;
+	      
+	    case 'N':
+	      RtsFlags.GranFlags.PreferSparksOfLocalNodes=rtsFalse;
+			 break;
+			 
+	    case 'P':
+	      RtsFlags.GranFlags.GranSimStats.Suppressed=rtsTrue;
+	      break;
+
+	    case 's':
+	      RtsFlags.GranFlags.GranSimStats.Sparks=rtsFalse;
+	      break;
+	    
+	    case 'h':
+	      RtsFlags.GranFlags.GranSimStats.Heap=rtsFalse;
+	      break;
+	    
+	    case 'b':
+	      RtsFlags.GranFlags.GranSimStats.Binary=rtsFalse;
+	      break;
+			 
+	    case 'X':
+	      RtsFlags.GranFlags.DoPrioritySparking = rtsFalse;
+	      break;
+
+	    case 'Y':
+	      RtsFlags.GranFlags.DoPrioritySparking = rtsFalse;
+	      RtsFlags.GranFlags.SparkPriority = rtsFalse;
+	      break;
+
+	    case 'I':
+	      RtsFlags.GranFlags.DoPriorityScheduling = rtsFalse;
+	      break;
+
+	    case 'e':
+	      RtsFlags.GranFlags.RandomSteal = rtsFalse;
+	      break;
+
+	    default:
+	      bad_option( rts_argv[arg] );
+	      break;
+	  }
+	  break;
+
+#  if defined(GRAN_CHECK) && defined(GRAN)
+        case 'D':
+	  switch(rts_argv[arg][3]) {
+	    case 'Q':    /* Set pack buffer size (same as 'Q' in GUM) */
+	      if (rts_argv[arg][4] != '\0') {
+		RtsFlags.GranFlags.packBufferSize = decode(rts_argv[arg]+4);
+		fprintf(stderr,"Pack buffer size: %d\n",
+			RtsFlags.GranFlags.packBufferSize);
+	      } else {
+    	    	fprintf(stderr, "setupRtsFlags: missing size of PackBuffer (for -Q)\n");
+    	    	*error = rtsTrue;
+    	      }
+	      break;
+
+	  default:
+	      if (isdigit(rts_argv[arg][3])) {/* Set all debugging options in one */
+	    	/* hack warning: interpret the flags as a binary number */
+	    	nat n = decode(rts_argv[arg]+3);
+		set_GranSim_debug_options(n);
+	      } else {
+		nat i;
+		for (i=0; i<=MAX_GRAN_DEBUG_OPTION; i++) 
+		  if (rts_argv[arg][3] == gran_debug_opts_flags[i])
+		    break;
+		
+		if (i==MAX_GRAN_DEBUG_OPTION+1) {
+		  fprintf(stderr, "Valid GranSim debug options are:\n");
+		  help_GranSim_debug_options(MAX_GRAN_DEBUG_MASK);
+		  bad_option( rts_argv[arg] );
+		} else { // flag found; now set it
+		  set_GranSim_debug_options(GRAN_DEBUG_MASK(i));  // 2^i
+		}
+	      }
+	      break;
+	      
+#if 0
+	    case 'e':       /* event trace; also -bD1 */
+	      fprintf(stderr,"DEBUG: event_trace; printing event trace.\n");
+	      RtsFlags.GranFlags.Debug.event_trace = rtsTrue;
+	      /* RtsFlags.GranFlags.event_trace=rtsTrue; */
+	      break;
+	      
+	    case 'E':       /* event statistics; also -bD2 */
+	      fprintf(stderr,"DEBUG: event_stats; printing event statistics.\n");
+	      RtsFlags.GranFlags.Debug.event_stats = rtsTrue;
+	      /* RtsFlags.GranFlags.Debug |= 0x20; print event statistics   */
+	      break;
+	      
+	    case 'f':       /* thunkStealing; also -bD4 */
+	      fprintf(stderr,"DEBUG: thunkStealing; printing forwarding of FETCHNODES.\n");
+	      RtsFlags.GranFlags.Debug.thunkStealing = rtsTrue;
+	      /* RtsFlags.GranFlags.Debug |= 0x2;  print fwd messages */
+	      break;
+
+	    case 'z':       /* blockOnFetch; also -bD8 */
+	      fprintf(stderr,"DEBUG: blockOnFetch; check for blocked on fetch.\n");
+	      RtsFlags.GranFlags.Debug.blockOnFetch = rtsTrue;
+	      /* RtsFlags.GranFlags.Debug |= 0x4; debug non-reschedule-on-fetch */
+	      break;
+	      
+	    case 't':       /* blockOnFetch_sanity; also -bD16 */  
+	      fprintf(stderr,"DEBUG: blockOnFetch_sanity; check for TSO asleep on fetch.\n");
+	      RtsFlags.GranFlags.Debug.blockOnFetch_sanity = rtsTrue;
+	      /* RtsFlags.GranFlags.Debug |= 0x10; debug TSO asleep for fetch  */
+	      break;
+
+	    case 'S':       /* priSpark; also -bD32 */
+	      fprintf(stderr,"DEBUG: priSpark; priority sparking.\n");
+	      RtsFlags.GranFlags.Debug.priSpark = rtsTrue;
+	      break;
+
+	    case 's':       /* priSched; also -bD64 */
+	      fprintf(stderr,"DEBUG: priSched; priority scheduling.\n");
+	      RtsFlags.GranFlags.Debug.priSched = rtsTrue;
+	      break;
+
+	    case 'F':       /* findWork; also -bD128 */
+	      fprintf(stderr,"DEBUG: findWork; searching spark-pools (local & remote), thread queues for work.\n");
+	      RtsFlags.GranFlags.Debug.findWork = rtsTrue;
+	      break;
+	      
+	    case 'g':       /* globalBlock; also -bD256 */
+	      fprintf(stderr,"DEBUG: globalBlock; blocking on remote closures (FETCHMEs etc in GUM).\n");
+	      RtsFlags.GranFlags.Debug.globalBlock = rtsTrue;
+	      break;
+	      
+	    case 'G':       /* pack; also -bD512 */
+	      fprintf(stderr,"DEBUG: pack; routines for (un-)packing graph structures.\n");
+	      RtsFlags.GranFlags.Debug.pack = rtsTrue;
+	      break;
+	      
+	    case 'P':       /* packBuffer; also -bD1024 */
+	      fprintf(stderr,"DEBUG: packBuffer; routines handling pack buffer (GranSim internal!).\n");
+	      RtsFlags.GranFlags.Debug.packBuffer = rtsTrue;
+	      break;
+	      
+	    case 'o':       /* sortedQ; also -bD2048 */
+	      fprintf(stderr,"DEBUG: sortedQ; check whether spark/thread queues are sorted.\n");
+	      RtsFlags.GranFlags.Debug.sortedQ = rtsTrue;
+	      break;
+	      
+	    case 'r':       /* randomSteal; also -bD4096 */
+	      fprintf(stderr,"DEBUG: randomSteal; stealing sparks/threads from random PEs.\n");
+	      RtsFlags.GranFlags.Debug.randomSteal = rtsTrue;
+	      break;
+	      
+	    case 'q':       /* checkSparkQ; also -bD8192 */
+	      fprintf(stderr,"DEBUG: checkSparkQ; check consistency of the spark queues.\n");
+	      RtsFlags.GranFlags.Debug.checkSparkQ = rtsTrue;
+	      break;
+	      
+	    case ':':       /* checkLight; also -bD16384 */
+	      fprintf(stderr,"DEBUG: checkLight; check GranSim-Light setup.\n");
+	      RtsFlags.GranFlags.Debug.checkLight = rtsTrue;
+	      break;
+	      
+	    case 'b':       /* bq; also -bD32768 */
+	      fprintf(stderr,"DEBUG: bq; check blocking queues\n");
+	      RtsFlags.GranFlags.Debug.bq = rtsTrue;
+	      break;
+	      
+	    case 'd':       /* all options turned on */
+	      fprintf(stderr,"DEBUG: all options turned on.\n");
+	      set_GranSim_debug_options(MAX_GRAN_DEBUG_MASK);
+	      /* RtsFlags.GranFlags.Debug |= 0x40; */
+	      break;
+
+/*  	    case '\0': */
+/*  	      RtsFlags.GranFlags.Debug = 1; */
+/*  	      break; */
+#endif
+
+	  }
+	  break;
+#  endif  /* GRAN_CHECK */
+      default:
+	bad_option( rts_argv[arg] );
+	break;
+      }
+}
+
+/*
+  Interpret n as a binary number masking GranSim debug options and set the 
+  correxponding option. See gran_debug_opts_strs for explanations of the flags.
+*/
+static void
+set_GranSim_debug_options(nat n) {
+  nat i;
+
+  for (i=0; i<=MAX_GRAN_DEBUG_OPTION; i++) 
+    if ((n>>i)&1) {
+      fprintf(stderr, gran_debug_opts_strs[i]);
+      switch (i) {
+        case 0: RtsFlags.GranFlags.Debug.event_trace   = rtsTrue;  break;
+        case 1: RtsFlags.GranFlags.Debug.event_stats   = rtsTrue;  break;
+        case 2: RtsFlags.GranFlags.Debug.bq            = rtsTrue;  break;
+        case 3: RtsFlags.GranFlags.Debug.pack          = rtsTrue;  break;
+        case 4: RtsFlags.GranFlags.Debug.checkSparkQ   = rtsTrue;  break;
+        case 5: RtsFlags.GranFlags.Debug.thunkStealing = rtsTrue;  break;
+        case 6: RtsFlags.GranFlags.Debug.randomSteal   = rtsTrue;  break;
+        case 7: RtsFlags.GranFlags.Debug.findWork      = rtsTrue;  break;
+        case 8: RtsFlags.GranFlags.Debug.unused        = rtsTrue;  break;
+        case 9: RtsFlags.GranFlags.Debug.pri           = rtsTrue;  break;
+        case 10: RtsFlags.GranFlags.Debug.checkLight   = rtsTrue;  break;
+        case 11: RtsFlags.GranFlags.Debug.sortedQ      = rtsTrue;  break;
+        case 12: RtsFlags.GranFlags.Debug.blockOnFetch = rtsTrue;  break;
+        case 13: RtsFlags.GranFlags.Debug.packBuffer   = rtsTrue;  break;
+        case 14: RtsFlags.GranFlags.Debug.blockOnFetch_sanity = rtsTrue;  break;
+        default: barf("set_GranSim_debug_options: only %d debug options expected");
+      } /* switch */
+    } /* if */
+}
+
+/*
+  Print one line explanation for each of the GranSim debug options specified
+  in the bitmask n.
+*/
+static void
+help_GranSim_debug_options(nat n) {
+  nat i;
+
+  for (i=0; i<=MAX_GRAN_DEBUG_OPTION; i++) 
+    if ((n>>i)&1) 
+      fprintf(stderr, gran_debug_opts_strs[i]);
+}
+
+# elif defined(PAR)
+
+static void
+process_par_option(int arg, int *rts_argc, char *rts_argv[], rtsBool *error)
+{
+  if (rts_argv[arg][1] != 'q') /* All GUM options start with -q */
+    return;
+  
+  /* Communication and task creation cost parameters */
+  switch(rts_argv[arg][2]) {
+  case 'e':  /* -qe<n>  ... allow <n> local sparks */
+    if (rts_argv[arg][3] != '\0') { /* otherwise, stick w/ the default */
+      RtsFlags.ParFlags.maxLocalSparks
+	= strtol(rts_argv[arg]+3, (char **) NULL, 10);
+      
+      if (RtsFlags.ParFlags.maxLocalSparks <= 0) {
+	belch("setupRtsFlags: bad value for -e\n");
+	*error = rtsTrue;
+      }
+    }
+    IF_PAR_DEBUG(verbose,
+		 belch("-qe<n>: max %d local sparks", 
+		       RtsFlags.ParFlags.maxLocalSparks));
+    break;
+  
+  case 't':
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.maxThreads
+	= strtol(rts_argv[arg]+3, (char **) NULL, 10);
+    } else {
+      belch("setupRtsFlags: missing size for -qt\n");
+      *error = rtsTrue;
+    }
+    IF_PAR_DEBUG(verbose,
+		 belch("-qt<n>: max %d threads", 
+		       RtsFlags.ParFlags.maxThreads));
+    break;
+
+  case 'f':
+    if (rts_argv[arg][3] != '\0')
+      RtsFlags.ParFlags.maxFishes = decode(rts_argv[arg]+3);
+    else
+      RtsFlags.ParFlags.maxFishes = MAX_FISHES;
+    break;
+    IF_PAR_DEBUG(verbose,
+		 belch("-qf<n>: max %d fishes sent out at one time", 
+		       RtsFlags.ParFlags.maxFishes));
+    break;
+  
+
+  case 'd':
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.fishDelay
+	= strtol(rts_argv[arg]+3, (char **) NULL, 10);
+    } else {
+      belch("setupRtsFlags: missing fish delay time for -qd\n");
+      *error = rtsTrue;
+    }
+    IF_PAR_DEBUG(verbose,
+		 belch("-qd<n>: fish delay time %d", 
+		       RtsFlags.ParFlags.fishDelay));
+    break;
+
+  case 'O':
+    RtsFlags.ParFlags.outputDisabled = rtsTrue;
+    IF_PAR_DEBUG(verbose,
+		 belch("-qO: output disabled"));
+    break;
+  
+  case 'P': /* -qP for writing a log file */
+    RtsFlags.ParFlags.ParStats.Full = rtsTrue;
+    /* same encoding as in GranSim after -bP */	
+    switch(rts_argv[arg][3]) {
+    case '\0': break; // nothing special, just an ordinary profile
+      //case '0': RtsFlags.ParFlags.ParStats.Suppressed = rtsTrue;
+      //  break;
+    case 'b': RtsFlags.ParFlags.ParStats.Binary = rtsTrue;
+      break;
+    case 's': RtsFlags.ParFlags.ParStats.Sparks = rtsTrue;
+      break;
+      //case 'h': RtsFlags.parFlags.ParStats.Heap = rtsTrue;
+      //  break;
+    case 'n': RtsFlags.ParFlags.ParStats.NewLogfile = rtsTrue;
+      break;
+    case 'g': RtsFlags.ParFlags.ParStats.Global = rtsTrue;
+      break;
+    default: barf("Unknown option -qP%c", rts_argv[arg][2]);
+    }
+    IF_PAR_DEBUG(verbose,
+		 belch("(-qP) writing to log-file (RtsFlags.ParFlags.ParStats.Full=%s)",
+		       (RtsFlags.ParFlags.ParStats.Full ? "rtsTrue" : "rtsFalse")));
+    break;
+  
+  case 'Q': /* -qQ<n> ... set pack buffer size to <n> */
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.packBufferSize = decode(rts_argv[arg]+3);
+    } else {
+      belch("setupRtsFlags: missing size of PackBuffer (for -Q)\n");
+      error = rtsTrue;
+    }
+    IF_PAR_DEBUG(verbose,
+		 belch("-qQ<n>: pack buffer size set to %d", 
+		       RtsFlags.ParFlags.packBufferSize));
+    break;
+
+# if defined(DEBUG)  
+  case 'w':
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.wait
+	= strtol(rts_argv[arg]+3, (char **) NULL, 10);
+    } else {
+      RtsFlags.ParFlags.wait = 1000;
+    }
+    IF_PAR_DEBUG(verbose,
+		 belch("-qw<n>: length of wait loop after synchr before reduction: %d", 
+		       RtsFlags.ParFlags.wait));
+    break;
+
+  case 'D':  /* -qD ... all the debugging options */
+    if (isdigit(rts_argv[arg][3])) {/* Set all debugging options in one */
+      /* hack warning: interpret the flags as a binary number */
+      nat n = decode(rts_argv[arg]+3);
+      set_par_debug_options(n);
+    } else {
+      nat i;
+      for (i=0; i<=MAX_PAR_DEBUG_OPTION; i++) 
+	if (rts_argv[arg][3] == par_debug_opts_flags[i])
+	  break;
+	
+      if (i==MAX_PAR_DEBUG_OPTION+1) {
+	fprintf(stderr, "Valid GUM debug options are:\n");
+	help_par_debug_options(MAX_PAR_DEBUG_MASK);
+	bad_option( rts_argv[arg] );
+      } else { // flag found; now set it
+	set_par_debug_options(PAR_DEBUG_MASK(i));  // 2^i
+      }
+    }
+    break;
+# endif
+  default:
+    belch("Unknown option -q%c", rts_argv[arg][2]);
+    break;
+  } /* switch */
+}
+
+/*
+  Interpret n as a binary number masking Par debug options and set the 
+  correxponding option. See par_debug_opts_strs for explanations of the flags.
+*/
+static void
+set_par_debug_options(nat n) {
+  nat i;
+
+  for (i=0; i<=MAX_PAR_DEBUG_OPTION; i++) 
+    if ((n>>i)&1) {
+      fprintf(stderr, par_debug_opts_strs[i]);
+      switch (i) {
+        case 0: RtsFlags.ParFlags.Debug.verbose       = rtsTrue;  break;
+        case 1: RtsFlags.ParFlags.Debug.trace         = rtsTrue;  break;
+        case 2: RtsFlags.ParFlags.Debug.schedule      = rtsTrue;  break;
+        case 3: RtsFlags.ParFlags.Debug.free          = rtsTrue;  break;
+        case 4: RtsFlags.ParFlags.Debug.resume        = rtsTrue;  break;
+        case 5: RtsFlags.ParFlags.Debug.weight        = rtsTrue;  break;
+        case 6: RtsFlags.ParFlags.Debug.fetch         = rtsTrue;  break;
+        case 7: RtsFlags.ParFlags.Debug.ack           = rtsTrue;  break;
+        case 8: RtsFlags.ParFlags.Debug.fish          = rtsTrue;  break;
+        case 9: RtsFlags.ParFlags.Debug.forward       = rtsTrue;  break;
+        case 10: RtsFlags.ParFlags.Debug.pack          = rtsTrue;  break;
+        default: barf("set_par_debug_options: only %d debug options expected");
+      } /* switch */
+    } /* if */
+}
+
+/*
+  Print one line explanation for each of the GranSim debug options specified
+  in the bitmask n.
+*/
+static void
+help_par_debug_options(nat n) {
+  nat i;
+
+  for (i=0; i<=MAX_PAR_DEBUG_OPTION; i++) 
+    if ((n>>i)&1) 
+      fprintf(stderr, par_debug_opts_strs[i]);
+}
+
+#endif /* GRAN */
+
+//@node Aux fcts,  , GranSim specific options
+//@subsection Aux fcts
+
 static FILE *		/* return NULL on error */
 open_stats_file (
     I_ arg,
diff --git a/ghc/rts/RtsFlags.h b/ghc/rts/RtsFlags.h
index 238e2b6..e3febb3 100644
--- a/ghc/rts/RtsFlags.h
+++ b/ghc/rts/RtsFlags.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: RtsFlags.h,v 1.19 2000/01/12 15:15:17 simonmar Exp $
+ * $Id: RtsFlags.h,v 1.20 2000/01/13 14:34:04 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -59,6 +59,8 @@ struct DEBUG_FLAGS {
 
   rtsBool stable      : 1; /* 256 */
   rtsBool prof        : 1; /* 512 */
+  rtsBool gran        : 1; /* 1024 */
+  rtsBool par         : 1; /* 2048 */
 };
 
 #if defined(PROFILING) || defined(PAR)
@@ -124,15 +126,46 @@ struct CONCURRENT_FLAGS {
 };
 
 #ifdef PAR
+/* currently the same as GRAN_STATS_FLAGS */
+struct PAR_STATS_FLAGS {
+  rtsBool Full;       /* Full .gr profile (rtsTrue) or only END events? */
+  // rtsBool Suppressed; /* No .gr profile at all */
+  rtsBool Binary;     /* Binary profile? (not yet implemented) */
+  rtsBool Sparks;     /* Info on sparks in profile? */
+  rtsBool Heap;       /* Info on heap allocs in profile? */ 
+  rtsBool NewLogfile; /* Use new log-file format? (not yet implemented) */
+  rtsBool Global;     /* Global statistics? (printed on shutdown; no log file) */
+};
+
+struct PAR_DEBUG_FLAGS {  
+  /* flags to control debugging output in various subsystems */
+  rtsBool verbose    : 1; /*    1 */
+  rtsBool trace      : 1; /*    2 */
+  rtsBool schedule   : 1; /*    4 */
+  rtsBool free       : 1; /*    8 */
+  rtsBool resume     : 1; /*   16 */
+  rtsBool weight     : 1; /*   32 */
+  rtsBool fetch      : 1; /*   64 */
+  rtsBool ack        : 1; /*  128 */
+  rtsBool fish       : 1; /*  256 */
+  rtsBool forward    : 1; /*  512 */
+  rtsBool pack       : 1; /* 1024 */
+};
+
+#define MAX_PAR_DEBUG_OPTION     10
+#define PAR_DEBUG_MASK(n)        ((nat)(ldexp(1,n)))
+#define MAX_PAR_DEBUG_MASK       ((nat)(ldexp(1,(MAX_PAR_DEBUG_OPTION+1))-1))
+
 struct PAR_FLAGS {
-  rtsBool parallelStats; 	/* Gather parallel statistics */
-  rtsBool granSimStats;	   /* Full .gr profile (rtsTrue) or only END events? */
-  rtsBool granSimStats_Binary;
-  
-  rtsBool outputDisabled;	/* Disable output for performance purposes */
-  
-  unsigned int packBufferSize;
-  unsigned int maxLocalSparks;
+  struct PAR_STATS_FLAGS ParStats;  /* profile and stats output */
+  struct PAR_DEBUG_FLAGS Debug;         /* debugging options */
+  rtsBool  outputDisabled;	  /* Disable output for performance purposes */
+  nat      packBufferSize;
+  nat	   maxLocalSparks;        /* spark pool size */
+  nat      maxThreads;            /* thread pool size */
+  nat      maxFishes;             /* max number of active fishes */
+  rtsTime  fishDelay;             /* delay before sending a new fish */
+  long   wait;
 };
 #endif /* PAR */
 
@@ -141,53 +174,88 @@ struct PAR_FLAGS {
   nat            nNodes;         /* number of threads to run simultaneously */
   unsigned int	 maxLocalSparks;
 };
-#endif
+#endif /* SMP */
 
 #ifdef GRAN
+struct GRAN_STATS_FLAGS {
+  rtsBool Full;       /* Full .gr profile (rtsTrue) or only END events? */
+  rtsBool Suppressed; /* No .gr profile at all */
+  rtsBool Binary;     /* Binary profile? (not yet implemented) */
+  rtsBool Sparks;     /* Info on sparks in profile? */
+  rtsBool Heap;       /* Info on heap allocs in profile? */ 
+  rtsBool NewLogfile; /* Use new log-file format? (not yet implemented) */
+  rtsBool Global;     /* Global statistics? (printed on shutdown; no log file) */
+};
+
+struct GRAN_COST_FLAGS {
+  /* Communication Cost Variables -- set in main program */
+  nat latency;              /* Latency for single packet */
+  nat additional_latency;   /* Latency for additional packets */
+  nat fetchtime;            
+  nat lunblocktime;         /* Time for local unblock */
+  nat gunblocktime;         /* Time for global unblock */
+  nat mpacktime;            /* Cost of creating a packet */     
+  nat munpacktime;	    /* Cost of receiving a packet */    
+  nat mtidytime;	    /* Cost of cleaning up after send */
+  
+  nat threadcreatetime;     /* Thread creation costs */
+  nat threadqueuetime;      /* Cost of adding a thread to the running/runnable queue */
+  nat threaddescheduletime; /* Cost of descheduling a thread */
+  nat threadscheduletime;   /* Cost of scheduling a thread */
+  nat threadcontextswitchtime;  /* Cost of context switch  */
+  
+  /* Instruction Costs */
+  nat arith_cost;        /* arithmetic instructions (+,i,< etc) */
+  nat branch_cost;       /* branch instructions */ 
+  nat load_cost;         /* load into register */
+  nat store_cost;        /* store into memory */
+  nat float_cost;        /* floating point operations */
+  
+  nat heapalloc_cost;    /* heap allocation costs */
+  
+  /* Overhead for granularity control mechanisms */
+  /* overhead per elem of spark queue */
+  nat pri_spark_overhead;
+  /* overhead per elem of thread queue */
+  nat pri_sched_overhead;
+};
+
+struct GRAN_DEBUG_FLAGS {  
+  /* flags to control debugging output in various subsystems */
+  rtsBool event_trace    : 1; /*    1 */
+  rtsBool event_stats    : 1; /*    2 */
+  rtsBool bq             : 1; /*    4 */
+  rtsBool pack           : 1; /*    8 */
+  rtsBool checkSparkQ    : 1; /*   16 */
+  rtsBool thunkStealing  : 1; /*   32 */
+  rtsBool randomSteal  	 : 1; /*   64 */
+  rtsBool findWork     	 : 1; /*  128 */
+  rtsBool unused     	 : 1; /*  256 */
+  rtsBool pri     	 : 1; /*  512 */
+  rtsBool checkLight   	 : 1; /* 1024 */
+  rtsBool sortedQ      	 : 1; /* 2048 */
+  rtsBool blockOnFetch   : 1; /* 4096 */
+  rtsBool packBuffer     : 1; /* 8192 */
+  rtsBool blockOnFetch_sanity : 1; /*  16384 */
+};
+
+#define MAX_GRAN_DEBUG_OPTION     14
+#define GRAN_DEBUG_MASK(n)        ((nat)(ldexp(1,n)))
+#define MAX_GRAN_DEBUG_MASK       ((nat)(ldexp(1,(MAX_GRAN_DEBUG_OPTION+1))-1))
+
 struct GRAN_FLAGS {
-    rtsBool granSimStats;  /* Full .gr profile (rtsTrue) or only END events? */
-    rtsBool granSimStats_suppressed; /* No .gr profile at all */
-    rtsBool granSimStats_Binary;
-    rtsBool granSimStats_Sparks;
-    rtsBool granSimStats_Heap;
-    rtsBool labelling;
-    unsigned int	    packBufferSize;
-    unsigned int	    packBufferSize_internal;
-
-    int proc;                      /* number of processors */
-    int max_fishes;                /* max number of spark or thread steals */
-    TIME time_slice;              /* max time slice of one reduction thread */
-
-    /* Communication Cost Variables -- set in main program */
-    unsigned int gran_latency;              /* Latency for single packet */
-    unsigned int gran_additional_latency;   /* Latency for additional packets */
-    unsigned int gran_fetchtime;            
-    unsigned int gran_lunblocktime;         /* Time for local unblock */
-    unsigned int gran_gunblocktime;         /* Time for global unblock */
-    unsigned int gran_mpacktime;            /* Cost of creating a packet */     
-    unsigned int gran_munpacktime;	  /* Cost of receiving a packet */    
-    unsigned int gran_mtidytime;		  /* Cost of cleaning up after send */
-
-    unsigned int gran_threadcreatetime;     /* Thread creation costs */
-    unsigned int gran_threadqueuetime;      /* Cost of adding a thread to the running/runnable queue */
-    unsigned int gran_threaddescheduletime; /* Cost of descheduling a thread */
-    unsigned int gran_threadscheduletime;   /* Cost of scheduling a thread */
-    unsigned int gran_threadcontextswitchtime;  /* Cost of context switch  */
-
-    /* Instruction Costs */
-    unsigned int gran_arith_cost;        /* arithmetic instructions (+,i,< etc) */
-    unsigned int gran_branch_cost;       /* branch instructions */ 
-    unsigned int gran_load_cost;         /* load into register */
-    unsigned int gran_store_cost;        /* store into memory */
-    unsigned int gran_float_cost;        /* floating point operations */
-
-    unsigned int gran_heapalloc_cost;    /* heap allocation costs */
-
-    /* Overhead for granularity control mechanisms */
-    /* overhead per elem of spark queue */
-    unsigned int gran_pri_spark_overhead;
-    /* overhead per elem of thread queue */
-    unsigned int gran_pri_sched_overhead;
+  struct GRAN_STATS_FLAGS GranSimStats;  /* profile and stats output */
+  struct GRAN_COST_FLAGS Costs;          /* cost metric for simulation */
+  struct GRAN_DEBUG_FLAGS Debug;         /* debugging options */
+
+  // rtsBool labelling;
+  nat  packBufferSize;
+  nat  packBufferSize_internal;
+
+  PEs proc;                     /* number of processors */
+  rtsBool Fishing;              /* Simulate GUM style fishing mechanism? */
+  nat maxFishes;                /* max number of spark or thread steals */
+  rtsTime time_slice;           /* max time slice of one reduction thread */
 
     /* GrAnSim-Light: This version puts no bound on the number of
          processors but in exchange doesn't model communication costs
@@ -198,30 +266,27 @@ struct GRAN_FLAGS {
     rtsBool Light;
 
     rtsBool DoFairSchedule ;        /* fair scheduling alg? default: unfair */
-    rtsBool DoReScheduleOnFetch ;   /* async. communication? */
+    rtsBool DoAsyncFetch;           /* async. communication? */
     rtsBool DoStealThreadsFirst;    /* prefer threads over sparks when stealing */
-    rtsBool SimplifiedFetch;        /* fast but inaccurate fetch modelling */
-    rtsBool DoAlwaysCreateThreads;  /* eager thread creation */
-    rtsBool DoGUMMFetching;         /* bulk fetching */
-    rtsBool DoThreadMigration;      /* allow to move threads */
-    int      FetchStrategy;          /* what to do when waiting for data */
-    rtsBool PreferSparksOfLocalNodes; /* prefer local over global sparks */
-    rtsBool DoPrioritySparking;     /* sparks sorted by priorities */
-    rtsBool DoPriorityScheduling;   /* threads sorted by priorities */
-    int      SparkPriority;          /* threshold for cut-off mechanism */
-    int      SparkPriority2;
-    rtsBool RandomPriorities;
-    rtsBool InversePriorities;
-    rtsBool IgnorePriorities;
-    int      ThunksToPack;           /* number of thunks in packet + 1 */ 
-    rtsBool RandomSteal;            /* steal spark/thread from random proc */
-    rtsBool NoForward;              /* no forwarding of fetch messages */
-    rtsBool PrintFetchMisses;       /* print number of fetch misses */
-
-    unsigned int	    debug;
-    rtsBool event_trace;
-    rtsBool event_trace_all;
-   
+  rtsBool DoAlwaysCreateThreads;  /* eager thread creation */
+  rtsBool DoBulkFetching;         /* bulk fetching */
+  rtsBool DoThreadMigration;      /* allow to move threads */
+  nat     FetchStrategy;         /* what to do when waiting for data */
+  rtsBool PreferSparksOfLocalNodes; /* prefer local over global sparks */
+  rtsBool DoPrioritySparking;     /* sparks sorted by priorities */
+  rtsBool DoPriorityScheduling;   /* threads sorted by priorities */
+  nat     SparkPriority;         /* threshold for cut-off mechanism */
+  nat     SparkPriority2;
+  rtsBool RandomPriorities;
+  rtsBool InversePriorities;
+  rtsBool IgnorePriorities;
+  nat     ThunksToPack;      /* number of thunks in packet + 1 */ 
+  rtsBool RandomSteal;        /* steal spark/thread from random proc */
+  rtsBool NoForward;        /* no forwarding of fetch messages */
+
+  // unsigned int	    debug;
+  //  rtsBool event_trace;
+  //  rtsBool event_trace_all;
 };
 #endif /* GRAN */
 
diff --git a/ghc/rts/RtsStartup.c b/ghc/rts/RtsStartup.c
index 0996ba0..a589b18 100644
--- a/ghc/rts/RtsStartup.c
+++ b/ghc/rts/RtsStartup.c
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: RtsStartup.c,v 1.25 1999/12/20 10:34:37 simonpj Exp $
+ * $Id: RtsStartup.c,v 1.26 2000/01/13 14:34:04 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -25,7 +25,12 @@
 # include "ProfHeap.h"
 #endif
 
-#ifdef PAR
+#if defined(GRAN)
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+#endif
+
+#if defined(PAR)
 #include "ParInit.h"
 #include "Parallel.h"
 #include "LLC.h"
@@ -37,6 +42,9 @@
 struct RTS_FLAGS RtsFlags;
 
 static int rts_has_started_up = 0;
+#if defined(PAR)
+static ullong startTime = 0;
+#endif
 
 void
 startupHaskell(int argc, char *argv[])
@@ -51,10 +59,6 @@ startupHaskell(int argc, char *argv[])
    else
      rts_has_started_up=1;
 
-#if defined(PAR)
-    int nPEs = 0;		    /* Number of PEs */
-#endif
-
     /* The very first thing we do is grab the start time...just in case we're
      * collecting timing statistics.
      */
@@ -62,13 +66,15 @@ startupHaskell(int argc, char *argv[])
 
 #ifdef PAR
 /*
- *The parallel system needs to be initialised and synchronised before
- *the program is run.  
+ * The parallel system needs to be initialised and synchronised before
+ * the program is run.  
  */
+    fprintf(stderr, "startupHaskell: argv[0]=%s\n", argv[0]);
     if (*argv[0] == '-') {     /* Look to see whether we're the Main Thread */
 	IAmMainThread = rtsTrue;
         argv++; argc--;			/* Strip off flag argument */
-/*	fprintf(stderr, "I am Main Thread\n"); */
+	// IF_PAR_DEBUG(verbose,
+		     fprintf(stderr, "[%x] I am Main Thread\n", mytid);
     }
     /* 
      * Grab the number of PEs out of the argument vector, and
@@ -78,7 +84,6 @@ startupHaskell(int argc, char *argv[])
     argv[1] = argv[0];
     argv++; argc--;
     initEachPEHook();                  /* HWL: hook to be execed on each PE */
-    SynchroniseSystem();
 #endif
 
     /* Set the RTS flags to default values. */
@@ -92,13 +97,10 @@ startupHaskell(int argc, char *argv[])
     prog_argc = argc;
     prog_argv = argv;
 
-#ifdef PAR
-   /* Initialise the parallel system -- before initHeap! */
-    initParallelSystem();
-   /* And start GranSim profiling if required: omitted for now
-    *if (Rtsflags.ParFlags.granSimStats)
-    *init_gr_profiling(rts_argc, rts_argv, prog_argc, prog_argv);
-    */
+#if defined(PAR)
+    /* NB: this really must be done after processing the RTS flags */
+    fprintf(stderr, "Synchronising system (%d PEs)\n", nPEs);
+    SynchroniseSystem();             // calls initParallelSystem etc
 #endif	/* PAR */
 
     /* initialise scheduler data structures (needs to be done before
@@ -106,6 +108,16 @@ startupHaskell(int argc, char *argv[])
      */
     initScheduler();
 
+#if defined(GRAN)
+    /* And start GranSim profiling if required: */
+    if (RtsFlags.GranFlags.GranSimStats.Full)
+      init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv);
+#elif defined(PAR)
+    /* And start GUM profiling if required: */
+    if (RtsFlags.ParFlags.ParStats.Full)
+      init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv);
+#endif	/* PAR || GRAN */
+
     /* initialize the storage manager */
     initStorage();
 
@@ -179,12 +191,14 @@ shutdownHaskell(void)
   /* start timing the shutdown */
   stat_startExit();
 
+#if !defined(GRAN)
   /* Finalize any remaining weak pointers */
   finalizeWeakPointersNow();
+#endif
 
 #if defined(GRAN)
-  #error FixMe.
-  if (!RTSflags.GranFlags.granSimStats_suppressed)
+  /* end_gr_simulation prints global stats if requested -- HWL */
+  if (!RtsFlags.GranFlags.GranSimStats.Suppressed)
     end_gr_simulation();
 #endif
 
@@ -220,8 +234,12 @@ shutdownHaskell(void)
 #endif
 
   rts_has_started_up=0;
-}
 
+#if defined(PAR)
+  shutdownParallelSystem(0);
+#endif
+
+}
 
 /* 
  * called from STG-land to exit the program
@@ -230,7 +248,7 @@ shutdownHaskell(void)
 void  
 stg_exit(I_ n)
 {
-#ifdef PAR
+#if 0 /* def PAR */
   par_exit(n);
 #else
   exit(n);
diff --git a/ghc/rts/RtsUtils.c b/ghc/rts/RtsUtils.c
index 28fb2f7..5e53b7d 100644
--- a/ghc/rts/RtsUtils.c
+++ b/ghc/rts/RtsUtils.c
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: RtsUtils.c,v 1.12 2000/01/13 12:40:16 simonmar Exp $
+ * $Id: RtsUtils.c,v 1.13 2000/01/13 14:34:04 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -8,6 +8,7 @@
  * ---------------------------------------------------------------------------*/
 
 #include "Rts.h"
+#include "RtsTypes.h"
 #include "RtsAPI.h"
 #include "RtsFlags.h"
 #include "Hooks.h"
@@ -23,6 +24,10 @@
 #include <fcntl.h>
 #endif
 
+#ifdef HAVE_GETTIMEOFDAY
+#include <sys/time.h>
+#endif
+
 #include <stdarg.h>
 
 /* variable-argument error function. */
@@ -182,7 +187,7 @@ resetGenSymZh(void) /* it's your funeral */
    Get the current time as a string.  Used in profiling reports.
    -------------------------------------------------------------------------- */
 
-#if defined(PROFILING) || defined(DEBUG)
+#if defined(PROFILING) || defined(DEBUG) || defined(PAR) || defined(GRAN)
 char *
 time_str(void)
 {
@@ -219,6 +224,44 @@ resetNonBlockingFd(int fd)
 #endif
 }
 
+#if 0
+static ullong startTime = 0;
+
+/* used in a parallel setup */
+ullong
+msTime(void)
+{
+# if defined(HAVE_GETCLOCK) && !defined(alpha_TARGET_ARCH)
+    struct timespec tv;
+
+    if (getclock(TIMEOFDAY, &tv) != 0) {
+	fflush(stdout);
+	fprintf(stderr, "Clock failed\n");
+	stg_exit(EXIT_FAILURE);
+    }
+    return tv.tv_sec * LL(1000) + tv.tv_nsec / LL(1000000) - startTime;
+# elif HAVE_GETTIMEOFDAY && !defined(alpha_TARGET_ARCH)
+    struct timeval tv;
+ 
+    if (gettimeofday(&tv, NULL) != 0) {
+	fflush(stdout);
+	fprintf(stderr, "Clock failed\n");
+	stg_exit(EXIT_FAILURE);
+    }
+    return tv.tv_sec * LL(1000) + tv.tv_usec / LL(1000) - startTime;
+# else
+    time_t t;
+    if ((t = time(NULL)) == (time_t) -1) {
+	fflush(stdout);
+	fprintf(stderr, "Clock failed\n");
+	stg_exit(EXIT_FAILURE);
+    }
+    return t * LL(1000) - startTime;
+# endif
+}
+#endif
+
+
 /* -----------------------------------------------------------------------------
    Print large numbers, with punctuation.
    -------------------------------------------------------------------------- */
diff --git a/ghc/rts/RtsUtils.h b/ghc/rts/RtsUtils.h
index 8f5581c..79557e8 100644
--- a/ghc/rts/RtsUtils.h
+++ b/ghc/rts/RtsUtils.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: RtsUtils.h,v 1.6 2000/01/12 15:15:17 simonmar Exp $
+ * $Id: RtsUtils.h,v 1.7 2000/01/13 14:34:04 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -30,7 +30,8 @@ extern nat stg_strlen(char *str);
 /*Defined in Main.c, but made visible here*/
 extern void stg_exit(I_ n) __attribute__((noreturn));
 
-char * time_str(void);
-
+char *time_str(void);
 char *ullong_format_string(ullong, char *, rtsBool);
+//ullong   msTime(void);
+
 
diff --git a/ghc/rts/Sanity.c b/ghc/rts/Sanity.c
index 920530a..c0a602a 100644
--- a/ghc/rts/Sanity.c
+++ b/ghc/rts/Sanity.c
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: Sanity.c,v 1.14 1999/05/21 14:37:12 sof Exp $
+ * $Id: Sanity.c,v 1.15 2000/01/13 14:34:04 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -14,17 +14,36 @@
  *
  * ---------------------------------------------------------------------------*/
 
+//@menu
+//* Includes::			
+//* Macros::			
+//* Stack sanity::		
+//* Heap Sanity::		
+//* TSO Sanity::		
+//* Thread Queue Sanity::	
+//* Blackhole Sanity::		
+//@end menu
+
+//@node Includes, Macros
+//@subsection Includes
+
 #include "Rts.h"
 
-#ifdef DEBUG
+#ifdef DEBUG                                                   /* whole file */
 
 #include "RtsFlags.h"
 #include "RtsUtils.h"
 #include "BlockAlloc.h"
 #include "Sanity.h"
 
+//@node Macros, Stack sanity, Includes
+//@subsection Macros
+
 #define LOOKS_LIKE_PTR(r) (LOOKS_LIKE_STATIC_CLOSURE(r) || ((HEAP_ALLOCED(r) && Bdescr((P_)r)->free != (void *)-1)))
 
+//@node Stack sanity, Heap Sanity, Macros
+//@subsection Stack sanity
+
 /* -----------------------------------------------------------------------------
    Check stack sanity
    -------------------------------------------------------------------------- */
@@ -42,6 +61,7 @@ static StgOffset checkLargeBitmap( StgPtr payload,
 
 void checkClosureShallow( StgClosure* p );
 
+//@cindex checkSmallBitmap
 static StgOffset 
 checkSmallBitmap( StgPtr payload, StgWord32 bitmap )
 {
@@ -56,7 +76,7 @@ checkSmallBitmap( StgPtr payload, StgWord32 bitmap )
     return i;
 }
 
-
+//@cindex checkLargeBitmap
 static StgOffset 
 checkLargeBitmap( StgPtr payload, StgLargeBitmap* large_bitmap )
 {
@@ -75,6 +95,7 @@ checkLargeBitmap( StgPtr payload, StgLargeBitmap* large_bitmap )
     return i;
 }
 
+//@cindex checkStackClosure
 StgOffset 
 checkStackClosure( StgClosure* c )
 {    
@@ -91,17 +112,28 @@ checkStackClosure( StgClosure* c )
     case RET_BCO: /* small bitmap (<= 32 entries) */
     case RET_SMALL:
     case RET_VEC_SMALL:
+            return 1 + checkSmallBitmap((StgPtr)c + 1,info->layout.bitmap);
+      
     case UPDATE_FRAME:
     case CATCH_FRAME:
     case STOP_FRAME:
     case SEQ_FRAME:
-	    return 1 + checkSmallBitmap((StgPtr)c + 1,info->layout.bitmap);
+#if defined(GRAN)
+            return 2 +
+#else
+            return 1 +
+#endif
+	               checkSmallBitmap((StgPtr)c + 1,info->layout.bitmap);
     case RET_BIG: /* large bitmap (> 32 entries) */
     case RET_VEC_BIG:
 	    return 1 + checkLargeBitmap((StgPtr)c + 1,info->layout.large_bitmap);
     case FUN:
     case FUN_STATIC: /* probably a slow-entry point return address: */
-	    return 1;
+#if 0 && defined(GRAN)
+            return 2;
+#else
+            return 1;
+#endif
     default:
        	    /* if none of the above, maybe it's a closure which looks a
        	     * little like an infotable
@@ -118,6 +150,7 @@ checkStackClosure( StgClosure* c )
  * chunks.
  */
  
+//@cindex checkClosureShallow
 void 
 checkClosureShallow( StgClosure* p )
 {
@@ -133,6 +166,7 @@ checkClosureShallow( StgClosure* p )
 }
 
 /* check an individual stack object */
+//@cindex checkStackObject
 StgOffset 
 checkStackObject( StgPtr sp )
 {
@@ -151,6 +185,7 @@ checkStackObject( StgPtr sp )
 }
 
 /* check sections of stack between update frames */
+//@cindex checkStackChunk
 void 
 checkStackChunk( StgPtr sp, StgPtr stack_end )
 {
@@ -160,9 +195,10 @@ checkStackChunk( StgPtr sp, StgPtr stack_end )
     while (p < stack_end) {
 	p += checkStackObject( p );
     }
-    ASSERT( p == stack_end );
+    // ASSERT( p == stack_end ); -- HWL
 }
 
+//@cindex checkStackChunk
 StgOffset 
 checkClosure( StgClosure* p )
 {
@@ -332,13 +368,17 @@ checkClosure( StgClosure* p )
     case BLOCKED_FETCH:
     case FETCH_ME:
     case EVACUATED:
-	    barf("checkClosure: unimplemented/strange closure type");
+	    barf("checkClosure: unimplemented/strange closure type %d",
+		 info->type);
     default:
-	    barf("checkClosure");
+	    barf("checkClosure (closure type %d)", info->type);
     }
 #undef LOOKS_LIKE_PTR
 }
 
+//@node Heap Sanity, TSO Sanity, Stack sanity
+//@subsection Heap Sanity
+
 /* -----------------------------------------------------------------------------
    Check Heap Sanity
 
@@ -348,6 +388,7 @@ checkClosure( StgClosure* p )
    all the objects in the remainder of the chain.
    -------------------------------------------------------------------------- */
 
+//@cindex checkHeap
 extern void 
 checkHeap(bdescr *bd, StgPtr start)
 {
@@ -377,6 +418,7 @@ checkHeap(bdescr *bd, StgPtr start)
     }
 }
 
+//@cindex checkChain
 extern void
 checkChain(bdescr *bd)
 {
@@ -387,6 +429,7 @@ checkChain(bdescr *bd)
 }
 
 /* check stack - making sure that update frames are linked correctly */
+//@cindex checkStack
 void 
 checkStack(StgPtr sp, StgPtr stack_end, StgUpdateFrame* su )
 {
@@ -415,6 +458,10 @@ checkStack(StgPtr sp, StgPtr stack_end, StgUpdateFrame* su )
     ASSERT(stgCast(StgPtr,su) == stack_end);
 }
 
+//@node TSO Sanity, Thread Queue Sanity, Heap Sanity
+//@subsection TSO Sanity
+
+//@cindex checkTSO
 extern void
 checkTSO(StgTSO *tso)
 {
@@ -437,6 +484,69 @@ checkTSO(StgTSO *tso)
     checkStack(sp, stack_end, su);
 }
 
+#if defined(GRAN)
+//@cindex checkTSOsSanity
+extern void  
+checkTSOsSanity(void) {
+  nat i, tsos;
+  StgTSO *tso;
+  
+  belch("Checking sanity of all runnable TSOs:");
+  
+  for (i=0, tsos=0; i<RtsFlags.GranFlags.proc; i++) {
+    for (tso=run_queue_hds[i]; tso!=END_TSO_QUEUE; tso=tso->link) {
+      fprintf(stderr, "TSO %p on PE %d ...", tso, i);
+      checkTSO(tso); 
+      fprintf(stderr, "OK, ");
+      tsos++;
+    }
+  }
+  
+  belch(" checked %d TSOs on %d PEs; ok\n", tsos, RtsFlags.GranFlags.proc);
+}
+
+//@node Thread Queue Sanity, Blackhole Sanity, TSO Sanity
+//@subsection Thread Queue Sanity
+
+// still GRAN only
+
+//@cindex checkThreadQSanity
+extern rtsBool
+checkThreadQSanity (PEs proc, rtsBool check_TSO_too) 
+{
+  StgTSO *tso, *prev;
+
+  /* the NIL value for TSOs is END_TSO_QUEUE; thus, finding NULL is an error */
+  ASSERT(run_queue_hds[proc]!=NULL);
+  ASSERT(run_queue_tls[proc]!=NULL);
+  /* if either head or tail is NIL then the other one must be NIL, too */
+  ASSERT(run_queue_hds[proc]!=END_TSO_QUEUE || run_queue_tls[proc]==END_TSO_QUEUE);
+  ASSERT(run_queue_tls[proc]!=END_TSO_QUEUE || run_queue_hds[proc]==END_TSO_QUEUE);
+  for (tso=run_queue_hds[proc], prev=END_TSO_QUEUE; 
+       tso!=END_TSO_QUEUE;
+       prev=tso, tso=tso->link) {
+    ASSERT((prev!=END_TSO_QUEUE || tso==run_queue_hds[proc]) &&
+	   (prev==END_TSO_QUEUE || prev->link==tso));
+    if (check_TSO_too)
+      checkTSO(tso);
+  }
+  ASSERT(prev==run_queue_tls[proc]);
+}
+
+//@cindex checkThreadQsSanity
+extern rtsBool
+checkThreadQsSanity (rtsBool check_TSO_too)
+{
+  PEs p;
+  
+  for (p=0; p<RtsFlags.GranFlags.proc; p++)
+    checkThreadQSanity(p, check_TSO_too);
+}
+#endif /* GRAN */
+
+//@node Blackhole Sanity, Index, Thread Queue Sanity
+//@subsection Blackhole Sanity
+
 /* -----------------------------------------------------------------------------
    Check Blackhole Sanity
 
@@ -448,7 +558,9 @@ checkTSO(StgTSO *tso)
    the update frame list.
 
    -------------------------------------------------------------------------- */
-rtsBool isBlackhole( StgTSO* tso, StgClosure* p )
+//@cindex isBlackhole
+rtsBool 
+isBlackhole( StgTSO* tso, StgClosure* p )
 {
   StgUpdateFrame* su = tso->su;
   do {
@@ -474,4 +586,26 @@ rtsBool isBlackhole( StgTSO* tso, StgClosure* p )
   } while (1);
 }
 
+//@node Index,  , Blackhole Sanity
+//@subsection Index
+
+//@index
+//* checkChain::  @cindex\s-+checkChain
+//* checkClosureShallow::  @cindex\s-+checkClosureShallow
+//* checkHeap::  @cindex\s-+checkHeap
+//* checkLargeBitmap::  @cindex\s-+checkLargeBitmap
+//* checkSmallBitmap::  @cindex\s-+checkSmallBitmap
+//* checkStack::  @cindex\s-+checkStack
+//* checkStackChunk::  @cindex\s-+checkStackChunk
+//* checkStackChunk::  @cindex\s-+checkStackChunk
+//* checkStackClosure::  @cindex\s-+checkStackClosure
+//* checkStackObject::  @cindex\s-+checkStackObject
+//* checkTSO::  @cindex\s-+checkTSO
+//* checkTSOsSanity::  @cindex\s-+checkTSOsSanity
+//* checkThreadQSanity::  @cindex\s-+checkThreadQSanity
+//* checkThreadQsSanity::  @cindex\s-+checkThreadQsSanity
+//* isBlackhole::  @cindex\s-+isBlackhole
+//@end index
+
 #endif /* DEBUG */
+
diff --git a/ghc/rts/Sanity.h b/ghc/rts/Sanity.h
index 6ab9c84..1bd2157 100644
--- a/ghc/rts/Sanity.h
+++ b/ghc/rts/Sanity.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: Sanity.h,v 1.4 1999/02/05 16:02:52 simonm Exp $
+ * $Id: Sanity.h,v 1.5 2000/01/13 14:34:04 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -13,6 +13,11 @@ extern void checkHeap  ( bdescr *bd, StgPtr start );
 extern void checkChain ( bdescr *bd );
 extern void checkStack ( StgPtr sp, StgPtr stack_end, StgUpdateFrame* su );
 extern void checkTSO   ( StgTSO* tso );
+#if defined(GRAN)
+extern void checkTSOsSanity(void);
+extern rtsBool checkThreadQSanity (PEs proc, rtsBool check_TSO_too);
+extern rtsBool checkThreadQsSanity (rtsBool check_TSO_too);
+#endif
 
 extern StgOffset checkClosure( StgClosure* p );
 
diff --git a/ghc/rts/Schedule.c b/ghc/rts/Schedule.c
index 1a96f87..d87f7ab 100644
--- a/ghc/rts/Schedule.c
+++ b/ghc/rts/Schedule.c
@@ -1,11 +1,18 @@
-/* -----------------------------------------------------------------------------
- * $Id: Schedule.c,v 1.40 2000/01/13 10:37:31 simonmar Exp $
+/* ---------------------------------------------------------------------------
+ * $Id: Schedule.c,v 1.41 2000/01/13 14:34:05 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
  * Scheduler
  *
- * ---------------------------------------------------------------------------*/
+ * The main scheduling code in GranSim is quite different from that in std
+ * (concurrent) Haskell: while concurrent Haskell just iterates over the
+ * threads in the runnable queue, GranSim is event driven, i.e. it iterates
+ * over the events in the global event queue.  -- HWL
+ * --------------------------------------------------------------------------*/
+
+//@node Main scheduling code, , ,
+//@section Main scheduling code
 
 /* Version with scheduler monitor support for SMPs.
 
@@ -27,6 +34,23 @@
    SDM & KH, 10/99
 */
 
+//@menu
+//* Includes::			
+//* Variables and Data structures::  
+//* Prototypes::		
+//* Main scheduling loop::	
+//* Suspend and Resume::	
+//* Run queue code::		
+//* Garbage Collextion Routines::  
+//* Blocking Queue Routines::	
+//* Exception Handling Routines::  
+//* Debugging Routines::	
+//* Index::			
+//@end menu
+
+//@node Includes, Variables and Data structures, Main scheduling code, Main scheduling code
+//@subsection Includes
+
 #include "Rts.h"
 #include "SchedAPI.h"
 #include "RtsUtils.h"
@@ -48,9 +72,21 @@
 #include "Sanity.h"
 #include "Stats.h"
 #include "Sparks.h"
+#if defined(GRAN) || defined(PAR)
+# include "GranSimRts.h"
+# include "GranSim.h"
+# include "ParallelRts.h"
+# include "Parallel.h"
+# include "ParallelDebug.h"
+# include "FetchMe.h"
+# include "HLC.h"
+#endif
 
 #include <stdarg.h>
 
+//@node Variables and Data structures, Prototypes, Includes, Main scheduling code
+//@subsection Variables and Data structures
+
 /* Main threads:
  *
  * These are the threads which clients have requested that we run.  
@@ -65,6 +101,7 @@
  *
  * Main threads information is kept in a linked list:
  */
+//@cindex StgMainThread
 typedef struct StgMainThread_ {
   StgTSO *         tso;
   SchedulerStatus  stat;
@@ -83,6 +120,47 @@ static StgMainThread *main_threads;
 /* Thread queues.
  * Locks required: sched_mutex.
  */
+
+#if DEBUG
+char *whatNext_strs[] = {
+  "ThreadEnterGHC",
+  "ThreadRunGHC",
+  "ThreadEnterHugs",
+  "ThreadKilled",
+  "ThreadComplete"
+};
+
+char *threadReturnCode_strs[] = {
+  "HeapOverflow",			/* might also be StackOverflow */
+  "StackOverflow",
+  "ThreadYielding",
+  "ThreadBlocked",
+  "ThreadFinished"
+};
+#endif
+
+#if defined(GRAN)
+
+StgTSO* ActiveTSO = NULL; /* for assigning system costs; GranSim-Light only */
+// rtsTime TimeOfNextEvent, EndOfTimeSlice;            now in GranSim.c
+
+/* 
+   In GranSim we have a runable and a blocked queue for each processor.
+   In order to minimise code changes new arrays run_queue_hds/tls
+   are created. run_queue_hd is then a short cut (macro) for
+   run_queue_hds[CurrentProc] (see GranSim.h).
+   -- HWL
+*/
+StgTSO *run_queue_hds[MAX_PROC], *run_queue_tls[MAX_PROC];
+StgTSO *blocked_queue_hds[MAX_PROC], *blocked_queue_tls[MAX_PROC];
+StgTSO *ccalling_threadss[MAX_PROC];
+
+#else /* !GRAN */
+
+//@cindex run_queue_hd
+//@cindex run_queue_tl
+//@cindex blocked_queue_hd
+//@cindex blocked_queue_tl
 StgTSO *run_queue_hd, *run_queue_tl;
 StgTSO *blocked_queue_hd, *blocked_queue_tl;
 
@@ -93,6 +171,7 @@ static StgTSO *suspended_ccalling_threads;
 
 static void GetRoots(void);
 static StgTSO *threadStackOverflow(StgTSO *tso);
+#endif
 
 /* KH: The following two flags are shared memory locations.  There is no need
        to lock them, since they are only unset at the end of a scheduler
@@ -100,14 +179,17 @@ static StgTSO *threadStackOverflow(StgTSO *tso);
 */
 
 /* flag set by signal handler to precipitate a context switch */
+//@cindex context_switch
 nat context_switch;
 
 /* if this flag is set as well, give up execution */
+//@cindex interrupted
 rtsBool interrupted;
 
 /* Next thread ID to allocate.
  * Locks required: sched_mutex
  */
+//@cindex next_thread_id
 StgThreadID next_thread_id = 1;
 
 /*
@@ -132,10 +214,19 @@ StgThreadID next_thread_id = 1;
  * Locks required: sched_mutex.
  */
 #ifdef SMP
-Capability *free_capabilities;	/* Available capabilities for running threads */
-nat n_free_capabilities;        /* total number of available capabilities */
+//@cindex free_capabilities
+//@cindex n_free_capabilities
+Capability *free_capabilities; /* Available capabilities for running threads */
+nat n_free_capabilities;       /* total number of available capabilities */
+#else
+//@cindex MainRegTable
+Capability MainRegTable;       /* for non-SMP, we have one global capability */
+#endif
+
+#if defined(GRAN)
+StgTSO      *CurrentTSOs[MAX_PROC];
 #else
-Capability MainRegTable;	/* for non-SMP, we have one global capability */
+StgTSO      *CurrentTSO;
 #endif
 
 rtsBool ready_to_gc;
@@ -143,6 +234,7 @@ rtsBool ready_to_gc;
 /* All our current task ids, saved in case we need to kill them later.
  */
 #ifdef SMP
+//@cindex task_ids
 task_info *task_ids;
 #endif
 
@@ -157,6 +249,10 @@ static void sched_belch(char *s, ...);
 #endif
 
 #ifdef SMP
+//@cindex sched_mutex
+//@cindex term_mutex
+//@cindex thread_ready_cond
+//@cindex gc_pending_cond
 pthread_mutex_t sched_mutex       = PTHREAD_MUTEX_INITIALIZER;
 pthread_mutex_t term_mutex        = PTHREAD_MUTEX_INITIALIZER;
 pthread_cond_t  thread_ready_cond = PTHREAD_COND_INITIALIZER;
@@ -165,7 +261,35 @@ pthread_cond_t  gc_pending_cond   = PTHREAD_COND_INITIALIZER;
 nat await_death;
 #endif
 
-/* -----------------------------------------------------------------------------
+#if defined(PAR)
+StgTSO *LastTSO;
+rtsTime TimeOfLastYield;
+#endif
+
+/*
+ * The thread state for the main thread.
+// ToDo: check whether not needed any more
+StgTSO   *MainTSO;
+ */
+
+
+//@node Prototypes, Main scheduling loop, Variables and Data structures, Main scheduling code
+//@subsection Prototypes
+
+#if 0 && defined(GRAN)
+// ToDo: replace these with macros
+static /* inline */ void    add_to_run_queue(StgTSO* tso); 
+static /* inline */ void    push_on_run_queue(StgTSO* tso); 
+static /* inline */ StgTSO *take_off_run_queue(StgTSO *tso);
+
+/* Thread management */
+void initScheduler(void);
+#endif
+
+//@node Main scheduling loop, Suspend and Resume, Prototypes, Main scheduling code
+//@subsection Main scheduling loop
+
+/* ---------------------------------------------------------------------------
    Main scheduling loop.
 
    We use round-robin scheduling, each thread returning to the
@@ -184,18 +308,35 @@ nat await_death;
       * waiting for work, or
       * waiting for a GC to complete.
 
-   -------------------------------------------------------------------------- */
-
+   ------------------------------------------------------------------------ */
+//@cindex schedule
 static void
 schedule( void )
 {
   StgTSO *t;
   Capability *cap;
   StgThreadReturnCode ret;
+#if defined(GRAN)
+  rtsEvent *event;
+#elif defined(PAR)
+  rtsSpark spark;
+  StgTSO *tso;
+  GlobalTaskId pe;
+#endif
   
   ACQUIRE_LOCK(&sched_mutex);
 
+#if defined(GRAN)
+# error ToDo: implement GranSim scheduler
+#elif defined(PAR)
+  while (!GlobalStopPending) {          /* GlobalStopPending set in par_exit */
+
+    if (PendingFetches != END_BF_QUEUE) {
+        processFetches();
+    }
+#else
   while (1) {
+#endif
 
     /* If we're interrupted (the user pressed ^C, or some other
      * termination condition occurred), kill all the currently running
@@ -267,7 +408,7 @@ schedule( void )
      * number of threads in the run queue equal to the number of
      * free capabilities.
      */
-#if defined(SMP) || defined(PAR)
+#if defined(SMP)
     {
       nat n = n_free_capabilities;
       StgTSO *tso = run_queue_hd;
@@ -284,11 +425,12 @@ schedule( void )
 	if (spark == NULL) {
 	  break; /* no more sparks in the pool */
 	} else {
+	  // I'd prefer this to be done in activateSpark -- HWL
 	  StgTSO *tso;
 	  tso = createThread_(RtsFlags.GcFlags.initialStkSize, rtsTrue);
 	  pushClosure(tso,spark);
 	  PUSH_ON_RUN_QUEUE(tso);
-#ifdef ToDo
+#ifdef PAR
 	  advisory_thread_count++;
 #endif
 	  
@@ -304,7 +446,7 @@ schedule( void )
 	  pthread_cond_signal(&thread_ready_cond);
       }
     }
-#endif /* SMP || PAR */
+#endif /* SMP */
 
     /* Check whether any waiting threads need to be woken up.  If the
      * run queue is empty, and there are no other tasks running, we
@@ -375,10 +517,114 @@ schedule( void )
       IF_DEBUG(scheduler, sched_belch("work now available"));
     }
 #endif
+
+#if defined(GRAN)
+# error ToDo: implement GranSim scheduler
+#elif defined(PAR)
+    // ToDo: phps merge with spark activation above
+    /* check whether we have local work and send requests if we have none */
+    if (run_queue_hd == END_TSO_QUEUE) {  /* no runnable threads */
+      /* :-[  no local threads => look out for local sparks */
+      if (advisory_thread_count < RtsFlags.ParFlags.maxThreads &&
+	  (pending_sparks_hd[REQUIRED_POOL] < pending_sparks_tl[REQUIRED_POOL] ||
+	   pending_sparks_hd[ADVISORY_POOL] < pending_sparks_tl[ADVISORY_POOL])) {
+	/* 
+	 * ToDo: add GC code check that we really have enough heap afterwards!!
+	 * Old comment:
+	 * If we're here (no runnable threads) and we have pending
+	 * sparks, we must have a space problem.  Get enough space
+	 * to turn one of those pending sparks into a
+	 * thread... 
+	 */
+	
+	spark = findSpark();                /* get a spark */
+	if (spark != (rtsSpark) NULL) {
+	  tso = activateSpark(spark);       /* turn the spark into a thread */
+	  IF_PAR_DEBUG(verbose,
+		       belch("== [%x] schedule: Created TSO %p (%d); %d threads active",
+			     mytid, tso, tso->id, advisory_thread_count));
+
+	  if (tso==END_TSO_QUEUE) { // failed to activate spark -> back to loop
+	    belch("^^ failed to activate spark");
+	    goto next_thread;
+	  }                         // otherwise fall through & pick-up new tso
+	} else {
+	  IF_PAR_DEBUG(verbose,
+		       belch("^^ no local sparks (spark pool contains only NFs: %d)", 
+			     spark_queue_len(ADVISORY_POOL)));
+	  goto next_thread;
+	}
+      } else  
+      /* =8-[  no local sparks => look for work on other PEs */
+      {
+	/*
+	 * We really have absolutely no work.  Send out a fish
+	 * (there may be some out there already), and wait for
+	 * something to arrive.  We clearly can't run any threads
+	 * until a SCHEDULE or RESUME arrives, and so that's what
+	 * we're hoping to see.  (Of course, we still have to
+	 * respond to other types of messages.)
+	 */
+	if (//!fishing &&  
+	    outstandingFishes < RtsFlags.ParFlags.maxFishes ) { // &&
+	  // (last_fish_arrived_at+FISH_DELAY < CURRENT_TIME)) {
+	  /* fishing set in sendFish, processFish;
+	     avoid flooding system with fishes via delay */
+	  pe = choosePE();
+	  sendFish(pe, mytid, NEW_FISH_AGE, NEW_FISH_HISTORY, 
+		   NEW_FISH_HUNGER);
+	}
+	
+	processMessages();
+	goto next_thread;
+	// ReSchedule(0);
+      }
+    } else if (PacketsWaiting()) {  /* Look for incoming messages */
+      processMessages();
+    }
+
+    /* Now we are sure that we have some work available */
+    ASSERT(run_queue_hd != END_TSO_QUEUE);
+    /* Take a thread from the run queue, if we have work */
+    t = take_off_run_queue(END_TSO_QUEUE);
+
+    /* ToDo: write something to the log-file
+    if (RTSflags.ParFlags.granSimStats && !sameThread)
+        DumpGranEvent(GR_SCHEDULE, RunnableThreadsHd);
+    */
+
+    CurrentTSO = t;
+
+    IF_DEBUG(scheduler, belch("--^^ %d sparks on [%#x] (hd=%x; tl=%x; lim=%x)", 
+			      spark_queue_len(ADVISORY_POOL), CURRENT_PROC,
+			      pending_sparks_hd[ADVISORY_POOL], 
+			      pending_sparks_tl[ADVISORY_POOL], 
+			      pending_sparks_lim[ADVISORY_POOL]));
+
+    IF_DEBUG(scheduler, belch("--== %d threads on [%#x] (hd=%x; tl=%x)", 
+			      run_queue_len(), CURRENT_PROC,
+			      run_queue_hd, run_queue_tl));
+
+    if (t != LastTSO) {
+      /* 
+	 we are running a different TSO, so write a schedule event to log file
+	 NB: If we use fair scheduling we also have to write  a deschedule 
+	     event for LastTSO; with unfair scheduling we know that the
+	     previous tso has blocked whenever we switch to another tso, so
+	     we don't need it in GUM for now
+      */
+      DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC,
+		       GR_SCHEDULE, t, (StgClosure *)NULL, 0, 0);
+      
+    }
+
+#else /* !GRAN && !PAR */
   
     /* grab a thread from the run queue
      */
     t = POP_RUN_QUEUE();
+
+#endif
     
     /* grab a capability
      */
@@ -403,6 +649,7 @@ schedule( void )
     
     IF_DEBUG(scheduler,sched_belch("running thread %d", t->id));
 
+    /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
     /* Run the current thread 
      */
     switch (cap->rCurrentTSO->whatNext) {
@@ -433,6 +680,7 @@ schedule( void )
     default:
       barf("schedule: invalid whatNext field");
     }
+    /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
     
     /* Costs for the scheduler are assigned to CCS_SYSTEM */
 #ifdef PROFILING
@@ -487,6 +735,14 @@ schedule( void )
       break;
 
     case ThreadYielding:
+#if defined(GRAN)
+      IF_DEBUG(gran, 
+	       DumpGranEvent(GR_DESCHEDULE, t));
+      globalGranStats.tot_yields++;
+#elif defined(PAR)
+      IF_DEBUG(par, 
+	       DumpGranEvent(GR_DESCHEDULE, t));
+#endif
       /* put the thread back on the run queue.  Then, if we're ready to
        * GC, check whether this is the last task to stop.  If so, wake
        * up the GC thread.  getThread will block during a GC until the
@@ -507,6 +763,13 @@ schedule( void )
       break;
       
     case ThreadBlocked:
+#if defined(GRAN)
+# error ToDo: implement GranSim scheduler
+#elif defined(PAR)
+      IF_DEBUG(par, 
+	       DumpGranEvent(GR_DESCHEDULE, t)); 
+#else
+#endif
       /* don't need to do anything.  Either the thread is blocked on
        * I/O, in which case we'll have called addToBlockedQueue
        * previously, or it's blocked on an MVar or Blackhole, in which
@@ -527,6 +790,13 @@ schedule( void )
        */
       IF_DEBUG(scheduler,belch("thread %ld finished", t->id));
       t->whatNext = ThreadComplete;
+#if defined(GRAN)
+      // ToDo: endThread(t, CurrentProc); // clean-up the thread
+#elif defined(PAR)
+      advisory_thread_count--;
+      if (RtsFlags.ParFlags.ParStats.Full) 
+	DumpEndEvent(CURRENT_PROC, t, rtsFalse /* not mandatory */);
+#endif
       break;
       
     default:
@@ -540,10 +810,11 @@ schedule( void )
 #endif
 
 #ifdef SMP
-    if (ready_to_gc && n_free_capabilities == RtsFlags.ParFlags.nNodes) {
+    if (ready_to_gc && n_free_capabilities == RtsFlags.ParFlags.nNodes) 
 #else
-    if (ready_to_gc) {
+    if (ready_to_gc) 
 #endif
+      {
       /* everybody back, start the GC.
        * Could do it in this thread, or signal a condition var
        * to do it in another thread.  Either way, we need to
@@ -558,10 +829,26 @@ schedule( void )
       pthread_cond_broadcast(&gc_pending_cond);
 #endif
     }
+#if defined(GRAN)
+  next_thread:
+    IF_GRAN_DEBUG(unused,
+		  print_eventq(EventHd));
+
+    event = get_next_event();
+
+#elif defined(PAR)
+  next_thread:
+    /* ToDo: wait for next message to arrive rather than busy wait */
+
+#else /* GRAN */
+  /* not any more
+  next_thread:
+    t = take_off_run_queue(END_TSO_QUEUE);
+  */
+#endif /* GRAN */
   } /* end of while(1) */
 }
 
-
 /* A hack for Hugs concurrency support.  Needs sanitisation (?) */
 void deleteAllThreads ( void )
 {
@@ -577,8 +864,12 @@ void deleteAllThreads ( void )
   blocked_queue_hd = blocked_queue_tl = END_TSO_QUEUE;
 }
 
+/* startThread and  insertThread are now in GranSim.c -- HWL */
 
-/* -----------------------------------------------------------------------------
+//@node Suspend and Resume, Run queue code, Main scheduling loop, Main scheduling code
+//@subsection Suspend and Resume
+
+/* ---------------------------------------------------------------------------
  * Suspending & resuming Haskell threads.
  * 
  * When making a "safe" call to C (aka _ccall_GC), the task gives back
@@ -591,7 +882,7 @@ void deleteAllThreads ( void )
  * duration of the call, on the susepended_ccalling_threads queue.  We
  * give out a token to the task, which it can use to resume the thread
  * on return from the C function.
- * -------------------------------------------------------------------------- */
+ * ------------------------------------------------------------------------- */
    
 StgInt
 suspendThread( Capability *cap )
@@ -660,17 +951,18 @@ resumeThread( StgInt tok )
   return cap;
 }
 
-/* -----------------------------------------------------------------------------
+
+/* ---------------------------------------------------------------------------
  * Static functions
- * -------------------------------------------------------------------------- */
+ * ------------------------------------------------------------------------ */
 static void unblockThread(StgTSO *tso);
 
-/* -----------------------------------------------------------------------------
+/* ---------------------------------------------------------------------------
  * Comparing Thread ids.
  *
  * This is used from STG land in the implementation of the
  * instances of Eq/Ord for ThreadIds.
- * -------------------------------------------------------------------------- */
+ * ------------------------------------------------------------------------ */
 
 int cmp_thread(const StgTSO *tso1, const StgTSO *tso2) 
 { 
@@ -682,7 +974,7 @@ int cmp_thread(const StgTSO *tso1, const StgTSO *tso2)
   return 0;
 }
 
-/* -----------------------------------------------------------------------------
+/* ---------------------------------------------------------------------------
    Create a new thread.
 
    The new thread starts with the given stack size.  Before the
@@ -692,19 +984,50 @@ int cmp_thread(const StgTSO *tso1, const StgTSO *tso2)
 
    createGenThread() and createIOThread() (in SchedAPI.h) are
    convenient packaged versions of this function.
-   -------------------------------------------------------------------------- */
+   ------------------------------------------------------------------------ */
+//@cindex createThread
+#if defined(GRAN)
+/* currently pri (priority) is only used in a GRAN setup -- HWL */
+StgTSO *
+createThread(nat stack_size, StgInt pri)
+{
+  return createThread_(stack_size, rtsFalse, pri);
+}
 
+static StgTSO *
+createThread_(nat size, rtsBool have_lock, StgInt pri)
+{
+#else
 StgTSO *
-createThread(nat size)
+createThread(nat stack_size)
 {
-  return createThread_(size, rtsFalse);
+  return createThread_(stack_size, rtsFalse);
 }
 
 static StgTSO *
 createThread_(nat size, rtsBool have_lock)
 {
-  StgTSO *tso;
-  nat stack_size;
+#endif
+    StgTSO *tso;
+    nat stack_size;
+
+    /* First check whether we should create a thread at all */
+#if defined(PAR)
+  /* check that no more than RtsFlags.ParFlags.maxThreads threads are created */
+  if (advisory_thread_count >= RtsFlags.ParFlags.maxThreads) {
+    threadsIgnored++;
+    belch("{createThread}Daq ghuH: refusing to create another thread; no more than %d threads allowed (currently %d)",
+	  RtsFlags.ParFlags.maxThreads, advisory_thread_count);
+    return END_TSO_QUEUE;
+  }
+  threadsCreated++;
+#endif
+
+#if defined(GRAN)
+  ASSERT(!RtsFlags.GranFlags.Light || CurrentProc==0);
+#endif
+
+  // ToDo: check whether size = stack_size - TSO_STRUCT_SIZEW
 
   /* catch ridiculously small stack sizes */
   if (size < MIN_STACK_WORDS + TSO_STRUCT_SIZEW) {
@@ -716,9 +1039,13 @@ createThread_(nat size, rtsBool have_lock)
   
   stack_size = size - TSO_STRUCT_SIZEW;
 
+  // Hmm, this CCS_MAIN is not protected by a PROFILING cpp var;
   SET_HDR(tso, &TSO_info, CCS_MAIN);
-  tso->whatNext = ThreadEnterGHC;
-  
+#if defined(GRAN)
+  SET_GRAN_HDR(tso, ThisPE);
+#endif
+  tso->whatNext     = ThreadEnterGHC;
+
   /* tso->id needs to be unique.  For now we use a heavyweight mutex to
   	 protect the increment operation on next_thread_id.
   	 In future, we could use an atomic increment instead.
@@ -746,13 +1073,69 @@ createThread_(nat size, rtsBool have_lock)
   SET_HDR((StgClosure*)tso->sp,(StgInfoTable *)&stg_stop_thread_info,CCS_MAIN);
   tso->su = (StgUpdateFrame*)tso->sp;
 
+  IF_DEBUG(scheduler,belch("---- Initialised TSO %ld (%p), stack size = %lx words", 
+			   tso->id, tso, tso->stack_size));
+
+  // ToDo: check this
+#if defined(GRAN)
+  tso->link = END_TSO_QUEUE;
+  /* uses more flexible routine in GranSim */
+  insertThread(tso, CurrentProc);
+#else
+  add_to_run_queue(tso);
+#endif
+
+#if defined(GRAN)
+  tso->gran.pri = pri;
+  tso->gran.magic = TSO_MAGIC; // debugging only
+  tso->gran.sparkname   = 0;
+  tso->gran.startedat   = CURRENT_TIME; 
+  tso->gran.exported    = 0;
+  tso->gran.basicblocks = 0;
+  tso->gran.allocs      = 0;
+  tso->gran.exectime    = 0;
+  tso->gran.fetchtime   = 0;
+  tso->gran.fetchcount  = 0;
+  tso->gran.blocktime   = 0;
+  tso->gran.blockcount  = 0;
+  tso->gran.blockedat   = 0;
+  tso->gran.globalsparks = 0;
+  tso->gran.localsparks  = 0;
+  if (RtsFlags.GranFlags.Light)
+    tso->gran.clock  = Now; /* local clock */
+  else
+    tso->gran.clock  = 0;
+
+  IF_DEBUG(gran,printTSO(tso));
+#elif defined(PAR)
+  tso->par.sparkname   = 0;
+  tso->par.startedat   = CURRENT_TIME; 
+  tso->par.exported    = 0;
+  tso->par.basicblocks = 0;
+  tso->par.allocs      = 0;
+  tso->par.exectime    = 0;
+  tso->par.fetchtime   = 0;
+  tso->par.fetchcount  = 0;
+  tso->par.blocktime   = 0;
+  tso->par.blockcount  = 0;
+  tso->par.blockedat   = 0;
+  tso->par.globalsparks = 0;
+  tso->par.localsparks  = 0;
+#endif
+
+#if defined(GRAN)
+  globalGranStats.tot_threads_created++;
+  globalGranStats.threads_created_on_PE[CurrentProc]++;
+  globalGranStats.tot_sq_len += spark_queue_len(CurrentProc);
+  globalGranStats.tot_sq_probes++;
+#endif 
+
   IF_DEBUG(scheduler,sched_belch("created thread %ld, stack size = %lx words", 
 				 tso->id, tso->stack_size));
   return tso;
 }
 
-
-/* -----------------------------------------------------------------------------
+/* ---------------------------------------------------------------------------
  * scheduleThread()
  *
  * scheduleThread puts a thread on the head of the runnable queue.
@@ -760,7 +1143,7 @@ createThread_(nat size, rtsBool have_lock)
  * The caller of scheduleThread must create the thread using e.g.
  * createThread and push an appropriate closure
  * on this thread's stack before the scheduler is invoked.
- * -------------------------------------------------------------------------- */
+ * ------------------------------------------------------------------------ */
 
 void
 scheduleThread(StgTSO *tso)
@@ -779,14 +1162,13 @@ scheduleThread(StgTSO *tso)
   RELEASE_LOCK(&sched_mutex);
 }
 
-
-/* -----------------------------------------------------------------------------
+/* ---------------------------------------------------------------------------
  * startTasks()
  *
  * Start up Posix threads to run each of the scheduler tasks.
  * I believe the task ids are not needed in the system as defined.
-  * KH @ 25/10/99
- * -------------------------------------------------------------------------- */
+ *  KH @ 25/10/99
+ * ------------------------------------------------------------------------ */
 
 #ifdef SMP
 static void *
@@ -797,7 +1179,7 @@ taskStart( void *arg STG_UNUSED )
 }
 #endif
 
-/* -----------------------------------------------------------------------------
+/* ---------------------------------------------------------------------------
  * initScheduler()
  *
  * Initialise the scheduler.  This resets all the queues - if the
@@ -805,7 +1187,7 @@ taskStart( void *arg STG_UNUSED )
  * next pass.
  *
  * This now calls startTasks(), so should only be called once!  KH @ 25/10/99
- * -------------------------------------------------------------------------- */
+ * ------------------------------------------------------------------------ */
 
 #ifdef SMP
 static void
@@ -819,12 +1201,26 @@ term_handler(int sig STG_UNUSED)
 }
 #endif
 
-void initScheduler(void)
+//@cindex initScheduler
+void 
+initScheduler(void)
 {
+#if defined(GRAN)
+  nat i;
+
+  for (i=0; i<=MAX_PROC; i++) {
+    run_queue_hds[i]      = END_TSO_QUEUE;
+    run_queue_tls[i]      = END_TSO_QUEUE;
+    blocked_queue_hds[i]  = END_TSO_QUEUE;
+    blocked_queue_tls[i]  = END_TSO_QUEUE;
+    ccalling_threadss[i]  = END_TSO_QUEUE;
+  }
+#else
   run_queue_hd      = END_TSO_QUEUE;
   run_queue_tl      = END_TSO_QUEUE;
   blocked_queue_hd  = END_TSO_QUEUE;
   blocked_queue_tl  = END_TSO_QUEUE;
+#endif 
 
   suspended_ccalling_threads  = END_TSO_QUEUE;
 
@@ -1009,42 +1405,127 @@ waitThread(StgTSO *tso, /*out*/StgClosure **ret)
 
   return stat;
 }
-  
-/* -----------------------------------------------------------------------------
-   Debugging: why is a thread blocked
-   -------------------------------------------------------------------------- */
 
-#ifdef DEBUG
-void printThreadBlockage(StgTSO *tso)
+//@node Run queue code, Garbage Collextion Routines, Suspend and Resume, Main scheduling code
+//@subsection Run queue code 
+
+#if 0
+/* 
+   NB: In GranSim we have many run queues; run_queue_hd is actually a macro
+       unfolding to run_queue_hds[CurrentProc], thus CurrentProc is an
+       implicit global variable that has to be correct when calling these
+       fcts -- HWL 
+*/
+
+/* Put the new thread on the head of the runnable queue.
+ * The caller of createThread better push an appropriate closure
+ * on this thread's stack before the scheduler is invoked.
+ */
+static /* inline */ void
+add_to_run_queue(tso)
+StgTSO* tso; 
 {
-  switch (tso->why_blocked) {
-  case BlockedOnRead:
-    fprintf(stderr,"blocked on read from fd %d", tso->block_info.fd);
-    break;
-  case BlockedOnWrite:
-    fprintf(stderr,"blocked on write to fd %d", tso->block_info.fd);
-    break;
-  case BlockedOnDelay:
-    fprintf(stderr,"blocked on delay of %d ms", tso->block_info.delay);
-    break;
-  case BlockedOnMVar:
-    fprintf(stderr,"blocked on an MVar");
-    break;
-  case BlockedOnException:
-    fprintf(stderr,"blocked on delivering an exception to thread %d",
-	    tso->block_info.tso->id);
-    break;
-  case BlockedOnBlackHole:
-    fprintf(stderr,"blocked on a black hole");
-    break;
-  case NotBlocked:
-    fprintf(stderr,"not blocked");
-    break;
+  ASSERT(tso!=run_queue_hd && tso!=run_queue_tl);
+  tso->link = run_queue_hd;
+  run_queue_hd = tso;
+  if (run_queue_tl == END_TSO_QUEUE) {
+    run_queue_tl = tso;
   }
 }
-#endif
 
-/* -----------------------------------------------------------------------------
+/* Put the new thread at the end of the runnable queue. */
+static /* inline */ void
+push_on_run_queue(tso)
+StgTSO* tso; 
+{
+  ASSERT(get_itbl((StgClosure *)tso)->type == TSO);
+  ASSERT(run_queue_hd!=NULL && run_queue_tl!=NULL);
+  ASSERT(tso!=run_queue_hd && tso!=run_queue_tl);
+  if (run_queue_hd == END_TSO_QUEUE) {
+    run_queue_hd = tso;
+  } else {
+    run_queue_tl->link = tso;
+  }
+  run_queue_tl = tso;
+}
+
+/* 
+   Should be inlined because it's used very often in schedule.  The tso
+   argument is actually only needed in GranSim, where we want to have the
+   possibility to schedule *any* TSO on the run queue, irrespective of the
+   actual ordering. Therefore, if tso is not the nil TSO then we traverse
+   the run queue and dequeue the tso, adjusting the links in the queue. 
+*/
+//@cindex take_off_run_queue
+static /* inline */ StgTSO*
+take_off_run_queue(StgTSO *tso) {
+  StgTSO *t, *prev;
+
+  /* 
+     qetlaHbogh Qu' ngaSbogh ghomDaQ {tso} yIteq!
+
+     if tso is specified, unlink that tso from the run_queue (doesn't have
+     to be at the beginning of the queue); GranSim only 
+  */
+  if (tso!=END_TSO_QUEUE) {
+    /* find tso in queue */
+    for (t=run_queue_hd, prev=END_TSO_QUEUE; 
+	 t!=END_TSO_QUEUE && t!=tso;
+	 prev=t, t=t->link) 
+      /* nothing */ ;
+    ASSERT(t==tso);
+    /* now actually dequeue the tso */
+    if (prev!=END_TSO_QUEUE) {
+      ASSERT(run_queue_hd!=t);
+      prev->link = t->link;
+    } else {
+      /* t is at beginning of thread queue */
+      ASSERT(run_queue_hd==t);
+      run_queue_hd = t->link;
+    }
+    /* t is at end of thread queue */
+    if (t->link==END_TSO_QUEUE) {
+      ASSERT(t==run_queue_tl);
+      run_queue_tl = prev;
+    } else {
+      ASSERT(run_queue_tl!=t);
+    }
+    t->link = END_TSO_QUEUE;
+  } else {
+    /* take tso from the beginning of the queue; std concurrent code */
+    t = run_queue_hd;
+    if (t != END_TSO_QUEUE) {
+      run_queue_hd = t->link;
+      t->link = END_TSO_QUEUE;
+      if (run_queue_hd == END_TSO_QUEUE) {
+	run_queue_tl = END_TSO_QUEUE;
+      }
+    }
+  }
+  return t;
+}
+
+#endif /* 0 */
+
+nat
+run_queue_len(void)
+{
+  nat i;
+  StgTSO *tso;
+
+  for (i=0, tso=run_queue_hd; 
+       tso != END_TSO_QUEUE;
+       i++, tso=tso->link)
+    /* nothing */
+
+  return i;
+}
+
+
+//@node Garbage Collextion Routines, Blocking Queue Routines, Run queue code, Main scheduling code
+//@subsection Garbage Collextion Routines
+
+/* ---------------------------------------------------------------------------
    Where are the roots that we know about?
 
         - all the threads on the runnable queue
@@ -1052,7 +1533,7 @@ void printThreadBlockage(StgTSO *tso)
 	- all the thread currently executing a _ccall_GC
         - all the "main threads"
      
-   -------------------------------------------------------------------------- */
+   ------------------------------------------------------------------------ */
 
 /* This has to be protected either by the scheduler monitor, or by the
 	garbage collection monitor (probably the latter).
@@ -1062,12 +1543,36 @@ void printThreadBlockage(StgTSO *tso)
 static void GetRoots(void)
 {
   StgMainThread *m;
+  nat i;
+
+#if defined(GRAN)
+  for (i=0; i<=RtsFlags.GranFlags.proc; i++) {
+    if ((run_queue_hds[i] != END_TSO_QUEUE) && ((run_queue_hds[i] != NULL)))
+      run_queue_hds[i]    = (StgTSO *)MarkRoot((StgClosure *)run_queue_hds[i]);
+    if ((run_queue_tls[i] != END_TSO_QUEUE) && ((run_queue_tls[i] != NULL)))
+      run_queue_tls[i]    = (StgTSO *)MarkRoot((StgClosure *)run_queue_tls[i]);
+    
+    if ((blocked_queue_hds[i] != END_TSO_QUEUE) && ((blocked_queue_hds[i] != NULL)))
+      blocked_queue_hds[i] = (StgTSO *)MarkRoot((StgClosure *)blocked_queue_hds[i]);
+    if ((blocked_queue_tls[i] != END_TSO_QUEUE) && ((blocked_queue_tls[i] != NULL)))
+      blocked_queue_tls[i] = (StgTSO *)MarkRoot((StgClosure *)blocked_queue_tls[i]);
+    if ((ccalling_threadss[i] != END_TSO_QUEUE) && ((ccalling_threadss[i] != NULL)))
+      ccalling_threadss[i] = (StgTSO *)MarkRoot((StgClosure *)ccalling_threadss[i]);
+  }
 
+  markEventQueue();
+#elif defined(PAR)
+  run_queue_hd      = (StgTSO *)MarkRoot((StgClosure *)run_queue_hd);
+  run_queue_tl      = (StgTSO *)MarkRoot((StgClosure *)run_queue_tl);
+  blocked_queue_hd  = (StgTSO *)MarkRoot((StgClosure *)blocked_queue_hd);
+  blocked_queue_tl  = (StgTSO *)MarkRoot((StgClosure *)blocked_queue_tl);
+#else
   run_queue_hd      = (StgTSO *)MarkRoot((StgClosure *)run_queue_hd);
   run_queue_tl      = (StgTSO *)MarkRoot((StgClosure *)run_queue_tl);
 
   blocked_queue_hd  = (StgTSO *)MarkRoot((StgClosure *)blocked_queue_hd);
   blocked_queue_tl  = (StgTSO *)MarkRoot((StgClosure *)blocked_queue_tl);
+#endif 
 
   for (m = main_threads; m != NULL; m = m->link) {
     m->tso = (StgTSO *)MarkRoot((StgClosure *)m->tso);
@@ -1205,10 +1710,93 @@ threadStackOverflow(StgTSO *tso)
   return dest;
 }
 
-/* -----------------------------------------------------------------------------
+//@node Blocking Queue Routines, Exception Handling Routines, Garbage Collextion Routines, Main scheduling code
+//@subsection Blocking Queue Routines
+
+/* ---------------------------------------------------------------------------
    Wake up a queue that was blocked on some resource.
-   -------------------------------------------------------------------------- */
+   ------------------------------------------------------------------------ */
+
+// ToDo: check push_on_run_queue vs. PUSH_ON_RUN_QUEUE
+
+#if defined(GRAN)
+# error FixME
+#elif defined(PAR)
+static inline void
+unblockCount ( StgBlockingQueueElement *bqe, StgClosure *node )
+{
+  /* write RESUME events to log file and
+     update blocked and fetch time (depending on type of the orig closure) */
+  if (RtsFlags.ParFlags.ParStats.Full) {
+    DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC, 
+		     GR_RESUMEQ, ((StgTSO *)bqe), ((StgTSO *)bqe)->block_info.closure,
+		     0, 0 /* spark_queue_len(ADVISORY_POOL) */);
+
+    switch (get_itbl(node)->type) {
+	case FETCH_ME_BQ:
+	  ((StgTSO *)bqe)->par.fetchtime += CURRENT_TIME-((StgTSO *)bqe)->par.blockedat;
+	  break;
+	case RBH:
+	case FETCH_ME:
+	case BLACKHOLE_BQ:
+	  ((StgTSO *)bqe)->par.blocktime += CURRENT_TIME-((StgTSO *)bqe)->par.blockedat;
+	  break;
+	default:
+	  barf("{unblockOneLocked}Daq Qagh: unexpected closure in blocking queue");
+	}
+      }
+}
+#endif
+
+#if defined(GRAN)
+# error FixME
+#elif defined(PAR)
+static StgBlockingQueueElement *
+unblockOneLocked(StgBlockingQueueElement *bqe, StgClosure *node)
+{
+    StgBlockingQueueElement *next;
+
+    switch (get_itbl(bqe)->type) {
+    case TSO:
+      ASSERT(((StgTSO *)bqe)->why_blocked != NotBlocked);
+      /* if it's a TSO just push it onto the run_queue */
+      next = bqe->link;
+      // ((StgTSO *)bqe)->link = END_TSO_QUEUE; // debugging?
+      PUSH_ON_RUN_QUEUE((StgTSO *)bqe); 
+      THREAD_RUNNABLE();
+      unblockCount(bqe, node);
+      /* reset blocking status after dumping event */
+      ((StgTSO *)bqe)->why_blocked = NotBlocked;
+      break;
+
+    case BLOCKED_FETCH:
+      /* if it's a BLOCKED_FETCH put it on the PendingFetches list */
+      next = bqe->link;
+      bqe->link = PendingFetches;
+      PendingFetches = bqe;
+      break;
 
+# if defined(DEBUG)
+      /* can ignore this case in a non-debugging setup; 
+	 see comments on RBHSave closures above */
+    case CONSTR:
+      /* check that the closure is an RBHSave closure */
+      ASSERT(get_itbl((StgClosure *)bqe) == &RBH_Save_0_info ||
+	     get_itbl((StgClosure *)bqe) == &RBH_Save_1_info ||
+	     get_itbl((StgClosure *)bqe) == &RBH_Save_2_info);
+      break;
+
+    default:
+      barf("{unblockOneLocked}Daq Qagh: Unexpected IP (%#lx; %s) in blocking queue at %#lx\n",
+	   get_itbl((StgClosure *)bqe), info_type((StgClosure *)bqe), 
+	   (StgClosure *)bqe);
+# endif
+    }
+  // IF_DEBUG(scheduler,sched_belch("waking up thread %ld", tso->id));
+  return next;
+}
+
+#else /* !GRAN && !PAR */
 static StgTSO *
 unblockOneLocked(StgTSO *tso)
 {
@@ -1223,7 +1811,20 @@ unblockOneLocked(StgTSO *tso)
   IF_DEBUG(scheduler,sched_belch("waking up thread %ld", tso->id));
   return next;
 }
+#endif
 
+#if defined(GRAN)
+# error FixME
+#elif defined(PAR)
+inline StgTSO *
+unblockOne(StgTSO *tso, StgClosure *node)
+{
+  ACQUIRE_LOCK(&sched_mutex);
+  tso = unblockOneLocked(tso, node);
+  RELEASE_LOCK(&sched_mutex);
+  return tso;
+}
+#else
 inline StgTSO *
 unblockOne(StgTSO *tso)
 {
@@ -1232,7 +1833,35 @@ unblockOne(StgTSO *tso)
   RELEASE_LOCK(&sched_mutex);
   return tso;
 }
+#endif
 
+#if defined(GRAN)
+# error FixME
+#elif defined(PAR)
+void 
+awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node)
+{
+  StgBlockingQueueElement *bqe, *next;
+
+  ACQUIRE_LOCK(&sched_mutex);
+
+  IF_PAR_DEBUG(verbose, 
+	       belch("## AwBQ for node %p on [%x]: ",
+		     node, mytid));
+
+  ASSERT(get_itbl(q)->type == TSO ||           
+  	 get_itbl(q)->type == BLOCKED_FETCH || 
+  	 get_itbl(q)->type == CONSTR); 
+
+  bqe = q;
+  while (get_itbl(bqe)->type==TSO || 
+	 get_itbl(bqe)->type==BLOCKED_FETCH) {
+    bqe = unblockOneLocked(bqe, node);
+  }
+  RELEASE_LOCK(&sched_mutex);
+}
+
+#else   /* !GRAN && !PAR */
 void
 awakenBlockedQueue(StgTSO *tso)
 {
@@ -1242,11 +1871,275 @@ awakenBlockedQueue(StgTSO *tso)
   }
   RELEASE_LOCK(&sched_mutex);
 }
+#endif
 
-/* -----------------------------------------------------------------------------
+#if 0
+// ngoq ngo'
+
+#if defined(GRAN)
+/* 
+   Awakening a blocking queue in GranSim means checking for each of the
+   TSOs in the queue whether they are local or not, issuing a ResumeThread
+   or an UnblockThread event, respectively. The basic iteration over the
+   blocking queue is the same as in the standard setup.  
+*/
+void
+awaken_blocked_queue(StgBlockingQueueElement *q, StgClosure *node)
+{
+  StgBlockingQueueElement *bqe, *next;
+  StgTSO *tso;
+  PEs node_loc, tso_loc;
+  rtsTime bq_processing_time = 0;
+  nat len = 0, len_local = 0;
+
+  IF_GRAN_DEBUG(bq, 
+		belch("## AwBQ for node %p on PE %d @ %ld by TSO %d (%p): ", \
+		      node, CurrentProc, CurrentTime[CurrentProc], 
+		      CurrentTSO->id, CurrentTSO));
+
+  node_loc = where_is(node);
+
+  ASSERT(get_itbl(q)->type == TSO ||   // q is either a TSO or an RBHSave
+	 get_itbl(q)->type == CONSTR); // closure (type constructor)
+  ASSERT(is_unique(node));
+
+  /* FAKE FETCH: magically copy the node to the tso's proc;
+     no Fetch necessary because in reality the node should not have been 
+     moved to the other PE in the first place
+  */
+  if (CurrentProc!=node_loc) {
+    IF_GRAN_DEBUG(bq, 
+		  belch("## node %p is on PE %d but CurrentProc is %d (TSO %d); assuming fake fetch and adjusting bitmask (old: %#x)",
+			node, node_loc, CurrentProc, CurrentTSO->id, 
+			// CurrentTSO, where_is(CurrentTSO),
+			node->header.gran.procs));
+    node->header.gran.procs = (node->header.gran.procs) | PE_NUMBER(CurrentProc);
+    IF_GRAN_DEBUG(bq, 
+		  belch("## new bitmask of node %p is %#x",
+			node, node->header.gran.procs));
+    if (RtsFlags.GranFlags.GranSimStats.Global) {
+      globalGranStats.tot_fake_fetches++;
+    }
+  }
+
+  next = q;
+  // ToDo: check: ASSERT(CurrentProc==node_loc);
+  while (get_itbl(next)->type==TSO) { // q != END_TSO_QUEUE) {
+    bqe = next;
+    next = bqe->link;
+    /* 
+       bqe points to the current element in the queue
+       next points to the next element in the queue
+    */
+    tso = (StgTSO *)bqe;  // wastes an assignment to get the type right
+    tso_loc = where_is(tso);
+    if (IS_LOCAL_TO(PROCS(node),tso_loc)) { // TSO is local
+      /* !fake_fetch => TSO is on CurrentProc is same as IS_LOCAL_TO */
+      ASSERT(CurrentProc!=node_loc || tso_loc==CurrentProc);
+      bq_processing_time += RtsFlags.GranFlags.Costs.lunblocktime;
+      // insertThread(tso, node_loc);
+      new_event(tso_loc, tso_loc,
+		CurrentTime[CurrentProc]+bq_processing_time,
+		ResumeThread,
+		tso, node, (rtsSpark*)NULL);
+      tso->link = END_TSO_QUEUE; // overwrite link just to be sure 
+      len_local++;
+      len++;
+    } else { // TSO is remote (actually should be FMBQ)
+      bq_processing_time += RtsFlags.GranFlags.Costs.mpacktime;
+      bq_processing_time += RtsFlags.GranFlags.Costs.gunblocktime;
+      new_event(tso_loc, CurrentProc, 
+		CurrentTime[CurrentProc]+bq_processing_time+
+		RtsFlags.GranFlags.Costs.latency,
+		UnblockThread,
+		tso, node, (rtsSpark*)NULL);
+      tso->link = END_TSO_QUEUE; // overwrite link just to be sure 
+      bq_processing_time += RtsFlags.GranFlags.Costs.mtidytime;
+      len++;
+    }      
+    /* the thread-queue-overhead is accounted for in either Resume or UnblockThread */
+    IF_GRAN_DEBUG(bq,
+		  fprintf(stderr," %s TSO %d (%p) [PE %d] (blocked_on=%p) (next=%p) ,",
+			  (node_loc==tso_loc ? "Local" : "Global"), 
+			  tso->id, tso, CurrentProc, tso->block_info.closure, tso->link))
+    tso->block_info.closure = NULL;
+    IF_DEBUG(scheduler,belch("-- Waking up thread %ld (%p)", 
+			     tso->id, tso));
+  }
+
+  /* if this is the BQ of an RBH, we have to put back the info ripped out of
+     the closure to make room for the anchor of the BQ */
+  if (next!=END_BQ_QUEUE) {
+    ASSERT(get_itbl(node)->type == RBH && get_itbl(next)->type == CONSTR);
+    /*
+    ASSERT((info_ptr==&RBH_Save_0_info) ||
+	   (info_ptr==&RBH_Save_1_info) ||
+	   (info_ptr==&RBH_Save_2_info));
+    */
+    /* cf. convertToRBH in RBH.c for writing the RBHSave closure */
+    ((StgRBH *)node)->blocking_queue = ((StgRBHSave *)next)->payload[0];
+    ((StgRBH *)node)->mut_link       = ((StgRBHSave *)next)->payload[1];
+
+    IF_GRAN_DEBUG(bq,
+		  belch("## Filled in RBH_Save for %p (%s) at end of AwBQ",
+			node, info_type(node)));
+  }
+
+  /* statistics gathering */
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    globalGranStats.tot_bq_processing_time += bq_processing_time;
+    globalGranStats.tot_bq_len += len;      // total length of all bqs awakened
+    globalGranStats.tot_bq_len_local += len_local;  // same for local TSOs only
+    globalGranStats.tot_awbq++;             // total no. of bqs awakened
+  }
+  IF_GRAN_DEBUG(bq,
+		fprintf(stderr,"## BQ Stats of %p: [%d entries, %d local] %s\n",
+			node, len, len_local, (next!=END_TSO_QUEUE) ? "RBH" : ""));
+}
+
+#elif defined(PAR)
+
+/*
+  Awakening a blocking queue in GUM has to check whether an entry in the
+  queue is a normal TSO or a BLOCKED_FETCH. The later indicates that a TSO is
+  waiting for the result of this computation on another PE. Thus, when
+  finding a BLOCKED_FETCH we have to send off a message to that PE. 
+  Actually, we defer sending off a message, by just putting the BLOCKED_FETCH
+  onto the PendingFetches queue, which will be later traversed by
+  processFetches, sending off a RESUME message for each BLOCKED_FETCH.
+
+  NB: There is no check for an RBHSave closure (type CONSTR) in the code 
+      below. The reason is, if we awaken the BQ of an RBH closure (RBHSaves 
+      only exist at the end of such BQs) we know that the closure has been
+      unpacked successfully on the other PE, and we can discard the info
+      contained in the RBHSave closure. The current closure will be turned 
+      into a FetchMe closure anyway.
+*/
+void 
+awaken_blocked_queue(StgBlockingQueueElement *q, StgClosure *node)
+{
+  StgBlockingQueueElement *bqe, *next;
+
+  IF_PAR_DEBUG(verbose, 
+	       belch("## AwBQ for node %p on [%x]: ",
+		     node, mytid));
+
+  ASSERT(get_itbl(q)->type == TSO ||           
+  	 get_itbl(q)->type == BLOCKED_FETCH || 
+  	 get_itbl(q)->type == CONSTR); 
+
+  next = q;
+  while (get_itbl(next)->type==TSO || 
+	 get_itbl(next)->type==BLOCKED_FETCH) {
+    bqe = next;
+    switch (get_itbl(bqe)->type) {
+    case TSO:
+      /* if it's a TSO just push it onto the run_queue */
+      next = bqe->link;
+#if defined(DEBUG)
+      ((StgTSO *)bqe)->link = END_TSO_QUEUE; // debugging only
+#endif
+      push_on_run_queue((StgTSO *)bqe); // HWL: was: PUSH_ON_RUN_QUEUE(tso);
+
+      /* write RESUME events to log file and
+	 update blocked and fetch time (depending on type of the orig closure) */
+      if (RtsFlags.ParFlags.ParStats.Full) {
+	DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC, 
+			 GR_RESUMEQ, ((StgTSO *)bqe), ((StgTSO *)bqe)->block_info.closure,
+			 0, spark_queue_len(ADVISORY_POOL));
+
+	switch (get_itbl(node)->type) {
+	case FETCH_ME_BQ:
+	  ((StgTSO *)bqe)->par.fetchtime += CURRENT_TIME-((StgTSO *)bqe)->par.blockedat;
+	  break;
+	case RBH:
+	case FETCH_ME:
+	case BLACKHOLE_BQ:
+	  ((StgTSO *)bqe)->par.blocktime += CURRENT_TIME-((StgTSO *)bqe)->par.blockedat;
+	  break;
+	default:
+	  barf("{awaken_blocked_queue}Daq Qagh: unexpected closure %p (%s) with blocking queue",
+	       node, info_type(node));
+	}
+      }
+      /* reset block_info.closure field after dumping event */
+      ((StgTSO *)bqe)->block_info.closure = NULL;
+
+      /* rest of this branch is debugging only */
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr," TSO %d (%p) [PE %lx] (block_info.closure=%p) (next=%p) ,",
+			   ((StgTSO *)bqe)->id, (StgTSO *)bqe,
+			   mytid, ((StgTSO *)bqe)->block_info.closure, ((StgTSO *)bqe)->link));
+
+      IF_DEBUG(scheduler,
+	       if (!RtsFlags.ParFlags.Debug.verbose)
+	         belch("-- Waking up thread %ld (%p)", 
+		       ((StgTSO *)bqe)->id, (StgTSO *)bqe));
+      break;
+
+    case BLOCKED_FETCH:
+      /* if it's a BLOCKED_FETCH put it on the PendingFetches list */
+      next = bqe->link;
+      bqe->link = PendingFetches;
+      PendingFetches = bqe;
+      // bqe.tso->block_info.closure = NULL;
+
+      /* rest of this branch is debugging only */
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr," BLOCKED_FETCH (%p) on node %p [PE %lx] (next=%p) ,",
+			   ((StgBlockedFetch *)bqe), 
+			   ((StgBlockedFetch *)bqe)->node, 
+			   mytid, ((StgBlockedFetch *)bqe)->link));
+      break;
+
+# if defined(DEBUG)
+      /* can ignore this case in a non-debugging setup; 
+	 see comments on RBHSave closures above */
+    case CONSTR:
+      /* check that the closure is an RBHSave closure */
+      ASSERT(get_itbl((StgClosure *)bqe) == &RBH_Save_0_info ||
+	     get_itbl((StgClosure *)bqe) == &RBH_Save_1_info ||
+	     get_itbl((StgClosure *)bqe) == &RBH_Save_2_info);
+      break;
+
+    default:
+      barf("{awaken_blocked_queue}Daq Qagh: Unexpected IP (%#lx; %s) in blocking queue at %#lx\n",
+	   get_itbl((StgClosure *)bqe), info_type((StgClosure *)bqe), 
+	   (StgClosure *)bqe);
+# endif
+    }
+  }
+}
+
+#else /* !GRAN && !PAR */
+
+void 
+awaken_blocked_queue(StgTSO *q) { awakenBlockedQueue(q); }
+
+/*
+{
+  StgTSO *tso;
+
+  while (q != END_TSO_QUEUE) {
+    ASSERT(get_itbl(q)->type == TSO);
+    tso = q;
+    q = tso->link;
+    push_on_run_queue(tso); // HWL: was: PUSH_ON_RUN_QUEUE(tso);
+    //tso->block_info.closure = NULL;
+    IF_DEBUG(scheduler, belch("-- Waking up thread %ld (%p)", tso->id, tso));
+  }
+}
+*/
+#endif /* GRAN */
+#endif /* 0 */
+
+//@node Exception Handling Routines, Debugging Routines, Blocking Queue Routines, Main scheduling code
+//@subsection Exception Handling Routines
+
+/* ---------------------------------------------------------------------------
    Interrupt execution
    - usually called inside a signal handler so it mustn't do anything fancy.   
-   -------------------------------------------------------------------------- */
+   ------------------------------------------------------------------------ */
 
 void
 interruptStgRts(void)
@@ -1260,6 +2153,7 @@ interruptStgRts(void)
 
    This is for use when we raise an exception in another thread, which
    may be blocked.
+   This has nothing to do with the UnblockThread event in GranSim. -- HWL
    -------------------------------------------------------------------------- */
 
 static void
@@ -1593,11 +2487,202 @@ raiseAsync(StgTSO *tso, StgClosure *exception)
   barf("raiseAsync");
 }
 
+//@node Debugging Routines, Index, Exception Handling Routines, Main scheduling code
+//@subsection Debugging Routines
+
 /* -----------------------------------------------------------------------------
-   Debuggery...
+   Debugging: why is a thread blocked
    -------------------------------------------------------------------------- */
 
 #ifdef DEBUG
+
+void printThreadBlockage(StgTSO *tso)
+{
+  switch (tso->why_blocked) {
+  case BlockedOnRead:
+    fprintf(stderr,"blocked on read from fd %d", tso->block_info.fd);
+    break;
+  case BlockedOnWrite:
+    fprintf(stderr,"blocked on write to fd %d", tso->block_info.fd);
+    break;
+  case BlockedOnDelay:
+    fprintf(stderr,"blocked on delay of %d ms", tso->block_info.delay);
+    break;
+  case BlockedOnMVar:
+    fprintf(stderr,"blocked on an MVar");
+    break;
+  case BlockedOnException:
+    fprintf(stderr,"blocked on delivering an exception to thread %d",
+	    tso->block_info.tso->id);
+    break;
+  case BlockedOnBlackHole:
+    fprintf(stderr,"blocked on a black hole");
+    break;
+  case NotBlocked:
+    fprintf(stderr,"not blocked");
+    break;
+#if defined(PAR)
+  case BlockedOnGA:
+    fprintf(stderr,"blocked on global address");
+    break;
+#endif
+  }
+}
+
+/* 
+   Print a whole blocking queue attached to node (debugging only).
+*/
+//@cindex print_bq
+# if defined(PAR)
+void 
+print_bq (StgClosure *node)
+{
+  StgBlockingQueueElement *bqe;
+  StgTSO *tso;
+  rtsBool end;
+
+  fprintf(stderr,"## BQ of closure %p (%s): ",
+	  node, info_type(node));
+
+  /* should cover all closures that may have a blocking queue */
+  ASSERT(get_itbl(node)->type == BLACKHOLE_BQ ||
+	 get_itbl(node)->type == FETCH_ME_BQ ||
+	 get_itbl(node)->type == RBH);
+    
+  ASSERT(node!=(StgClosure*)NULL);         // sanity check
+  /* 
+     NB: In a parallel setup a BQ of an RBH must end with an RBH_Save closure;
+  */
+  for (bqe = ((StgBlockingQueue*)node)->blocking_queue, end = (bqe==END_BQ_QUEUE);
+       !end; // iterate until bqe points to a CONSTR
+       end = (get_itbl(bqe)->type == CONSTR) || (bqe->link==END_BQ_QUEUE), bqe = end ? END_BQ_QUEUE : bqe->link) {
+    ASSERT(bqe != END_BQ_QUEUE);             // sanity check
+    ASSERT(bqe != (StgTSO*)NULL);            // sanity check
+    /* types of closures that may appear in a blocking queue */
+    ASSERT(get_itbl(bqe)->type == TSO ||           
+	   get_itbl(bqe)->type == BLOCKED_FETCH || 
+	   get_itbl(bqe)->type == CONSTR); 
+    /* only BQs of an RBH end with an RBH_Save closure */
+    ASSERT(get_itbl(bqe)->type != CONSTR || get_itbl(node)->type == RBH);
+
+    switch (get_itbl(bqe)->type) {
+    case TSO:
+      fprintf(stderr," TSO %d (%x),",
+	      ((StgTSO *)bqe)->id, ((StgTSO *)bqe));
+      break;
+    case BLOCKED_FETCH:
+      fprintf(stderr," BF (node=%p, ga=((%x, %d, %x)),",
+	      ((StgBlockedFetch *)bqe)->node, 
+	      ((StgBlockedFetch *)bqe)->ga.payload.gc.gtid,
+	      ((StgBlockedFetch *)bqe)->ga.payload.gc.slot,
+	      ((StgBlockedFetch *)bqe)->ga.weight);
+      break;
+    case CONSTR:
+      fprintf(stderr," %s (IP %p),",
+	      (get_itbl(bqe) == &RBH_Save_0_info ? "RBH_Save_0" :
+	       get_itbl(bqe) == &RBH_Save_1_info ? "RBH_Save_1" :
+	       get_itbl(bqe) == &RBH_Save_2_info ? "RBH_Save_2" :
+	       "RBH_Save_?"), get_itbl(bqe));
+      break;
+    default:
+      barf("Unexpected closure type %s in blocking queue of %p (%s)",
+	   info_type(bqe), node, info_type(node));
+      break;
+    }
+  } /* for */
+  fputc('\n', stderr);
+}
+# elif defined(GRAN)
+void 
+print_bq (StgClosure *node)
+{
+  StgBlockingQueueElement *bqe;
+  StgTSO *tso;
+  PEs node_loc, tso_loc;
+  rtsBool end;
+
+  /* should cover all closures that may have a blocking queue */
+  ASSERT(get_itbl(node)->type == BLACKHOLE_BQ ||
+	 get_itbl(node)->type == FETCH_ME_BQ ||
+	 get_itbl(node)->type == RBH);
+    
+  ASSERT(node!=(StgClosure*)NULL);         // sanity check
+  node_loc = where_is(node);
+
+  fprintf(stderr,"## BQ of closure %p (%s) on [PE %d]: ",
+	  node, info_type(node), node_loc);
+
+  /* 
+     NB: In a parallel setup a BQ of an RBH must end with an RBH_Save closure;
+  */
+  for (bqe = ((StgBlockingQueue*)node)->blocking_queue, end = (bqe==END_BQ_QUEUE);
+       !end; // iterate until bqe points to a CONSTR
+       end = (get_itbl(bqe)->type == CONSTR) || (bqe->link==END_BQ_QUEUE), bqe = end ? END_BQ_QUEUE : bqe->link) {
+    ASSERT(bqe != END_BQ_QUEUE);             // sanity check
+    ASSERT(bqe != (StgTSO*)NULL);            // sanity check
+    /* types of closures that may appear in a blocking queue */
+    ASSERT(get_itbl(bqe)->type == TSO ||           
+	   get_itbl(bqe)->type == CONSTR); 
+    /* only BQs of an RBH end with an RBH_Save closure */
+    ASSERT(get_itbl(bqe)->type != CONSTR || get_itbl(node)->type == RBH);
+
+    tso_loc = where_is((StgClosure *)bqe);
+    switch (get_itbl(bqe)->type) {
+    case TSO:
+      fprintf(stderr," TSO %d (%x) on [PE %d],",
+	      ((StgTSO *)bqe)->id, ((StgTSO *)bqe), tso_loc);
+      break;
+    case CONSTR:
+      fprintf(stderr," %s (IP %p),",
+	      (get_itbl(bqe) == &RBH_Save_0_info ? "RBH_Save_0" :
+	       get_itbl(bqe) == &RBH_Save_1_info ? "RBH_Save_1" :
+	       get_itbl(bqe) == &RBH_Save_2_info ? "RBH_Save_2" :
+	       "RBH_Save_?"), get_itbl(bqe));
+      break;
+    default:
+      barf("Unexpected closure type %s in blocking queue of %p (%s)",
+	   info_type(bqe), node, info_type(node));
+      break;
+    }
+  } /* for */
+  fputc('\n', stderr);
+}
+#else
+/* 
+   Nice and easy: only TSOs on the blocking queue
+*/
+void 
+print_bq (StgClosure *node)
+{
+  StgTSO *tso;
+
+  ASSERT(node!=(StgClosure*)NULL);         // sanity check
+  for (tso = ((StgBlockingQueue*)node)->blocking_queue;
+       tso != END_TSO_QUEUE; 
+       tso=tso->link) {
+    ASSERT(tso!=(StgTSO*)NULL && tso!=END_TSO_QUEUE);   // sanity check
+    ASSERT(get_itbl(tso)->type == TSO);  // guess what, sanity check
+    fprintf(stderr," TSO %d (%x),", tso->id, tso);
+  }
+  fputc('\n', stderr);
+}
+# endif
+
+/* A debugging function used all over the place in GranSim and GUM.
+   Dummy function in other setups.
+*/
+# if !defined(GRAN) && !defined(PAR)
+char *
+info_type(StgClosure *closure){ 
+  return "petaQ";
+}
+
+char *
+info_type_by_ip(StgInfoTable *ip){ 
+  return "petaQ";
+}
+#endif
+
 static void
 sched_belch(char *s, ...)
 {
@@ -1611,4 +2696,33 @@ sched_belch(char *s, ...)
   vfprintf(stderr, s, ap);
   fprintf(stderr, "\n");
 }
-#endif
+
+#endif /* DEBUG */
+
+//@node Index,  , Debugging Routines, Main scheduling code
+//@subsection Index
+
+//@index
+//* MainRegTable::  @cindex\s-+MainRegTable
+//* StgMainThread::  @cindex\s-+StgMainThread
+//* awaken_blocked_queue::  @cindex\s-+awaken_blocked_queue
+//* blocked_queue_hd::  @cindex\s-+blocked_queue_hd
+//* blocked_queue_tl::  @cindex\s-+blocked_queue_tl
+//* context_switch::  @cindex\s-+context_switch
+//* createThread::  @cindex\s-+createThread
+//* free_capabilities::  @cindex\s-+free_capabilities
+//* gc_pending_cond::  @cindex\s-+gc_pending_cond
+//* initScheduler::  @cindex\s-+initScheduler
+//* interrupted::  @cindex\s-+interrupted
+//* n_free_capabilities::  @cindex\s-+n_free_capabilities
+//* next_thread_id::  @cindex\s-+next_thread_id
+//* print_bq::  @cindex\s-+print_bq
+//* run_queue_hd::  @cindex\s-+run_queue_hd
+//* run_queue_tl::  @cindex\s-+run_queue_tl
+//* sched_mutex::  @cindex\s-+sched_mutex
+//* schedule::  @cindex\s-+schedule
+//* take_off_run_queue::  @cindex\s-+take_off_run_queue
+//* task_ids::  @cindex\s-+task_ids
+//* term_mutex::  @cindex\s-+term_mutex
+//* thread_ready_cond::  @cindex\s-+thread_ready_cond
+//@end index
diff --git a/ghc/rts/Schedule.h b/ghc/rts/Schedule.h
index f559efc..1c93099 100644
--- a/ghc/rts/Schedule.h
+++ b/ghc/rts/Schedule.h
@@ -1,13 +1,26 @@
 /* -----------------------------------------------------------------------------
- * $Id: Schedule.h,v 1.12 2000/01/12 15:15:18 simonmar Exp $
+ * $Id: Schedule.h,v 1.13 2000/01/13 14:34:05 hwloidl Exp $
  *
  * (c) The GHC Team 1998-1999
  *
  * Prototypes for functions in Schedule.c 
  * (RTS internal scheduler interface)
  *
- * ---------------------------------------------------------------------------*/
+ * -------------------------------------------------------------------------*/
 
+//@menu
+//* Scheduler Functions::	
+//* Scheduler Vars and Data Types::  
+//* Some convenient macros::	
+//* Index::			
+//@end menu
+
+//@node Scheduler Functions, Scheduler Vars and Data Types
+//@subsection Scheduler Functions
+
+//@cindex initScheduler
+//@cindex exitScheduler
+//@cindex startTasks
 /* initScheduler(), exitScheduler(), startTasks()
  * 
  * Called from STG :  no
@@ -19,6 +32,7 @@ void exitScheduler( void );
 void startTasks( void );
 #endif
 
+//@cindex awakenBlockedQueue
 /* awakenBlockedQueue()
  *
  * Takes a pointer to the beginning of a blocked TSO queue, and
@@ -27,8 +41,15 @@ void startTasks( void );
  * Called from STG :  yes
  * Locks assumed   :  none
  */
+#if defined(GRAN)
+# error FixME
+#elif defined(PAR)
+void awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node);
+#else
 void awakenBlockedQueue(StgTSO *tso);
+#endif
 
+//@cindex unblockOne
 /* unblockOne()
  *
  * Takes a pointer to the beginning of a blocked TSO queue, and
@@ -37,8 +58,15 @@ void awakenBlockedQueue(StgTSO *tso);
  * Called from STG : yes
  * Locks assumed   : none
  */
+#if defined(GRAN)
+# error FixME
+#elif defined(PAR)
+StgTSO *unblockOne(StgTSO *tso, StgClosure *node);
+#else
 StgTSO *unblockOne(StgTSO *tso);
+#endif
 
+//@cindex raiseAsync
 /* raiseAsync()
  *
  * Raises an exception asynchronously in the specified thread.
@@ -48,6 +76,7 @@ StgTSO *unblockOne(StgTSO *tso);
  */
 void raiseAsync(StgTSO *tso, StgClosure *exception);
 
+//@cindex awaitEvent
 /* awaitEvent()
  *
  * Raises an exception asynchronously in the specified thread.
@@ -57,6 +86,33 @@ void raiseAsync(StgTSO *tso, StgClosure *exception);
  */
 void awaitEvent(rtsBool wait);  /* In Select.c */
 
+// ToDo: check whether all fcts below are used in the SMP version, too
+//@cindex awaken_blocked_queue
+#if defined(GRAN)
+void    awaken_blocked_queue(StgBlockingQueueElement *q, StgClosure *node);
+void    unlink_from_bq(StgTSO* tso, StgClosure* node);
+void    initThread(StgTSO *tso, nat stack_size, StgInt pri);
+#elif defined(PAR)
+nat     run_queue_len(void);
+void    awaken_blocked_queue(StgBlockingQueueElement *q, StgClosure *node);
+void    initThread(StgTSO *tso, nat stack_size);
+#else
+char   *info_type(StgClosure *closure);    // dummy
+char   *info_type_by_ip(StgInfoTable *ip); // dummy
+void    awaken_blocked_queue(StgTSO *q);
+void    initThread(StgTSO *tso, nat stack_size);
+#endif
+
+// debugging only
+#ifdef DEBUG
+extern void printThreadBlockage(StgTSO *tso);
+#endif
+void    print_bq (StgClosure *node);
+
+//@node Scheduler Vars and Data Types, Some convenient macros, Scheduler Functions
+//@subsection Scheduler Vars and Data Types
+
+//@cindex context_switch
 /* Context switch flag.
  * Locks required  : sched_mutex
  */
@@ -65,6 +121,7 @@ extern rtsBool interrupted;
 
 extern  nat ticks_since_select;
 
+//@cindex Capability
 /* Capability type
  */
 typedef StgRegTable Capability;
@@ -85,16 +142,16 @@ extern Capability MainRegTable;
 extern  StgTSO *run_queue_hd, *run_queue_tl;
 extern  StgTSO *blocked_queue_hd, *blocked_queue_tl;
 
-#ifdef DEBUG
-extern void printThreadBlockage(StgTSO *tso);
-#endif
-
 #ifdef SMP
+//@cindex sched_mutex
+//@cindex thread_ready_cond
+//@cindex gc_pending_cond
 extern pthread_mutex_t sched_mutex;
 extern pthread_cond_t  thread_ready_cond;
 extern pthread_cond_t  gc_pending_cond;
 #endif
 
+//@cindex task_info
 #ifdef SMP
 typedef struct {
   pthread_t id;
@@ -108,9 +165,19 @@ typedef struct {
 extern task_info *task_ids;
 #endif
 
+#if !defined(GRAN)
+extern  StgTSO *run_queue_hd, *run_queue_tl;
+extern  StgTSO *blocked_queue_hd, *blocked_queue_tl;
+#endif
+
 /* Needed by Hugs.
  */
 void interruptStgRts ( void );
+// ?? needed -- HWL
+void raiseAsync(StgTSO *tso, StgClosure *exception);
+
+//@node Some convenient macros, Index, Scheduler Vars and Data Types
+//@subsection Some convenient macros
 
 /* -----------------------------------------------------------------------------
  * Some convenient macros...
@@ -119,6 +186,7 @@ void interruptStgRts ( void );
 #define END_TSO_QUEUE  ((StgTSO *)(void*)&END_TSO_QUEUE_closure)
 #define END_CAF_LIST   ((StgCAF *)(void*)&END_TSO_QUEUE_closure)
 
+//@cindex APPEND_TO_RUN_QUEUE
 /* Add a thread to the end of the run queue.
  * NOTE: tso->link should be END_TSO_QUEUE before calling this macro.
  */
@@ -131,6 +199,7 @@ void interruptStgRts ( void );
     }						\
     run_queue_tl = tso;
 
+//@cindex PUSH_ON_RUN_QUEUE
 /* Push a thread on the beginning of the run queue.  Used for
  * newly awakened threads, so they get run as soon as possible.
  */
@@ -140,7 +209,8 @@ void interruptStgRts ( void );
     if (run_queue_tl == END_TSO_QUEUE) {	\
       run_queue_tl = tso;			\
     }
-    
+
+//@cindex POP_RUN_QUEUE    
 /* Pop the first thread off the runnable queue.
  */
 #define POP_RUN_QUEUE()				\
@@ -155,6 +225,7 @@ void interruptStgRts ( void );
     t;						\
   })
 
+//@cindex APPEND_TO_BLOCKED_QUEUE
 /* Add a thread to the end of the blocked queue.
  */
 #define APPEND_TO_BLOCKED_QUEUE(tso)		\
@@ -166,6 +237,7 @@ void interruptStgRts ( void );
     }						\
     blocked_queue_tl = tso;
 
+//@cindex THREAD_RUNNABLE
 /* Signal that a runnable thread has become available, in
  * case there are any waiting tasks to execute it.
  */
@@ -179,3 +251,27 @@ void interruptStgRts ( void );
 #define THREAD_RUNNABLE()  /* nothing */
 #endif
 
+//@node Index,  , Some convenient macros
+//@subsection Index
+
+//@index
+//* APPEND_TO_BLOCKED_QUEUE::  @cindex\s-+APPEND_TO_BLOCKED_QUEUE
+//* APPEND_TO_RUN_QUEUE::  @cindex\s-+APPEND_TO_RUN_QUEUE
+//* Capability::  @cindex\s-+Capability
+//* POP_RUN_QUEUE    ::  @cindex\s-+POP_RUN_QUEUE    
+//* PUSH_ON_RUN_QUEUE::  @cindex\s-+PUSH_ON_RUN_QUEUE
+//* THREAD_RUNNABLE::  @cindex\s-+THREAD_RUNNABLE
+//* awaitEvent::  @cindex\s-+awaitEvent
+//* awakenBlockedQueue::  @cindex\s-+awakenBlockedQueue
+//* awaken_blocked_queue::  @cindex\s-+awaken_blocked_queue
+//* context_switch::  @cindex\s-+context_switch
+//* exitScheduler::  @cindex\s-+exitScheduler
+//* gc_pending_cond::  @cindex\s-+gc_pending_cond
+//* initScheduler::  @cindex\s-+initScheduler
+//* raiseAsync::  @cindex\s-+raiseAsync
+//* sched_mutex::  @cindex\s-+sched_mutex
+//* startTasks::  @cindex\s-+startTasks
+//* task_info::  @cindex\s-+task_info
+//* thread_ready_cond::  @cindex\s-+thread_ready_cond
+//* unblockOne::  @cindex\s-+unblockOne
+//@end index
diff --git a/ghc/rts/StgMiscClosures.hc b/ghc/rts/StgMiscClosures.hc
index 8c436d8..4809be7 100644
--- a/ghc/rts/StgMiscClosures.hc
+++ b/ghc/rts/StgMiscClosures.hc
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: StgMiscClosures.hc,v 1.30 1999/11/30 11:43:26 simonmar Exp $
+ * $Id: StgMiscClosures.hc,v 1.31 2000/01/13 14:34:05 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -9,12 +9,17 @@
 
 #include "Rts.h"
 #include "RtsUtils.h"
+#include "RtsFlags.h"
 #include "StgMiscClosures.h"
 #include "HeapStackCheck.h"   /* for stg_gen_yield */
 #include "Storage.h"
 #include "StoragePriv.h"
 #include "ProfRts.h"
 #include "SMP.h"
+#if defined(GRAN) || defined(PAR)
+# include "GranSimRts.h"      /* for DumpRawGranEvent */
+# include "StgRun.h"	/* for StgReturn and register saving */
+#endif
 
 #ifdef HAVE_STDIO_H
 #include <stdio.h>
@@ -25,6 +30,20 @@
  */
 #define DUMP_ERRMSG(msg) STGCALL2(fprintf,stderr,msg)
 
+/*
+  Template for the entry code of non-enterable closures.
+*/
+
+#define NON_ENTERABLE_ENTRY_CODE(type)					\
+STGFUN(type##_entry)							\
+{									\
+  FB_									\
+    DUMP_ERRMSG(#type " object entered!\n");                            \
+    STGCALL1(raiseError, errorHandler);					\
+    stg_exit(EXIT_FAILURE); /* not executed */				\
+  FE_									\
+}
+
 /* -----------------------------------------------------------------------------
    Entry code for an indirection.
 
@@ -185,6 +204,11 @@ INFO_TABLE(BLACKHOLE_info, BLACKHOLE_entry,0,2,BLACKHOLE,,EF_,0,0);
 STGFUN(BLACKHOLE_entry)
 {
   FB_
+#if defined(GRAN)
+    /* Before overwriting TSO_LINK */
+    STGCALL3(GranSimBlock,CurrentTSO,CurrentProc,(StgClosure *)R1.p /*Node*/);
+#endif
+
 #ifdef SMP
     CMPXCHG(R1.cl->header.info, &BLACKHOLE_info, &WHITEHOLE_info);
 #endif
@@ -192,15 +216,43 @@ STGFUN(BLACKHOLE_entry)
     TICK_ENT_BH();
 
     /* Put ourselves on the blocking queue for this black hole */
-    CurrentTSO->link = (StgTSO *)&END_TSO_QUEUE_closure;
+#if defined(GRAN) || defined(PAR)
+    /* in fact, only difference is the type of the end-of-queue marker! */
+    CurrentTSO->link = END_BQ_QUEUE;
+    ((StgBlockingQueue *)R1.p)->blocking_queue = (StgBlockingQueueElement *)CurrentTSO;
+#else
+    CurrentTSO->link = END_TSO_QUEUE;
     ((StgBlockingQueue *)R1.p)->blocking_queue = CurrentTSO;
+#endif
+    /* jot down why and on what closure we are blocked */
     CurrentTSO->why_blocked = BlockedOnBlackHole;
     CurrentTSO->block_info.closure = R1.cl;
+    /* closure is mutable since something has just been added to its BQ */
     recordMutable((StgMutClosure *)R1.cl);
     /* Change the BLACKHOLE into a BLACKHOLE_BQ */
     ((StgBlockingQueue *)R1.p)->header.info = &BLACKHOLE_BQ_info;
+
+#if defined(PAR)
+    /* Save the Thread State here, before calling RTS routines below! */
+    SAVE_THREAD_STATE(1);
+
+    /* if collecting stats update the execution time etc */
+    if (RtsFlags.ParFlags.ParStats.Full) {
+      /* Note that CURRENT_TIME may perform an unsafe call */
+      //rtsTime now = CURRENT_TIME; /* Now */
+      CurrentTSO->par.exectime += CURRENT_TIME - CurrentTSO->par.blockedat;
+      CurrentTSO->par.blockcount++;
+      CurrentTSO->par.blockedat = CURRENT_TIME;
+      DumpRawGranEvent(CURRENT_PROC, thisPE,
+		       GR_BLOCK, CurrentTSO, (StgClosure *)R1.p, 0);
+    }
+
+    THREAD_RETURN(1);  /* back to the scheduler */  
+#else
     /* stg_gen_block is too heavyweight, use a specialised one */
     BLOCK_NP(1);
+#endif
+
   FE_
 }
 
@@ -208,6 +260,11 @@ INFO_TABLE(BLACKHOLE_BQ_info, BLACKHOLE_BQ_entry,1,1,BLACKHOLE_BQ,,EF_,0,0);
 STGFUN(BLACKHOLE_BQ_entry)
 {
   FB_
+#if defined(GRAN)
+    /* Before overwriting TSO_LINK */
+    STGCALL3(GranSimBlock,CurrentTSO,CurrentProc,(StgClosure *)R1.p /*Node*/);
+#endif
+
 #ifdef SMP
     CMPXCHG(R1.cl->header.info, &BLACKHOLE_BQ_info, &WHITEHOLE_info);
 #endif
@@ -215,42 +272,156 @@ STGFUN(BLACKHOLE_BQ_entry)
     TICK_ENT_BH();
 
     /* Put ourselves on the blocking queue for this black hole */
-    CurrentTSO->why_blocked = BlockedOnBlackHole;
-    CurrentTSO->block_info.closure = R1.cl;
     CurrentTSO->link = ((StgBlockingQueue *)R1.p)->blocking_queue;
     ((StgBlockingQueue *)R1.p)->blocking_queue = CurrentTSO;
+    /* jot down why and on what closure we are blocked */
+    CurrentTSO->why_blocked = BlockedOnBlackHole;
+    CurrentTSO->block_info.closure = R1.cl;
 #ifdef SMP
     ((StgBlockingQueue *)R1.p)->header.info = &BLACKHOLE_BQ_info;
 #endif
 
+#if defined(PAR)
+    /* Save the Thread State here, before calling RTS routines below! */
+    SAVE_THREAD_STATE(1);
+
+    /* if collecting stats update the execution time etc */
+    if (RtsFlags.ParFlags.ParStats.Full) {
+      /* Note that CURRENT_TIME may perform an unsafe call */
+      //rtsTime now = CURRENT_TIME; /* Now */
+      CurrentTSO->par.exectime += CURRENT_TIME - CurrentTSO->par.blockedat;
+      CurrentTSO->par.blockcount++;
+      CurrentTSO->par.blockedat = CURRENT_TIME;
+      DumpRawGranEvent(CURRENT_PROC, thisPE,
+		       GR_BLOCK, CurrentTSO, (StgClosure *)R1.p, 0);
+    }
+
+    THREAD_RETURN(1);  /* back to the scheduler */  
+#else
     /* stg_gen_block is too heavyweight, use a specialised one */
     BLOCK_NP(1);
+#endif
+  FE_
+}
+
+/*
+   Revertible black holes are needed in the parallel world, to handle
+   negative acknowledgements of messages containing updatable closures.
+   The idea is that when the original message is transmitted, the closure
+   is turned into a revertible black hole...an object which acts like a
+   black hole when local threads try to enter it, but which can be reverted
+   back to the original closure if necessary.
+
+   It's actually a lot like a blocking queue (BQ) entry, because revertible
+   black holes are initially set up with an empty blocking queue.
+*/
+
+#if defined(PAR) || defined(GRAN)
+
+INFO_TABLE(RBH_info, RBH_entry,1,1,RBH,,EF_,0,0);
+STGFUN(RBH_entry)
+{
+  FB_
+# if defined(GRAN)
+    /* mainly statistics gathering for GranSim simulation */
+    STGCALL3(GranSimBlock,CurrentTSO,CurrentProc,(StgClosure *)R1.p /*Node*/);
+# endif
+
+    /* exactly the same as a BLACKHOLE_BQ_entry -- HWL */
+    /* Put ourselves on the blocking queue for this black hole */
+    CurrentTSO->link = ((StgBlockingQueue *)R1.p)->blocking_queue;
+    ((StgBlockingQueue *)R1.p)->blocking_queue = CurrentTSO;
+    /* jot down why and on what closure we are blocked */
+    CurrentTSO->why_blocked = BlockedOnBlackHole;
+    CurrentTSO->block_info.closure = R1.cl;
+
+#if defined(PAR)
+    /* Save the Thread State here, before calling RTS routines below! */
+    SAVE_THREAD_STATE(1);
+
+    /* if collecting stats update the execution time etc */
+    if (RtsFlags.ParFlags.ParStats.Full) {
+      /* Note that CURRENT_TIME may perform an unsafe call */
+      //rtsTime now = CURRENT_TIME; /* Now */
+      CurrentTSO->par.exectime += CURRENT_TIME - CurrentTSO->par.blockedat;
+      CurrentTSO->par.blockcount++;
+      CurrentTSO->par.blockedat = CURRENT_TIME;
+      DumpRawGranEvent(CURRENT_PROC, thisPE,
+		       GR_BLOCK, CurrentTSO, (StgClosure *)R1.p, 0);
+    }
+
+    THREAD_RETURN(1);  /* back to the scheduler */  
+#else
+    /* saves thread state and leaves thread in ThreadEnterGHC state; */
+    /* stg_gen_block is too heavyweight, use a specialised one */
+    BLOCK_NP(1); 
+#endif
+
   FE_
 }
 
+INFO_TABLE(RBH_Save_0_info, RBH_Save_0_entry,0,2,CONSTR,,EF_,0,0);
+NON_ENTERABLE_ENTRY_CODE(RBH_Save_0);
+
+INFO_TABLE(RBH_Save_1_info, RBH_Save_1_entry,1,1,CONSTR,,EF_,0,0);
+NON_ENTERABLE_ENTRY_CODE(RBH_Save_1);
+
+INFO_TABLE(RBH_Save_2_info, RBH_Save_2_entry,2,0,CONSTR,,EF_,0,0);
+NON_ENTERABLE_ENTRY_CODE(RBH_Save_2);
+#endif /* defined(PAR) || defined(GRAN) */
+
 /* identical to BLACKHOLEs except for the infotag */
 INFO_TABLE(CAF_BLACKHOLE_info, CAF_BLACKHOLE_entry,0,2,CAF_BLACKHOLE,,EF_,0,0);
 STGFUN(CAF_BLACKHOLE_entry)
 {
   FB_
+#if defined(GRAN)
+    /* mainly statistics gathering for GranSim simulation */
+    STGCALL3(GranSimBlock,CurrentTSO,CurrentProc,(StgClosure *)R1.p /*Node*/);
+#endif
+
 #ifdef SMP
     CMPXCHG(R1.cl->header.info, &CAF_BLACKHOLE_info, &WHITEHOLE_info);
+#endif
 
     TICK_ENT_BH();
 
     /* Put ourselves on the blocking queue for this black hole */
-    CurrentTSO->link = (StgTSO *)&END_TSO_QUEUE_closure;
+#if defined(GRAN) || defined(PAR)
+    /* in fact, only difference is the type of the end-of-queue marker! */
+    CurrentTSO->link = END_BQ_QUEUE;
+    ((StgBlockingQueue *)R1.p)->blocking_queue = (StgBlockingQueueElement *)CurrentTSO;
+#else
+    CurrentTSO->link = END_TSO_QUEUE;
     ((StgBlockingQueue *)R1.p)->blocking_queue = CurrentTSO;
+#endif
+    /* jot down why and on what closure we are blocked */
     CurrentTSO->why_blocked = BlockedOnBlackHole;
     CurrentTSO->block_info.closure = R1.cl;
+    /* closure is mutable since something has just been added to its BQ */
     recordMutable((StgMutClosure *)R1.cl);
     /* Change the CAF_BLACKHOLE into a BLACKHOLE_BQ */
     ((StgBlockingQueue *)R1.p)->header.info = &BLACKHOLE_BQ_info;
-    /* stg_gen_block is too heavyweight, use a specialised one */
-    BLOCK_NP(1);
 
+#if defined(PAR)
+    /* Save the Thread State here, before calling RTS routines below! */
+    SAVE_THREAD_STATE(1);
+
+    /* if collecting stats update the execution time etc */
+    if (RtsFlags.ParFlags.ParStats.Full) {
+      /* Note that CURRENT_TIME may perform an unsafe call */
+      //rtsTime now = CURRENT_TIME; /* Now */
+      CurrentTSO->par.exectime += CURRENT_TIME - CurrentTSO->par.blockedat;
+      CurrentTSO->par.blockcount++;
+      CurrentTSO->par.blockedat = CURRENT_TIME;
+      DumpRawGranEvent(CURRENT_PROC, thisPE,
+		       GR_BLOCK, CurrentTSO, (StgClosure *)R1.p, 0);
+    }
+
+    THREAD_RETURN(1);  /* back to the scheduler */  
 #else
-    JMP_(BLACKHOLE_entry);
+    /* stg_gen_block is too heavyweight, use a specialised one */
+    BLOCK_NP(1);
 #endif
 
   FE_
@@ -301,17 +472,9 @@ EF_(BCO_entry) {
 /* -----------------------------------------------------------------------------
    Some static info tables for things that don't get entered, and
    therefore don't need entry code (i.e. boxed but unpointed objects)
+   NON_ENTERABLE_ENTRY_CODE now defined at the beginning of the file
    -------------------------------------------------------------------------- */
 
-#define NON_ENTERABLE_ENTRY_CODE(type)					\
-STGFUN(type##_entry)							\
-{									\
-  FB_									\
-    DUMP_ERRMSG(#type " object entered!\n");                            \
-    STGCALL1(shutdownHaskellAndExit, EXIT_FAILURE);			\
-  FE_									\
-}
-
 INFO_TABLE(TSO_info, TSO_entry, 0,0,TSO,,EF_,0,0);
 NON_ENTERABLE_ENTRY_CODE(TSO);
 
diff --git a/ghc/rts/Storage.h b/ghc/rts/Storage.h
index c996edf..17076bf 100644
--- a/ghc/rts/Storage.h
+++ b/ghc/rts/Storage.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: Storage.h,v 1.13 1999/11/11 11:49:26 simonmar Exp $
+ * $Id: Storage.h,v 1.14 2000/01/13 14:34:05 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -178,5 +178,10 @@ updateWithPermIndirection(const StgInfoTable *info, StgClosure *p1, StgClosure *
 
 extern StgCAF* enteredCAFs;
 
+#if defined(DEBUG)
+void printMutOnceList(generation *gen);
+void printMutableList(generation *gen);
+#endif DEBUG
+
 #endif STORAGE_H
 
diff --git a/ghc/rts/Updates.hc b/ghc/rts/Updates.hc
index 53f2476..38e69e8 100644
--- a/ghc/rts/Updates.hc
+++ b/ghc/rts/Updates.hc
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------------
- * $Id: Updates.hc,v 1.24 1999/12/01 14:34:39 simonmar Exp $
+ * $Id: Updates.hc,v 1.25 2000/01/13 14:34:05 hwloidl Exp $
  *
  * (c) The GHC Team, 1998-1999
  *
@@ -13,6 +13,9 @@
 #include "HeapStackCheck.h"
 #include "Storage.h"
 #include "ProfRts.h"
+#if defined(GRAN) || defined(PAR)
+# include "FetchMe.h"
+#endif
 
 /*
   The update frame return address must be *polymorphic*, that means
@@ -245,11 +248,6 @@ EXTFUN(stg_update_PAP)
      */
     Fun = R1.cl;
 
-#if defined(GRAN_COUNT)
-#error Fixme.
-      ++nPAPs;
-#endif
-
     /* Just copy the whole block of stack between the stack pointer
      * and the update frame pointer.
      */
diff --git a/ghc/rts/parallel/0Hash.c b/ghc/rts/parallel/0Hash.c
new file mode 100644
index 0000000..56e6646
--- /dev/null
+++ b/ghc/rts/parallel/0Hash.c
@@ -0,0 +1,321 @@
+/*-----------------------------------------------------------------------------
+ * $Id: 0Hash.c,v 1.2 2000/01/13 14:34:06 hwloidl Exp $
+ *
+ * (c) The AQUA Project, Glasgow University, 1995-1998
+ * (c) The GHC Team, 1999
+ *
+ * Dynamically expanding linear hash tables, as described in
+ * Per-\AAke Larson, ``Dynamic Hash Tables,'' CACM 31(4), April 1988,
+ * pp. 446 -- 457.
+ * -------------------------------------------------------------------------- */
+
+/* 
+   Replaced with ghc/rts/Hash.c in the new RTS
+*/
+
+#if 0
+
+#include "Rts.h"
+#include "Hash.h"
+#include "RtsUtils.h"
+
+#define HSEGSIZE    1024    /* Size of a single hash table segment */
+			    /* Also the minimum size of a hash table */
+#define HDIRSIZE    1024    /* Size of the segment directory */
+			    /* Maximum hash table size is HSEGSIZE * HDIRSIZE */
+#define HLOAD	    5	    /* Maximum average load of a single hash bucket */
+
+#define HCHUNK	    (1024 * sizeof(W_) / sizeof(HashList))
+			    /* Number of HashList cells to allocate in one go */
+
+
+/* Linked list of (key, data) pairs for separate chaining */
+struct hashlist {
+    StgWord key;
+    void *data;
+    struct hashlist *next;  /* Next cell in bucket chain (same hash value) */
+};
+
+typedef struct hashlist HashList;
+
+struct hashtable {
+    int split;		    /* Next bucket to split when expanding */
+    int max;		    /* Max bucket of smaller table */
+    int mask1;		    /* Mask for doing the mod of h_1 (smaller table) */
+    int mask2;		    /* Mask for doing the mod of h_2 (larger table) */
+    int kcount;		    /* Number of keys */
+    int bcount;		    /* Number of buckets */
+    HashList **dir[HDIRSIZE];	/* Directory of segments */
+};
+
+/* -----------------------------------------------------------------------------
+ * Hash first using the smaller table.  If the bucket is less than the
+ * next bucket to be split, re-hash using the larger table.
+ * -------------------------------------------------------------------------- */
+
+static int
+hash(HashTable *table, W_ key)
+{
+    int bucket;
+
+    /* Strip the boring zero bits */
+    key /= sizeof(StgWord);
+
+    /* Mod the size of the hash table (a power of 2) */
+    bucket = key & table->mask1;
+
+    if (bucket < table->split) {
+	/* Mod the size of the expanded hash table (also a power of 2) */
+	bucket = key & table->mask2;
+    }
+    return bucket;
+}
+
+/* -----------------------------------------------------------------------------
+ * Allocate a new segment of the dynamically growing hash table.
+ * -------------------------------------------------------------------------- */
+
+static void
+allocSegment(HashTable *table, int segment)
+{
+    table->dir[segment] = stgMallocBytes(HSEGSIZE * sizeof(HashList *), 
+					 "allocSegment");
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Expand the larger hash table by one bucket, and split one bucket
+ * from the smaller table into two parts.  Only the bucket referenced
+ * by @table->split@ is affected by the expansion.
+ * -------------------------------------------------------------------------- */
+
+static void
+expand(HashTable *table)
+{
+    int oldsegment;
+    int oldindex;
+    int newbucket;
+    int newsegment;
+    int newindex;
+    HashList *hl;
+    HashList *next;
+    HashList *old, *new;
+
+    if (table->split + table->max >= HDIRSIZE * HSEGSIZE)
+	/* Wow!  That's big.  Too big, so don't expand. */
+	return;
+
+    /* Calculate indices of bucket to split */
+    oldsegment = table->split / HSEGSIZE;
+    oldindex = table->split % HSEGSIZE;
+
+    newbucket = table->max + table->split;
+
+    /* And the indices of the new bucket */
+    newsegment = newbucket / HSEGSIZE;
+    newindex = newbucket % HSEGSIZE;
+
+    if (newindex == 0)
+	allocSegment(table, newsegment);
+
+    if (++table->split == table->max) {
+	table->split = 0;
+	table->max *= 2;
+	table->mask1 = table->mask2;
+	table->mask2 = table->mask2 << 1 | 1;
+    }
+    table->bcount++;
+
+    /* Split the bucket, paying no attention to the original order */
+
+    old = new = NULL;
+    for (hl = table->dir[oldsegment][oldindex]; hl != NULL; hl = next) {
+	next = hl->next;
+	if (hash(table, hl->key) == newbucket) {
+	    hl->next = new;
+	    new = hl;
+	} else {
+	    hl->next = old;
+	    old = hl;
+	}
+    }
+    table->dir[oldsegment][oldindex] = old;
+    table->dir[newsegment][newindex] = new;
+
+    return;
+}
+
+void *
+lookupHashTable(HashTable *table, StgWord key)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+
+    bucket = hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    for (hl = table->dir[segment][index]; hl != NULL; hl = hl->next)
+	if (hl->key == key)
+	    return hl->data;
+
+    /* It's not there */
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * We allocate the hashlist cells in large chunks to cut down on malloc
+ * overhead.  Although we keep a free list of hashlist cells, we make
+ * no effort to actually return the space to the malloc arena.
+ * -------------------------------------------------------------------------- */
+
+static HashList *freeList = NULL;
+
+static HashList *
+allocHashList(void)
+{
+    HashList *hl, *p;
+
+    if ((hl = freeList) != NULL) {
+	freeList = hl->next;
+    } else {
+        hl = stgMallocBytes(HCHUNK * sizeof(HashList), "allocHashList");
+
+	freeList = hl + 1;
+	for (p = freeList; p < hl + HCHUNK - 1; p++)
+	    p->next = p + 1;
+	p->next = NULL;
+    }
+    return hl;
+}
+
+static void
+freeHashList(HashList *hl)
+{
+    hl->next = freeList;
+    freeList = hl;
+}
+
+void
+insertHashTable(HashTable *table, StgWord key, void *data)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+
+    /* We want no duplicates */
+    ASSERT(lookupHashTable(table, key) == NULL);
+    
+    /* When the average load gets too high, we expand the table */
+    if (++table->kcount >= HLOAD * table->bcount)
+	expand(table);
+
+    bucket = hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    hl = allocHashList();
+
+    hl->key = key;
+    hl->data = data;
+    hl->next = table->dir[segment][index];
+    table->dir[segment][index] = hl;
+
+}
+
+void *
+removeHashTable(HashTable *table, StgWord key, void *data)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+    HashList *prev = NULL;
+
+    bucket = hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    for (hl = table->dir[segment][index]; hl != NULL; hl = hl->next) {
+	if (hl->key == key && (data == NULL || hl->data == data)) {
+	    if (prev == NULL)
+		table->dir[segment][index] = hl->next;
+	    else
+		prev->next = hl->next;
+	    table->kcount--;
+	    return hl->data;
+	}
+	prev = hl;
+    }
+
+    /* It's not there */
+    ASSERT(data == NULL);
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * When we free a hash table, we are also good enough to free the
+ * data part of each (key, data) pair, as long as our caller can tell
+ * us how to do it.
+ * -------------------------------------------------------------------------- */
+
+void
+freeHashTable(HashTable *table, void (*freeDataFun)(void *) )
+{
+    long segment;
+    long index;
+    HashList *hl;
+    HashList *next;
+
+    /* The last bucket with something in it is table->max + table->split - 1 */
+    segment = (table->max + table->split - 1) / HSEGSIZE;
+    index = (table->max + table->split - 1) % HSEGSIZE;
+
+    while (segment >= 0) {
+	while (index >= 0) {
+	    for (hl = table->dir[segment][index]; hl != NULL; hl = next) {
+		next = hl->next;
+		if (freeDataFun != NULL)
+		    (*freeDataFun)(hl->data);
+		freeHashList(hl);
+	    }
+	    index--;
+	}
+	free(table->dir[segment]);
+	segment--;
+	index = HSEGSIZE - 1;
+    }
+    free(table);
+}
+
+/* -----------------------------------------------------------------------------
+ * When we initialize a hash table, we set up the first segment as well,
+ * initializing all of the first segment's hash buckets to NULL.
+ * -------------------------------------------------------------------------- */
+
+HashTable *
+allocHashTable(void)
+{
+    HashTable *table;
+    HashList **hb;
+
+    table = stgMallocBytes(sizeof(HashTable),"allocHashTable");
+
+    allocSegment(table, 0);
+
+    for (hb = table->dir[0]; hb < table->dir[0] + HSEGSIZE; hb++)
+	*hb = NULL;
+
+    table->split = 0;
+    table->max = HSEGSIZE;
+    table->mask1 = HSEGSIZE - 1;
+    table->mask2 = 2 * HSEGSIZE - 1;
+    table->kcount = 0;
+    table->bcount = HSEGSIZE;
+
+    return table;
+}
+#endif
diff --git a/ghc/rts/parallel/0Parallel.h b/ghc/rts/parallel/0Parallel.h
new file mode 100644
index 0000000..d52bf00
--- /dev/null
+++ b/ghc/rts/parallel/0Parallel.h
@@ -0,0 +1,414 @@
+/*
+  Time-stamp: <Mon Oct 04 1999 14:50:28 Stardate: [-30]3692.88 hwloidl>
+ 
+  Definitions for parallel machines.
+
+This section contains definitions applicable only to programs compiled
+to run on a parallel machine, i.e. on GUM. Some of these definitions
+are also used when simulating parallel execution, i.e. on GranSim.
+  */
+
+/*
+  ToDo: Check the PAR specfic part of this file 
+        Move stuff into Closures.h and ClosureMacros.h 
+	Clean-up GRAN specific code
+  -- HWL
+  */
+
+#ifndef PARALLEL_H
+#define PARALLEL_H
+
+#if defined(PAR) || defined(GRAN)        /* whole file */
+
+#include "Rts.h"
+#include "GranSim.h"
+//#include "ClosureTypes.h"
+
+//@menu
+//* Basic definitions::		
+//* Externs and types::		
+//* Dummy defs::		
+//* Par specific fixed headers::  
+//* Parallel only heap objects::  
+//* Packing definitions::	
+//* End of File::		
+//@end menu
+//*/
+
+//@node Basic definitions, Externs and types
+//@section Basic definitions
+
+/* SET_PAR_HDR and SET_STATIC_PAR_HDR now live in ClosureMacros.h */
+
+/* Needed for dumping routines */
+#if defined(PAR)
+# define TIME                      ullong
+# define CURRENT_TIME              msTime()
+# define TIME_ON_PROC(p)           msTime()
+# define CURRENT_PROC              thisPE
+# define BINARY_STATS              RtsFlags.ParFlags.granSimStats_Binary
+#elif defined(GRAN)
+# define TIME                      rtsTime
+# define CURRENT_TIME              CurrentTime[CurrentProc]
+# define TIME_ON_PROC(p)           CurrentTime[p]
+# define CURRENT_PROC              CurrentProc
+# define BINARY_STATS              RtsFlags.GranFlags.granSimStats_Binary
+#endif
+
+#if defined(PAR)
+#  define MAX_PES	256		/* Maximum number of processors */
+	/* MAX_PES is enforced by SysMan, which does not
+	   allow more than this many "processors".
+	   This is important because PackGA [GlobAddr.lc]
+	   **assumes** that a PE# can fit in 8+ bits.
+	*/
+#endif
+
+//@node Externs and types, Dummy defs, Basic definitions
+//@section Externs and types
+
+#if defined(PAR)
+/* GUM: one spark queue on each PE, and each PE sees only its own spark queue */
+extern rtsSparkQ pending_sparks_hd;
+extern rtsSparkQ pending_sparks_tl;
+#elif defined(GRAN)
+/* GranSim: a globally visible array of spark queues */
+extern rtsSparkQ pending_sparks_hds[];
+extern rtsSparkQ pending_sparks_tls[];
+#endif
+extern unsigned int /* nat */ spark_queue_len(PEs proc);
+
+extern StgInt SparksAvail;     /* How many sparks are available */
+
+/* prototypes of spark routines */
+/* ToDo: check whether all have to be visible -- HWL */
+#if defined(GRAN)
+rtsSpark *newSpark(StgClosure *node, StgInt name, StgInt gran_info, StgInt size_info, StgInt par_info, StgInt local);
+void disposeSpark(rtsSpark *spark);
+void disposeSparkQ(rtsSparkQ spark);
+void add_to_spark_queue(rtsSpark *spark);
+void delete_from_spark_queue (rtsSpark *spark);
+#endif
+
+#define STATS_FILENAME_MAXLEN	128
+
+/* Where to write the log file */
+//extern FILE *gr_file;
+extern char gr_filename[STATS_FILENAME_MAXLEN];
+
+#if defined(GRAN)
+int init_gr_simulation(char *rts_argv[], int rts_argc, char *prog_argv[], int prog_argc);
+void end_gr_simulation(void);
+#endif 
+
+#if defined(PAR)
+extern I_ do_sp_profile;
+
+extern P_ PendingFetches;
+extern GLOBAL_TASK_ID *PEs;
+
+extern rtsBool IAmMainThread, GlobalStopPending;
+extern rtsBool fishing;
+extern GLOBAL_TASK_ID SysManTask;
+extern int seed;			/*pseudo-random-number generator seed:*/
+					/*Initialised in ParInit*/
+extern I_ threadId;                     /*Number of Threads that have existed on a PE*/
+extern GLOBAL_TASK_ID mytid;
+
+extern int  nPEs;
+
+extern rtsBool InGlobalGC;  	/* Are we in the midst of performing global GC */
+
+extern HashTable *pGAtoGALAtable;
+extern HashTable *LAtoGALAtable;
+extern GALA *freeIndirections;
+extern GALA *liveIndirections;
+extern GALA *freeGALAList;
+extern GALA *liveRemoteGAs;
+extern int thisPE;
+
+void RunParallelSystem (StgPtr program_closure);
+void initParallelSystem();
+void SynchroniseSystem();
+
+void registerTask (GLOBAL_TASK_ID gtid);
+globalAddr *LAGAlookup (P_ addr);
+P_ GALAlookup (globalAddr *ga);
+globalAddr *MakeGlobal (P_ addr, rtsBool preferred);
+globalAddr *setRemoteGA (P_ addr, globalAddr *ga, rtsBool preferred);
+void splitWeight (globalAddr *to, globalAddr *from);
+globalAddr *addWeight (globalAddr *ga);
+void initGAtables();
+W_ taskIDtoPE (GLOBAL_TASK_ID gtid);
+void RebuildLAGAtable();
+
+void *lookupHashTable (HashTable *table, StgWord key);
+void insertHashTable (HashTable *table, StgWord key, void *data);
+void freeHashTable (HashTable *table, void (*freeDataFun) ((void *data)));
+HashTable *allocHashTable();
+void *removeHashTable (HashTable *table, StgWord key, void *data);
+#endif /* PAR */
+
+/* Interface for dumping routines (i.e. writing to log file) */
+void DumpGranEvent(GranEventType name, StgTSO *tso);
+void DumpRawGranEvent(PEs proc, PEs p, GranEventType name, 
+ 	              StgTSO *tso, StgClosure *node, StgInt sparkname, StgInt len);
+//void DumpEndEvent(PEs proc, StgTSO *tso, rtsBool mandatory_thread);
+
+//@node Dummy defs, Par specific fixed headers, Externs and types
+//@section Dummy defs
+
+/*
+Get this out of the way.  These are all null definitions.
+*/
+
+
+//#  define GA_HDR_SIZE			0 
+//#  define GA(closure)	        	/*nothing */ 
+  
+//#  define SET_GA(closure,ga)		/* nothing */ 
+//#  define SET_STATIC_GA(closure)	/* nothing */ 
+//#  define SET_GRAN_HDR(closure,pe)      /* nothing */ 
+//#  define SET_STATIC_PROCS(closure)	/* nothing */ 
+  
+//#  define SET_TASK_ACTIVITY(act)	/* nothing */ 
+
+#if defined(GRAN)
+
+#  define GA_HDR_SIZE			1
+
+#  define PROCS_HDR_POSN		PAR_HDR_POSN
+#  define PROCS_HDR_SIZE		1
+
+/* Accessing components of the field */
+#  define PROCS(closure)	        ((closure)->header.gran.procs)
+/* SET_PROCS is now SET_GRAN_HEADER in ClosureMacros.h. */
+#endif
+
+
+//@node Par specific fixed headers, Parallel only heap objects, Dummy defs
+//@section Par specific fixed headers
+
+/*
+Definitions relating to the entire parallel-only fixed-header field.
+
+On GUM, the global addresses for each local closure are stored in a separate
+hash table, rather then with the closure in the heap.  We call @getGA@ to
+look up the global address associated with a local closure (0 is returned
+for local closures that have no global address), and @setGA@ to store a new
+global address for a local closure which did not previously have one.
+*/
+
+#if defined(PAR) 
+
+#  define GA_HDR_SIZE			0
+  
+#  define GA(closure)		        getGA(closure)
+  
+#  define SET_GA(closure, ga)             setGA(closure,ga)
+#  define SET_STATIC_GA(closure)
+#  define SET_GRAN_HDR(closure,pe)
+#  define SET_STATIC_PROCS(closure)
+  
+#  define MAX_GA_WEIGHT			0	/* Treat as 2^n */
+  
+W_ PackGA ((W_, int));
+   /* There was a PACK_GA macro here; but we turned it into the PackGA
+      routine [GlobAddr.lc] (because it needs to do quite a bit of
+      paranoia checking.  Phil & Will (95/08)
+   */
+
+/* At the moment, there is no activity profiling for GUM.  This may change. */
+#  define SET_TASK_ACTIVITY(act)        /* nothing */
+#endif
+
+//@node Parallel only heap objects, Packing definitions, Par specific fixed headers
+//@section Parallel only heap objects
+
+// NB: The following definitons are BOTH for GUM and GrAnSim -- HWL
+
+/*   All in Closures.h and CLosureMacros.h */
+
+//@node Packing definitions, End of File, Parallel only heap objects
+//@section Packing definitions
+
+//@menu
+//* GUM::			
+//* GranSim::			
+//@end menu
+//*/
+
+//@node GUM, GranSim, Packing definitions, Packing definitions
+//@subsection GUM
+
+#if defined(PAR) 
+/*
+Symbolic constants for the packing code.
+
+This constant defines how many words of data we can pack into a single
+packet in the parallel (GUM) system.
+*/
+
+//@menu
+//* Externs::			
+//* Prototypes::		
+//* Macros::			
+//@end menu
+//*/
+
+//@node Externs, Prototypes, GUM, GUM
+//@subsubsection Externs
+
+extern W_      *PackBuffer;      /* size: can be set via option */
+extern long *buffer;             /* HWL_ */
+extern W_ *freeBuffer;           /* HWL_ */
+extern W_ *packBuffer;           /* HWL_ */
+
+extern void    InitPackBuffer(STG_NO_ARGS);
+extern void    InitMoreBuffers(STG_NO_ARGS);
+extern void    InitPendingGABuffer(W_ size); 
+extern void    AllocClosureQueue(W_ size);
+
+//@node Prototypes, Macros, Externs, GUM
+//@subsubsection Prototypes
+
+void	InitPackBuffer();
+P_      PackTSO (P_ tso, W_ *size);
+P_      PackStkO (P_ stko, W_ *size);
+P_	AllocateHeap (W_ size);          /* Doesn't belong */
+
+void    InitClosureQueue ();
+P_      DeQueueClosure();
+void    QueueClosure (P_ closure);
+rtsBool QueueEmpty();
+void    PrintPacket (P_ buffer);
+
+P_      get_closure_info (P_ closure, W_ *size, W_ *ptrs, W_ *nonptrs, W_ *vhs, char *type);
+
+rtsBool isOffset (globalAddr *ga),
+	isFixed (globalAddr *ga);
+
+void    doGlobalGC();
+
+P_      PackNearbyGraph (P_ closure,W_ *size);
+P_      UnpackGraph (W_ *buffer, globalAddr **gamap, W_ *nGAs);
+
+
+//@node Macros,  , Prototypes, GUM
+//@subsubsection Macros
+
+#    define PACK_HEAP_REQUIRED  \
+      ((RtsFlags.ParFlags.packBufferSize - PACK_HDR_SIZE) / (PACK_GA_SIZE + _FHS) * (SPEC_HS + 2))
+
+#  define MAX_GAS 	(RtsFlags.ParFlags.packBufferSize / PACK_GA_SIZE)
+
+
+#  define PACK_GA_SIZE	3	/* Size of a packed GA in words */
+			        /* Size of a packed fetch-me in words */
+#  define PACK_FETCHME_SIZE (PACK_GA_SIZE + FIXED_HS)
+
+#  define PACK_HDR_SIZE	1	/* Words of header in a packet */
+
+#  define PACK_PLC_SIZE	2	/* Size of a packed PLC in words */
+
+#endif /* PAR */
+
+//@node GranSim,  , GUM, Packing definitions
+//@subsection GranSim
+
+#if defined(GRAN)
+/* ToDo: Check which of the PAR routines are needed in GranSim -- HWL */
+
+//@menu
+//* Types::			
+//* Prototypes::		
+//* Macros::			
+//@end menu
+//*/
+
+//@node Types, Prototypes, GranSim, GranSim
+//@subsubsection Types
+
+typedef struct rtsPackBuffer_ {
+  StgInt /* nat */           size;
+  StgInt /* nat */           unpacked_size;
+  StgTSO       *tso;
+  StgClosure  **buffer;  
+} rtsPackBuffer;
+
+//@node Prototypes, Macros, Types, GranSim
+//@subsubsection Prototypes
+
+
+/* main packing functions */
+/*
+rtsPackBuffer *PackNearbyGraph(StgClosure* closure, StgTSO* tso, nat *packbuffersize);
+rtsPackBuffer *PackOneNode(StgClosure* closure, StgTSO* tso, nat *packbuffersize);
+void PrintPacket(rtsPackBuffer *buffer);
+StgClosure *UnpackGraph(rtsPackBuffer* buffer);
+*/
+/* important auxiliary functions */
+
+//StgInfoTable *get_closure_info(StgClosure* node, nat *size, nat *ptrs, nat *nonptrs, nat *vhs, char *info_hdr_ty);
+int IS_BLACK_HOLE(StgClosure* node);
+StgClosure *IS_INDIRECTION(StgClosure* node);
+int IS_THUNK(StgClosure* closure);
+char *display_info_type(StgClosure* closure, char *str);
+
+/* 
+OLD CODE -- HWL
+void  InitPackBuffer(void);
+P_    AllocateHeap (W_ size);
+P_    PackNearbyGraph (P_ closure, P_ tso, W_ *packbuffersize);
+P_    PackOneNode (P_ closure, P_ tso, W_ *packbuffersize);
+P_    UnpackGraph (P_ buffer);
+
+void    InitClosureQueue (void);
+P_      DeQueueClosure(void);
+void    QueueClosure (P_ closure);
+// rtsBool QueueEmpty();
+void    PrintPacket (P_ buffer);
+*/
+
+// StgInfoTable *get_closure_info(StgClosure* node, unsigned int /* nat */ *size, unsigned int /* nat */ *ptrs, unsigned int /* nat */ *nonptrs, unsigned int /* nat */ *vhs, char *info_hdr_ty);
+// int /* rtsBool */ IS_BLACK_HOLE(StgClosure* node)          ;
+
+//@node Macros,  , Prototypes, GranSim
+//@subsubsection Macros
+
+/* These are needed in the packing code to get the size of the packet
+   right. The closures itself are never built in GrAnSim. */
+#  define FETCHME_VHS				IND_VHS
+#  define FETCHME_HS				IND_HS
+  
+#  define FETCHME_GA_LOCN                       FETCHME_HS
+  
+#  define FETCHME_CLOSURE_SIZE(closure)		IND_CLOSURE_SIZE(closure)
+#  define FETCHME_CLOSURE_NoPTRS(closure)		0L
+#  define FETCHME_CLOSURE_NoNONPTRS(closure)	(IND_CLOSURE_SIZE(closure)-IND_VHS)
+  
+#  define MAX_GAS 	(RtsFlags.GranFlags.packBufferSize / PACK_GA_SIZE)
+#  define PACK_GA_SIZE	3	/* Size of a packed GA in words */
+			        /* Size of a packed fetch-me in words */
+#  define PACK_FETCHME_SIZE (PACK_GA_SIZE + FIXED_HS)
+#  define PACK_HDR_SIZE	4	/* Words of header in a packet */
+
+#    define PACK_HEAP_REQUIRED  \
+      (RtsFlags.GranFlags.packBufferSize * sizeofW(StgClosure*) + \
+      2 * sizeofW(StgInt) + sizeofW(StgTSO*))
+
+#    define PACK_FLAG_LOCN           0  
+#    define PACK_TSO_LOCN            1
+#    define PACK_UNPACKED_SIZE_LOCN  2
+#    define PACK_SIZE_LOCN           3
+#    define MAGIC_PACK_FLAG          0xfabc
+
+#endif   /* GRAN */
+
+//@node End of File,  , Packing definitions
+//@section End of File
+
+#endif /* defined(PAR) || defined(GRAN)         whole file */
+#endif /* Parallel_H */
+
+
diff --git a/ghc/rts/parallel/0Unpack.c b/ghc/rts/parallel/0Unpack.c
new file mode 100644
index 0000000..fc4a8e5
--- /dev/null
+++ b/ghc/rts/parallel/0Unpack.c
@@ -0,0 +1,440 @@
+/*
+  Time-stamp: <Wed Jan 12 2000 13:29:08 Stardate: [-30]4193.85 hwloidl>
+
+  Unpacking closures which have been exported to remote processors
+
+  This module defines routines for unpacking closures in the parallel
+  runtime system (GUM).
+
+  In the case of GrAnSim, this module defines routines for *simulating* the
+  unpacking of closures as it is done in the parallel runtime system.
+*/
+
+/* 
+   Code in this file has been merged with Pack.c 
+*/
+
+#if 0
+
+//@node Unpacking closures, , ,
+//@section Unpacking closures
+
+//@menu
+//* Includes::			
+//* Prototypes::		
+//* GUM code::			
+//* GranSim Code::		
+//* Index::			
+//@end menu
+//*/
+
+//@node Includes, Prototypes, Unpacking closures, Unpacking closures
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+#include "ParallelDebug.h"
+#include "FetchMe.h"
+#include "Storage.h"
+
+//@node Prototypes, GUM code, Includes, Unpacking closures
+//@subsection Prototypes
+
+void     InitPacking(void);
+# if defined(PAR)
+void            InitPackBuffer(void);
+# endif
+/* Interface for ADT of closure queues */
+void    	  AllocClosureQueue(nat size);
+void    	  InitClosureQueue(void);
+rtsBool 	  QueueEmpty(void);
+void    	  QueueClosure(StgClosure *closure);
+StgClosure *DeQueueClosure(void);
+
+StgPtr AllocateHeap(nat size);
+
+//@node GUM code, GranSim Code, Prototypes, Unpacking closures
+//@subsection GUM code
+
+#if defined(PAR) 
+
+//@node Local Definitions,  , GUM code, GUM code
+//@subsubsection Local Definitions
+
+//@cindex PendingGABuffer
+static globalAddr *PendingGABuffer;  
+/* is initialised in main; */
+
+//@cindex InitPendingGABuffer
+void
+InitPendingGABuffer(size)
+nat size; 
+{
+  PendingGABuffer = (globalAddr *) 
+                      stgMallocBytes((size-PACK_HDR_SIZE)*2*sizeof(globalAddr),
+				     "InitPendingGABuffer");
+}
+
+/*
+  @CommonUp@ commons up two closures which we have discovered to be
+  variants of the same object.  One is made an indirection to the other.  */
+
+//@cindex CommonUp
+void
+CommonUp(StgClosure *src, StgClosure *dst)
+{
+  StgBlockingQueueElement *bqe;
+
+  ASSERT(src != dst);
+  switch (get_itbl(src)->type) {
+  case BLACKHOLE_BQ:
+    bqe = ((StgBlockingQueue *)src)->blocking_queue;
+    break;
+
+  case FETCH_ME_BQ:
+    bqe = ((StgFetchMeBlockingQueue *)src)->blocking_queue;
+    break;
+    
+  case RBH:
+    bqe = ((StgRBH *)src)->blocking_queue;
+    break;
+    
+  case BLACKHOLE:
+  case FETCH_ME:
+    bqe = END_BQ_QUEUE;
+    break;
+
+  default:
+    /* Don't common up anything else */
+    return;
+  }
+  /* We do not use UPD_IND because that would awaken the bq, too */
+  // UPD_IND(src, dst);
+  updateWithIndirection(get_itbl(src), src, dst);
+  //ASSERT(!IS_BIG_MOTHER(INFO_PTR(dst)));
+  if (bqe != END_BQ_QUEUE)
+    awaken_blocked_queue(bqe, src);
+}
+
+/*
+  @UnpackGraph@ unpacks the graph contained in a message buffer.  It
+  returns a pointer to the new graph.  The @gamap@ parameter is set to
+  point to an array of (oldGA,newGA) pairs which were created as a result
+  of unpacking the buffer; @nGAs@ is set to the number of GA pairs which
+  were created.
+
+  The format of graph in the pack buffer is as defined in @Pack.lc@.  */
+
+//@cindex UnpackGraph
+StgClosure *
+UnpackGraph(packBuffer, gamap, nGAs)
+rtsPackBuffer *packBuffer;
+globalAddr **gamap;
+nat *nGAs;
+{
+  nat size, ptrs, nonptrs, vhs;
+  StgWord **buffer, **bufptr, **slotptr;
+  globalAddr ga, *gaga;
+  StgClosure *closure, *existing,
+             *graphroot, *graph, *parent;
+  StgInfoTable *ip, *oldip;
+  nat bufsize, i,
+      pptr = 0, pptrs = 0, pvhs;
+  char str[80];
+
+  InitPackBuffer();                  /* in case it isn't already init'd */
+  graphroot = (StgClosure *)NULL;
+
+  gaga = PendingGABuffer;
+
+  InitClosureQueue();
+
+  /* Unpack the header */
+  bufsize = packBuffer->size;
+  buffer = packBuffer->buffer;
+  bufptr = buffer;
+
+  /* allocate heap */
+  if (bufsize > 0) {
+    graph = allocate(bufsize);
+    ASSERT(graph != NULL);
+  }
+
+  parent = (StgClosure *)NULL;
+
+  do {
+    /* This is where we will ultimately save the closure's address */
+    slotptr = bufptr;
+
+    /* First, unpack the next GA or PLC */
+    ga.weight = (rtsWeight) *bufptr++;
+
+    if (ga.weight > 0) {
+      ga.payload.gc.gtid = (GlobalTaskId) *bufptr++;
+      ga.payload.gc.slot = (int) *bufptr++;
+    } else
+      ga.payload.plc = (StgPtr) *bufptr++;
+    
+    /* Now unpack the closure body, if there is one */
+    if (isFixed(&ga)) {
+      /* No more to unpack; just set closure to local address */
+      IF_PAR_DEBUG(pack,
+		   belch("Unpacked PLC at %x", ga.payload.plc)); 
+      closure = ga.payload.plc;
+    } else if (isOffset(&ga)) {
+      /* No more to unpack; just set closure to cached address */
+      ASSERT(parent != (StgClosure *)NULL);
+      closure = (StgClosure *) buffer[ga.payload.gc.slot];
+    } else {
+      /* Now we have to build something. */
+
+      ASSERT(bufsize > 0);
+
+      /*
+       * Close your eyes.  You don't want to see where we're looking. You
+       * can't get closure info until you've unpacked the variable header,
+       * but you don't know how big it is until you've got closure info.
+       * So...we trust that the closure in the buffer is organized the
+       * same way as they will be in the heap...at least up through the
+       * end of the variable header.
+       */
+      ip = get_closure_info(bufptr, &size, &ptrs, &nonptrs, &vhs, str);
+	  
+      /* 
+	 Remember, the generic closure layout is as follows:
+	 +-------------------------------------------------+
+	 | FIXED HEADER | VARIABLE HEADER | PTRS | NON-PRS |
+	 +-------------------------------------------------+
+      */
+      /* Fill in the fixed header */
+      for (i = 0; i < FIXED_HS; i++)
+	((StgPtr)graph)[i] = *bufptr++;
+
+      if (ip->type == FETCH_ME)
+	size = ptrs = nonptrs = vhs = 0;
+
+      /* Fill in the packed variable header */
+      for (i = 0; i < vhs; i++)
+	((StgPtr)graph)[FIXED_HS + i] = *bufptr++;
+
+      /* Pointers will be filled in later */
+
+      /* Fill in the packed non-pointers */
+      for (i = 0; i < nonptrs; i++)
+	((StgPtr)graph)[FIXED_HS + i + vhs + ptrs] = *bufptr++;
+                
+      /* Indirections are never packed */
+      // ASSERT(INFO_PTR(graph) != (W_) Ind_info_TO_USE);
+
+      /* Add to queue for processing */
+      QueueClosure(graph);
+	
+      /*
+       * Common up the new closure with any existing closure having the same
+       * GA
+       */
+
+      if ((existing = GALAlookup(&ga)) == NULL) {
+	globalAddr *newGA;
+	/* Just keep the new object */
+	IF_PAR_DEBUG(pack,
+		     belch("Unpacking new (%x, %d, %x)\n", 
+			   ga.payload.gc.gtid, ga.payload.gc.slot, ga.weight));
+
+	closure = graph;
+	newGA = setRemoteGA(graph, &ga, rtsTrue);
+	if (ip->type == FETCH_ME)
+	  // FETCHME_GA(closure) = newGA;
+	  ((StgFetchMe *)closure)->ga = newGA;
+      } else {
+	/* Two closures, one global name.  Someone loses */
+	oldip = get_itbl(existing);
+
+	if ((oldip->type == FETCH_ME || IS_BLACK_HOLE(existing)) &&
+	    ip->type != FETCH_ME) {
+
+	  /* What we had wasn't worth keeping */
+	  closure = graph;
+	  CommonUp(existing, graph);
+	} else {
+
+	  /*
+	   * Either we already had something worthwhile by this name or
+	   * the new thing is just another FetchMe.  However, the thing we
+	   * just unpacked has to be left as-is, or the child unpacking
+	   * code will fail.  Remember that the way pointer words are
+	   * filled in depends on the info pointers of the parents being
+	   * the same as when they were packed.
+	   */
+	  IF_PAR_DEBUG(pack,
+		       belch("Unpacking old (%x, %d, %x), keeping %#lx", 
+			     ga.payload.gc.gtid, ga.payload.gc.slot, ga.weight,
+			     existing));
+
+	  closure = existing;
+	}
+	/* Pool the total weight in the stored ga */
+	(void) addWeight(&ga);
+      }
+
+      /* Sort out the global address mapping */
+      if ((ip_THUNK(ip) && !ip_UNPOINTED(ip)) || 
+	  (ip_MUTABLE(ip) && ip->type != FETCH_ME)) {
+	/* Make up new GAs for single-copy closures */
+	globalAddr *newGA = makeGlobal(closure, rtsTrue);
+	
+	ASSERT(closure == graph);
+
+	/* Create an old GA to new GA mapping */
+	*gaga++ = ga;
+	splitWeight(gaga, newGA);
+	ASSERT(gaga->weight == 1L << (BITS_IN(unsigned) - 1));
+	gaga++;
+      }
+      graph += FIXED_HS + (size < MIN_UPD_SIZE ? MIN_UPD_SIZE : size);
+    }
+
+    /*
+     * Set parent pointer to point to chosen closure.  If we're at the top of
+     * the graph (our parent is NULL), then we want to arrange to return the
+     * chosen closure to our caller (possibly in place of the allocated graph
+     * root.)
+     */
+    if (parent == NULL)
+      graphroot = closure;
+    else
+      ((StgPtr)parent)[FIXED_HS + pvhs + pptr] = (StgWord) closure;
+
+    /* Save closure pointer for resolving offsets */
+    *slotptr = (StgWord) closure;
+
+    /* Locate next parent pointer */
+    pptr++;
+    while (pptr + 1 > pptrs) {
+      parent = DeQueueClosure();
+
+      if (parent == NULL)
+	break;
+      else {
+	(void) get_closure_info(parent, &size, &pptrs, &nonptrs,
+					&pvhs, str);
+	pptr = 0;
+      }
+    }
+  } while (parent != NULL);
+
+  ASSERT(bufsize == 0 || graph - 1 <= SAVE_Hp);
+
+  *gamap = PendingGABuffer;
+  *nGAs = (gaga - PendingGABuffer) / 2;
+
+  /* ToDo: are we *certain* graphroot has been set??? WDP 95/07 */
+  ASSERT(graphroot!=NULL);
+  return (graphroot);
+}
+#endif  /* PAR */
+
+//@node GranSim Code, Index, GUM code, Unpacking closures
+//@subsection GranSim Code
+
+/*
+   For GrAnSim: In general no actual unpacking should be necessary. We just
+   have to walk over the graph and set the bitmasks appropriately. -- HWL */
+
+//@node Unpacking,  , GranSim Code, GranSim Code
+//@subsubsection Unpacking
+
+#if defined(GRAN)
+void
+CommonUp(StgClosure *src, StgClosure *dst)
+{
+  barf("CommonUp: should never be entered in a GranSim setup");
+}
+
+/* This code fakes the unpacking of a somewhat virtual buffer */
+StgClosure*
+UnpackGraph(buffer)
+rtsPackBuffer* buffer;
+{
+  nat size, ptrs, nonptrs, vhs,
+      bufptr = 0;
+  StgClosure *closure, *graphroot, *graph;
+  StgInfoTable *ip;
+  StgWord bufsize, unpackedsize,
+          pptr = 0, pptrs = 0, pvhs;
+  StgTSO* tso;
+  char str[240], str1[80];
+  int i;
+
+  bufptr = 0;
+  graphroot = buffer->buffer[0];
+
+  tso = buffer->tso;
+
+  /* Unpack the header */
+  unpackedsize = buffer->unpacked_size;
+  bufsize = buffer->size;
+
+  IF_GRAN_DEBUG(pack,
+		belch("<<< Unpacking <<%d>> (buffer @ %p):\n    (root @ %p, PE %d,size=%d), demanded by TSO %d (%p)[PE %d]",
+		      buffer->id, buffer, graphroot, where_is(graphroot), 
+		      bufsize, tso->id, tso, 
+		      where_is((StgClosure *)tso)));
+
+  do {
+    closure = buffer->buffer[bufptr++]; /* that's all we need for GrAnSim -- HWL */
+      
+    /* Actually only ip is needed; rest is useful for TESTING -- HWL */
+    ip = get_closure_info(closure, 
+			  &size, &ptrs, &nonptrs, &vhs, str);
+      
+    IF_GRAN_DEBUG(pack,
+		  sprintf(str, "**    (%p): Changing bitmask[%s]: 0x%x ",
+			  closure, (closure_HNF(closure) ? "NF" : "__"),
+			  PROCS(closure)));
+
+    if (ip->type == RBH) {
+      closure->header.gran.procs = PE_NUMBER(CurrentProc);    /* Move node */
+      
+      IF_GRAN_DEBUG(pack,
+		    strcat(str, " (converting RBH) ")); 
+
+      convertFromRBH(closure);   /* In GUM that's done by convertToFetchMe */
+    } else if (IS_BLACK_HOLE(closure)) {
+      closure->header.gran.procs |= PE_NUMBER(CurrentProc); /* Copy node */
+    } else if ( closure->header.gran.procs & PE_NUMBER(CurrentProc) == 0 ) {
+      if (closure_HNF(closure))
+	closure->header.gran.procs |= PE_NUMBER(CurrentProc); /* Copy node */
+      else
+	closure->header.gran.procs = PE_NUMBER(CurrentProc);  /* Move node */
+    }
+
+    IF_GRAN_DEBUG(pack,
+		  sprintf(str1, "0x%x",   PROCS(closure)); strcat(str, str1));
+    IF_GRAN_DEBUG(pack, belch(str));
+    
+  } while (bufptr<buffer->size) ;   /*  (parent != NULL);  */
+
+  /* In GrAnSim we allocate pack buffers dynamically! -- HWL */
+  free(buffer->buffer);
+  free(buffer);
+
+  IF_GRAN_DEBUG(pack,
+		belch("PrintGraph of %p is:", graphroot); PrintGraph(graphroot,0));
+
+  return (graphroot);
+}
+#endif  /* GRAN */
+#endif
+
+//@node Index,  , GranSim Code, Unpacking closures
+//@subsection Index
+
+//@index
+//* CommonUp::  @cindex\s-+CommonUp
+//* InitPendingGABuffer::  @cindex\s-+InitPendingGABuffer
+//* PendingGABuffer::  @cindex\s-+PendingGABuffer
+//* UnpackGraph::  @cindex\s-+UnpackGraph
+//@end index
diff --git a/ghc/rts/parallel/FetchMe.h b/ghc/rts/parallel/FetchMe.h
new file mode 100644
index 0000000..ebbb8dd
--- /dev/null
+++ b/ghc/rts/parallel/FetchMe.h
@@ -0,0 +1,22 @@
+/* -----------------------------------------------------------------------------
+ * $Id: FetchMe.h,v 1.2 2000/01/13 14:34:06 hwloidl Exp $
+ *
+ * Closure types for the parallel system.
+ *
+ * ---------------------------------------------------------------------------*/
+
+EI_(FETCH_ME_info);
+EF_(FETCH_ME_entry);
+
+EI_(FETCH_ME_BQ_info);
+EF_(FETCH_ME_BQ_entry);
+
+EI_(BLOCKED_FETCH_info);
+EF_(BLOCKED_FETCH_entry);
+
+EI_(RBH_Save_0_info);
+EF_(RBH_Save_0_entry);
+EI_(RBH_Save_1_info);
+EF_(RBH_Save_1_entry);
+EI_(RBH_Save_2_info);
+EF_(RBH_Save_2_entry);
diff --git a/ghc/rts/parallel/FetchMe.hc b/ghc/rts/parallel/FetchMe.hc
new file mode 100644
index 0000000..01f1f14
--- /dev/null
+++ b/ghc/rts/parallel/FetchMe.hc
@@ -0,0 +1,214 @@
+/* ----------------------------------------------------------------------------
+ Time-stamp: <Wed Jan 12 2000 13:39:33 Stardate: [-30]4193.88 hwloidl>
+ $Id: FetchMe.hc,v 1.2 2000/01/13 14:34:06 hwloidl Exp $
+
+ Entry code for a FETCH_ME closure
+
+ This module defines routines for handling remote pointers (@FetchMe@s)
+ in GUM.  It is threaded (@.hc@) because @FetchMe_entry@ will be
+ called during evaluation.
+
+ * --------------------------------------------------------------------------*/
+ 
+#ifdef PAR /* all of it */
+
+//@menu
+//* Includes::			
+//* Info tables::		
+//* Index::			
+//@end menu
+
+//@node Includes, Info tables
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Storage.h"
+#include "GranSim.h"
+#include "GranSimRts.h"
+#include "Parallel.h"
+#include "ParallelRts.h"
+#include "FetchMe.h"
+#include "HLC.h"
+#include "StgRun.h"	/* for StgReturn and register saving */
+
+/* --------------------------------------------------------------------------
+   FETCH_ME closures.
+
+   A FETCH_ME closure represents data that currently resides on
+   another PE.  We issue a fetch message, and wait for the data to be
+   retrieved.
+
+   About the difference between std and PAR in returning to the RTS:
+   in PAR we call RTS functions from within the entry code (see also
+   BLACKHOLE_entry and friends in StgMiscClosures.hc); therefore, we
+   have to save the thread state before calling these functions --- 
+   this is done via SAVE_THREAD_STATE; we then just load the return
+   code into R1 before jumping into the RTS --- this is done via
+   THREAD_RETURN; so, in short we have something like
+     SAVE_THREAD_STATE + THREAD_RETURN = BLOCK_NP
+   
+   ------------------------------------------------------------------------ */
+
+//@node Info tables, Index, Includes
+//@subsection Info tables
+
+//@cindex FETCH_ME_info
+INFO_TABLE(FETCH_ME_info, FETCH_ME_entry, 0,2, FETCH_ME, const, EF_,0,0);
+//@cindex FETCH_ME_entry
+STGFUN(FETCH_ME_entry)
+{
+  extern globalAddr *rga_GLOBAL;
+  extern globalAddr *lga_GLOBAL;
+  extern globalAddr fmbqga_GLOBAL;
+  extern StgClosure *p_GLOBAL;
+  /* 
+  globalAddr *rga;
+  globalAddr *lga;
+  globalAddr fmbqga;
+  StgClosure *p;
+  */
+
+  rga_GLOBAL = ((StgFetchMe *)R1.p)->ga;
+  ASSERT(rga->payload.gc.gtid != mytid);
+
+  /* Turn the FETCH_ME into a FETCH_ME_BQ, and place the current thread
+   * on the blocking queue.
+   */
+  // R1.cl->header.info = FETCH_ME_BQ_info;
+  SET_INFO((StgClosure *)R1.cl, &FETCH_ME_BQ_info);
+
+  CurrentTSO->link = END_BQ_QUEUE;
+  ((StgFetchMeBlockingQueue *)R1.cl)->blocking_queue = (StgBlockingQueueElement *)CurrentTSO;
+
+  /* record onto which closure the current thread is blcoking */
+  CurrentTSO->block_info.closure = R1.cl;
+  //recordMutable((StgMutClosure *)R1.cl);
+  p_GLOBAL = R1.cl;
+
+  /* Save the Thread State here, before calling RTS routines below! */
+  //BLOCK_NP_NO_JUMP(1);
+  SAVE_THREAD_STATE(1);
+
+  /* unknown junk... needed? --SDM  yes, want to see what's happening -- HWL */
+  if (RtsFlags.ParFlags.ParStats.Full) {
+    /* Note that CURRENT_TIME may perform an unsafe call */
+    //rtsTime now = CURRENT_TIME; /* Now */
+    CurrentTSO->par.exectime += CURRENT_TIME - CurrentTSO->par.blockedat;
+    CurrentTSO->par.fetchcount++;
+    /* TSO_QUEUE(CurrentTSO) = Q_FETCHING; */
+    CurrentTSO->par.blockedat = CURRENT_TIME;
+    /* we are about to send off a FETCH message, so dump a FETCH event */
+    DumpRawGranEvent(CURRENT_PROC, taskIDtoPE(rga_GLOBAL->payload.gc.gtid),
+		     GR_FETCH, CurrentTSO, (StgClosure *)R1.p, 0);
+  }
+
+  /* Phil T. claims that this was a workaround for a hard-to-find
+   * bug, hence I'm leaving it out for now --SDM 
+   */
+  /* Assign a brand-new global address to the newly created FMBQ */
+  lga_GLOBAL = makeGlobal(p_GLOBAL, rtsFalse);
+  splitWeight(&fmbqga_GLOBAL, lga_GLOBAL);
+  ASSERT(fmbqga_GLOBAL.weight == 1L << (BITS_IN(unsigned) - 1));
+
+  /* I *hope* it's ok to call this from STG land. --SDM */
+  STGCALL3(sendFetch, rga_GLOBAL, &fmbqga_GLOBAL, 0/*load*/);
+
+  // sendFetch now called from processTheRealFetch, to make SDM happy
+  //theGlobalFromGA.payload.gc.gtid = rga->payload.gc.gtid;
+  //theGlobalFromGA.payload.gc.slot = rga->payload.gc.slot;
+  //theGlobalFromGA.weight = rga->weight;
+  //theGlobalToGA.payload.gc.gtid = fmbqga.payload.gc.gtid;
+  //theGlobalToGA.payload.gc.slot = fmbqga.payload.gc.slot;
+  //theGlobalToGA.weight = fmbqga.weight;
+
+  // STGCALL6(fprintf,stderr,"%% Fetching %p from remote PE ((%x,%d,%x))\n",R1.p,rga->payload.gc.gtid, rga->payload.gc.slot, rga->weight);
+
+  THREAD_RETURN(1); /* back to the scheduler */  
+  // was: BLOCK_NP(1); 
+  FE_
+}
+
+/* ---------------------------------------------------------------------------
+   FETCH_ME_BQ
+   
+   On the first entry of a FETCH_ME closure, we turn the closure into
+   a FETCH_ME_BQ, which behaves just like a BLACKHOLE_BQ.  Any thread
+   entering the FETCH_ME_BQ will be placed in the blocking queue.
+   When the data arrives from the remote PE, all waiting threads are
+   woken up and the FETCH_ME_BQ is overwritten with the fetched data.
+
+   FETCH_ME_BQ_entry is a copy of BLACKHOLE_BQ_entry -- HWL
+   ------------------------------------------------------------------------ */
+
+INFO_TABLE(FETCH_ME_BQ_info, FETCH_ME_BQ_entry,0,2,FETCH_ME_BQ,const,EF_,0,0);
+//@cindex FETCH_ME_BQ_info
+STGFUN(FETCH_ME_BQ_entry)
+{
+  FB_
+    TICK_ENT_BH();
+
+    /* Put ourselves on the blocking queue for this black hole */
+    CurrentTSO->block_info.closure = R1.cl;
+    CurrentTSO->link = ((StgBlockingQueue *)R1.p)->blocking_queue;
+    ((StgBlockingQueue *)R1.p)->blocking_queue = CurrentTSO;
+
+#if defined(PAR)
+    /* Save the Thread State here, before calling RTS routines below! */
+    SAVE_THREAD_STATE(1);
+
+    if (RtsFlags.ParFlags.ParStats.Full) {
+      /* Note that CURRENT_TIME may perform an unsafe call */
+      //rtsTime now = CURRENT_TIME; /* Now */
+      CurrentTSO->par.exectime += CURRENT_TIME - CurrentTSO->par.blockedat;
+      CurrentTSO->par.blockcount++;
+      CurrentTSO->par.blockedat = CURRENT_TIME;
+      DumpRawGranEvent(CURRENT_PROC, thisPE,
+		       GR_BLOCK, CurrentTSO, (StgClosure *)R1.p, 0);
+    }
+
+    THREAD_RETURN(1);  /* back to the scheduler */  
+#else
+    /* stg_gen_block is too heavyweight, use a specialised one */
+    BLOCK_NP(1);
+#endif
+  FE_
+}
+
+/* ---------------------------------------------------------------------------
+   BLOCKED_FETCH_BQ
+   
+   A BLOCKED_FETCH closure only ever exists in the blocking queue of a
+   globally visible closure i.e. one with a GA. A BLOCKED_FETCH closure
+   indicates that a TSO on another PE is waiting for the result of this
+   computation. Thus, when updating the closure, the result has to be sent
+   to that PE. The relevant routines handling that are awaken_blocked_queue
+   and blockFetch (for putting BLOCKED_FETCH closure into a BQ).
+*/
+
+//@cindex BLOCKED_FETCH_info
+INFO_TABLE(BLOCKED_FETCH_info, BLOCKED_FETCH_entry,0,2,BLOCKED_FETCH,const,EF_,0,0);
+//@cindex BLOCKED_FETCH_entry
+STGFUN(BLOCKED_FETCH_entry)
+{
+  FB_
+    /* see NON_ENTERABLE_ENTRY_CODE in StgMiscClosures.hc */
+    fprintf(stderr,"Qagh: BLOCKED_FETCH entered!\n");
+    STGCALL1(raiseError, errorHandler);
+    stg_exit(EXIT_FAILURE); /* not executed */
+  FE_
+}
+
+#endif /* PAR */
+
+//@node Index,  , Info tables
+//@subsection Index
+
+//@index
+//* BLOCKED_FETCH_entry::  @cindex\s-+BLOCKED_FETCH_entry
+//* BLOCKED_FETCH_info::  @cindex\s-+BLOCKED_FETCH_info
+//* FETCH_ME_BQ_info::  @cindex\s-+FETCH_ME_BQ_info
+//* FETCH_ME_entry::  @cindex\s-+FETCH_ME_entry
+//* FETCH_ME_info::  @cindex\s-+FETCH_ME_info
+//@end index
diff --git a/ghc/rts/parallel/Global.c b/ghc/rts/parallel/Global.c
new file mode 100644
index 0000000..59eda0b
--- /dev/null
+++ b/ghc/rts/parallel/Global.c
@@ -0,0 +1,828 @@
+/* ---------------------------------------------------------------------------
+   Time-stamp: <Sat Dec 04 1999 21:28:56 Stardate: [-30]3999.47 hwloidl>
+   $Id: Global.c,v 1.2 2000/01/13 14:34:06 hwloidl Exp $
+
+   (c) The AQUA/Parade Projects, Glasgow University, 1995
+       The GdH/APART 624 Projects, Heriot-Watt University, Edinburgh, 1999
+
+   Global Address Manipulation.
+   
+   The GALA and LAGA tables for mapping global addresses to local addresses 
+   (i.e. heap pointers) are defined here. We use the generic hash tables
+   defined in Hash.c.
+   ------------------------------------------------------------------------- */
+
+#ifdef PAR /* whole file */
+
+//@menu
+//* Includes::			
+//* Global tables and lists::	
+//* Fcts on GALA tables::	
+//* Interface to taskId-PE table::  
+//* Interface to LAGA table::	
+//* Interface to GALA table::	
+//* GC functions for GALA tables::  
+//* Index::			
+//@end menu
+
+//@node Includes, Global tables and lists, Global Address Manipulation, Global Address Manipulation
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Storage.h"
+#include "Hash.h"
+#include "ParallelRts.h"
+
+/*
+  @globalAddr@ structures are allocated in chunks to reduce malloc overhead.
+*/
+
+//@node Global tables and lists, Fcts on GALA tables, Includes, Global Address Manipulation
+//@subsection Global tables and lists
+
+//@cindex thisPE
+int thisPE;
+
+//@menu
+//* Free lists::		
+//* Hash tables::		
+//@end menu
+
+//@node Free lists, Hash tables, Global tables and lists, Global tables and lists
+//@subsubsection Free lists
+
+/* Free list of GALA entries */
+GALA *freeGALAList = NULL;
+
+/* Number of globalAddr cells to allocate in one go */
+#define GCHUNK	    (1024 * sizeof(StgWord) / sizeof(GALA))
+
+/* Free list of indirections */
+
+//@cindex nextIndirection
+static StgInt nextIndirection = 0;
+//@cindex freeIndirections
+GALA *freeIndirections = NULL;
+
+/* The list of live indirections has to be marked for GC (see makeGlobal) */
+//@cindex liveIndirections
+GALA *liveIndirections = NULL;
+
+/* The list of remote indirections has to be marked for GC (see setRemoteGA) */
+//@cindex liveRemoteGAs
+GALA *liveRemoteGAs = NULL;
+
+//@node Hash tables,  , Free lists, Global tables and lists
+//@subsubsection Hash tables
+
+/* Mapping global task ids PEs */
+//@cindex taskIDtoPEtable
+HashTable *taskIDtoPEtable = NULL;
+
+static int nextPE = 0;
+
+/* LAGA table: StgClosure* -> globalAddr*
+               (Remember: globalAddr = (GlobalTaskId, Slot, Weight))
+   Mapping local to global addresses (see interface below) 
+*/
+
+//@cindex LAtoGALAtable
+HashTable *LAtoGALAtable = NULL;
+
+/* GALA table: globalAddr* -> StgClosure*
+               (Remember: globalAddr = (GlobalTaskId, Slot, Weight))
+   Mapping global to local addresses (see interface below) 
+*/
+
+//@cindex pGAtoGALAtable
+HashTable *pGAtoGALAtable = NULL;
+
+//@node Fcts on GALA tables, Interface to taskId-PE table, Global tables and lists, Global Address Manipulation
+//@subsection Fcts on GALA tables
+
+//@cindex allocGALA
+static GALA *
+allocGALA(void)
+{
+  GALA *gl, *p;
+
+  if ((gl = freeGALAList) != NULL) {
+    freeGALAList = gl->next;
+  } else {
+    gl = (GALA *) stgMallocBytes(GCHUNK * sizeof(GALA), "allocGALA");
+
+    freeGALAList = gl + 1;
+    for (p = freeGALAList; p < gl + GCHUNK - 1; p++)
+      p->next = p + 1;
+    p->next = NULL;
+  }
+  return gl;
+}
+
+//@node Interface to taskId-PE table, Interface to LAGA table, Fcts on GALA tables, Global Address Manipulation
+//@subsection Interface to taskId-PE table
+
+/*
+  We don't really like GLOBAL_TASK_ID, so we keep a table of TASK_ID to
+  PE mappings.  The idea is that a PE identifier will fit in 16 bits, whereas 
+  a TASK_ID may not.
+*/
+
+//@cindex taskIDtoPE
+PEs
+taskIDtoPE(GlobalTaskId gtid)
+{
+  return (PEs) lookupHashTable(taskIDtoPEtable, gtid);
+}
+
+//@cindex registerTask
+void 
+registerTask(gtid)
+GlobalTaskId gtid;
+{
+  if (gtid == mytid)
+    thisPE = nextPE;
+
+  insertHashTable(taskIDtoPEtable, gtid, (void *) (StgWord) nextPE++);
+}
+
+//@node Interface to LAGA table, Interface to GALA table, Interface to taskId-PE table, Global Address Manipulation
+//@subsection Interface to LAGA table
+
+/*
+  The local address to global address mapping returns a globalAddr structure
+  (pe task id, slot, weight) for any closure in the local heap which has a
+  global identity.  Such closures may be copies of normal form objects with
+  a remote `master' location, @FetchMe@ nodes referencing remote objects, or
+  globally visible objects in the local heap (for which we are the master).
+*/
+
+//@cindex LAGAlookup
+globalAddr *
+LAGAlookup(addr)
+StgClosure *addr;
+{
+  GALA *gala;
+
+  /* We never look for GA's on indirections */
+  ASSERT(IS_INDIRECTION(addr) == NULL);
+  if ((gala = lookupHashTable(LAtoGALAtable, (StgWord) addr)) == NULL)
+    return NULL;
+  else
+    return &(gala->ga);
+}
+
+//@node Interface to GALA table, GC functions for GALA tables, Interface to LAGA table, Global Address Manipulation
+//@subsection Interface to GALA table
+
+/*
+  We also manage a mapping of global addresses to local addresses, so that
+  we can ``common up'' multiple references to the same object as they arrive
+  in data packets from remote PEs.
+
+  The global address to local address mapping is actually managed via a
+  ``packed global address'' to GALA hash table.  The packed global
+  address takes the interesting part of the @globalAddr@ structure
+  (i.e. the pe and slot fields) and packs them into a single word
+  suitable for hashing.
+*/
+
+//@cindex GALAlookup
+StgClosure *
+GALAlookup(ga)
+globalAddr *ga;
+{
+  StgWord pga = PackGA(taskIDtoPE(ga->payload.gc.gtid), ga->payload.gc.slot);
+  GALA *gala;
+
+  if ((gala = (GALA *) lookupHashTable(pGAtoGALAtable, pga)) == NULL)
+    return NULL;
+  else {
+    /* 
+     * Bypass any indirections when returning a local closure to
+     * the caller.  Note that we do not short-circuit the entry in
+     * the GALA tables right now, because we would have to do a
+     * hash table delete and insert in the LAtoGALAtable to keep
+     * that table up-to-date for preferred GALA pairs.  That's
+     * probably a bit expensive.
+     */
+    return UNWIND_IND((StgClosure *)(gala->la));
+  }
+}
+
+/*
+  External references to our globally-visible closures are managed through an
+  indirection table.  The idea is that the closure may move about as the result
+  of local garbage collections, but its global identity is determined by its
+  slot in the indirection table, which never changes.
+
+  The indirection table is maintained implicitly as part of the global
+  address to local address table.  We need only keep track of the
+  highest numbered indirection index allocated so far, along with a free
+  list of lower numbered indices no longer in use.
+*/
+
+/* 
+   Allocate an indirection slot for the closure currently at address @addr@.
+*/
+
+//@cindex allocIndirection
+static GALA *
+allocIndirection(StgPtr addr)
+{
+  GALA *gala;
+  
+  if ((gala = freeIndirections) != NULL) {
+    freeIndirections = gala->next;
+  } else {
+    gala = allocGALA();
+    gala->ga.payload.gc.gtid = mytid;
+    gala->ga.payload.gc.slot = nextIndirection++;
+  }
+  gala->ga.weight = MAX_GA_WEIGHT;
+  gala->la = addr;
+  return gala;
+}
+
+/*
+  Make a local closure at @addr@ globally visible.  We have to allocate an
+  indirection slot for it, and update both the local address to global address
+  and global address to local address maps.
+*/
+
+//@cindex makeGlobal
+globalAddr *
+makeGlobal(addr, preferred)
+StgClosure *addr;
+rtsBool preferred;
+{
+  GALA *oldGALA = lookupHashTable(LAtoGALAtable, (StgWord) addr);
+  GALA *newGALA = allocIndirection((StgPtr)addr);
+  StgWord pga = PackGA(thisPE, newGALA->ga.payload.gc.slot);
+
+  ASSERT(HEAP_ALLOCED(addr)); // check that addr might point into the heap 
+  ASSERT(GALAlookup(&(newGALA->ga)) == NULL);
+  
+  newGALA->la = addr;
+  newGALA->preferred = preferred;
+
+  if (preferred) {
+    /* The new GA is now the preferred GA for the LA */
+    if (oldGALA != NULL) {
+      oldGALA->preferred = rtsFalse;
+      (void) removeHashTable(LAtoGALAtable, (StgWord) addr, (void *) oldGALA);
+    }
+    insertHashTable(LAtoGALAtable, (StgWord) addr, (void *) newGALA);
+  }
+
+  /* put the new GALA entry on the list of live indirections */
+  newGALA->next = liveIndirections;
+  liveIndirections = newGALA;
+  
+  insertHashTable(pGAtoGALAtable, pga, (void *) newGALA);
+  
+  return &(newGALA->ga);
+}
+
+/*
+  Assign an existing remote global address to an existing closure.
+  We do not retain the @globalAddr@ structure that's passed in as an argument,
+  so it can be a static in the calling routine.
+*/
+
+//@cindex setRemoteGA
+globalAddr *
+setRemoteGA(addr, ga, preferred)
+StgClosure *addr;
+globalAddr *ga;
+rtsBool preferred;
+{
+  GALA *oldGALA = lookupHashTable(LAtoGALAtable, (StgWord) addr);
+  GALA *newGALA = allocGALA();
+  StgWord pga = PackGA(taskIDtoPE(ga->payload.gc.gtid), ga->payload.gc.slot);
+
+  ASSERT(ga->payload.gc.gtid != mytid);
+  ASSERT(ga->weight > 0);
+  ASSERT(GALAlookup(ga) == NULL);
+
+  newGALA->ga = *ga;
+  newGALA->la = addr;
+  newGALA->preferred = preferred;
+
+  if (preferred) {
+    /* The new GA is now the preferred GA for the LA */
+    if (oldGALA != NULL) {
+      oldGALA->preferred = rtsFalse;
+      (void) removeHashTable(LAtoGALAtable, (StgWord) addr, (void *) oldGALA);
+    }
+    insertHashTable(LAtoGALAtable, (StgWord) addr, (void *) newGALA);
+  }
+  newGALA->next = liveRemoteGAs;
+  liveRemoteGAs = newGALA;
+  
+  insertHashTable(pGAtoGALAtable, pga, (void *) newGALA);
+  
+  ga->weight = 0;
+
+  return &(newGALA->ga);
+}
+
+/*
+  Give me a bit of weight to give away on a new reference to a particular
+  global address.  If we run down to nothing, we have to assign a new GA.  
+*/
+
+//@cindex splitWeight
+void
+splitWeight(to, from)
+globalAddr *to, *from;
+{
+  /* Make sure we have enough weight to split */
+  if (from->weight == 1)
+    from = makeGlobal(GALAlookup(from), rtsTrue);
+  
+  to->payload = from->payload;
+
+  if (from->weight == 0)
+    to->weight = 1L << (BITS_IN(unsigned) - 1);
+  else
+    to->weight = from->weight / 2;
+
+  from->weight -= to->weight;
+}
+
+/*
+  Here, I am returning a bit of weight that a remote PE no longer needs.
+*/
+
+//@cindex addWeight
+globalAddr *
+addWeight(ga)
+globalAddr *ga;
+{
+  StgWord pga = PackGA(taskIDtoPE(ga->payload.gc.gtid), ga->payload.gc.slot);
+  GALA *gala = (GALA *) lookupHashTable(pGAtoGALAtable, pga);
+
+  IF_PAR_DEBUG(weight,
+	       fprintf(stderr, "@* Adding weight %x to ", ga->weight);
+	       printGA(&(gala->ga));
+	       fputc('\n', stderr));
+
+  gala->ga.weight += ga->weight;    
+  ga->weight = 0;
+
+  return &(gala->ga);
+}
+
+/*
+  Initialize all of the global address structures: the task ID to PE id
+  map, the local address to global address map, the global address to
+  local address map, and the indirection table.
+*/
+
+//@cindex initGAtables
+void
+initGAtables(void)
+{
+  taskIDtoPEtable = allocHashTable();
+  LAtoGALAtable = allocHashTable();
+  pGAtoGALAtable = allocHashTable();
+}
+
+//@cindex PackGA
+StgWord
+PackGA (pe, slot)
+StgWord pe;
+int slot;
+{
+  int pe_shift = (BITS_IN(StgWord)*3)/4;
+  int pe_bits  = BITS_IN(StgWord) - pe_shift;
+
+  if ( pe_bits < 8 || slot >= (1L << pe_shift) ) { /* big trouble */
+    fflush(stdout);
+    fprintf(stderr, "PackGA: slot# too big (%d) or not enough pe_bits (%d)\n",
+	    slot,pe_bits);
+    stg_exit(EXIT_FAILURE);
+  }
+
+  return((((StgWord)(pe)) << pe_shift) | ((StgWord)(slot)));
+	
+    /* the idea is to use 3/4 of the bits (e.g., 24) for indirection-
+       table "slot", and 1/4 for the pe# (e.g., 8).
+       
+       We check for too many bits in "slot", and double-check (at
+       compile-time?) that we have enough bits for "pe".  We *don't*
+       check for too many bits in "pe", because SysMan enforces a
+       MAX_PEs limit at the very very beginning.
+
+       Phil & Will 95/08
+    */
+}
+
+//@node GC functions for GALA tables, Debugging routines, Interface to GALA table, Global Address Manipulation
+//@subsection GC functions for GALA tables
+
+/*
+  When we do a copying collection, we want to evacuate all of the local
+  entries in the GALA table for which there are outstanding remote
+  pointers (i.e. for which the weight is not MAX_GA_WEIGHT.)
+*/
+//@cindex markLocalGAs
+void
+markLocalGAs(rtsBool full)
+{
+  GALA *gala;
+  GALA *next;
+  GALA *prev = NULL;
+  StgPtr old_la, new_la;
+  nat n=0, m=0; // debugging only
+  
+  IF_DEBUG(gc,
+	   belch("@@ markLocalGAs: Marking LIVE INDIRECTIONS in GALA table starting with GALA at %p\n",
+		 liveIndirections);
+	   printLAGAtable());
+
+  for (gala = liveIndirections, m=0; gala != NULL; gala = next, m++) {
+    IF_DEBUG(gc,
+ 	     printGA(&(gala->ga));
+	     fprintf(stderr, ";@ %d: LA: %p (%s) ",
+		     m, gala->la, info_type(gala->la)));
+    next = gala->next;
+    old_la = gala->la;
+    ASSERT(gala->ga.payload.gc.gtid == mytid); /* it's supposed to be local */
+    if (get_itbl((StgClosure *)old_la)->type == EVACUATED) {
+      /* somebody else already evacuated this closure */
+      new_la = ((StgEvacuated *)old_la)->evacuee;
+      IF_DEBUG(gc,
+	       belch(" already evacuated to %p\n", new_la));
+    } else {
+      StgClosure *foo ; // debugging only
+      n++;
+      IF_PAR_DEBUG(verbose,
+		   if (IS_INDIRECTION((StgClosure *)old_la))
+		       belch("{markLocalGAs}Daq ghuH: trying to mark an indirection %p (%s) -> %p (%s); [closure=%p]",
+			     old_la, info_type(old_la), 
+			     (foo = UNWIND_IND((StgClosure *)old_la)), info_type(foo), 
+			     old_la));
+      new_la = MarkRoot(UNWIND_IND((StgClosure *)old_la)); // or just evacuate(old_ga)
+      IF_DEBUG(gc,
+	       belch(" evacuated %p to %p\n", old_la, new_la));
+    }
+
+    gala->la = new_la;
+    /* remove old LA and replace with new LA */
+    //(void) removeHashTable(LAtoGALAtable, (StgWord) old_la, (void *) gala);
+    //insertHashTable(LAtoGALAtable, (StgWord) new_la, (void *) gala);
+
+    gala->next = prev;
+    prev = gala;
+  }
+  liveIndirections = prev;  /* list has been reversed during the marking */
+
+  IF_PAR_DEBUG(verbose,
+	       belch("@@ markLocalGAs: %d of %d GALAs marked on PE %x",
+		     n, m, mytid));
+
+  /* -------------------------------------------------------------------- */
+
+  n=0; m=0; // debugging only
+  
+  IF_DEBUG(gc,
+	   belch("@@ markLocalGAs: Marking LIVE REMOTE GAs in GALA table starting with GALA at %p\n",
+		 liveRemoteGAs));
+
+  for (gala = liveRemoteGAs, prev = NULL; gala != NULL; gala = next) {
+    IF_DEBUG(gc,
+	     printGA(&(gala->ga)));
+    next = gala->next;
+    old_la = gala->la;
+    ASSERT(gala->ga.payload.gc.gtid != mytid); /* it's supposed to be remote */
+    if (get_itbl((StgClosure *)old_la)->type == EVACUATED) {
+      /* somebody else already evacuated this closure */
+      new_la = ((StgEvacuated *)old_la)->evacuee;
+    } else {
+      n++;
+      new_la = MarkRoot((StgClosure *)old_la); // or just evacuate(old_ga)
+    }
+
+    gala->la = new_la;
+    /* remove old LA and replace with new LA */
+    //(void) removeHashTable(LAtoGALAtable, (StgWord) old_la, (void *) gala);
+    //insertHashTable(LAtoGALAtable, (StgWord) new_la, (void *) gala);
+
+    gala->next = prev;
+    prev = gala;
+  }
+  liveRemoteGAs = prev; /* list is reversed during marking */
+
+  /* If we have any remaining FREE messages to send off, do so now */
+  // sendFreeMessages();
+
+  IF_DEBUG(gc,
+	   belch("@@ markLocalGAs: GALA after marking");
+	   printLAGAtable();
+	   belch("--------------------------------------"));
+  
+}
+
+void
+OLDmarkLocalGAs(rtsBool full)
+{
+  extern StgClosure *MarkRootHWL(StgClosure *root);
+
+  GALA *gala;
+  GALA *next;
+  GALA *prev = NULL;
+  StgPtr new_la;
+  nat n=0, m=0; // debugging only
+  
+  IF_DEBUG(gc,
+	   belch("@@ markLocalGAs: Marking entries in GALA table starting with GALA at %p",
+		 liveIndirections);
+	   printLAGAtable());
+
+  for (gala = liveIndirections; gala != NULL; gala = next) {
+    IF_DEBUG(gc,
+ 	     printGA(&(gala->ga));
+	     fprintf(stderr, " LA: %p (%s) ",
+		     gala->la, info_type(gala->la)));
+    next = gala->next;
+    ASSERT(gala->ga.payload.gc.gtid == mytid); /* it's supposed to be local */
+    if (gala->ga.weight != MAX_GA_WEIGHT) {
+      /* Remote references exist, so we must evacuate the local closure */
+      StgPtr old_la = gala->la;
+
+      if (get_itbl((StgClosure *)old_la)->type != EVACUATED) { // track evacuee!??
+	n++;
+	IF_DEBUG(gc,
+		 fprintf(stderr, " marking as root\n"));
+	new_la = MarkRoot((StgClosure *)old_la); // or just evacuate(old_ga)
+	//IF_DEBUG(gc,
+	//	 fprintf(stderr, " new LA is %p ", new_la));
+	if (!full && gala->preferred && new_la != old_la) {
+	  IF_DEBUG(gc,
+		   fprintf(stderr, " replacing %p with %p in LAGA table\n",
+			   old_la, new_la));
+	  (void) removeHashTable(LAtoGALAtable, (StgWord) old_la, (void *) gala);
+	  insertHashTable(LAtoGALAtable, (StgWord) new_la, (void *) gala);
+	}
+      } else {
+  	IF_DEBUG(gc,
+  		 fprintf(stderr, " EVAC "));
+	new_la = ((StgEvacuated *)old_la)->evacuee;
+	IF_DEBUG(gc,
+		 fprintf(stderr, " replacing %p with %p in LAGA table\n",
+			   old_la, new_la));
+	(void) removeHashTable(LAtoGALAtable, (StgWord) old_la, (void *) gala);
+	insertHashTable(LAtoGALAtable, (StgWord) new_la, (void *) gala);
+      } 
+      gala->next = prev;
+      prev = gala;
+    } else {
+      /* Since we have all of the weight, this GA is no longer needed */
+      StgWord pga = PackGA(thisPE, gala->ga.payload.gc.slot);
+
+      m++;
+      IF_DEBUG(gc,
+	       fprintf(stderr, " freeing slot %d", 
+		       gala->ga.payload.gc.slot));
+
+      /* put the now redundant GALA onto the free list */
+      gala->next = freeIndirections;
+      freeIndirections = gala;
+      /* remove the GALA from the GALA table; now it's just local */
+      (void) removeHashTable(pGAtoGALAtable, pga, (void *) gala);
+      if (!full && gala->preferred)
+	(void) removeHashTable(LAtoGALAtable, (StgWord) gala->la, (void *) gala);
+
+#ifdef DEBUG
+      gala->ga.weight = 0x0d0d0d0d;
+      gala->la = (StgWord) 0x0bad0bad;
+#endif
+    }
+  }
+  liveIndirections = prev;  /* list has been reversed during the marking */
+
+  IF_PAR_DEBUG(verbose,
+	       belch("@@ markLocalGAs: %d GALAs marked, %d GALAs nuked on PE %x",
+		     n, m, mytid));
+
+}
+
+//@cindex RebuildGAtables
+void
+RebuildGAtables(rtsBool full)
+{
+  GALA *gala;
+  GALA *next;
+  GALA *prev;
+  StgClosure *closure, *last, *new_closure;
+
+  //prepareFreeMsgBuffers();
+
+  if (full)
+    RebuildLAGAtable();
+
+  IF_DEBUG(gc,
+	   belch("@@ RebuildGAtables: After ReBuilding GALA table starting with GALA at %p",
+		 liveRemoteGAs);
+	   printLAGAtable());
+}
+
+void
+OLDRebuildGAtables(rtsBool full)
+{
+  GALA *gala;
+  GALA *next;
+  GALA *prev;
+  StgClosure *closure, *last, *new_closure;
+
+  prepareFreeMsgBuffers();
+
+  for (gala = liveRemoteGAs, prev = NULL; gala != NULL; gala = next) {
+    IF_DEBUG(gc,
+	     printGA(&(gala->ga)));
+    next = gala->next;
+    ASSERT(gala->ga.payload.gc.gtid != mytid); /* it's supposed to be remote */
+
+    closure = (StgClosure *) (gala->la);
+
+    /*
+     * If the old closure has not been forwarded, we let go.  Note that this
+     * approach also drops global aliases for PLCs.
+     */
+
+    if (!full && gala->preferred)
+      (void) removeHashTable(LAtoGALAtable, (StgWord) gala->la, (void *) gala);
+
+    /* Follow indirection chains to the end, just in case */
+    closure = UNWIND_IND(closure);
+
+    /*
+    if (get_itbl(closure)->type != EVACUATED) { // (new_closure = isAlive(closure)) == NULL) { // (W_) Forward_Ref_info)
+      // closure is not alive any more, thus remove GA 
+      int pe = taskIDtoPE(gala->ga.payload.gc.gtid);
+      StgWord pga = PackGA(pe, gala->ga.payload.gc.slot);
+
+      IF_DEBUG(gc,
+	       fprintf(stderr, " (LA: %p (%s)) is unused on this PE -> sending free\n",
+		       closure, info_type(closure)));
+
+      (void) removeHashTable(pGAtoGALAtable, pga, (void *) gala);
+      freeRemoteGA(pe, &(gala->ga));
+      gala->next = freeGALAList;
+      freeGALAList = gala;
+    } else {
+    */
+    if (get_itbl(closure)->type == EVACUATED) {
+      IF_DEBUG(gc,
+	       fprintf(stderr, " EVAC %p (%s)\n",
+		       closure, info_type(closure)));
+      closure = ((StgEvacuated *)closure)->evacuee;
+    } else {
+      IF_DEBUG(gc,
+	       fprintf(stderr, " !EVAC %p (%s)\n",
+		       closure, info_type(closure)));
+    }
+    gala->la = closure;
+    if (!full && gala->preferred)
+      insertHashTable(LAtoGALAtable, (StgWord) gala->la, (void *) gala);
+    gala->next = prev;
+    prev = gala;
+  }
+  //}
+  liveRemoteGAs = prev; /* list is reversed during marking */
+
+  /* If we have any remaining FREE messages to send off, do so now */
+  sendFreeMessages();
+
+  if (full)
+    RebuildLAGAtable();
+
+  IF_DEBUG(gc,
+	   belch("@@ RebuildGAtables: After ReBuilding GALA table starting with GALA at %p",
+		 liveRemoteGAs);
+	   printLAGAtable());
+}
+
+/*
+  Rebuild the LA->GA table, assuming that the addresses in the GALAs are
+  correct.  
+*/
+
+//@cindex RebuildLAGAtable
+void
+RebuildLAGAtable(void)
+{
+  GALA *gala;
+  nat n=0, m=0; // debugging
+
+  /* The old LA->GA table is worthless */
+  freeHashTable(LAtoGALAtable, NULL);
+  LAtoGALAtable = allocHashTable();
+
+  IF_DEBUG(gc,
+	   belch("@@ RebuildLAGAtable: new LAGA table at %p",
+		 LAtoGALAtable)); 
+  
+  for (gala = liveIndirections; gala != NULL; gala = gala->next) {
+    n++;
+    if (gala->preferred)
+      insertHashTable(LAtoGALAtable, (StgWord) gala->la, (void *) gala);
+  }
+
+  for (gala = liveRemoteGAs; gala != NULL; gala = gala->next) {
+    m++;
+    if (gala->preferred)
+      insertHashTable(LAtoGALAtable, (StgWord) gala->la, (void *) gala);
+  }
+
+  IF_DEBUG(gc,
+	   belch("@@ RebuildLAGAtable: inserted %d entries from liveIndirections and %d entries from liveRemoteGAs",
+		 n,m)); 
+  
+}
+
+//@node Debugging routines, Index, GC functions for GALA tables, Global Address Manipulation
+//@subsection Debugging routines
+
+//@cindex printGA
+void
+printGA (globalAddr *ga)
+{
+  fprintf(stderr, "((%x, %d, %x))", 
+	  ga->payload.gc.gtid,
+	  ga->payload.gc.slot,
+	  ga->weight);
+}
+
+//@cindex printGALA
+void 
+printGALA (GALA *gala)
+{
+  printGA(&(gala->ga));
+  fprintf(stderr, " -> %p (%s)", (StgPtr)gala->la, info_type(gala->la));
+  fprintf(stderr, " %s", (gala->preferred) ? "PREF" : "____");
+}
+
+/*
+  Printing the LA->GA table.
+*/
+
+//@cindex DebugPrintLAGAtable
+void
+printLAGAtable(void)
+{
+  GALA *gala;
+  nat n=0, m=0; // debugging
+
+  belch("@@ LAGAtable (%p) with liveIndirections=%p, liveRemoteGAs=%p:",
+	LAtoGALAtable, liveIndirections, liveRemoteGAs); 
+  
+  for (gala = liveIndirections; gala != NULL; gala = gala->next) {
+    n++;
+    printGALA(gala);
+    fputc('\n', stderr);
+  }
+
+  for (gala = liveRemoteGAs; gala != NULL; gala = gala->next) {
+    m++;
+    printGALA(gala);
+    fputc('\n', stderr);
+  }
+  belch("@@ LAGAtable has %d liveIndirections entries and %d liveRemoteGAs entries",
+	n, m);
+}
+
+#endif /* PAR -- whole file */
+
+//@node Index,  , Debugging routines, Global Address Manipulation
+//@subsection Index
+
+//@index
+//* GALAlookup::  @cindex\s-+GALAlookup
+//* LAGAlookup::  @cindex\s-+LAGAlookup
+//* LAtoGALAtable::  @cindex\s-+LAtoGALAtable
+//* PackGA::  @cindex\s-+PackGA
+//* RebuildGAtables::  @cindex\s-+RebuildGAtables
+//* RebuildLAGAtable::  @cindex\s-+RebuildLAGAtable
+//* addWeight::  @cindex\s-+addWeight
+//* allocGALA::  @cindex\s-+allocGALA
+//* allocIndirection::  @cindex\s-+allocIndirection
+//* freeIndirections::  @cindex\s-+freeIndirections
+//* initGAtables::  @cindex\s-+initGAtables
+//* liveIndirections::  @cindex\s-+liveIndirections
+//* liveRemoteGAs::  @cindex\s-+liveRemoteGAs
+//* makeGlobal::  @cindex\s-+makeGlobal
+//* markLocalGAs::  @cindex\s-+markLocalGAs
+//* nextIndirection::  @cindex\s-+nextIndirection
+//* pGAtoGALAtable::  @cindex\s-+pGAtoGALAtable
+//* registerTask::  @cindex\s-+registerTask
+//* setRemoteGA::  @cindex\s-+setRemoteGA
+//* splitWeight::  @cindex\s-+splitWeight
+//* taskIDtoPE::  @cindex\s-+taskIDtoPE
+//* taskIDtoPEtable::  @cindex\s-+taskIDtoPEtable
+//* thisPE::  @cindex\s-+thisPE
+//@end index
diff --git a/ghc/rts/parallel/GranSim.c b/ghc/rts/parallel/GranSim.c
new file mode 100644
index 0000000..8d08fb6
--- /dev/null
+++ b/ghc/rts/parallel/GranSim.c
@@ -0,0 +1,3005 @@
+/* 
+   Time-stamp: <Sat Dec 11 1999 17:25:27 Stardate: [-30]4033.42 software>
+   $Id: GranSim.c,v 1.2 2000/01/13 14:34:06 hwloidl Exp $
+
+   Variables and functions specific to GranSim the parallelism simulator
+   for GPH.
+*/
+
+//@node GranSim specific code, , ,
+//@section GranSim specific code
+
+/*
+   Macros for dealing with the new and improved GA field for simulating
+   parallel execution. Based on @CONCURRENT@ package. The GA field now
+   contains a mask, where the n-th bit stands for the n-th processor, where
+   this data can be found. In case of multiple copies, several bits are
+   set. The total number of processors is bounded by @MAX_PROC@, which
+   should be <= the length of a word in bits.  -- HWL 
+*/
+
+//@menu
+//* Includes::			
+//* Prototypes and externs::	
+//* Constants and Variables::	
+//* Initialisation::		
+//* Global Address Operations::	 
+//* Global Event Queue::	
+//* Spark queue functions::	
+//* Scheduling functions::	
+//* Thread Queue routines::	
+//* GranSim functions::		
+//* GranSimLight routines::	
+//* Code for Fetching Nodes::	
+//* Idle PEs::			
+//* Routines directly called from Haskell world::  
+//* Emiting profiling info for GrAnSim::  
+//* Dumping routines::		
+//* Index::			
+//@end menu
+
+//@node Includes, Prototypes and externs, GranSim specific code, GranSim specific code
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "StgMiscClosures.h"
+#include "StgTypes.h"
+#include "Schedule.h"
+#include "SchedAPI.h"       // for pushClosure
+#include "GC.h"
+#include "GranSimRts.h"
+#include "GranSim.h"
+#include "ParallelRts.h"
+#include "ParallelDebug.h"
+#include "Storage.h"       // for recordMutable
+
+
+//@node Prototypes and externs, Constants and Variables, Includes, GranSim specific code
+//@subsection Prototypes and externs
+
+#if defined(GRAN)
+
+/* Prototypes */
+static inline PEs      ga_to_proc(StgWord);
+static inline rtsBool  any_idle(void);
+static inline nat      idlers(void);
+       PEs             where_is(StgClosure *node);
+
+static rtsBool         stealSomething(PEs proc, rtsBool steal_spark, rtsBool steal_thread);
+static inline rtsBool  stealSpark(PEs proc);
+static inline rtsBool  stealThread(PEs proc);
+static rtsBool         stealSparkMagic(PEs proc);
+static rtsBool         stealThreadMagic(PEs proc);
+/* subsumed by stealSomething
+static void            stealThread(PEs proc); 
+static void            stealSpark(PEs proc);
+*/
+static rtsTime         sparkStealTime(void);
+static nat             natRandom(nat from, nat to);
+static PEs             findRandomPE(PEs proc);
+static void            sortPEsByTime (PEs proc, PEs *pes_by_time, 
+				      nat *firstp, nat *np);
+
+void GetRoots(void);
+
+#endif /* GRAN */
+
+//@node Constants and Variables, Initialisation, Prototypes and externs, GranSim specific code
+//@subsection Constants and Variables
+
+#if defined(GRAN) || defined(PAR)
+/* See GranSim.h for the definition of the enum gran_event_types */
+char *gran_event_names[] = {
+    "START", "START(Q)",
+    "STEALING", "STOLEN", "STOLEN(Q)",
+    "FETCH", "REPLY", "BLOCK", "RESUME", "RESUME(Q)",
+    "SCHEDULE", "DESCHEDULE",
+    "END",
+    "SPARK", "SPARKAT", "USED", "PRUNED", "EXPORTED", "ACQUIRED",
+    "ALLOC",
+    "TERMINATE",
+    "SYSTEM_START", "SYSTEM_END",           /* only for debugging */
+    "??"
+};
+#endif
+
+#if defined(GRAN)                                              /* whole file */
+char *proc_status_names[] = {
+  "Idle", "Sparking", "Starting", "Fetching", "Fishing", "Busy", 
+  "UnknownProcStatus"
+};
+
+/* For internal use (event statistics) only */
+char *event_names[] =
+    { "ContinueThread", "StartThread", "ResumeThread", 
+      "MoveSpark", "MoveThread", "FindWork",
+      "FetchNode", "FetchReply",
+      "GlobalBlock", "UnblockThread"
+    };
+
+//@cindex CurrentProc
+PEs CurrentProc = 0;
+
+/*
+  ToDo: Create a structure for the processor status and put all the 
+        arrays below into it. 
+  -- HWL */
+
+//@cindex CurrentTime
+/* One clock for each PE */
+rtsTime CurrentTime[MAX_PROC];  
+
+/* Useful to restrict communication; cf fishing model in GUM */
+nat OutstandingFetches[MAX_PROC], OutstandingFishes[MAX_PROC];
+
+/* Status of each PE (new since but independent of GranSim Light) */
+rtsProcStatus procStatus[MAX_PROC];
+
+# if defined(GRAN) && defined(GRAN_CHECK)
+/* To check if the RTS ever tries to run a thread that should be blocked
+   because of fetching remote data */
+StgTSO *BlockedOnFetch[MAX_PROC];
+# define FETCH_MASK_TSO  0x08000000      /* only bits 0, 1, 2 should be used */
+# endif
+
+nat SparksAvail = 0;     /* How many sparks are available */
+nat SurplusThreads = 0;  /* How many excess threads are there */
+
+/* Do we need to reschedule following a fetch? */
+rtsBool NeedToReSchedule = rtsFalse, IgnoreEvents = rtsFalse, IgnoreYields = rtsFalse; 
+rtsTime TimeOfNextEvent, TimeOfLastEvent, EndOfTimeSlice; /* checked from the threaded world! */
+
+//@cindex spark queue
+/* GranSim: a globally visible array of spark queues */
+rtsSparkQ pending_sparks_hds[MAX_PROC];
+rtsSparkQ pending_sparks_tls[MAX_PROC];
+
+nat sparksIgnored = 0, sparksCreated = 0;
+
+GlobalGranStats globalGranStats;
+
+nat gran_arith_cost, gran_branch_cost, gran_load_cost, 
+    gran_store_cost, gran_float_cost;
+
+/*
+Old comment from 0.29. ToDo: Check and update -- HWL
+
+The following variables control the behaviour of GrAnSim. In general, there
+is one RTS option for enabling each of these features. In getting the
+desired setup of GranSim the following questions have to be answered:
+\begin{itemize}
+\item {\em Which scheduling algorithm} to use (@RtsFlags.GranFlags.DoFairSchedule@)? 
+      Currently only unfair scheduling is supported.
+\item What to do when remote data is fetched (@RtsFlags.GranFlags.DoAsyncFetch@)? 
+      Either block and wait for the
+      data or reschedule and do some other work.
+      Thus, if this variable is true, asynchronous communication is
+      modelled. Block on fetch mainly makes sense for incremental fetching.
+
+      There is also a simplified fetch variant available
+      (@RtsFlags.GranFlags.SimplifiedFetch@). This variant does not use events to model
+      communication. It is faster but the results will be less accurate.
+\item How aggressive to be in getting work after a reschedule on fetch
+      (@RtsFlags.GranFlags.FetchStrategy@)?
+      This is determined by the so-called {\em fetching
+      strategy\/}. Currently, there are four possibilities:
+      \begin{enumerate}
+       \item Only run a runnable thread.
+       \item Turn a spark into a thread, if necessary.
+       \item Steal a remote spark, if necessary.
+       \item Steal a runnable thread from another processor, if necessary.
+      \end{itemize}
+      The variable @RtsFlags.GranFlags.FetchStrategy@ determines how far to go in this list
+      when rescheduling on a fetch.
+\item Should sparks or threads be stolen first when looking for work
+      (@RtsFlags.GranFlags.DoStealThreadsFirst@)? 
+      The default is to steal sparks first (much cheaper).
+\item Should the RTS use a lazy thread creation scheme
+      (@RtsFlags.GranFlags.DoAlwaysCreateThreads@)?  By default yes i.e.\ sparks are only
+      turned into threads when work is needed. Also note, that sparks
+      can be discarded by the RTS (this is done in the case of an overflow
+      of the spark pool). Setting @RtsFlags.GranFlags.DoAlwaysCreateThreads@  to @True@ forces
+      the creation of threads at the next possibility (i.e.\ when new work
+      is demanded the next time).
+\item Should data be fetched closure-by-closure or in packets
+      (@RtsFlags.GranFlags.DoBulkFetching@)? The default strategy is a GRIP-like incremental 
+      (i.e.\ closure-by-closure) strategy. This makes sense in a
+      low-latency setting but is bad in a high-latency system. Setting 
+      @RtsFlags.GranFlags.DoBulkFetching@ to @True@ enables bulk (packet) fetching. Other
+      parameters determine the size of the packets (@pack_buffer_size@) and the number of
+      thunks that should be put into one packet (@RtsFlags.GranFlags.ThunksToPack@).
+\item If there is no other possibility to find work, should runnable threads
+      be moved to an idle processor (@RtsFlags.GranFlags.DoThreadMigration@)? In any case, the
+      RTS tried to get sparks (either local or remote ones) first. Thread
+      migration is very expensive, since a whole TSO has to be transferred
+      and probably data locality becomes worse in the process. Note, that
+      the closure, which will be evaluated next by that TSO is not
+      transferred together with the TSO (that might block another thread).
+\item Should the RTS distinguish between sparks created by local nodes and
+      stolen sparks (@RtsFlags.GranFlags.PreferSparksOfLocalNodes@)?  The idea is to improve 
+      data locality by preferring sparks of local nodes (it is more likely
+      that the data for those sparks is already on the local processor). 
+      However, such a distinction also imposes an overhead on the spark
+      queue management, and typically a large number of sparks are
+      generated during execution. By default this variable is set to @False@.
+\item Should the RTS use granularity control mechanisms? The idea of a 
+      granularity control mechanism is to make use of granularity
+      information provided via annotation of the @par@ construct in order
+      to prefer bigger threads when either turning a spark into a thread or
+      when choosing the next thread to schedule. Currently, three such
+      mechanisms are implemented:
+      \begin{itemize}
+        \item Cut-off: The granularity information is interpreted as a
+              priority. If a threshold priority is given to the RTS, then
+              only those sparks with a higher priority than the threshold 
+              are actually created. Other sparks are immediately discarded.
+              This is similar to a usual cut-off mechanism often used in 
+              parallel programs, where parallelism is only created if the 
+              input data is lage enough. With this option, the choice is 
+              hidden in the RTS and only the threshold value has to be 
+              provided as a parameter to the runtime system.
+        \item Priority Sparking: This mechanism keeps priorities for sparks
+              and chooses the spark with the highest priority when turning
+              a spark into a thread. After that the priority information is
+              discarded. The overhead of this mechanism comes from
+              maintaining a sorted spark queue.
+        \item Priority Scheduling: This mechanism keeps the granularity
+              information for threads, to. Thus, on each reschedule the 
+              largest thread is chosen. This mechanism has a higher
+              overhead, as the thread queue is sorted, too.
+       \end{itemize}  
+\end{itemize}
+*/
+
+//@node Initialisation, Global Address Operations, Constants and Variables, GranSim specific code
+//@subsection Initialisation
+
+void 
+init_gr_stats (void) {
+  memset(&globalGranStats, '\0', sizeof(GlobalGranStats));
+#if 0
+  /* event stats */
+  globalGranStats.noOfEvents = 0;
+  for (i=0; i<MAX_EVENT; i++) globalGranStats.event_counts[i]=0;
+
+  /* communication stats */
+  globalGranStats.fetch_misses = 0;
+  globalGranStats.tot_low_pri_sparks = 0;
+
+  /* obscure stats */  
+  globalGranStats.rs_sp_count = 0;
+  globalGranStats.rs_t_count = 0;
+  globalGranStats.ntimes_total = 0, 
+  globalGranStats.fl_total = 0;
+  globalGranStats.no_of_steals = 0;
+
+  /* spark queue stats */
+  globalGranStats.tot_sq_len = 0, 
+  globalGranStats.tot_sq_probes = 0; 
+  globalGranStats.tot_sparks = 0;
+  globalGranStats.withered_sparks = 0;
+  globalGranStats.tot_add_threads = 0;
+  globalGranStats.tot_tq_len = 0;
+  globalGranStats.non_end_add_threads = 0;
+
+  /* thread stats */
+  globalGranStats.tot_threads_created = 0;
+  for (i=0; i<MAX_PROC; i++) globalGranStats.threads_created_on_PE[i]=0;
+#endif /* 0 */
+}
+
+//@node Global Address Operations, Global Event Queue, Initialisation, GranSim specific code
+//@subsection Global Address Operations
+/*
+  ----------------------------------------------------------------------
+  Global Address Operations
+
+  These functions perform operations on the global-address (ga) part of a
+  closure. The ga is the only new field (1 word) in a closure introduced by
+  GrAnSim. It serves as a bitmask, indicating on which processor the
+  closure is residing. Since threads are described by Thread State Object
+  (TSO), which is nothing but another kind of closure, this scheme allows
+  gives placement information about threads.
+
+  A ga is just a bitmask, so the operations on them are mainly bitmask
+  manipulating functions. Note, that there are important macros like PROCS,
+  IS_LOCAL_TO etc. They are defined in @GrAnSim.lh@.
+
+  NOTE: In GrAnSim-light we don't maintain placement information. This
+  allows to simulate an arbitrary number of processors. The price we have
+  to be is the lack of costing any communication properly. In short,
+  GrAnSim-light is meant to reveal the maximal parallelism in a program.
+  From an implementation point of view the important thing is: {\em
+  GrAnSim-light does not maintain global-addresses}.  */
+
+/* ga_to_proc returns the first processor marked in the bitmask ga.
+   Normally only one bit in ga should be set. But for PLCs all bits
+   are set. That shouldn't hurt since we only need IS_LOCAL_TO for PLCs */
+ 
+//@cindex ga_to_proc
+
+static inline PEs
+ga_to_proc(StgWord ga)
+{
+    PEs i;
+    for (i = 0; i < RtsFlags.GranFlags.proc && !IS_LOCAL_TO(ga, i); i++);
+    ASSERT(0<=i && i<RtsFlags.GranFlags.proc);
+    return (i);
+}
+
+/* NB: This takes a *node* rather than just a ga as input */
+//@cindex where_is
+PEs
+where_is(StgClosure *node)
+{ return (ga_to_proc(PROCS(node))); }
+
+// debugging only
+//@cindex is_unique
+rtsBool
+is_unique(StgClosure *node)
+{ 
+  PEs i;
+  rtsBool unique = rtsFalse;
+
+  for (i = 0; i < RtsFlags.GranFlags.proc ; i++)
+    if (IS_LOCAL_TO(PROCS(node), i))
+      if (unique)          // exactly 1 instance found so far
+	return rtsFalse;   // found a 2nd instance => not unique
+      else 
+	unique = rtsTrue;  // found 1st instance 
+  ASSERT(unique);          // otherwise returned from within loop
+  return (unique);
+}
+
+//@cindex any_idle
+static inline rtsBool
+any_idle(void) { /* any (map (\ i -> procStatus[i] == Idle)) [0,..,MAX_PROC] */
+ PEs i; 
+ rtsBool any_idle; 
+ for(i=0, any_idle=rtsFalse; 
+     !any_idle && i<RtsFlags.GranFlags.proc; 
+     any_idle = any_idle || procStatus[i] == Idle, i++) 
+ {} ;
+}
+
+//@cindex idlers
+static inline nat
+idlers(void) {  /* number of idle PEs */
+ PEs i, j; 
+ for(i=0, j=0;
+     i<RtsFlags.GranFlags.proc; 
+     j += (procStatus[i] == Idle) ? 1 : 0, i++) 
+ {} ;
+ return j;
+}
+
+//@node Global Event Queue, Spark queue functions, Global Address Operations, GranSim specific code
+//@subsection Global Event Queue
+/*
+The following routines implement an ADT of an event-queue (FIFO). 
+ToDo: Put that in an own file(?)
+*/
+
+/* Pointer to the global event queue; events are currently malloc'ed */
+rtsEventQ EventHd = NULL;
+
+//@cindex get_next_event
+rtsEvent *
+get_next_event(void)
+{
+  static rtsEventQ entry = NULL;
+
+  if (EventHd == NULL) {
+    barf("No next event. This may be caused by a circular data dependency in the program.");
+  }
+
+  if (entry != NULL)
+    free((char *)entry);
+
+  if (RtsFlags.GranFlags.GranSimStats.Global) {     /* count events */
+    globalGranStats.noOfEvents++;
+    globalGranStats.event_counts[EventHd->evttype]++;
+  }
+
+  entry = EventHd;
+
+  IF_GRAN_DEBUG(event_trace,
+	   print_event(entry));
+
+  EventHd = EventHd->next;
+  return(entry);
+}
+
+/* When getting the time of the next event we ignore CONTINUETHREAD events:
+   we don't want to be interrupted before the end of the current time slice
+   unless there is something important to handle. 
+*/
+//@cindex get_time_of_next_event
+rtsTime
+get_time_of_next_event(void)
+{ 
+  rtsEventQ event = EventHd;
+
+  while (event != NULL && event->evttype==ContinueThread) {
+    event = event->next;
+  }
+  if(event == NULL)
+      return ((rtsTime) 0);
+  else
+      return (event->time);
+}
+
+/* ToDo: replace malloc/free with a free list */
+//@cindex insert_event
+void
+insert_event(newentry)
+rtsEvent *newentry;
+{
+  rtsEventType evttype = newentry->evttype;
+  rtsEvent *event, **prev;
+
+  /* if(evttype >= CONTINUETHREAD1) evttype = CONTINUETHREAD; */
+
+  /* Search the queue and insert at the right point:
+     FINDWORK before everything, CONTINUETHREAD after everything.
+
+     This ensures that we find any available work after all threads have
+     executed the current cycle.  This level of detail would normally be
+     irrelevant, but matters for ridiculously low latencies...
+  */
+
+  /* Changed the ordering: Now FINDWORK comes after everything but 
+     CONTINUETHREAD. This makes sure that a MOVESPARK comes before a 
+     FINDWORK. This is important when a GranSimSparkAt happens and
+     DoAlwaysCreateThreads is turned on. Also important if a GC occurs
+     when trying to build a new thread (see much_spark)  -- HWL 02/96  */
+
+  if(EventHd == NULL)
+    EventHd = newentry;
+  else {
+    for (event = EventHd, prev=(rtsEvent**)&EventHd; 
+	 event != NULL; 
+         prev = (rtsEvent**)&(event->next), event = event->next) {
+      switch (evttype) {
+        case FindWork: if ( event->time < newentry->time ||
+                            ( (event->time == newentry->time) &&
+			      (event->evttype != ContinueThread) ) )
+                         continue;
+                       else
+                         break;
+        case ContinueThread: if ( event->time <= newentry->time )
+			       continue;
+			     else
+                               break;
+        default: if ( event->time < newentry->time || 
+	              ((event->time == newentry->time) &&
+		       (event->evttype == newentry->evttype)) )
+		   continue;
+		 else
+                   break;
+       }
+       /* Insert newentry here (i.e. before event) */
+       *prev = newentry;
+       newentry->next = event;
+       break;
+    }
+    if (event == NULL)
+      *prev = newentry;
+  }
+}
+
+//@cindex new_event
+void
+new_event(proc,creator,time,evttype,tso,node,spark)
+PEs proc, creator;
+rtsTime time;
+rtsEventType evttype;
+StgTSO *tso;
+StgClosure *node;
+rtsSpark *spark;
+{
+  rtsEvent *newentry = (rtsEvent *) stgMallocBytes(sizeof(rtsEvent), "new_event");
+
+  newentry->proc     = proc;
+  newentry->creator  = creator;
+  newentry->time     = time;
+  newentry->evttype  = evttype;
+  newentry->tso      = tso;
+  newentry->node     = node;
+  newentry->spark    = spark;
+  newentry->gc_info  = 0;
+  newentry->next     = NULL;
+
+  insert_event(newentry);
+
+  IF_DEBUG(gran, 
+	   fprintf(stderr, "GRAN: new_event: \n"); 
+	   print_event(newentry))
+}
+
+//@cindex prepend_event
+void
+prepend_event(event)       /* put event at beginning of EventQueue */
+rtsEvent *event;
+{				  /* only used for GC! */
+ event->next = EventHd;
+ EventHd = event;
+}
+
+//@cindex grab_event
+rtsEventQ
+grab_event(void)             /* undo prepend_event i.e. get the event */
+{			 /* at the head of EventQ but don't free anything */
+ rtsEventQ event = EventHd;
+
+ if (EventHd == NULL) {
+   barf("No next event (in grab_event). This may be caused by a circular data dependency in the program.");
+ }
+
+ EventHd = EventHd->next;
+ return (event);
+}
+
+//@cindex traverse_eventq_for_gc
+void 
+traverse_eventq_for_gc(void)
+{
+ rtsEventQ event = EventHd;
+ StgWord bufsize;
+ StgClosure *closurep;
+ StgTSO *tsop;
+ StgPtr buffer, bufptr;
+ PEs proc, creator;
+
+ /* Traverse eventq and replace every FETCHREPLY by a FETCHNODE for the
+    orig closure (root of packed graph). This means that a graph, which is
+    between processors at the time of GC is fetched again at the time when
+    it would have arrived, had there been no GC. Slightly inaccurate but
+    safe for GC.
+    This is only needed for GUM style fetchng. -- HWL */
+ if (!RtsFlags.GranFlags.DoBulkFetching)
+   return;
+
+ for(event = EventHd; event!=NULL; event=event->next) {
+   if (event->evttype==FetchReply) {
+     buffer = stgCast(StgPtr,event->node);
+     ASSERT(buffer[PACK_FLAG_LOCN]==MAGIC_PACK_FLAG);  /* It's a pack buffer */
+     bufsize = buffer[PACK_SIZE_LOCN];
+     closurep = stgCast(StgClosure*,buffer[PACK_HDR_SIZE]);
+     tsop = stgCast(StgTSO*,buffer[PACK_TSO_LOCN]);
+     proc = event->proc;
+     creator = event->creator;                 /* similar to unpacking */
+     for (bufptr=buffer+PACK_HDR_SIZE; 
+	  bufptr<(buffer+bufsize);
+	  bufptr++) {
+	 // if ( (INFO_TYPE(INFO_PTR(*bufptr)) == INFO_SPEC_RBH_TYPE) ||
+	 //      (INFO_TYPE(INFO_PTR(*bufptr)) == INFO_GEN_RBH_TYPE) ) {
+	   if ( GET_INFO(stgCast(StgClosure*,bufptr)) ) {
+	     convertFromRBH(stgCast(StgClosure *,bufptr));
+	 }
+     }
+     free(buffer);
+     event->evttype = FetchNode;
+     event->proc    = creator;
+     event->creator = proc;
+     event->node    = closurep;
+     event->tso     = tsop;
+     event->gc_info = 0;
+   }
+ }
+}
+
+void
+markEventQueue(void)
+{ 
+  StgClosure *MarkRoot(StgClosure *root); // prototype
+
+  rtsEventQ event = EventHd;
+  nat len;
+
+  /* iterate over eventq and register relevant fields in event as roots */
+  for(event = EventHd, len =  0; event!=NULL; event=event->next, len++) {
+    switch (event->evttype) {
+      case ContinueThread:  
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	break;
+      case StartThread: 
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      case ResumeThread:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      case MoveSpark:
+	event->spark->node = (StgClosure *)MarkRoot((StgClosure *)event->spark->node);
+	break;
+      case MoveThread:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	break;
+      case FindWork:
+	break;
+      case FetchNode: 
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+  	break;
+      case FetchReply:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	if (RtsFlags.GranFlags.DoBulkFetching)
+	  // ToDo: traverse_eventw_for_gc if GUM-Fetching!!! HWL
+	  belch("ghuH: packets in BulkFetching not marked as roots; mayb be fatal");
+	else
+	  event->node = (StgTSO *)MarkRoot((StgClosure *)event->node);
+	break;
+      case GlobalBlock:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      case UnblockThread:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      default:
+	barf("markEventQueue: trying to mark unknown event @ %p", event);
+    }}
+  IF_DEBUG(gc,
+	   belch("GC: markEventQueue: %d events in queue", len));
+}
+
+/*
+  Prune all ContinueThread events related to tso or node in the eventq.
+  Currently used if a thread leaves STG land with ThreadBlocked status,
+  i.e. it blocked on a closure and has been put on its blocking queue.  It
+  will be reawakended via a call to awaken_blocked_queue. Until then no
+  event effecting this tso should appear in the eventq.  A bit of a hack,
+  because ideally we shouldn't generate such spurious ContinueThread events
+  in the first place.  
+*/
+//@cindex prune_eventq 
+void 
+prune_eventq(tso, node) 
+StgTSO *tso; 
+StgClosure *node; 
+{ rtsEventQ prev = (rtsEventQ)NULL, event = EventHd;
+
+  /* node unused for now */ 
+  ASSERT(node==NULL); 
+  /* tso must be valid, then */
+  ASSERT(tso!=END_TSO_QUEUE);
+  while (event != NULL) {
+    if (event->evttype==ContinueThread && 
+	(event->tso==tso)) {
+      IF_GRAN_DEBUG(event_trace, // ToDo: use another debug flag
+		    belch("prune_eventq: pruning ContinueThread event for TSO %d (%p) on PE %d @ %lx (%p)",
+			  event->tso->id, event->tso, event->proc, event->time, event));
+      if (prev==(rtsEventQ)NULL) { // beginning of eventq
+	EventHd = event->next;
+	free(event); 
+	event = EventHd;
+      } else {
+	prev->next = event->next;
+	free(event); 
+	event = prev->next;
+      }
+    } else { // no pruning necessary; go to next event
+      prev = event;
+      event = event->next;
+    }
+  }
+}
+
+//@cindex print_event
+void
+print_event(event)
+rtsEvent *event;
+{
+  char str_tso[16], str_node[16];
+  StgThreadID tso_id;
+
+  if (event->tso==END_TSO_QUEUE) {
+    strcpy(str_tso, "______");
+    tso_id = 0;
+  } else { 
+    sprintf(str_tso, "%p", event->tso);
+    tso_id = (event->tso==NULL) ? 0 : event->tso->id;
+  }
+  if  (event->node==(StgClosure*)NULL) {
+    strcpy(str_node, "______");
+  } else {
+    sprintf(str_node, "%p", event->node);
+  }
+  // HWL: shouldn't be necessary; ToDo: nuke
+  //str_tso[6]='\0';
+  //str_node[6]='\0';
+
+  if (event==NULL)
+    fprintf(stderr,"Evt: NIL\n");
+  else
+    fprintf(stderr, "Evt: %s (%u), PE %u [%u], Time %lu, TSO %d (%s), Node %s\n", //"Evt: %s (%u), PE %u [%u], Time %u, TSO %s (%#l), Node %s\n",
+	      event_names[event->evttype], event->evttype,
+              event->proc, event->creator, event->time, 
+	      tso_id, str_tso, str_node
+	      /*, event->spark, event->next */ );
+
+}
+
+//@cindex print_eventq
+void
+print_eventq(hd)
+rtsEvent *hd;
+{
+  rtsEvent *x;
+
+  fprintf(stderr,"Event Queue with root at %p:\n", hd);
+  for (x=hd; x!=NULL; x=x->next) {
+    print_event(x);
+  }
+}
+
+/* 
+   Spark queue functions are now all  in Sparks.c!!
+*/
+//@node Scheduling functions, Thread Queue routines, Spark queue functions, GranSim specific code
+//@subsection Scheduling functions
+
+/* 
+   These functions are variants of thread initialisation and therefore
+   related to initThread and friends in Schedule.c. However, they are
+   specific to a GranSim setup in storing more info in the TSO's statistics
+   buffer and sorting the thread queues etc.  
+*/
+
+/*
+   A large portion of startThread deals with maintaining a sorted thread
+   queue, which is needed for the Priority Sparking option. Without that
+   complication the code boils down to FIFO handling.  
+*/
+//@cindex insertThread
+void
+insertThread(tso, proc)
+StgTSO*     tso;
+PEs         proc;
+{
+  StgTSO *prev = NULL, *next = NULL;
+  nat count = 0;
+  rtsBool found = rtsFalse;
+
+  ASSERT(CurrentProc==proc);
+  ASSERT(!is_on_queue(tso,proc));
+  /* Idle proc: put the thread on the run queue
+     same for pri spark and basic version */
+  if (run_queue_hds[proc] == END_TSO_QUEUE)
+    {
+      /* too strong!
+      ASSERT((CurrentProc==MainProc &&   
+	      CurrentTime[MainProc]==0 &&
+	      procStatus[MainProc]==Idle) ||
+	     procStatus[proc]==Starting);
+      */
+      run_queue_hds[proc] = run_queue_tls[proc] = tso;
+
+      CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadqueuetime;
+
+      /* new_event of ContinueThread has been moved to do_the_startthread */
+
+      /* too strong!
+      ASSERT(procStatus[proc]==Idle || 
+             procStatus[proc]==Fishing || 
+             procStatus[proc]==Starting);
+      procStatus[proc] = Busy;
+      */
+      return;
+    }
+
+  if (RtsFlags.GranFlags.Light)
+    GranSimLight_insertThread(tso, proc);
+
+  /* Only for Pri Scheduling: find place where to insert tso into queue */
+  if (RtsFlags.GranFlags.DoPriorityScheduling && tso->gran.pri!=0)
+    /* {add_to_spark_queue}vo' jInIHta'; Qu' wa'DIch yIleghQo' */
+    for (prev = run_queue_hds[proc], next = run_queue_hds[proc]->link, count=0;
+	 (next != END_TSO_QUEUE) && 
+	 !(found = tso->gran.pri >= next->gran.pri);
+	 prev = next, next = next->link, count++) 
+      { 
+       ASSERT((prev!=(StgTSO*)NULL || next==run_queue_hds[proc]) &&
+	      (prev==(StgTSO*)NULL || prev->link==next));
+      }
+
+  ASSERT(!found || next != END_TSO_QUEUE);
+  ASSERT(procStatus[proc]!=Idle);
+ 
+  if (found) {
+     /* found can only be rtsTrue if pri scheduling enabled */ 
+     ASSERT(RtsFlags.GranFlags.DoPriorityScheduling);
+     if (RtsFlags.GranFlags.GranSimStats.Global) 
+       globalGranStats.non_end_add_threads++;
+     /* Add tso to ThreadQueue between prev and next */
+     tso->link = next;
+     if ( next == (StgTSO*)END_TSO_QUEUE ) {
+       run_queue_tl = tso;
+     } else {
+       /* no back link for TSO chain */
+     }
+     
+     if ( prev == (StgTSO*)END_TSO_QUEUE ) {
+       /* Never add TSO as first elem of thread queue; the first */
+       /* element should be the one that is currently running -- HWL */
+       IF_DEBUG(gran,
+		belch("GRAN: Qagh: NewThread (w/ PriorityScheduling): Trying to add TSO %p (PRI=%d) as first elem of threadQ (%p) on proc %u (@ %u)\n",
+		    tso, tso->gran.pri, run_queue_hd, proc,
+		    CurrentTime[proc]));
+     } else {
+      prev->link = tso;
+     }
+  } else { /* !found */ /* or not pri sparking! */
+    /* Add TSO to the end of the thread queue on that processor */
+    run_queue_tls[proc]->link = tso;
+    run_queue_tls[proc] = tso;
+  }
+  ASSERT(RtsFlags.GranFlags.DoPriorityScheduling || count==0);
+  CurrentTime[proc] += count * RtsFlags.GranFlags.Costs.pri_sched_overhead +
+                       RtsFlags.GranFlags.Costs.threadqueuetime;
+
+  /* ToDo: check if this is still needed -- HWL 
+  if (RtsFlags.GranFlags.DoThreadMigration)
+    ++SurplusThreads;
+
+  if (RtsFlags.GranFlags.GranSimStats.Full &&
+      !(( event_type == GR_START || event_type == GR_STARTQ) && 
+	RtsFlags.GranFlags.labelling) )
+    DumpRawGranEvent(proc, creator, event_type+1, tso, node, 
+	             tso->gran.sparkname, spark_queue_len(proc));
+  */
+
+# if defined(GRAN_CHECK)
+  /* Check if thread queue is sorted. Only for testing, really!  HWL */
+  if ( RtsFlags.GranFlags.DoPriorityScheduling && 
+       (RtsFlags.GranFlags.Debug.sortedQ) ) {
+    rtsBool sorted = rtsTrue;
+    StgTSO *prev, *next;
+
+    if (run_queue_hds[proc]==END_TSO_QUEUE || 
+	run_queue_hds[proc]->link==END_TSO_QUEUE) {
+      /* just 1 elem => ok */
+    } else {
+      /* Qu' wa'DIch yIleghQo' (ignore first elem)! */
+      for (prev = run_queue_hds[proc]->link, next = prev->link;
+	   (next != END_TSO_QUEUE) ;
+	   prev = next, next = prev->link) {
+	ASSERT((prev!=(StgTSO*)NULL || next==run_queue_hds[proc]) &&
+	       (prev==(StgTSO*)NULL || prev->link==next));
+	sorted = sorted && 
+	         (prev->gran.pri >= next->gran.pri);
+      }
+    }
+    if (!sorted) {
+      fprintf(stderr,"Qagh: THREADQ on PE %d is not sorted:\n",
+	      CurrentProc);
+      G_THREADQ(run_queue_hd,0x1);
+    }
+  }
+# endif
+}
+
+/*
+  insertThread, which is only used for GranSim Light, is similar to
+  startThread in that it adds a TSO to a thread queue. However, it assumes
+  that the thread queue is sorted by local clocks and it inserts the TSO at
+  the right place in the queue. Don't create any event, just insert.  
+*/
+//@cindex GranSimLight_insertThread
+rtsBool
+GranSimLight_insertThread(tso, proc)
+StgTSO* tso;
+PEs proc;
+{
+  StgTSO *prev, *next;
+  nat count = 0;
+  rtsBool found = rtsFalse;
+
+  ASSERT(RtsFlags.GranFlags.Light);
+
+  /* In GrAnSim-Light we always have an idle `virtual' proc.
+     The semantics of the one-and-only thread queue is different here:
+     all threads in the queue are running (each on its own virtual processor);
+     the queue is only needed internally in the simulator to interleave the
+     reductions of the different processors.
+     The one-and-only thread queue is sorted by the local clocks of the TSOs.
+  */
+  ASSERT(run_queue_hds[proc] != END_TSO_QUEUE);
+  ASSERT(tso->link == END_TSO_QUEUE);
+
+  /* If only one thread in queue so far we emit DESCHEDULE in debug mode */
+  if (RtsFlags.GranFlags.GranSimStats.Full &&
+      (RtsFlags.GranFlags.Debug.checkLight) && 
+      (run_queue_hd->link == END_TSO_QUEUE)) {
+    DumpRawGranEvent(proc, proc, GR_DESCHEDULE,
+		     run_queue_hds[proc], (StgClosure*)NULL, 
+		     tso->gran.sparkname, spark_queue_len(proc)); // ToDo: check spar_queue_len
+    // resched = rtsTrue;
+  }
+
+  /* this routine should only be used in a GrAnSim Light setup */
+  /* && CurrentProc must be 0 in GrAnSim Light setup */
+  ASSERT(RtsFlags.GranFlags.Light && CurrentProc==0);
+
+  /* Idle proc; same for pri spark and basic version */
+  if (run_queue_hd==END_TSO_QUEUE)
+    {
+      run_queue_hd = run_queue_tl = tso;
+      /* MAKE_BUSY(CurrentProc); */
+      return rtsTrue;
+    }
+
+  for (prev = run_queue_hds[proc], next = run_queue_hds[proc]->link, count = 0;
+       (next != END_TSO_QUEUE) && 
+       !(found = (tso->gran.clock < next->gran.clock));
+       prev = next, next = next->link, count++) 
+    { 
+       ASSERT((prev!=(StgTSO*)NULL || next==run_queue_hds[proc]) &&
+	      (prev==(StgTSO*)NULL || prev->link==next));
+    }
+
+  /* found can only be rtsTrue if pri sparking enabled */ 
+  if (found) {
+     /* Add tso to ThreadQueue between prev and next */
+     tso->link = next;
+     if ( next == END_TSO_QUEUE ) {
+       run_queue_tls[proc] = tso;
+     } else {
+       /* no back link for TSO chain */
+     }
+     
+     if ( prev == END_TSO_QUEUE ) {
+       run_queue_hds[proc] = tso;
+     } else {
+       prev->link = tso;
+     }
+  } else { /* !found */ /* or not pri sparking! */
+    /* Add TSO to the end of the thread queue on that processor */
+    run_queue_tls[proc]->link = tso;
+    run_queue_tls[proc] = tso;
+  }
+
+  if ( prev == END_TSO_QUEUE ) {        /* new head of queue */
+    new_event(proc, proc, CurrentTime[proc],
+	      ContinueThread,
+	      tso, (StgClosure*)NULL, (rtsSpark*)NULL);
+  }
+  /*
+  if (RtsFlags.GranFlags.GranSimStats.Full && 
+      !(( event_type == GR_START || event_type == GR_STARTQ) && 
+	RtsFlags.GranFlags.labelling) )
+    DumpRawGranEvent(proc, creator, gr_evttype, tso, node,
+		     tso->gran.sparkname, spark_queue_len(proc));
+  */
+  return rtsTrue;
+}
+
+/*
+  endThread is responsible for general clean-up after the thread tso has
+  finished. This includes emitting statistics into the profile etc.  
+*/
+void
+endThread(StgTSO *tso, PEs proc) 
+{
+  ASSERT(procStatus[proc]==Busy);        // coming straight out of STG land
+  ASSERT(tso->whatNext==ThreadComplete);
+  // ToDo: prune ContinueThreads for this TSO from event queue
+  DumpEndEvent(proc, tso, rtsFalse /* not mandatory */);
+
+  /* if this was the last thread on this PE then make it Idle */
+  if (run_queue_hds[proc]==END_TSO_QUEUE) {
+    procStatus[CurrentProc] = Idle;
+  }
+}
+
+//@node Thread Queue routines, GranSim functions, Scheduling functions, GranSim specific code
+//@subsection Thread Queue routines
+
+/* 
+   Check whether given tso resides on the run queue of the current processor.
+   Only used for debugging.
+*/
+   
+//@cindex is_on_queue
+rtsBool
+is_on_queue (StgTSO *tso, PEs proc) 
+{
+  StgTSO *t;
+  rtsBool found;
+
+  for (t=run_queue_hds[proc], found=rtsFalse; 
+       t!=END_TSO_QUEUE && !(found = t==tso);
+       t=t->link)
+    /* nothing */ ;
+
+  return found;
+}
+
+/* This routine  is only  used for keeping   a statistics  of thread  queue
+   lengths to evaluate the impact of priority scheduling. -- HWL 
+   {spark_queue_len}vo' jInIHta'
+*/
+//@cindex thread_queue_len
+nat
+thread_queue_len(PEs proc) 
+{
+ StgTSO *prev, *next;
+ nat len;
+
+ for (len = 0, prev = END_TSO_QUEUE, next = run_queue_hds[proc];
+      next != END_TSO_QUEUE; 
+      len++, prev = next, next = prev->link)
+   {}
+
+ return (len);
+}
+
+//@node GranSim functions, GranSimLight routines, Thread Queue routines, GranSim specific code
+//@subsection GranSim functions
+
+/* -----------------------------------------------------------------  */
+/* The main event handling functions; called from Schedule.c (schedule) */
+/* -----------------------------------------------------------------  */
+ 
+//@cindex do_the_globalblock
+
+void 
+do_the_globalblock(rtsEvent* event)
+{ 
+  PEs proc          = event->proc;        /* proc that requested node */
+  StgTSO *tso       = event->tso;         /* tso that requested node */
+  StgClosure  *node = event->node;        /* requested, remote node */
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the GlobalBlock\n"));
+  /* There should be no GLOBALBLOCKs in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+  /* GlobalBlock events only valid with GUM fetching */
+  ASSERT(RtsFlags.GranFlags.DoBulkFetching);
+
+  IF_GRAN_DEBUG(bq, // globalBlock,
+    if (IS_LOCAL_TO(PROCS(node),proc)) {
+      belch("## Qagh: GlobalBlock: Blocking TSO %d (%p) on LOCAL node %p (PE %d).\n",
+	    tso->id, tso, node, proc);
+    });
+
+  /* CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.munpacktime; */
+  if ( blockFetch(tso,proc,node) != 0 )
+    return;                     /* node has become local by now */
+
+#if 0
+ ToDo: check whether anything has to be done at all after blockFetch -- HWL
+
+  if (!RtsFlags.GranFlags.DoAsyncFetch) { /* head of queue is next thread */
+    StgTSO* tso = run_queue_hds[proc];       /* awaken next thread */
+    if (tso != (StgTSO*)NULL) {
+      new_event(proc, proc, CurrentTime[proc],
+		ContinueThread,
+		tso, (StgClosure*)NULL, (rtsSpark*)NULL);
+      CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadcontextswitchtime;
+      if (RtsFlags.GranFlags.GranSimStats.Full)
+        DumpRawGranEvent(proc, CurrentProc, GR_SCHEDULE, tso,
+			 (StgClosure*)NULL, tso->gran.sparkname, spark_queue_len(CurrentProc));  // ToDo: check sparkname and spar_queue_len
+      procStatus[proc] = Busy;                  /* might have been fetching */
+    } else {
+      procStatus[proc] = Idle;                     /* no work on proc now */
+    }
+  } else {  /* RtsFlags.GranFlags.DoAsyncFetch i.e. block-on-fetch */
+	      /* other thread is already running */
+	      /* 'oH 'utbe' 'e' vIHar ; I think that's not needed -- HWL 
+	      new_event(proc,proc,CurrentTime[proc],
+		       CONTINUETHREAD,EVENT_TSO(event),
+		       (RtsFlags.GranFlags.DoBulkFetching ? closure :
+		       EVENT_NODE(event)),NULL);
+	      */
+  }
+#endif
+}
+
+//@cindex do_the_unblock
+
+void 
+do_the_unblock(rtsEvent* event) 
+{
+  PEs proc = event->proc,       /* proc that requested node */
+      creator = event->creator; /* proc that requested node */
+  StgTSO* tso = event->tso;     /* tso that requested node */
+  StgClosure* node = event->node;  /* requested, remote node */
+  
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the UnBlock\n"))
+  /* There should be no UNBLOCKs in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+  /* UnblockThread means either FetchReply has arrived or
+     a blocking queue has been awakened;
+     ToDo: check with assertions
+  ASSERT(procStatus[proc]==Fetching || IS_BLACK_HOLE(event->node));
+  */
+  if (!RtsFlags.GranFlags.DoAsyncFetch) {  /* block-on-fetch */
+    /* We count block-on-fetch as normal block time */    
+    tso->gran.blocktime += CurrentTime[proc] - tso->gran.blockedat;
+    /* Dumping now done when processing the event
+       No costs for contextswitch or thread queueing in this case 
+       if (RtsFlags.GranFlags.GranSimStats.Full)
+         DumpRawGranEvent(proc, CurrentProc, GR_RESUME, tso, 
+                          (StgClosure*)NULL, tso->gran.sparkname, spark_queue_len(CurrentProc));
+    */
+    /* Maybe do this in FetchReply already 
+    if (procStatus[proc]==Fetching)
+      procStatus[proc] = Busy;
+    */
+    /*
+    new_event(proc, proc, CurrentTime[proc],
+	      ContinueThread,
+	      tso, node, (rtsSpark*)NULL);
+    */
+  } else {
+    /* Asynchr comm causes additional costs here: */
+    /* Bring the TSO from the blocked queue into the threadq */
+  }
+  /* In all cases, the UnblockThread causes a ResumeThread to be scheduled */
+  new_event(proc, proc, 
+	    CurrentTime[proc]+RtsFlags.GranFlags.Costs.threadqueuetime,
+	    ResumeThread,
+	    tso, node, (rtsSpark*)NULL);
+}
+
+//@cindex do_the_fetchnode
+
+void
+do_the_fetchnode(rtsEvent* event)
+{
+  PEs proc = event->proc,       /* proc that holds the requested node */
+      creator = event->creator; /* proc that requested node */
+  StgTSO* tso = event->tso;
+  StgClosure* node = event->node;  /* requested, remote node */
+  rtsFetchReturnCode rc;
+
+  ASSERT(CurrentProc==proc);
+  /* There should be no FETCHNODEs in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the FetchNode\n"));
+
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.munpacktime;
+
+  /* ToDo: check whether this is the right place for dumping the event */
+  if (RtsFlags.GranFlags.GranSimStats.Full)
+    DumpRawGranEvent(creator, proc, GR_FETCH, tso, node, (StgInt)0, 0);
+
+  do {
+    rc = handleFetchRequest(node, proc, creator, tso);
+    if (rc == OutOfHeap) {                                   /* trigger GC */
+# if defined(GRAN_CHECK)  && defined(GRAN)
+     if (RtsFlags.GcFlags.giveStats)
+       fprintf(RtsFlags.GcFlags.statsFile,"*****   veQ boSwI'  PackNearbyGraph(node %p, tso %p (%d))\n",
+     	        node, tso, tso->id);
+# endif
+     prepend_event(event);
+     GarbageCollect(GetRoots); 
+     // HWL: ToDo: check whether a ContinueThread has to be issued
+     // HWL old: ReallyPerformThreadGC(PACK_HEAP_REQUIRED, rtsFalse);
+# if defined(GRAN_CHECK)  && defined(GRAN)
+     if (RtsFlags.GcFlags.giveStats) {
+       fprintf(RtsFlags.GcFlags.statsFile,"*****      SAVE_Hp=%p, SAVE_HpLim=%p, PACK_HEAP_REQUIRED=%d\n",
+     	        Hp, HpLim, 0) ; // PACK_HEAP_REQUIRED);  ???
+       fprintf(stderr,"*****      No. of packets so far: %d (total size: %d)\n", 
+     	        globalGranStats.tot_packets, globalGranStats.tot_packet_size);
+     }
+# endif 
+     event = grab_event();
+     // Hp -= PACK_HEAP_REQUIRED; // ???
+
+     /* GC knows that events are special and follows the pointer i.e. */
+     /* events are valid even if they moved. An EXIT is triggered */
+     /* if there is not enough heap after GC. */
+    }
+  } while (rc == OutOfHeap);
+}
+
+//@cindex do_the_fetchreply
+void 
+do_the_fetchreply(rtsEvent* event)
+{
+  PEs proc = event->proc,       /* proc that requested node */
+      creator = event->creator; /* proc that holds the requested node */
+  StgTSO* tso = event->tso;
+  StgClosure* node = event->node;  /* requested, remote node */
+  StgClosure* closure=(StgClosure*)NULL;
+
+  ASSERT(CurrentProc==proc);
+  ASSERT(RtsFlags.GranFlags.DoAsyncFetch || procStatus[proc]==Fetching);
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the FetchReply\n"));
+  /* There should be no FETCHREPLYs in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  /* assign message unpack costs *before* dumping the event */
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.munpacktime;
+  
+  /* ToDo: check whether this is the right place for dumping the event */
+  if (RtsFlags.GranFlags.GranSimStats.Full)
+    DumpRawGranEvent(proc, creator, GR_REPLY, tso, node, 
+  		      tso->gran.sparkname, spark_queue_len(proc));
+
+  /* THIS SHOULD NEVER HAPPEN 
+     If tso is in the BQ of node this means that it actually entered the 
+     remote closure, due to a missing GranSimFetch at the beginning of the 
+     entry code; therefore, this is actually a faked fetch, triggered from 
+     within GranSimBlock; 
+     since tso is both in the EVQ and the BQ for node, we have to take it out 
+     of the BQ first before we can handle the FetchReply;
+     ToDo: special cases in awaken_blocked_queue, since the BQ magically moved.
+  */
+  if (tso->blocked_on!=(StgClosure*)NULL) {
+    IF_GRAN_DEBUG(bq,
+		  belch("## ghuH: TSO %d (%p) in FetchReply is blocked on node %p (shouldn't happen AFAIK)",
+			tso->id, tso, node));
+    // unlink_from_bq(tso, node);
+  }
+    
+  if (RtsFlags.GranFlags.DoBulkFetching) {      /* bulk (packet) fetching */
+    rtsPackBuffer *buffer = (rtsPackBuffer*)node;
+    nat size = buffer->size;
+  
+    /* NB: Fetch misses can't occur with GUM fetching, as */
+    /* updatable closure are turned into RBHs and therefore locked */
+    /* for other processors that try to grab them. */
+  
+    closure = UnpackGraph(buffer);
+    CurrentTime[proc] += size * RtsFlags.GranFlags.Costs.munpacktime;
+  } else  // incremental fetching
+      /* Copy or  move node to CurrentProc */
+      if (fetchNode(node, creator, proc)) {
+        /* Fetch has failed i.e. node has been grabbed by another PE */
+        PEs p = where_is(node);
+        rtsTime fetchtime;
+     
+	if (RtsFlags.GranFlags.GranSimStats.Global)
+	  globalGranStats.fetch_misses++;
+
+	IF_GRAN_DEBUG(thunkStealing,
+		 belch("== Qu'vatlh! fetch miss @ %u: node %p is at proc %u (rather than proc %u)\n",
+		       CurrentTime[proc],node,p,creator));
+
+	CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mpacktime;
+	
+	/* Count fetch again !? */
+	++(tso->gran.fetchcount);
+	tso->gran.fetchtime += RtsFlags.GranFlags.Costs.fetchtime;
+        
+	fetchtime = stg_max(CurrentTime[CurrentProc],CurrentTime[p]) +
+		    RtsFlags.GranFlags.Costs.latency;
+	
+	/* Chase the grabbed node */
+	new_event(p, proc, fetchtime,
+		  FetchNode,
+		  tso, node, (rtsSpark*)NULL);
+
+# if 0 && defined(GRAN_CHECK) && defined(GRAN) /* Just for testing */
+       IF_GRAN_DEBUG(blockOnFetch,
+		     BlockedOnFetch[CurrentProc] = tso;) /*-rtsTrue;-*/
+	
+       IF_GRAN_DEBUG(blockOnFetch_sanity,
+		     tso->type |= FETCH_MASK_TSO;)
+# endif
+
+        CurrentTime[proc] += RtsFlags.GranFlags.Costs.mtidytime;
+	
+        return; /* NB: no REPLy has been processed; tso still sleeping */
+    }
+
+    /* -- Qapla'! Fetch has been successful; node is here, now  */
+    ++(event->tso->gran.fetchcount);
+    event->tso->gran.fetchtime += RtsFlags.GranFlags.Costs.fetchtime;
+
+    /* this is now done at the beginning of this routine
+    if (RtsFlags.GranFlags.GranSimStats.Full)
+       DumpRawGranEvent(proc,event->creator, GR_REPLY, event->tso,
+			(RtsFlags.GranFlags.DoBulkFetching ? 
+			       closure : 
+			       event->node),
+                        tso->gran.sparkname, spark_queue_len(proc));
+    */
+
+    --OutstandingFetches[proc];
+    ASSERT(OutstandingFetches[proc] >= 0);
+    new_event(proc, proc, CurrentTime[proc],
+	      ResumeThread,
+	      event->tso, (RtsFlags.GranFlags.DoBulkFetching ? 
+			   closure : 
+			   event->node),
+	      (rtsSpark*)NULL);
+}
+
+//@cindex do_the_movethread
+
+void
+do_the_movethread(rtsEvent* event) {
+  PEs proc = event->proc,       /* proc that requested node */
+      creator = event->creator; /* proc that holds the requested node */
+  StgTSO* tso = event->tso;
+
+ IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the MoveThread\n"));
+
+ ASSERT(CurrentProc==proc);
+ /* There should be no MOVETHREADs in GrAnSim Light setup */
+ ASSERT(!RtsFlags.GranFlags.Light);
+ /* MOVETHREAD events should never occur without -bM */
+ ASSERT(RtsFlags.GranFlags.DoThreadMigration);
+ /* Bitmask of moved thread should be 0 */
+ ASSERT(PROCS(tso)==0);
+ ASSERT(procStatus[proc] == Fishing ||
+	RtsFlags.GranFlags.DoAsyncFetch);
+ ASSERT(OutstandingFishes[proc]>0);
+
+ /* ToDo: exact costs for unpacking the whole TSO  */
+ CurrentTime[proc] +=  5l * RtsFlags.GranFlags.Costs.munpacktime;
+
+ /* ToDo: check whether this is the right place for dumping the event */
+ if (RtsFlags.GranFlags.GranSimStats.Full)
+   DumpRawGranEvent(proc, creator, 
+		    GR_STOLEN, tso, (StgClosure*)NULL, (StgInt)0, 0);
+
+ // ToDo: check cost functions
+ --OutstandingFishes[proc];
+ SET_GRAN_HDR(tso, ThisPE);         // adjust the bitmask for the TSO
+ insertThread(tso, proc);
+
+ if (procStatus[proc]==Fishing)
+   procStatus[proc] = Idle;
+
+ if (RtsFlags.GranFlags.GranSimStats.Global)
+   globalGranStats.tot_TSOs_migrated++;
+}
+
+//@cindex do_the_movespark
+
+void
+do_the_movespark(rtsEvent* event) {
+ PEs proc = event->proc,       /* proc that requested spark */
+     creator = event->creator; /* proc that holds the requested spark */
+ StgTSO* tso = event->tso;
+ rtsSparkQ spark = event->spark;
+
+ IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the MoveSpark\n"))
+
+ ASSERT(CurrentProc==proc);
+ ASSERT(spark!=NULL);
+ ASSERT(procStatus[proc] == Fishing ||
+	RtsFlags.GranFlags.DoAsyncFetch);
+ ASSERT(OutstandingFishes[proc]>0); 
+
+ CurrentTime[proc] += RtsFlags.GranFlags.Costs.munpacktime;
+          
+ /* record movement of spark only if spark profiling is turned on */
+ if (RtsFlags.GranFlags.GranSimStats.Sparks)
+    DumpRawGranEvent(proc, creator,
+		     SP_ACQUIRED,
+		     tso, spark->node, spark->name, spark_queue_len(proc));
+
+ /* global statistics */
+ if ( RtsFlags.GranFlags.GranSimStats.Global &&
+      !closure_SHOULD_SPARK(spark->node))
+   globalGranStats.withered_sparks++;
+   /* Not adding the spark to the spark queue would be the right */
+   /* thing here, but it also would be cheating, as this info can't be */
+   /* available in a real system. -- HWL */
+
+ --OutstandingFishes[proc];
+
+ add_to_spark_queue(spark);
+
+ IF_GRAN_DEBUG(randomSteal, // ToDo: spark-distribution flag
+	       print_sparkq_stats());
+
+ /* Should we treat stolen sparks specially? Currently, we don't. */
+
+ if (procStatus[proc]==Fishing)
+   procStatus[proc] = Idle;
+
+ /* add_to_spark_queue will increase the time of the current proc. */
+ /*
+   If proc was fishing, it is Idle now with the new spark in its spark
+   pool. This means that the next time handleIdlePEs is called, a local
+   FindWork will be created on this PE to turn the spark into a thread. Of
+   course another PE might steal the spark in the meantime (that's why we
+   are using events rather than inlining all the operations in the first
+   place). */
+}
+
+/*
+  In the Constellation class version of GranSim the semantics of StarThread
+  events has changed. Now, StartThread has to perform 3 basic operations:
+   - create a new thread (previously this was done in ActivateSpark);
+   - insert the thread into the run queue of the current processor
+   - generate a new event for actually running the new thread
+  Note that the insertThread is called via createThread. 
+*/
+  
+//@cindex do_the_startthread
+
+void
+do_the_startthread(rtsEvent *event)
+{
+  PEs proc          = event->proc;        /* proc that requested node */
+  StgTSO *tso       = event->tso;         /* tso that requested node */
+  StgClosure  *node = event->node;        /* requested, remote node */
+  rtsSpark *spark   = event->spark;
+  GranEventType gr_evttype;
+
+  ASSERT(CurrentProc==proc);
+  ASSERT(!RtsFlags.GranFlags.Light || CurrentProc==0);
+  ASSERT(event->evttype == ResumeThread || event->evttype == StartThread);
+  /* if this was called via StartThread: */
+  ASSERT(event->evttype!=StartThread || tso == END_TSO_QUEUE); // not yet created
+  // ToDo: check: ASSERT(event->evttype!=StartThread || procStatus[proc]==Starting);
+  /* if this was called via ResumeThread: */
+  ASSERT(event->evttype!=ResumeThread || 
+	   RtsFlags.GranFlags.DoAsyncFetch ||!is_on_queue(tso,proc)); 
+
+  /* startThread may have been called from the main event handler upon
+     finding either a ResumeThread or a StartThread event; set the
+     gr_evttype (needed for writing to .gr file) accordingly */
+  // gr_evttype = (event->evttype == ResumeThread) ? GR_RESUME : GR_START;
+
+  if ( event->evttype == StartThread ) {
+    GranEventType gr_evttype = (run_queue_hds[proc]==END_TSO_QUEUE) ? 
+                                 GR_START : GR_STARTQ;
+
+    tso = createThread(BLOCK_SIZE_W, spark->gran_info);// implicit insertThread!
+    pushClosure(tso, node);
+
+    // ToDo: fwd info on local/global spark to thread -- HWL
+    // tso->gran.exported =  spark->exported;
+    // tso->gran.locked =   !spark->global;
+    tso->gran.sparkname = spark->name;
+
+    ASSERT(CurrentProc==proc);
+    if (RtsFlags.GranFlags.GranSimStats.Full)
+      DumpGranEvent(gr_evttype,tso);
+
+    CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadcreatetime;
+  } else { // event->evttype == ResumeThread
+    GranEventType gr_evttype = (run_queue_hds[proc]==END_TSO_QUEUE) ? 
+                                 GR_RESUME : GR_RESUMEQ;
+
+    insertThread(tso, proc);
+
+    ASSERT(CurrentProc==proc);
+    if (RtsFlags.GranFlags.GranSimStats.Full)
+      DumpGranEvent(gr_evttype,tso);
+  }
+
+  ASSERT(run_queue_hds[proc]!=END_TSO_QUEUE); // non-empty run queue
+  procStatus[proc] = Busy;
+  /* make sure that this thread is actually run */
+  new_event(proc, proc, 
+	    CurrentTime[proc],
+	    ContinueThread,
+	    tso, node, (rtsSpark*)NULL);
+  
+  /* A wee bit of statistics gathering */
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    globalGranStats.tot_add_threads++;
+    globalGranStats.tot_tq_len += thread_queue_len(CurrentProc);
+  }
+
+}
+
+//@cindex do_the_findwork
+void
+do_the_findwork(rtsEvent* event) 
+{
+  PEs proc = event->proc,       /* proc to search for work */
+      creator = event->creator; /* proc that requested work */
+  rtsSparkQ spark = event->spark;
+  /* ToDo: check that this size is safe -- HWL */
+  nat req_heap = sizeofW(StgTSO) + MIN_STACK_WORDS;
+                 // add this? -- HWL:RtsFlags.ConcFlags.stkChunkSize;
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the Findwork\n"));
+
+  /* If GUM style fishing is enabled, the contents of the spark field says
+     what to steal (spark(1) or thread(2)); */
+  ASSERT(!(RtsFlags.GranFlags.Fishing && event->spark==(rtsSpark*)0));
+
+  /* Make sure that we have enough heap for creating a new
+     thread. This is a conservative estimate of the required heap.
+     This eliminates special checks for GC around NewThread within
+     ActivateSpark.                                                 */
+  
+  if (Hp + req_heap > HpLim ) {
+    IF_DEBUG(gc, 
+	     belch("GC: Doing GC from within Findwork handling (that's bloody dangerous if you ask me)");)
+      GarbageCollect(GetRoots);
+      // ReallyPerformThreadGC(req_heap, rtsFalse);   old -- HWL
+      Hp -= req_heap;
+      if (procStatus[CurrentProc]==Sparking) 
+	procStatus[CurrentProc]=Idle;
+      return;
+  }
+  
+  if ( RtsFlags.GranFlags.DoAlwaysCreateThreads ||
+       RtsFlags.GranFlags.Fishing ||
+       ((procStatus[proc]==Idle || procStatus[proc]==Sparking) &&
+	(RtsFlags.GranFlags.FetchStrategy >= 2 || 
+	 OutstandingFetches[proc] == 0)) ) 
+   {
+    rtsBool found;
+    rtsSparkQ  prev, spark;
+    
+    /* ToDo: check */
+    ASSERT(procStatus[proc]==Sparking ||
+	   RtsFlags.GranFlags.DoAlwaysCreateThreads ||
+	   RtsFlags.GranFlags.Fishing);
+    
+    /* SImmoHwI' yInej! Search spark queue! */
+    /* gimme_spark (event, &found, &spark); */
+    findLocalSpark(event, &found, &spark);
+
+    if (!found) { /* pagh vumwI' */
+      /*
+        If no spark has been found this can mean 2 things:
+	 1/ The FindWork was a fish (i.e. a message sent by another PE) and 
+	    the spark pool of the receiver is empty
+	    --> the fish has to be forwarded to another PE
+         2/ The FindWork was local to this PE (i.e. no communication; in this
+            case creator==proc) and the spark pool of the PE is not empty 
+	    contains only sparks of closures that should not be sparked 
+	    (note: if the spark pool were empty, handleIdlePEs wouldn't have 
+	    generated a FindWork in the first place)
+	    --> the PE has to be made idle to trigger stealing sparks the next
+	        time handleIdlePEs is performed
+      */ 
+
+      ASSERT(pending_sparks_hds[proc]==(rtsSpark*)NULL);
+      if (creator==proc) {
+	/* local FindWork */
+	if (procStatus[proc]==Busy) {
+	  belch("ghuH: PE %d in Busy state while processing local FindWork (spark pool is empty!) @ %lx",
+		proc, CurrentTime[proc]);
+	  procStatus[proc] = Idle;
+	}
+      } else {
+	/* global FindWork i.e. a Fish */
+	ASSERT(RtsFlags.GranFlags.Fishing);
+	/* actually this generates another request from the originating PE */
+	ASSERT(OutstandingFishes[creator]>0);
+	OutstandingFishes[creator]--;
+	/* ToDo: assign costs for sending fish to proc not to creator */
+	stealSpark(creator); /* might steal from same PE; ToDo: fix */
+	ASSERT(RtsFlags.GranFlags.maxFishes!=1 || procStatus[creator] == Fishing);
+	/* any assertions on state of proc possible here? */
+      }
+    } else {
+      /* DaH chu' Qu' yIchen! Now create new work! */ 
+      IF_GRAN_DEBUG(findWork,
+		    belch("+- munching spark %p; creating thread for node %p",
+			  spark, spark->node));
+      activateSpark (event, spark);
+      ASSERT(spark != (rtsSpark*)NULL);
+      spark = delete_from_sparkq (spark, proc, rtsTrue);
+    }
+
+    IF_GRAN_DEBUG(findWork,
+		  belch("+- Contents of spark queues at the end of FindWork @ %lx",
+			CurrentTime[proc]); 
+		  print_sparkq_stats());
+
+    /* ToDo: check ; not valid if GC occurs in ActivateSpark */
+    ASSERT(!found ||
+	    /* forward fish  or */
+	    (proc!=creator ||
+	    /* local spark  or */
+            (proc==creator && procStatus[proc]==Starting)) || 
+	   //(!found && procStatus[proc]==Idle) ||
+	   RtsFlags.GranFlags.DoAlwaysCreateThreads); 
+   } else {
+    IF_GRAN_DEBUG(findWork,
+		  belch("+- RTS refuses to findWork on PE %d @ %lx",
+			proc, CurrentTime[proc]);
+		  belch("  procStatus[%d]=%s, fetch strategy=%d, outstanding fetches[%d]=%d", 
+			proc, proc_status_names[procStatus[proc]],
+			RtsFlags.GranFlags.FetchStrategy, 
+			proc, OutstandingFetches[proc]));
+   }  
+}
+ 
+//@node GranSimLight routines, Code for Fetching Nodes, GranSim functions, GranSim specific code
+//@subsection GranSimLight routines
+
+/* 
+   This code is called from the central scheduler after having rgabbed a
+   new event and is only needed for GranSim-Light. It mainly adjusts the
+   ActiveTSO so that all costs that have to be assigned from within the
+   scheduler are assigned to the right TSO. The choice of ActiveTSO depends
+   on the type of event that has been found.  
+*/
+
+void
+GranSimLight_enter_system(event, ActiveTSOp)
+rtsEvent *event;
+StgTSO **ActiveTSOp;
+{
+  StgTSO *ActiveTSO = *ActiveTSOp;
+
+  ASSERT (RtsFlags.GranFlags.Light);
+  
+  /* Restore local clock of the virtual processor attached to CurrentTSO.
+     All costs will be associated to the `virt. proc' on which the tso
+     is living. */
+  if (ActiveTSO != NULL) {                     /* already in system area */
+    ActiveTSO->gran.clock = CurrentTime[CurrentProc];
+    if (RtsFlags.GranFlags.DoFairSchedule)
+      {
+	if (RtsFlags.GranFlags.GranSimStats.Full &&
+	    RtsFlags.GranFlags.Debug.checkLight)
+	  DumpGranEvent(GR_SYSTEM_END,ActiveTSO);
+      }
+  }
+  switch (event->evttype)
+    { 
+    case ContinueThread: 
+    case FindWork:       /* inaccurate this way */
+      ActiveTSO = run_queue_hd;
+      break;
+    case ResumeThread:   
+    case StartThread:
+    case MoveSpark:      /* has tso of virt proc in tso field of event */
+      ActiveTSO = event->tso;
+      break;
+    default: barf("Illegal event type %s (%d) in GrAnSim Light setup\n",
+		  event_names[event->evttype],event->evttype);
+    }
+  CurrentTime[CurrentProc] = ActiveTSO->gran.clock;
+  if (RtsFlags.GranFlags.DoFairSchedule) {
+      if (RtsFlags.GranFlags.GranSimStats.Full &&
+	  RtsFlags.GranFlags.Debug.checkLight)
+	DumpGranEvent(GR_SYSTEM_START,ActiveTSO);
+  }
+}
+
+void
+GranSimLight_leave_system(event, ActiveTSOp)
+rtsEvent *event;
+StgTSO **ActiveTSOp;
+{
+  StgTSO *ActiveTSO = *ActiveTSOp;
+
+  ASSERT(RtsFlags.GranFlags.Light);
+
+  /* Save time of `virt. proc' which was active since last getevent and
+     restore time of `virt. proc' where CurrentTSO is living on. */
+  if(RtsFlags.GranFlags.DoFairSchedule) {
+    if (RtsFlags.GranFlags.GranSimStats.Full &&
+	RtsFlags.GranFlags.Debug.checkLight) // ToDo: clean up flags
+      DumpGranEvent(GR_SYSTEM_END,ActiveTSO);
+  }
+  ActiveTSO->gran.clock = CurrentTime[CurrentProc];
+  ActiveTSO = (StgTSO*)NULL;
+  CurrentTime[CurrentProc] = CurrentTSO->gran.clock;
+  if (RtsFlags.GranFlags.DoFairSchedule /* &&  resched */ ) {
+    // resched = rtsFalse;
+    if (RtsFlags.GranFlags.GranSimStats.Full &&
+	RtsFlags.GranFlags.Debug.checkLight)
+      DumpGranEvent(GR_SCHEDULE,run_queue_hd);
+  }
+  /* 
+     if (TSO_LINK(ThreadQueueHd)!=PrelBase_Z91Z93_closure &&
+     (TimeOfNextEvent == 0 ||
+     TSO_CLOCK(TSO_LINK(ThreadQueueHd))+1000<TimeOfNextEvent)) {
+     new_event(CurrentProc,CurrentProc,TSO_CLOCK(TSO_LINK(ThreadQueueHd))+1000,
+     CONTINUETHREAD,TSO_LINK(ThreadQueueHd),PrelBase_Z91Z93_closure,NULL);
+     TimeOfNextEvent = get_time_of_next_event();
+     }
+  */
+}
+
+//@node Code for Fetching Nodes, Idle PEs, GranSimLight routines, GranSim specific code
+//@subsection Code for Fetching Nodes
+
+/*
+   The following GrAnSim routines simulate the fetching of nodes from a
+   remote processor. We use a 1 word bitmask to indicate on which processor
+   a node is lying. Thus, moving or copying a node from one processor to
+   another just requires an appropriate change in this bitmask (using
+   @SET_GA@).  Additionally, the clocks have to be updated.
+
+   A special case arises when the node that is needed by processor A has
+   been moved from a processor B to a processor C between sending out a
+   @FETCH@ (from A) and its arrival at B. In that case the @FETCH@ has to
+   be forwarded to C. This is simulated by issuing another FetchNode event
+   on processor C with A as creator.
+*/
+ 
+/* ngoqvam che' {GrAnSim}! */
+
+/* Fetch node "node" to processor "p" */
+
+//@cindex fetchNode
+
+rtsFetchReturnCode
+fetchNode(node,from,to)
+StgClosure* node;
+PEs from, to;
+{
+  /* In case of RtsFlags.GranFlags.DoBulkFetching this fct should never be 
+     entered! Instead, UnpackGraph is used in ReSchedule */
+  StgClosure* closure;
+
+  ASSERT(to==CurrentProc);
+  /* Should never be entered  in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+  /* fetchNode should never be entered with DoBulkFetching */
+  ASSERT(!RtsFlags.GranFlags.DoBulkFetching);
+
+  /* Now fetch the node */
+  if (!IS_LOCAL_TO(PROCS(node),from) &&
+      !IS_LOCAL_TO(PROCS(node),to) ) 
+    return NodeHasMoved;
+  
+  if (closure_HNF(node))                /* node already in head normal form? */
+    node->header.gran.procs |= PE_NUMBER(to);           /* Copy node */
+  else
+    node->header.gran.procs = PE_NUMBER(to);            /* Move node */
+
+  return Ok;
+}
+
+/* 
+   Process a fetch request. 
+   
+   Cost of sending a packet of size n = C + P*n
+   where C = packet construction constant, 
+         P = cost of packing one word into a packet
+   [Should also account for multiple packets].
+*/
+
+//@cindex handleFetchRequest
+
+rtsFetchReturnCode
+handleFetchRequest(node,to,from,tso)
+StgClosure* node;   // the node which is requested
+PEs to, from;       // fetch request: from -> to
+StgTSO* tso;        // the tso which needs the node
+{
+  ASSERT(!RtsFlags.GranFlags.Light);
+  /* ToDo: check assertion */
+  ASSERT(OutstandingFetches[from]>0);
+
+  /* probably wrong place; */
+  ASSERT(CurrentProc==to);
+
+  if (IS_LOCAL_TO(PROCS(node), from)) /* Somebody else moved node already => */
+    {                                 /* start tso */
+      IF_GRAN_DEBUG(thunkStealing,
+		    fprintf(stderr,"ghuH: handleFetchRequest entered with local node %p (%s) (PE %d)\n", 
+			    node, info_type(node), from));
+
+      if (RtsFlags.GranFlags.DoBulkFetching) {
+	nat size;
+	rtsPackBuffer *graph;
+
+	/* Create a 1-node-buffer and schedule a FETCHREPLY now */
+	graph = PackOneNode(node, tso, &size); 
+	new_event(from, to, CurrentTime[to],
+		  FetchReply,
+		  tso, graph, (rtsSpark*)NULL);
+      } else {
+	new_event(from, to, CurrentTime[to],
+		  FetchReply,
+		  tso, node, (rtsSpark*)NULL);
+      }
+      IF_GRAN_DEBUG(thunkStealing,
+		    belch("== majQa'! closure %p is local on PE %d already (this is a good thing)", node, from));
+      return (NodeIsLocal);
+    }
+  else if (IS_LOCAL_TO(PROCS(node), to) )   /* Is node still here? */
+    {
+      if (RtsFlags.GranFlags.DoBulkFetching) { /* {GUM}vo' ngoqvam vInIHta' */
+	nat size;                              /* (code from GUM) */
+	StgClosure* graph;
+
+	if (IS_BLACK_HOLE(node)) {   /* block on BH or RBH */
+	  new_event(from, to, CurrentTime[to],
+		    GlobalBlock,
+		    tso, node, (rtsSpark*)NULL);
+	  /* Note: blockFetch is done when handling GLOBALBLOCK event; 
+	           make sure the TSO stays out of the run queue */
+          /* When this thread is reawoken it does the usual: it tries to 
+             enter the updated node and issues a fetch if it's remote.
+             It has forgotten that it has sent a fetch already (i.e. a
+             FETCHNODE is swallowed by a BH, leaving the thread in a BQ) */
+          --OutstandingFetches[from];
+
+	  IF_GRAN_DEBUG(thunkStealing,
+			belch("== majQa'! closure %p on PE %d is a BH (demander=PE %d); faking a FMBQ", 
+			      node, to, from));
+	  if (RtsFlags.GranFlags.GranSimStats.Global) {
+	    globalGranStats.tot_FMBQs++;
+	  }
+	  return (NodeIsBH);
+	}
+
+	/* The tso requesting the node is blocked and cannot be on a run queue */
+	ASSERT(!is_on_queue(tso, from));
+
+	if ((graph = PackNearbyGraph(node, tso, &size)) == NULL) 
+	  return (OutOfHeap);  /* out of heap */
+
+	/* Actual moving/copying of node is done on arrival; see FETCHREPLY */
+	/* Send a reply to the originator */
+	/* ToDo: Replace that by software costs for doing graph packing! */
+	CurrentTime[to] += size * RtsFlags.GranFlags.Costs.mpacktime;
+
+	new_event(from, to,
+		  CurrentTime[to]+RtsFlags.GranFlags.Costs.latency,
+		  FetchReply,
+		  tso, (StgClosure *)graph, (rtsSpark*)NULL);
+        
+	CurrentTime[to] += RtsFlags.GranFlags.Costs.mtidytime;
+	return (Ok);
+      } else {                   /* incremental (single closure) fetching */
+	/* Actual moving/copying of node is done on arrival; see FETCHREPLY */
+	/* Send a reply to the originator */
+	CurrentTime[to] += RtsFlags.GranFlags.Costs.mpacktime;
+
+	new_event(from, to,
+		  CurrentTime[to]+RtsFlags.GranFlags.Costs.latency,
+		  FetchReply,
+		  tso, node, (rtsSpark*)NULL);
+      
+	CurrentTime[to] += RtsFlags.GranFlags.Costs.mtidytime;
+	return (Ok);
+      }
+    }
+  else       /* Qu'vatlh! node has been grabbed by another proc => forward */
+    {    
+      PEs node_loc = where_is(node);
+      rtsTime fetchtime;
+
+      IF_GRAN_DEBUG(thunkStealing,
+		    belch("== Qu'vatlh! node %p has been grabbed by PE %d from PE %d (demander=%d) @ %d\n",
+			  node,node_loc,to,from,CurrentTime[to]));
+      if (RtsFlags.GranFlags.GranSimStats.Global) {
+	globalGranStats.fetch_misses++;
+      }
+
+      /* Prepare FORWARD message to proc p_new */
+      CurrentTime[to] += RtsFlags.GranFlags.Costs.mpacktime;
+      
+      fetchtime = stg_max(CurrentTime[to], CurrentTime[node_loc]) +
+                  RtsFlags.GranFlags.Costs.latency;
+          
+      new_event(node_loc, from, fetchtime,
+		FetchNode,
+		tso, node, (rtsSpark*)NULL);
+
+      CurrentTime[to] += RtsFlags.GranFlags.Costs.mtidytime;
+
+      return (NodeHasMoved);
+    }
+}
+
+/*
+   blockFetch blocks a BlockedFetch node on some kind of black hole.
+
+   Taken from gum/HLComms.lc.   [find a  better  place for that ?] --  HWL  
+
+   {\bf Note:} In GranSim we don't have @FETCHME@ nodes and therefore don't
+   create @FMBQ@'s (FetchMe blocking queues) to cope with global
+   blocking. Instead, non-local TSO are put into the BQ in the same way as
+   local TSOs. However, we have to check if a TSO is local or global in
+   order to account for the latencies involved and for keeping track of the
+   number of fetches that are really going on.  
+*/
+
+//@cindex blockFetch
+
+rtsFetchReturnCode
+blockFetch(tso, proc, bh)
+StgTSO* tso;                        /* TSO which gets blocked */
+PEs proc;                           /* PE where that tso was running */
+StgClosure* bh;                     /* closure to block on (BH, RBH, BQ) */
+{
+  StgInfoTable *info;
+
+  IF_GRAN_DEBUG(bq,
+		fprintf(stderr,"## blockFetch: blocking TSO %p (%d)[PE %d] on node %p (%s) [PE %d]. No graph is packed!\n", 
+		tso, tso->id, proc, bh, info_type(bh), where_is(bh)));
+
+    if (!IS_BLACK_HOLE(bh)) {                      /* catches BHs and RBHs */
+      IF_GRAN_DEBUG(bq,
+		    fprintf(stderr,"## blockFetch: node %p (%s) is not a BH => awakening TSO %p (%d) [PE %u]\n", 
+			    bh, info_type(bh), tso, tso->id, proc));
+
+      /* No BH anymore => immediately unblock tso */
+      new_event(proc, proc, CurrentTime[proc],
+	        UnblockThread,
+                tso, bh, (rtsSpark*)NULL);
+
+      /* Is this always a REPLY to a FETCH in the profile ? */
+      if (RtsFlags.GranFlags.GranSimStats.Full)
+	DumpRawGranEvent(proc, proc, GR_REPLY, tso, bh, (StgInt)0, 0);
+      return (NodeIsNoBH);
+    }
+
+    /* DaH {BQ}Daq Qu' Suq 'e' wISov!
+       Now we know that we have to put the tso into the BQ.
+       2 cases: If block-on-fetch, tso is at head of threadq => 
+                => take it out of threadq and into BQ
+                If reschedule-on-fetch, tso is only pointed to be event
+                => just put it into BQ
+
+    ngoq ngo'!!
+    if (!RtsFlags.GranFlags.DoAsyncFetch) {
+      GranSimBlock(tso, proc, bh);
+    } else {
+      if (RtsFlags.GranFlags.GranSimStats.Full)
+	DumpRawGranEvent(proc, where_is(bh), GR_BLOCK, tso, bh, (StgInt)0, 0);
+      ++(tso->gran.blockcount);
+      tso->gran.blockedat = CurrentTime[proc];
+    }
+    */
+
+    /* after scheduling the GlobalBlock event the TSO is not put into the
+       run queue again; it is only pointed to via the event we are
+       processing now; in GranSim 4.xx there is no difference between
+       synchr and asynchr comm here */
+    ASSERT(!is_on_queue(tso, proc));
+    ASSERT(tso->link == END_TSO_QUEUE);
+
+    GranSimBlock(tso, proc, bh);  /* GranSim statistics gathering */
+
+    /* Now, put tso into BQ (similar to blocking entry codes) */
+    info = get_itbl(bh);
+    switch (info -> type) {
+      case RBH:
+      case BLACKHOLE:
+      case CAF_BLACKHOLE: // ToDo: check whether this is a possibly ITBL here
+      case SE_BLACKHOLE:   // ToDo: check whether this is a possibly ITBL here
+      case SE_CAF_BLACKHOLE:// ToDo: check whether this is a possibly ITBL here
+	/* basically an inlined version of BLACKHOLE_entry -- HWL */
+	/* Change the BLACKHOLE into a BLACKHOLE_BQ */
+	((StgBlockingQueue *)bh)->header.info = &BLACKHOLE_BQ_info;
+	/* Put ourselves on the blocking queue for this black hole */
+	// tso->link=END_TSO_QUEUE;   not necessary; see assertion above
+	((StgBlockingQueue *)bh)->blocking_queue = (StgBlockingQueueElement *)tso;
+	tso->blocked_on = bh;
+	recordMutable((StgMutClosure *)bh);
+	break;
+
+    case BLACKHOLE_BQ:
+	/* basically an inlined version of BLACKHOLE_BQ_entry -- HWL */
+	tso->link = (StgTSO *) (((StgBlockingQueue*)bh)->blocking_queue); 
+	((StgBlockingQueue*)bh)->blocking_queue = (StgBlockingQueueElement *)tso;
+	recordMutable((StgMutClosure *)bh);
+
+# if 0 && defined(GC_MUT_REQUIRED)
+	ToDo: check whether recordMutable is necessary -- HWL
+	/*
+	 * If we modify a black hole in the old generation, we have to make 
+	 * sure it goes on the mutables list
+	 */
+
+	if (bh <= StorageMgrInfo.OldLim) {
+	    MUT_LINK(bh) = (W_) StorageMgrInfo.OldMutables;
+	    StorageMgrInfo.OldMutables = bh;
+	} else
+	    MUT_LINK(bh) = MUT_NOT_LINKED;
+# endif
+	break;
+
+    case FETCH_ME_BQ:
+	barf("Qagh: FMBQ closure (%p) found in GrAnSim (TSO=%p (%d))\n",
+	     bh, tso, tso->id);
+
+    default:
+	{
+	  G_PRINT_NODE(bh);
+	  barf("Qagh: thought %p was a black hole (IP %p (%s))",
+		  bh, info, info_type(get_itbl(bh)));
+	}
+      }
+    return (Ok);
+}
+
+
+//@node Idle PEs, Routines directly called from Haskell world, Code for Fetching Nodes, GranSim specific code
+//@subsection Idle PEs
+
+/*
+   Export work to idle PEs. This function is called from @ReSchedule@
+   before dispatching on the current event. @HandleIdlePEs@ iterates over
+   all PEs, trying to get work for idle PEs. Note, that this is a
+   simplification compared to GUM's fishing model. We try to compensate for
+   that by making the cost for stealing work dependent on the number of
+   idle processors and thereby on the probability with which a randomly
+   sent fish would find work.  
+*/
+
+//@cindex handleIdlePEs
+
+void
+handleIdlePEs(void)
+{
+  PEs p;
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: handling Idle PEs\n"))
+
+  /* Should never be entered in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  /* Could check whether there are idle PEs if it's a cheap check */
+  for (p = 0; p < RtsFlags.GranFlags.proc; p++) 
+    if (procStatus[p]==Idle)  /*  && IS_SPARKING(p) && IS_STARTING(p) */
+      /* First look for local work i.e. examine local spark pool! */
+      if (pending_sparks_hds[p]!=(rtsSpark *)NULL) {
+	new_event(p, p, CurrentTime[p],
+		  FindWork,
+		  (StgTSO*)NULL, (StgClosure*)NULL, (rtsSpark*)NULL);
+	procStatus[p] = Sparking;
+      } else if ((RtsFlags.GranFlags.maxFishes==0 ||
+		  OutstandingFishes[p]<RtsFlags.GranFlags.maxFishes) ) {
+
+	/* If no local work then try to get remote work! 
+	   Qu' Hopbe' pagh tu'lu'pu'chugh Qu' Hop yISuq ! */
+	if (RtsFlags.GranFlags.DoStealThreadsFirst && 
+	    (RtsFlags.GranFlags.FetchStrategy >= 4 || OutstandingFetches[p] == 0))
+	  {
+	    if (SurplusThreads > 0l)                    /* Steal a thread */
+	      stealThread(p);
+          
+	    if (procStatus[p]!=Idle)
+	      break;
+	  }
+	
+	if (SparksAvail > 0 && 
+	    (RtsFlags.GranFlags.FetchStrategy >= 3 || OutstandingFetches[p] == 0)) /* Steal a spark */
+	  stealSpark(p);
+	
+	if (SurplusThreads > 0 && 
+	    (RtsFlags.GranFlags.FetchStrategy >= 4 || OutstandingFetches[p] == 0)) /* Steal a thread */
+	  stealThread(p);
+      }
+}
+
+/*
+   Steal a spark and schedule moving it to proc. We want to look at PEs in
+   clock order -- most retarded first.  Currently sparks are only stolen
+   from the @ADVISORY_POOL@ never from the @REQUIRED_POOL@. Eventually,
+   this should be changed to first steal from the former then from the
+   latter.
+
+   We model a sort of fishing mechanism by counting the number of sparks
+   and threads we are currently stealing.  */
+
+/* 
+   Return a random nat value in the intervall [from, to) 
+*/
+static nat 
+natRandom(from, to)
+nat from, to;
+{
+  nat r, d;
+
+  ASSERT(from<=to);
+  d = to - from;
+  /* random returns a value in [0, RAND_MAX] */
+  r = (nat) ((float)from + ((float)random()*(float)d)/(float)RAND_MAX);
+  r = (r==to) ? from : r;
+  ASSERT(from<=r && (r<to || from==to));
+  return r;  
+}
+
+/* 
+   Find any PE other than proc. Used for GUM style fishing only.
+*/
+static PEs 
+findRandomPE (proc)
+PEs proc;
+{
+  nat p;
+
+  ASSERT(RtsFlags.GranFlags.Fishing);
+  if (RtsFlags.GranFlags.RandomSteal) {
+    p = natRandom(0,RtsFlags.GranFlags.proc);  /* full range of PEs */
+  } else {
+    p = 0;
+  }
+  IF_GRAN_DEBUG(randomSteal,
+		belch("^^ RANDOM_STEAL (fishing): stealing from PE %d (current proc is %d)",
+		      p, proc);)
+    
+  return (PEs)p;
+}
+
+/*
+  Magic code for stealing sparks/threads makes use of global knowledge on
+  spark queues.  
+*/
+static void
+sortPEsByTime (proc, pes_by_time, firstp, np) 
+PEs proc;
+PEs *pes_by_time;
+nat *firstp, *np;
+{
+  PEs p, temp, n, i, j;
+  nat first, upb, r=0, q=0;
+
+  ASSERT(!RtsFlags.GranFlags.Fishing);
+
+#if 0  
+  upb = RtsFlags.GranFlags.proc;            /* full range of PEs */
+
+  if (RtsFlags.GranFlags.RandomSteal) {
+    r = natRandom(0,RtsFlags.GranFlags.proc);  /* full range of PEs */
+  } else {
+    r = 0;
+  }
+#endif
+
+  /* pes_by_time shall contain processors from which we may steal sparks */ 
+  for(n=0, p=0; p < RtsFlags.GranFlags.proc; ++p)
+    if ((proc != p) &&                       // not the current proc
+        (pending_sparks_hds[p] != (rtsSpark *)NULL) && // non-empty spark pool
+        (CurrentTime[p] <= CurrentTime[CurrentProc]))
+      pes_by_time[n++] = p;
+
+  /* sort pes_by_time */
+  for(i=0; i < n; ++i)
+    for(j=i+1; j < n; ++j)
+      if (CurrentTime[pes_by_time[i]] > CurrentTime[pes_by_time[j]]) {
+	rtsTime temp = pes_by_time[i];
+	pes_by_time[i] = pes_by_time[j];
+	pes_by_time[j] = temp;
+      }
+
+  /* Choose random processor to steal spark from; first look at processors */
+  /* that are earlier than the current one (i.e. proc) */
+  for(first=0; 
+      (first < n) && (CurrentTime[pes_by_time[first]] <= CurrentTime[proc]);
+      ++first)
+    /* nothing */ ;
+
+  /* if the assertion below is true we can get rid of first */
+  /* ASSERT(first==n); */
+  /* ToDo: check if first is really needed; find cleaner solution */
+
+  *firstp = first;
+  *np = n;
+}
+
+/* 
+   Steal a spark (piece of work) from any processor and bring it to proc.
+*/
+//@cindex stealSpark
+static inline rtsBool 
+stealSpark(PEs proc) { stealSomething(proc, rtsTrue, rtsFalse); }
+
+/* 
+   Steal a thread from any processor and bring it to proc i.e. thread migration
+*/
+//@cindex stealThread
+static inline rtsBool 
+stealThread(PEs proc) { stealSomething(proc, rtsFalse, rtsTrue); }
+
+/* 
+   Steal a spark or a thread and schedule moving it to proc.
+*/
+//@cindex stealSomething
+static rtsBool
+stealSomething(proc, steal_spark, steal_thread)
+PEs proc;                           // PE that needs work (stealer)
+rtsBool steal_spark, steal_thread;  // should a spark and/or thread be stolen
+{
+  PEs p;
+  rtsTime fish_arrival_time;
+  rtsSpark *spark, *prev, *next;
+  rtsBool stolen = rtsFalse;
+
+  ASSERT(steal_spark || steal_thread);
+
+  /* Should never be entered in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+  ASSERT(!steal_thread || RtsFlags.GranFlags.DoThreadMigration);
+
+  if (!RtsFlags.GranFlags.Fishing) {
+    // ToDo: check if stealing threads is prefered over stealing sparks
+    if (steal_spark) {
+      if (stealSparkMagic(proc))
+	return rtsTrue;
+      else                             // no spark found
+	if (steal_thread)
+	  return stealThreadMagic(proc);
+        else                           // no thread found
+	  return rtsFalse;             
+    } else {                           // ASSERT(steal_thread);
+      return stealThreadMagic(proc);
+    }
+    barf("stealSomething: never reached");
+  }
+
+  /* The rest of this function does GUM style fishing */
+  
+  p = findRandomPE(proc); /* find a random PE other than proc */
+  
+  /* Message packing costs for sending a Fish; qeq jabbI'ID */
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.mpacktime;
+  
+  /* use another GranEvent for requesting a thread? */
+  if (steal_spark && RtsFlags.GranFlags.GranSimStats.Sparks)
+    DumpRawGranEvent(p, proc, SP_REQUESTED,
+		     (StgTSO*)NULL, (StgClosure *)NULL, (StgInt)0, 0);
+
+  /* time of the fish arrival on the remote PE */
+  fish_arrival_time = CurrentTime[proc] + RtsFlags.GranFlags.Costs.latency;
+  
+  /* Phps use an own Fish event for that? */
+  /* The contents of the spark component is a HACK:
+      1 means give me a spark;
+      2 means give me a thread
+      0 means give me nothing (this should never happen)
+  */
+  new_event(p, proc, fish_arrival_time,
+	    FindWork,
+	    (StgTSO*)NULL, (StgClosure*)NULL, 
+	    (steal_spark ? (rtsSpark*)1 : steal_thread ? (rtsSpark*)2 : (rtsSpark*)0));
+  
+  ++OutstandingFishes[proc];
+  /* only with Async fetching? */
+  if (procStatus[proc]==Idle)  
+    procStatus[proc]=Fishing;
+  
+  /* time needed to clean up buffers etc after sending a message */
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.mtidytime;
+
+  /* If GUM style fishing stealing always succeeds because it only consists
+     of sending out a fish; of course, when the fish may return
+     empty-handed! */
+  return rtsTrue;
+}
+
+/* 
+   This version of stealing a spark makes use of the global info on all
+   spark pools etc which is not available in a real parallel system.
+   This could be extended to test e.g. the impact of perfect load information.
+*/
+//@cindex stealSparkMagic
+static rtsBool
+stealSparkMagic(proc)
+PEs proc;
+{
+  PEs p, i, j, n, first, upb;
+  rtsSpark *spark, *next;
+  PEs pes_by_time[MAX_PROC];
+  rtsBool stolen = rtsFalse;
+  rtsTime stealtime;
+
+  /* Should never be entered in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  sortPEsByTime(proc, pes_by_time, &first, &n);
+
+  while (!stolen && n>0) {
+    upb = (first==0) ? n : first;
+    i = natRandom(0,upb);                /* choose a random eligible PE */
+    p = pes_by_time[i];
+
+    IF_GRAN_DEBUG(randomSteal,
+		  belch("^^ stealSparkMagic (random_steal, not fishing): stealing spark from PE %d (current proc is %d)",
+			p, proc));
+      
+    ASSERT(pending_sparks_hds[p]!=(rtsSpark *)NULL); /* non-empty spark pool */
+
+    /* Now go through rtsSparkQ and steal the first eligible spark */
+    
+    spark = pending_sparks_hds[p]; 
+    while (!stolen && spark != (rtsSpark*)NULL)
+      {
+	/* NB: no prev pointer is needed here because all sparks that are not 
+	   chosen are pruned
+	*/
+	if ((procStatus[p]==Idle || procStatus[p]==Sparking || procStatus[p] == Fishing) &&
+	    spark->next==(rtsSpark*)NULL) 
+	  {
+	    /* Be social! Don't steal the only spark of an idle processor 
+	       not {spark} neH yInIH !! */
+	    break; /* next PE */
+	  } 
+	else if (closure_SHOULD_SPARK(spark->node))
+	  {
+	    /* Don't Steal local sparks; 
+	       ToDo: optionally prefer local over global sparks
+	    if (!spark->global) {
+	      prev=spark;
+	      continue;                  next spark
+	    }
+	    */
+	    /* found a spark! */
+
+	    /* Prepare message for sending spark */
+	    CurrentTime[p] += RtsFlags.GranFlags.Costs.mpacktime;
+
+	    if (RtsFlags.GranFlags.GranSimStats.Sparks)
+	      DumpRawGranEvent(p, (PEs)0, SP_EXPORTED,
+			       (StgTSO*)NULL, spark->node,
+			       spark->name, spark_queue_len(p));
+
+	    stealtime = (CurrentTime[p] > CurrentTime[proc] ? 
+			   CurrentTime[p] : 
+			   CurrentTime[proc])
+	                + sparkStealTime();
+
+	    new_event(proc, p /* CurrentProc */, stealtime,
+		      MoveSpark,
+		      (StgTSO*)NULL, spark->node, spark);
+	    
+	    stolen = rtsTrue;
+	    ++OutstandingFishes[proc]; /* no. of sparks currently on the fly */
+	    if (procStatus[proc]==Idle)
+	      procStatus[proc] = Fishing;
+	    ++(spark->global);         /* record that this is a global spark */
+	    ASSERT(SparksAvail>0);
+	    --SparksAvail;            /* on-the-fly sparks are not available */
+	    next = delete_from_sparkq(spark, p, rtsFalse); // don't dispose!
+	    CurrentTime[p] += RtsFlags.GranFlags.Costs.mtidytime;
+	  }
+	else   /* !(closure_SHOULD_SPARK(SPARK_NODE(spark))) */
+	  {
+	   IF_GRAN_DEBUG(checkSparkQ,
+			 belch("^^ pruning spark %p (node %p) in stealSparkMagic",
+			       spark, spark->node));
+
+	    /* if the spark points to a node that should not be sparked,
+	       prune the spark queue at this point */
+	    if (RtsFlags.GranFlags.GranSimStats.Sparks)
+	      DumpRawGranEvent(p, (PEs)0, SP_PRUNED,
+			       (StgTSO*)NULL, spark->node,
+			       spark->name, spark_queue_len(p));
+	    if (RtsFlags.GranFlags.GranSimStats.Global)
+	      globalGranStats.pruned_sparks++;
+	    
+	    ASSERT(SparksAvail>0);
+	    --SparksAvail;
+	    spark = delete_from_sparkq(spark, p, rtsTrue);
+	  }
+	/* unlink spark (may have been freed!) from sparkq;
+	if (prev == NULL) // spark was head of spark queue
+	  pending_sparks_hds[p] = spark->next;
+        else  
+	  prev->next = spark->next;
+	if (spark->next == NULL)
+	  pending_sparks_tls[p] = prev;
+        else  
+	  next->prev = prev;
+	*/
+      }                    /* while ...    iterating over sparkq */
+
+    /* ToDo: assert that PE p still has work left after stealing the spark */
+
+    if (!stolen && (n>0)) {  /* nothing stealable from proc p :( */
+      ASSERT(pes_by_time[i]==p);
+
+      /* remove p from the list (at pos i) */
+      for (j=i; j+1<n; j++)
+	pes_by_time[j] = pes_by_time[j+1];
+      n--;
+      
+      /* update index to first proc which is later (or equal) than proc */
+      for ( ;
+	    (first>0) &&
+	      (CurrentTime[pes_by_time[first-1]]>CurrentTime[proc]);
+	    first--)
+	/* nothing */ ;
+    } 
+  }  /* while ... iterating over PEs in pes_by_time */
+
+  IF_GRAN_DEBUG(randomSteal,
+		if (stolen)
+		  belch("^^ stealSparkMagic: spark %p (node=%p) stolen by PE %d from PE %d (SparksAvail=%d; idlers=%d)",
+		       spark, spark->node, proc, p, 
+		       SparksAvail, idlers());
+		else  
+		  belch("^^ stealSparkMagic: nothing stolen by PE %d (sparkq len after pruning=%d)(SparksAvail=%d; idlers=%d)",
+		        proc, SparksAvail, idlers()));
+
+  if (RtsFlags.GranFlags.GranSimStats.Global &&
+      stolen && (i!=0)) {                          /* only for statistics */
+    globalGranStats.rs_sp_count++;
+    globalGranStats.ntimes_total += n;
+    globalGranStats.fl_total += first;
+    globalGranStats.no_of_steals++;
+  }
+
+  return stolen;
+}
+
+/* 
+   The old stealThread code, which makes use of global info and does not
+   send out fishes.  
+   NB: most of this is the same as in stealSparkMagic;
+       only the pieces specific to processing thread queues are different; 
+       long live polymorphism!  
+*/
+
+//@cindex stealThreadMagic
+static rtsBool
+stealThreadMagic(proc)
+PEs proc;
+{
+  PEs p, i, j, n, first, upb;
+  StgTSO *tso;
+  PEs pes_by_time[MAX_PROC];
+  rtsBool stolen = rtsFalse;
+  rtsTime stealtime;
+
+  /* Should never be entered in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  sortPEsByTime(proc, pes_by_time, &first, &n);
+
+  while (!stolen && n>0) {
+    upb = (first==0) ? n : first;
+    i = natRandom(0,upb);                /* choose a random eligible PE */
+    p = pes_by_time[i];
+
+    IF_GRAN_DEBUG(randomSteal,
+		  belch("^^ stealThreadMagic (random_steal, not fishing): stealing thread from PE %d (current proc is %d)",
+			p, proc));
+      
+    /* Steal the first exportable thread in the runnable queue but
+       never steal the first in the queue for social reasons;
+       not Qu' wa'DIch yInIH !!
+    */
+    /* Would be better to search through queue and have options which of
+       the threads to pick when stealing */
+    if (run_queue_hds[p] == END_TSO_QUEUE) {
+      IF_GRAN_DEBUG(randomSteal,
+		    belch("^^ stealThreadMagic: No thread to steal from PE %d (stealer=PE %d)", 
+			  p, proc));
+    } else {
+      tso = run_queue_hds[p]->link;  /* tso is *2nd* thread in thread queue */
+      /* Found one */
+      stolen = rtsTrue;
+
+      /* update links in queue */
+      run_queue_hds[p]->link = tso->link;
+      if (run_queue_tls[p] == tso)
+	run_queue_tls[p] = run_queue_hds[p];
+      
+      /* ToDo: Turn magic constants into params */
+      
+      CurrentTime[p] += 5l * RtsFlags.GranFlags.Costs.mpacktime;
+      
+      stealtime = (CurrentTime[p] > CurrentTime[proc] ? 
+		   CurrentTime[p] : 
+		   CurrentTime[proc])
+	+ sparkStealTime() 
+	+ 4l * RtsFlags.GranFlags.Costs.additional_latency
+	+ 5l * RtsFlags.GranFlags.Costs.munpacktime;
+
+      /* Move the thread; set bitmask to 0 while TSO is `on-the-fly' */
+      SET_GRAN_HDR(tso,Nowhere /* PE_NUMBER(proc) */); 
+
+      /* Move from one queue to another */
+      new_event(proc, p, stealtime,
+		MoveThread,
+		tso, (StgClosure*)NULL, (rtsSpark*)NULL);
+
+      /* MAKE_BUSY(proc);  not yet; only when thread is in threadq */
+      ++OutstandingFishes[proc];
+      if (procStatus[proc])
+	procStatus[proc] = Fishing;
+      --SurplusThreads;
+
+      if(RtsFlags.GranFlags.GranSimStats.Full)
+	DumpRawGranEvent(p, proc, 
+			 GR_STEALING, 
+			 tso, (StgClosure*)NULL, (StgInt)0, 0);
+      
+      /* costs for tidying up buffer after having sent it */
+      CurrentTime[p] += 5l * RtsFlags.GranFlags.Costs.mtidytime;
+    }
+
+    /* ToDo: assert that PE p still has work left after stealing the spark */
+
+    if (!stolen && (n>0)) {  /* nothing stealable from proc p :( */
+      ASSERT(pes_by_time[i]==p);
+
+      /* remove p from the list (at pos i) */
+      for (j=i; j+1<n; j++)
+	pes_by_time[j] = pes_by_time[j+1];
+      n--;
+      
+      /* update index to first proc which is later (or equal) than proc */
+      for ( ;
+	    (first>0) &&
+	      (CurrentTime[pes_by_time[first-1]]>CurrentTime[proc]);
+	    first--)
+	/* nothing */ ;
+    } 
+  }  /* while ... iterating over PEs in pes_by_time */
+
+  IF_GRAN_DEBUG(randomSteal,
+		if (stolen)
+  		  belch("^^ stealThreadMagic: stolen TSO %d (%p) by PE %d from PE %d (SparksAvail=%d; idlers=%d)",
+		        tso->id, tso, proc, p,
+		        SparksAvail, idlers());
+		else
+		  belch("stealThreadMagic: nothing stolen by PE %d (SparksAvail=%d; idlers=%d)",
+			proc, SparksAvail, idlers()));
+
+  if (RtsFlags.GranFlags.GranSimStats.Global &&
+      stolen && (i!=0)) { /* only for statistics */
+    /* ToDo: more statistics on avg thread queue lenght etc */
+    globalGranStats.rs_t_count++;
+    globalGranStats.no_of_migrates++;
+  }
+
+  return stolen;
+}
+
+//@cindex sparkStealTime
+static rtsTime
+sparkStealTime(void)
+{
+  double fishdelay, sparkdelay, latencydelay;
+  fishdelay =  (double)RtsFlags.GranFlags.proc/2;
+  sparkdelay = fishdelay - 
+          ((fishdelay-1)/(double)(RtsFlags.GranFlags.proc-1))*(double)idlers();
+  latencydelay = sparkdelay*((double)RtsFlags.GranFlags.Costs.latency);
+
+  return((rtsTime)latencydelay);
+}
+
+//@node Routines directly called from Haskell world, Emiting profiling info for GrAnSim, Idle PEs, GranSim specific code
+//@subsection Routines directly called from Haskell world
+/* 
+The @GranSim...@ routines in here are directly called via macros from the
+threaded world. 
+
+First some auxiliary routines.
+*/
+
+/* Take the current thread off the thread queue and thereby activate the 
+   next thread. It's assumed that the next ReSchedule after this uses 
+   NEW_THREAD as param. 
+   This fct is called from GranSimBlock and GranSimFetch 
+*/
+
+//@cindex ActivateNextThread
+
+void 
+ActivateNextThread (proc)
+PEs proc;
+{
+  StgTSO *t;
+  /*
+    This routine is entered either via GranSimFetch or via GranSimBlock.
+    It has to prepare the CurrentTSO for being blocked and update the
+    run queue and other statistics on PE proc. The actual enqueuing to the 
+    blocking queue (if coming from GranSimBlock) is done in the entry code 
+    of the BLACKHOLE and BLACKHOLE_BQ closures (see StgMiscClosures.hc).
+  */
+  /* ToDo: add assertions here!! */
+  //ASSERT(run_queue_hds[proc]!=END_TSO_QUEUE);
+
+  // Only necessary if the running thread is at front of the queue
+  // run_queue_hds[proc] = run_queue_hds[proc]->link;
+  ASSERT(CurrentProc==proc);
+  ASSERT(!is_on_queue(CurrentTSO,proc));
+  if (run_queue_hds[proc]==END_TSO_QUEUE) {
+    /* NB: this routine is only entered with asynchr comm (see assertion) */
+    procStatus[proc] = Idle;
+  } else {
+    /* ToDo: check cost assignment */
+    CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadcontextswitchtime;
+    if (RtsFlags.GranFlags.GranSimStats.Full && 
+	(!RtsFlags.GranFlags.Light || RtsFlags.GranFlags.Debug.checkLight)) 
+                                      /* right flag !?? ^^^ */ 
+      DumpRawGranEvent(proc, 0, GR_SCHEDULE, run_queue_hds[proc],
+                       (StgClosure*)NULL, (StgInt)0, 0);
+  }
+}
+
+/* 
+   The following GranSim fcts are stg-called from the threaded world.    
+*/
+
+/* Called from HP_CHK and friends (see StgMacros.h)  */
+//@cindex GranSimAllocate
+void 
+GranSimAllocate(n)
+StgInt n;
+{
+  CurrentTSO->gran.allocs += n;
+  ++(CurrentTSO->gran.basicblocks);
+
+  if (RtsFlags.GranFlags.GranSimStats.Heap) {
+      DumpRawGranEvent(CurrentProc, 0, GR_ALLOC, CurrentTSO,
+                       (StgClosure*)NULL, (StgInt)0, n);
+  }
+  
+  CurrentTSO->gran.exectime += RtsFlags.GranFlags.Costs.heapalloc_cost;
+  CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.heapalloc_cost;
+}
+
+/*
+  Subtract the values added above, if a heap check fails and
+  so has to be redone.
+*/
+//@cindex GranSimUnallocate
+void 
+GranSimUnallocate(n)
+StgInt n;
+{
+  CurrentTSO->gran.allocs -= n;
+  --(CurrentTSO->gran.basicblocks);
+  
+  CurrentTSO->gran.exectime -= RtsFlags.GranFlags.Costs.heapalloc_cost;
+  CurrentTime[CurrentProc] -= RtsFlags.GranFlags.Costs.heapalloc_cost;
+}
+
+/* NB: We now inline this code via GRAN_EXEC rather than calling this fct */
+//@cindex GranSimExec
+void 
+GranSimExec(ariths,branches,loads,stores,floats)
+StgWord ariths,branches,loads,stores,floats;
+{
+  StgWord cost = RtsFlags.GranFlags.Costs.arith_cost*ariths + 
+            RtsFlags.GranFlags.Costs.branch_cost*branches + 
+            RtsFlags.GranFlags.Costs.load_cost * loads +
+            RtsFlags.GranFlags.Costs.store_cost*stores + 
+            RtsFlags.GranFlags.Costs.float_cost*floats;
+
+  CurrentTSO->gran.exectime += cost;
+  CurrentTime[CurrentProc] += cost;
+}
+
+/* 
+   Fetch the node if it isn't local
+   -- result indicates whether fetch has been done.
+
+   This is GRIP-style single item fetching.
+*/
+
+//@cindex GranSimFetch
+StgInt 
+GranSimFetch(node /* , liveness_mask */ )
+StgClosure *node;
+/* StgInt liveness_mask; */
+{
+  /* reset the return value (to be checked within STG land) */
+  NeedToReSchedule = rtsFalse;   
+
+  if (RtsFlags.GranFlags.Light) {
+     /* Always reschedule in GrAnSim-Light to prevent one TSO from
+        running off too far 
+     new_event(CurrentProc,CurrentProc,CurrentTime[CurrentProc],
+	      ContinueThread,CurrentTSO,node,NULL);
+     */
+     return(0); 
+  }
+
+  /* Faking an RBH closure:
+     If the bitmask of the closure is 0 then this node is a fake RBH;
+  */
+  if (node->header.gran.procs == Nowhere) {
+    IF_GRAN_DEBUG(bq,
+		  belch("## Found fake RBH (node %p); delaying TSO %d (%p)", 
+			node, CurrentTSO->id, CurrentTSO));
+		  
+    new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc]+10000,
+	      ContinueThread, CurrentTSO, node, (rtsSpark*)NULL);
+
+    /* Rescheduling (GranSim internal) is necessary */
+    NeedToReSchedule = rtsTrue;
+    
+    return(1); 
+  }
+
+  /* Note: once a node has been fetched, this test will be passed */
+  if (!IS_LOCAL_TO(PROCS(node),CurrentProc))
+    {
+      PEs p = where_is(node);
+      rtsTime fetchtime;
+      
+      IF_GRAN_DEBUG(thunkStealing,
+		    if (p==CurrentProc) 
+		      belch("GranSimFetch: Trying to fetch from own processor%u\n", p););
+      
+      CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mpacktime;
+      /* NB: Fetch is counted on arrival (FetchReply) */
+      
+      fetchtime = stg_max(CurrentTime[CurrentProc],CurrentTime[p]) +
+	RtsFlags.GranFlags.Costs.latency;
+      
+      new_event(p, CurrentProc, fetchtime,
+		FetchNode, CurrentTSO, node, (rtsSpark*)NULL);
+      
+      if (fetchtime<TimeOfNextEvent)
+	TimeOfNextEvent = fetchtime;
+      
+      /* About to block */
+      CurrentTSO->gran.blockedat = CurrentTime[CurrentProc];
+      
+      ++OutstandingFetches[CurrentProc];
+      
+      if (RtsFlags.GranFlags.DoAsyncFetch) 
+	/* if asynchr comm is turned on, activate the next thread in the q */
+	ActivateNextThread(CurrentProc);
+      else
+	procStatus[CurrentProc] = Fetching;
+
+#if 0 
+      /* ToDo: nuke the entire if (anything special for fair schedule?) */
+      if (RtsFlags.GranFlags.DoAsyncFetch) 
+	{
+	  /* Remove CurrentTSO from the queue -- assumes head of queue == CurrentTSO */
+	  if(!RtsFlags.GranFlags.DoFairSchedule)
+	    {
+	      /* now done in do_the_fetchnode 
+	      if (RtsFlags.GranFlags.GranSimStats.Full)
+		DumpRawGranEvent(CurrentProc, p, GR_FETCH, CurrentTSO,
+				 node, (StgInt)0, 0);
+	      */				
+	      ActivateNextThread(CurrentProc);
+              
+# if 0 && defined(GRAN_CHECK)
+	      if (RtsFlags.GranFlags.Debug.blockOnFetch_sanity) {
+		if (TSO_TYPE(CurrentTSO) & FETCH_MASK_TSO) {
+		  fprintf(stderr,"FetchNode: TSO 0x%x has fetch-mask set @ %d\n",
+			  CurrentTSO,CurrentTime[CurrentProc]);
+		  stg_exit(EXIT_FAILURE);
+		} else {
+		  TSO_TYPE(CurrentTSO) |= FETCH_MASK_TSO;
+		}
+	      }
+# endif
+	      CurrentTSO->link = END_TSO_QUEUE;
+	      /* CurrentTSO = END_TSO_QUEUE; */
+	      
+	      /* CurrentTSO is pointed to by the FetchNode event; it is
+		 on no run queue any more */
+	  } else {  /* fair scheduling currently not supported -- HWL */
+	    barf("Asynchr communication is not yet compatible with fair scheduling\n");
+	  }
+	} else {                /* !RtsFlags.GranFlags.DoAsyncFetch */
+	  procStatus[CurrentProc] = Fetching; // ToDo: BlockedOnFetch;
+	  /* now done in do_the_fetchnode 
+	  if (RtsFlags.GranFlags.GranSimStats.Full)
+	    DumpRawGranEvent(CurrentProc, p,
+			     GR_FETCH, CurrentTSO, node, (StgInt)0, 0);
+	  */
+	  IF_GRAN_DEBUG(blockOnFetch, 
+			BlockedOnFetch[CurrentProc] = CurrentTSO;); /*- rtsTrue; -*/
+	}
+#endif /* 0 */
+
+      CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mtidytime;
+      
+      /* Rescheduling (GranSim internal) is necessary */
+      NeedToReSchedule = rtsTrue;
+      
+      return(1); 
+    }
+  return(0);
+}
+
+//@cindex GranSimSpark
+void 
+GranSimSpark(local,node)
+StgInt local;
+StgClosure *node;
+{
+  /* ++SparksAvail;  Nope; do that in add_to_spark_queue */
+  if (RtsFlags.GranFlags.GranSimStats.Sparks)
+    DumpRawGranEvent(CurrentProc, (PEs)0, SP_SPARK,
+		     END_TSO_QUEUE, node, (StgInt)0, spark_queue_len(CurrentProc)-1);
+
+  /* Force the PE to take notice of the spark */
+  if(RtsFlags.GranFlags.DoAlwaysCreateThreads) {
+    new_event(CurrentProc,CurrentProc,CurrentTime[CurrentProc],
+	      FindWork,
+	      END_TSO_QUEUE, (StgClosure*)NULL, (rtsSpark*)NULL);
+    if (CurrentTime[CurrentProc]<TimeOfNextEvent)
+      TimeOfNextEvent = CurrentTime[CurrentProc];
+  }
+
+  if(local)
+    ++CurrentTSO->gran.localsparks;
+  else
+    ++CurrentTSO->gran.globalsparks;
+}
+
+//@cindex GranSimSparkAt
+void 
+GranSimSparkAt(spark,where,identifier)
+rtsSpark *spark;
+StgClosure *where;    /* This should be a node; alternatively could be a GA */
+StgInt identifier;
+{
+  PEs p = where_is(where);
+  GranSimSparkAtAbs(spark,p,identifier);
+}
+
+//@cindex GranSimSparkAtAbs
+void 
+GranSimSparkAtAbs(spark,proc,identifier)
+rtsSpark *spark;
+PEs proc;        
+StgInt identifier;
+{
+  rtsTime exporttime;
+
+  if (spark == (rtsSpark *)NULL) /* Note: Granularity control might have */
+    return;                          /* turned a spark into a NULL. */
+
+  /* ++SparksAvail; Nope; do that in add_to_spark_queue */
+  if(RtsFlags.GranFlags.GranSimStats.Sparks)
+    DumpRawGranEvent(proc,0,SP_SPARKAT,
+		     END_TSO_QUEUE, spark->node, (StgInt)0, spark_queue_len(proc));
+
+  if (proc!=CurrentProc) {
+    CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mpacktime;
+    exporttime = (CurrentTime[proc] > CurrentTime[CurrentProc]? 
+                  CurrentTime[proc]: CurrentTime[CurrentProc])
+                 + RtsFlags.GranFlags.Costs.latency;
+  } else {
+    exporttime = CurrentTime[CurrentProc];
+  }
+
+  if ( RtsFlags.GranFlags.Light )
+    /* Need CurrentTSO in event field to associate costs with creating
+       spark even in a GrAnSim Light setup */
+    new_event(proc, CurrentProc, exporttime,
+	      MoveSpark,
+	      CurrentTSO, spark->node, spark);
+  else
+    new_event(proc, CurrentProc, exporttime,
+	      MoveSpark, (StgTSO*)NULL, spark->node, spark);
+  /* Bit of a hack to treat placed sparks the same as stolen sparks */
+  ++OutstandingFishes[proc];
+
+  /* Force the PE to take notice of the spark (FINDWORK is put after a
+     MoveSpark into the sparkq!) */
+  if (RtsFlags.GranFlags.DoAlwaysCreateThreads) {
+    new_event(CurrentProc,CurrentProc,exporttime+1,
+              FindWork,
+	      (StgTSO*)NULL, (StgClosure*)NULL, (rtsSpark*)NULL);
+  }
+
+  if (exporttime<TimeOfNextEvent)
+    TimeOfNextEvent = exporttime;
+
+  if (proc!=CurrentProc) {
+    CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mtidytime;
+    ++CurrentTSO->gran.globalsparks;
+  } else { 
+    ++CurrentTSO->gran.localsparks;
+  }
+}
+
+/* 
+   This function handles local and global blocking.  It's called either
+   from threaded code (RBH_entry, BH_entry etc) or from blockFetch when
+   trying to fetch an BH or RBH 
+*/
+
+//@cindex GranSimBlock
+void 
+GranSimBlock(tso, proc, node)
+StgTSO *tso;
+PEs proc;
+StgClosure *node;
+{
+  PEs node_proc = where_is(node), tso_proc = where_is(tso);
+
+  ASSERT(tso_proc==CurrentProc);
+  // ASSERT(node_proc==CurrentProc);
+  IF_GRAN_DEBUG(bq,
+		if (node_proc!=CurrentProc) 
+		  belch("## ghuH: TSO %d (%lx) [PE %d] blocks on non-local node %p [PE %d] (no simulation of FETCHMEs)",
+		        tso->id, tso, tso_proc, node, node_proc)); 
+  ASSERT(tso->link==END_TSO_QUEUE);
+  ASSERT(!is_on_queue(tso,proc)); // tso must not be on run queue already!
+  //ASSERT(tso==run_queue_hds[proc]);
+
+  IF_DEBUG(gran,
+	   belch("GRAN: TSO %d (%p) [PE %d] blocks on closure %p @ %lx",
+		 tso->id, tso, proc, node, CurrentTime[proc]);)
+
+
+    /* THIS SHOULD NEVER HAPPEN!
+       If tso tries to block on a remote node (i.e. node_proc!=CurrentProc)
+       we have missed a GranSimFetch before entering this closure;
+       we hack around it for now, faking a FetchNode; 
+       because GranSimBlock is entered via a BLACKHOLE(_BQ) closure,
+       tso will be blocked on this closure until the FetchReply occurs.
+
+       ngoq Dogh! 
+
+    if (node_proc!=CurrentProc) {
+      StgInt ret;
+      ret = GranSimFetch(node);
+      IF_GRAN_DEBUG(bq,
+                    if (ret)
+		      belch(".. GranSimBlock: faking a FetchNode of node %p from %d to %d",
+			    node, node_proc, CurrentProc););
+      return;
+    }
+    */
+
+  if (RtsFlags.GranFlags.GranSimStats.Full)
+    DumpRawGranEvent(proc,node_proc,GR_BLOCK,tso,node,(StgInt)0,0);
+
+  ++(tso->gran.blockcount);
+  /* Distinction  between local and global block is made in blockFetch */
+  tso->gran.blockedat = CurrentTime[proc];
+
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadqueuetime;
+  ActivateNextThread(proc);
+  /* tso->link = END_TSO_QUEUE;    not really necessary; only for testing */
+}
+
+#endif /* GRAN */
+
+//@node Index,  , Dumping routines, GranSim specific code
+//@subsection Index
+
+//@index
+//* ActivateNextThread::  @cindex\s-+ActivateNextThread
+//* CurrentProc::  @cindex\s-+CurrentProc
+//* CurrentTime::  @cindex\s-+CurrentTime
+//* GranSimAllocate::  @cindex\s-+GranSimAllocate
+//* GranSimBlock::  @cindex\s-+GranSimBlock
+//* GranSimExec::  @cindex\s-+GranSimExec
+//* GranSimFetch::  @cindex\s-+GranSimFetch
+//* GranSimLight_insertThread::  @cindex\s-+GranSimLight_insertThread
+//* GranSimSpark::  @cindex\s-+GranSimSpark
+//* GranSimSparkAt::  @cindex\s-+GranSimSparkAt
+//* GranSimSparkAtAbs::  @cindex\s-+GranSimSparkAtAbs
+//* GranSimUnallocate::  @cindex\s-+GranSimUnallocate
+//* any_idle::  @cindex\s-+any_idle
+//* blockFetch::  @cindex\s-+blockFetch
+//* do_the_fetchnode::  @cindex\s-+do_the_fetchnode
+//* do_the_fetchreply::  @cindex\s-+do_the_fetchreply
+//* do_the_findwork::  @cindex\s-+do_the_findwork
+//* do_the_globalblock::  @cindex\s-+do_the_globalblock
+//* do_the_movespark::  @cindex\s-+do_the_movespark
+//* do_the_movethread::  @cindex\s-+do_the_movethread
+//* do_the_startthread::  @cindex\s-+do_the_startthread
+//* do_the_unblock::  @cindex\s-+do_the_unblock
+//* fetchNode::  @cindex\s-+fetchNode
+//* ga_to_proc::  @cindex\s-+ga_to_proc
+//* get_next_event::  @cindex\s-+get_next_event
+//* get_time_of_next_event::  @cindex\s-+get_time_of_next_event
+//* grab_event::  @cindex\s-+grab_event
+//* handleFetchRequest::  @cindex\s-+handleFetchRequest
+//* handleIdlePEs::  @cindex\s-+handleIdlePEs
+//* idlers::  @cindex\s-+idlers
+//* insertThread::  @cindex\s-+insertThread
+//* insert_event::  @cindex\s-+insert_event
+//* is_on_queue::  @cindex\s-+is_on_queue
+//* is_unique::  @cindex\s-+is_unique
+//* new_event::  @cindex\s-+new_event
+//* prepend_event::  @cindex\s-+prepend_event
+//* print_event::  @cindex\s-+print_event
+//* print_eventq::  @cindex\s-+print_eventq
+//* prune_eventq ::  @cindex\s-+prune_eventq 
+//* spark queue::  @cindex\s-+spark queue
+//* sparkStealTime::  @cindex\s-+sparkStealTime
+//* stealSomething::  @cindex\s-+stealSomething
+//* stealSpark::  @cindex\s-+stealSpark
+//* stealSparkMagic::  @cindex\s-+stealSparkMagic
+//* stealThread::  @cindex\s-+stealThread
+//* stealThreadMagic::  @cindex\s-+stealThreadMagic
+//* thread_queue_len::  @cindex\s-+thread_queue_len
+//* traverse_eventq_for_gc::  @cindex\s-+traverse_eventq_for_gc
+//* where_is::  @cindex\s-+where_is
+//@end index
diff --git a/ghc/rts/parallel/GranSimRts.h b/ghc/rts/parallel/GranSimRts.h
new file mode 100644
index 0000000..585291a
--- /dev/null
+++ b/ghc/rts/parallel/GranSimRts.h
@@ -0,0 +1,261 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Sat Dec 04 1999 01:26:45 Stardate: [-30]3995.30 hwloidl>
+   $Id: GranSimRts.h,v 1.2 2000/01/13 14:34:07 hwloidl Exp $
+
+   Variables and functions specific to GranSim.
+   ----------------------------------------------------------------------- */
+
+#ifndef GRANSIM_RTS_H
+#define GRANSIM_RTS_H
+
+//@node Headers for GranSim objs used only in the RTS internally, , ,
+//@section Headers for GranSim objs used only in the RTS internally
+
+//@menu
+//* Event queue::		
+//* Spark handling routines::	
+//* Processor related stuff::	
+//* Local types::		
+//* Statistics gathering::	
+//* Prototypes::		
+//@end menu
+
+//@node Event queue, Spark handling routines, Headers for GranSim objs used only in the RTS internally, Headers for GranSim objs used only in the RTS internally
+//@subsection Event queue
+
+#if defined(GRAN) || defined(PAR)
+/* Granularity event types for output (see DumpGranEvent) */
+typedef enum GranEventType_ {
+    GR_START = 0, GR_STARTQ, 
+    GR_STEALING, GR_STOLEN, GR_STOLENQ, 
+    GR_FETCH, GR_REPLY, GR_BLOCK, GR_RESUME, GR_RESUMEQ,
+    GR_SCHEDULE, GR_DESCHEDULE,
+    GR_END,
+    SP_SPARK, SP_SPARKAT, SP_USED, SP_PRUNED, SP_EXPORTED, SP_ACQUIRED, SP_REQUESTED,
+    GR_ALLOC,
+    GR_TERMINATE,
+    GR_SYSTEM_START, GR_SYSTEM_END,            /* only for debugging */
+    GR_EVENT_MAX
+} GranEventType;
+
+extern char *gran_event_names[];
+#endif
+
+#if defined(GRAN)                                            /* whole file */
+
+/* Event Types (internal use only) */
+typedef enum rtsEventType_ {
+ ContinueThread = 0,  /* Continue running the first thread in the queue */
+ StartThread,         /* Start a newly created thread */
+ ResumeThread,        /* Resume a previously running thread */
+ MoveSpark,           /* Move a spark from one PE to another */
+ MoveThread,          /* Move a thread from one PE to another */
+ FindWork,            /* Search for work */
+ FetchNode,           /* Fetch a node */
+ FetchReply,          /* Receive a node */
+ GlobalBlock,         /* Block a TSO on a remote node */
+ UnblockThread        /* Make a TSO runnable */
+} rtsEventType;
+
+/* Number of last event type */
+#define MAX_EVENT       9
+ 
+typedef struct rtsEvent_ {
+  PEs           proc;    /* Processor id */
+  PEs           creator; /* Processor id of PE that created the event */
+  rtsEventType  evttype; /* rtsEvent type */
+  rtsTime       time;    /* Time at which event happened */
+  StgTSO       *tso;     /* Associated TSO, if relevant */
+  StgClosure   *node;    /* Associated node, if relevant */
+  rtsSpark     *spark;   /* Associated SPARK, if relevant */
+  StgInt        gc_info; /* Counter of heap objects to mark (used in GC only)*/
+  struct rtsEvent_ *next;
+  } rtsEvent;
+
+typedef rtsEvent *rtsEventQ;
+
+extern rtsEventQ EventHd;
+
+/* Interface for ADT of Event Queue */
+rtsEvent *get_next_event(void);
+rtsTime   get_time_of_next_event(void);
+void      insert_event(rtsEvent *newentry);
+void      new_event(PEs proc, PEs creator, rtsTime time, 
+		    rtsEventType evttype, StgTSO *tso, 
+		    StgClosure *node, rtsSpark *spark);
+void      print_event(rtsEvent *event);
+void      print_eventq(rtsEvent *hd);
+void      prepend_event(rtsEvent *event);
+rtsEventQ grab_event(void);
+void      prune_eventq(StgTSO *tso, StgClosure *node); 
+
+void      traverse_eventq_for_gc(void);
+void      markEventQueue(void);
+
+//@node Spark handling routines, Processor related stuff, Event queue, Headers for GranSim objs used only in the RTS internally
+//@subsection Spark handling routines
+
+/* These functions are only used in the RTS internally; see GranSim.h for rest */
+void 	  disposeSpark(rtsSpark *spark);
+void 	  disposeSparkQ(rtsSparkQ spark);
+void 	  print_spark(rtsSpark *spark);
+void      print_sparkq(PEs proc);
+void 	  print_sparkq_stats(void);
+nat  	  spark_queue_len(PEs proc);
+rtsSpark *delete_from_sparkq (rtsSpark *spark, PEs p, rtsBool dispose_too);
+void      markSparkQueue(void);
+
+//@node Processor related stuff, Local types, Spark handling routines, Headers for GranSim objs used only in the RTS internally
+//@subsection Processor related stuff
+
+typedef enum rtsProcStatus_ {
+  Idle = 0,             /* empty threadq */
+  Sparking,             /* non-empty sparkq; FINDWORK has been issued */
+  Starting,             /* STARTTHREAD has been issue */
+  Fetching,             /* waiting for remote data (only if block-on-fetch) */
+  Fishing,              /* waiting for remote spark/thread */
+  Busy                  /* non-empty threadq, with head of queue active */
+} rtsProcStatus;
+
+/*
+#define IS_IDLE(proc)        (procStatus[proc] == Idle)
+#define IS_SPARKING(proc)    (procStatus[proc] == Sparking)
+#define IS_STARTING(proc)    (procStatus[proc] == Starting)
+#define IS_FETCHING(proc)    (procStatus[proc] == Fetching)
+#define IS_FISHING(proc)     (procStatus[proc] == Fishing)
+#define IS_BUSY(proc)        (procStatus[proc] == Busy)    
+#define ANY_IDLE             (any_idle())
+#define MAKE_IDLE(proc)      procStatus[proc] = Idle
+#define MAKE_SPARKING(proc)  procStatus[proc] = Sparking
+#define MAKE_STARTING(proc)  procStatus[proc] = Starting
+#define MAKE_FETCHING(proc)  procStatus[proc] = Fetching
+#define MAKE_FISHING(proc)   procStatus[proc] = Fishing
+#define MAKE_BUSY(proc)      procStatus[proc] = Busy
+*/
+
+//@node Local types, Statistics gathering, Processor related stuff, Headers for GranSim objs used only in the RTS internally
+//@subsection Local types
+
+/* Return codes of HandleFetchRequest:
+    0 ... ok (FETCHREPLY event with a buffer containing addresses of the 
+              nearby graph has been scheduled)
+    1 ... node is already local (fetched by somebody else; no event is
+                                  scheduled in here)
+    2 ... fetch request has been forwrded to the PE that now contains the
+           node
+    3 ... node is a black hole (BH, BQ or RBH); no event is scheduled, and
+           the current TSO is put into the blocking queue of that node
+    4 ... out of heap in PackNearbyGraph; GC should be triggered in calling
+          function to guarantee that the tso and node inputs are valid
+          (they may be moved during GC).
+   Return codes of blockFetch:
+    0 ... ok; tso is now at beginning of BQ attached to the bh closure
+    1 ... the bh closure is no BH any more; tso is immediately unblocked
+*/
+
+typedef enum rtsFetchReturnCode_ {
+  Ok = 0,
+  NodeIsLocal,
+  NodeHasMoved,
+  NodeIsBH,
+  NodeIsNoBH,
+  OutOfHeap,
+} rtsFetchReturnCode;
+  
+//@node Statistics gathering, Prototypes, Local types, Headers for GranSim objs used only in the RTS internally
+//@subsection Statistics gathering
+
+extern unsigned int /* nat */ OutstandingFetches[], OutstandingFishes[];
+extern rtsProcStatus procStatus[];
+extern StgTSO *BlockedOnFetch[];
+
+/* global structure for collecting statistics */
+typedef struct GlobalGranStats_ {
+  /* event stats */
+  nat noOfEvents;
+  nat event_counts[MAX_EVENT];
+
+  /* communication stats */
+  nat fetch_misses;
+  nat tot_fake_fetches;   // GranSim internal; faked Fetches are a kludge!!
+  nat tot_low_pri_sparks;
+
+  /* load distribution statistics */  
+  nat rs_sp_count, rs_t_count, ntimes_total, fl_total, 
+      no_of_steals, no_of_migrates;
+
+  /* spark queue stats */
+  nat tot_sq_len, tot_sq_probes, tot_sparks;
+  nat tot_add_threads, tot_tq_len, non_end_add_threads;
+
+  /* packet statistics */
+  nat tot_packets, tot_packet_size, tot_cuts, tot_thunks;
+
+  /* thread stats */
+  nat tot_threads_created, threads_created_on_PE[MAX_PROC],
+      tot_TSOs_migrated;
+
+  /* spark stats */
+  nat pruned_sparks, withered_sparks;
+  nat tot_sparks_created, sparks_created_on_PE[MAX_PROC];
+
+  /* scheduling stats */
+  nat tot_yields;
+
+  /* blocking queue statistics */
+  rtsTime tot_bq_processing_time;
+  nat tot_bq_len, tot_bq_len_local, tot_awbq, tot_FMBQs;
+} GlobalGranStats;
+
+extern GlobalGranStats globalGranStats;
+
+//@node Prototypes,  , Statistics gathering, Headers for GranSim objs used only in the RTS internally
+//@subsection Prototypes
+
+/* Generally useful fcts */
+PEs where_is(StgClosure *node);
+rtsBool is_unique(StgClosure *node);
+
+/* Prototypes of event handling functions; needed in Schedule.c:ReSchedule() */
+void do_the_globalblock (rtsEvent* event);
+void do_the_unblock (rtsEvent* event);
+void do_the_fetchnode (rtsEvent* event);
+void do_the_fetchreply (rtsEvent* event);
+void do_the_movethread (rtsEvent* event);
+void do_the_movespark (rtsEvent* event);
+void do_the_startthread(rtsEvent *event);
+void do_the_findwork(rtsEvent* event);
+void gimme_spark (rtsEvent *event, rtsBool *found_res, rtsSparkQ *spark_res);
+rtsBool munch_spark (rtsEvent *event, rtsSparkQ spark);
+
+/* GranSimLight routines */
+void GranSimLight_enter_system(rtsEvent *event, StgTSO **ActiveTSOp);
+void GranSimLight_leave_system(rtsEvent *event, StgTSO **ActiveTSOp);
+
+/* Communication related routines */
+rtsFetchReturnCode fetchNode(StgClosure* node, PEs from, PEs to);
+rtsFetchReturnCode handleFetchRequest(StgClosure* node, PEs curr_proc, PEs p, StgTSO* tso);
+rtsFetchReturnCode blockFetch(StgTSO* tso, PEs proc, StgClosure* bh);
+void               handleIdlePEs(void);
+
+long int random(void); /* used in stealSpark() and stealThread() in GranSim.c */
+
+/* Scheduling fcts defined in GranSim.c */
+void    insertThread(StgTSO *tso, PEs proc);
+void    endThread(StgTSO *tso, PEs proc);
+rtsBool GranSimLight_insertThread(StgTSO *tso, PEs proc);
+nat     thread_queue_len(PEs proc);
+
+/* For debugging */
+rtsBool is_on_queue (StgTSO *tso, PEs proc);
+
+/* Interface for dumping routines (i.e. writing to log file) */
+void DumpGranEvent(GranEventType name, StgTSO *tso);
+void DumpEndEvent(PEs proc, StgTSO *tso, rtsBool mandatory_thread);
+void DumpTSO(StgTSO *tso);
+void DumpRawGranEvent(PEs proc, PEs p, GranEventType name, 
+ 	              StgTSO *tso, StgClosure *node, StgInt sparkname, StgInt len);
+
+#endif
+
+#endif /* GRANSIM_RTS_H  */
diff --git a/ghc/rts/parallel/HLC.h b/ghc/rts/parallel/HLC.h
new file mode 100644
index 0000000..f2d98d4
--- /dev/null
+++ b/ghc/rts/parallel/HLC.h
@@ -0,0 +1,59 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Sun Dec 05 1999 21:02:36 Stardate: [-30]4004.38 hwloidl>
+   $Id: HLC.h,v 1.2 2000/01/13 14:34:07 hwloidl Exp $
+
+   High Level Communications Header (HLC.h)
+
+   Contains the high-level definitions (i.e. communication
+   subsystem independent) used by GUM
+   Phil Trinder, Glasgow University, 12 December 1994
+   H-W. Loidl, Heriot-Watt, November 1999
+   ----------------------------------------------------------------------- */
+
+#ifndef __HLC_H
+#define __HLC_H
+
+#ifdef PAR
+
+#include "LLC.h"
+
+#define NEW_FISH_AGE           0
+#define NEW_FISH_HISTORY       0
+#define NEW_FISH_HUNGER        0
+#define FISH_LIFE_EXPECTANCY  10
+
+
+//@node GUM Message Sending and Unpacking Functions
+//@subsection GUM Message Sending and Unpacking Functions
+
+rtsBool  initMoreBuffers(void);
+
+void 	 sendFetch (globalAddr *ga, globalAddr *bqga, int load);
+void 	 sendResume(globalAddr *rga, int nelem, rtsPackBuffer *data);
+void 	 sendAck (GlobalTaskId task, int ngas, globalAddr *gagamap);
+void 	 sendFish (GlobalTaskId destPE, GlobalTaskId origPE, int age, int history, int hunger);
+void 	 sendFree (GlobalTaskId destPE, int nelem, P_ data);
+void 	 sendSchedule(GlobalTaskId origPE, int nelem, rtsPackBuffer *data);
+
+//@node Message-Processing Functions
+//@subsection Message-Processing Functions
+
+void 	 processMessages(void);
+void 	 processFetches(void);
+void 	 processTheRealFetches(void);
+
+//@node Miscellaneous Functions
+//@subsection Miscellaneous Functions
+
+void 	 prepareFreeMsgBuffers(void);
+void 	 freeRemoteGA (int pe, globalAddr *ga);
+void 	 sendFreeMessages(void);
+
+GlobalTaskId  choosePE(void);
+StgClosure   *createBlockedFetch (globalAddr ga, globalAddr rga);
+void 	      waitForTermination(void);
+
+void          DebugPrintGAGAMap (globalAddr *gagamap, int nGAs);
+
+#endif /* PAR */
+#endif /* __HLC_H */
diff --git a/ghc/rts/parallel/HLComms.c b/ghc/rts/parallel/HLComms.c
new file mode 100644
index 0000000..bce0de7
--- /dev/null
+++ b/ghc/rts/parallel/HLComms.c
@@ -0,0 +1,1305 @@
+/* ----------------------------------------------------------------------------
+ * Time-stamp: <Wed Jan 12 2000 13:32:25 Stardate: [-30]4193.86 hwloidl>
+ * $Id: HLComms.c,v 1.2 2000/01/13 14:34:07 hwloidl Exp $
+ *
+ * High Level Communications Routines (HLComms.lc)
+ *
+ * Contains the high-level routines (i.e. communication
+ * subsystem independent) used by GUM
+ * 
+ * Phil Trinder, Glasgow University, 12 December 1994
+ * Adapted for new RTS
+ * Phil Trinder, Simon Marlow July 1998
+ * H-W. Loidl, Heriot-Watt University, November 1999
+ * 
+ * ------------------------------------------------------------------------- */
+
+#ifdef PAR /* whole file */
+
+//@node High Level Communications Routines, , ,
+//@section High Level Communications Routines
+
+//@menu
+//* Macros etc::		
+//* Includes::			
+//* GUM Message Sending and Unpacking Functions::  
+//* Message-Processing Functions::  
+//* GUM Message Processor::	
+//* Miscellaneous Functions::	
+//* Index::			
+//@end menu
+
+//@node Macros etc, Includes, High Level Communications Routines, High Level Communications Routines
+//@subsection Macros etc
+
+# ifndef _AIX
+# define NON_POSIX_SOURCE /* so says Solaris */
+# endif
+
+//@node Includes, GUM Message Sending and Unpacking Functions, Macros etc, High Level Communications Routines
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "Storage.h"   // for recordMutable
+#include "HLC.h"
+#include "Parallel.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+#include "FetchMe.h"     // for BLOCKED_FETCH_info etc
+#if defined(DEBUG)
+# include "ParallelDebug.h"
+#endif
+#include "StgMacros.h" // inlined IS_... fcts
+
+//@node GUM Message Sending and Unpacking Functions, Message-Processing Functions, Includes, High Level Communications Routines
+//@subsection GUM Message Sending and Unpacking Functions
+
+/*
+ * GUM Message Sending and Unpacking Functions
+ */
+
+/*
+ * Allocate space for message processing
+ */
+
+//@cindex gumPackBuffer
+static rtsPackBuffer *gumPackBuffer;
+
+//@cindex initMoreBuffers
+rtsBool
+initMoreBuffers(void)
+{
+  if ((gumPackBuffer = (rtsPackBuffer *)stgMallocWords(RtsFlags.ParFlags.packBufferSize, 
+					     "initMoreBuffers")) == NULL)
+    return rtsFalse;
+  return rtsTrue;
+}
+
+/*
+ * SendFetch packs the two global addresses and a load into a message +
+ * sends it.  
+
+//@cindex FETCH
+
+   Structure of a FETCH message:
+
+         |    GA 1     |        GA 2          |
+         +------------------------------------+------+
+	 | gtid | slot | weight | gtid | slot | load |
+	 +------------------------------------+------+
+ */
+
+//@cindex sendFetch
+void
+sendFetch(globalAddr *rga, globalAddr *lga, int load)
+{
+  ASSERT(rga->weight > 0 && lga->weight > 0);
+  IF_PAR_DEBUG(fetch,
+	       belch("** [%x] Sending Fetch for ((%x, %d, 0)); locally ((%x, %d, %x)), load = %d", 
+		     mytid,
+		     rga->payload.gc.gtid, rga->payload.gc.slot, 
+		     lga->payload.gc.gtid, lga->payload.gc.slot, lga->weight,
+		     load));
+
+
+  /* ToDo: Dump event
+  DumpRawGranEvent(CURRENT_PROC, taskIDtoPE(rga->payload.gc.gtid), 
+		   GR_FETCH, CurrentTSO, (StgClosure *)(lga->payload.gc.slot),
+		   0, spark_queue_len(ADVISORY_POOL));
+  */
+
+  sendOpV(PP_FETCH, rga->payload.gc.gtid, 6,
+	  (StgWord) rga->payload.gc.gtid, (StgWord) rga->payload.gc.slot, 
+	  (StgWord) lga->weight, (StgWord) lga->payload.gc.gtid, 
+	  (StgWord) lga->payload.gc.slot, (StgWord) load);
+}
+
+/*
+ * unpackFetch unpacks a FETCH message into two Global addresses and a load
+ * figure.  
+*/
+
+//@cindex unpackFetch
+static void
+unpackFetch(globalAddr *lga, globalAddr *rga, int *load)
+{
+  long buf[6];
+
+  GetArgs(buf, 6); 
+
+  IF_PAR_DEBUG(fetch,
+	       belch("** [%x] Unpacking Fetch for ((%x, %d, 0)) to ((%x, %d, %x)), load = %d", 
+		     mytid,
+		     (GlobalTaskId) buf[0], (int) buf[1], 
+		     (GlobalTaskId) buf[3], (int) buf[4], buf[2], buf[5]));
+
+  lga->weight = 1;
+  lga->payload.gc.gtid = (GlobalTaskId) buf[0];
+  lga->payload.gc.slot = (int) buf[1];
+
+  rga->weight = (unsigned) buf[2];
+  rga->payload.gc.gtid = (GlobalTaskId) buf[3];
+  rga->payload.gc.slot = (int) buf[4];
+
+  *load = (int) buf[5];
+
+  ASSERT(rga->weight > 0);
+}
+
+/*
+ * SendResume packs the remote blocking queue's GA and data into a message 
+ * and sends it.
+
+//@cindex RESUME
+
+   Structure of a RESUME message:
+
+      -------------------------------
+      | weight | slot | n | data ...
+      -------------------------------
+
+   data is a packed graph represented as an rtsPackBuffer
+   n is the size of the graph (as returned by PackNearbyGraph) + packet hdr size
+ */
+
+//@cindex sendResume
+void
+sendResume(globalAddr *rga, int nelem, rtsPackBuffer *data) // StgPtr data)
+{
+  IF_PAR_DEBUG(resume,
+	       PrintPacket(data);
+	       belch("[] [%x] Sending Resume for ((%x, %d, %x))", 
+		     mytid,
+		     rga->payload.gc.gtid, rga->payload.gc.slot, rga->weight));
+
+  sendOpNV(PP_RESUME, rga->payload.gc.gtid, 
+	   nelem + PACK_BUFFER_HDR_SIZE, (StgPtr)data, 
+	   2, (rtsWeight) rga->weight, (StgWord) rga->payload.gc.slot);
+}
+
+/*
+ * unpackResume unpacks a Resume message into two Global addresses and
+ * a data array.
+ */
+
+//@cindex unpackResume
+static void
+unpackResume(globalAddr *lga, int *nelem, rtsPackBuffer *data)
+{
+    long buf[3];
+
+    GetArgs(buf, 3); 
+
+    IF_PAR_DEBUG(resume,
+		 belch("[] [%x] Unpacking Resume for ((%x, %d, %x))", 
+		       mytid, mytid,
+		       (int) buf[1], (unsigned) buf[0]));
+
+    /*
+      RESUME event is written in awaken_blocked_queue
+    DumpRawGranEvent(CURRENT_PROC, taskIDtoPE(lga->payload.gc.gtid), 
+		     GR_RESUME, END_TSO_QUEUE, (StgClosure *)NULL, 0, 0);
+    */
+
+    lga->weight = (unsigned) buf[0];
+    lga->payload.gc.gtid = mytid;
+    lga->payload.gc.slot = (int) buf[1];
+
+    *nelem = (int) buf[2]; // includes PACK_BUFFER_HDR_SIZE;
+    GetArgs(data, *nelem);
+    *nelem -= PACK_BUFFER_HDR_SIZE;
+}
+
+/*
+ * SendAck packs the global address being acknowledged, together with
+ * an array of global addresses for any closures shipped and sends them.
+
+//@cindex ACK
+
+   Structure of an ACK message:
+
+      |        GA 1          |        GA 2          | 
+      +---------------------------------------------+-------
+      | weight | gtid | slot | weight | gtid | slot |  .....  ngas times
+      + --------------------------------------------+------- 
+
+ */
+
+//@cindex sendAck
+void
+sendAck(GlobalTaskId task, int ngas, globalAddr *gagamap)
+{
+  static long *buffer;
+  long *p;
+  int i;
+
+  buffer = (long *) gumPackBuffer;
+
+  for(i = 0, p = buffer; i < ngas; i++, p += 6) {
+    ASSERT(gagamap[1].weight > 0);
+    p[0] = (long) gagamap->weight;
+    p[1] = (long) gagamap->payload.gc.gtid;
+    p[2] = (long) gagamap->payload.gc.slot;
+    gagamap++;
+    p[3] = (long) gagamap->weight;
+    p[4] = (long) gagamap->payload.gc.gtid;
+    p[5] = (long) gagamap->payload.gc.slot;
+    gagamap++;
+  }
+  IF_PAR_DEBUG(ack,
+	       belch(",, [%x] Sending Ack (%d pairs) to PE %x\n", 
+		     mytid, ngas, task));
+
+  sendOpN(PP_ACK, task, p - buffer, buffer);
+}
+
+/*
+ * unpackAck unpacks an Acknowledgement message into a Global address,
+ * a count of the number of global addresses following and a map of 
+ * Global addresses
+ */
+
+//@cindex unpackAck
+static void
+unpackAck(int *ngas, globalAddr *gagamap)
+{
+  long GAarraysize;
+  long buf[6];
+  
+  GetArgs(&GAarraysize, 1);
+  
+  *ngas = GAarraysize / 6;
+  
+  IF_PAR_DEBUG(ack,
+	       belch(",, [%x] Unpacking Ack (%d pairs) on %x\n", 
+		     mytid, *ngas, mytid));
+
+  while (GAarraysize > 0) {
+    GetArgs(buf, 6);
+    gagamap->weight = (rtsWeight) buf[0];
+    gagamap->payload.gc.gtid = (GlobalTaskId) buf[1];
+    gagamap->payload.gc.slot = (int) buf[2];
+    gagamap++;
+    gagamap->weight = (rtsWeight) buf[3];
+    gagamap->payload.gc.gtid = (GlobalTaskId) buf[4];
+    gagamap->payload.gc.slot = (int) buf[5];
+    ASSERT(gagamap->weight > 0);
+    gagamap++;
+    GAarraysize -= 6;
+  }
+}
+
+/*
+ * SendFish packs the global address being acknowledged, together with
+ * an array of global addresses for any closures shipped and sends them.
+
+//@cindex FISH
+
+ Structure of a FISH message:
+
+     +----------------------------------+
+     | orig PE | age | history | hunger |
+     +----------------------------------+
+ */
+
+//@cindex sendFish
+void
+sendFish(GlobalTaskId destPE, GlobalTaskId origPE, 
+	 int age, int history, int hunger)
+{
+  IF_PAR_DEBUG(fish,
+	       belch("$$ [%x] Sending Fish to %x (%d outstanding fishes)", 
+		     mytid, destPE, outstandingFishes));
+
+  sendOpV(PP_FISH, destPE, 4, 
+	  (StgWord) origPE, (StgWord) age, (StgWord) history, (StgWord) hunger);
+
+  if (origPE == mytid) {
+    //fishing = rtsTrue;
+    outstandingFishes++;
+  }
+}
+
+/*
+ * unpackFish unpacks a FISH message into the global task id of the
+ * originating PE and 3 data fields: the age, history and hunger of the
+ * fish. The history + hunger are not currently used.
+
+ */
+
+//@cindex unpackFish
+static void
+unpackFish(GlobalTaskId *origPE, int *age, int *history, int *hunger)
+{
+  long buf[4];
+  
+  GetArgs(buf, 4);
+  
+  IF_PAR_DEBUG(fish,
+	       belch("$$ [%x] Unpacking Fish from PE %x (age=%d)", 
+		     mytid, (GlobalTaskId) buf[0], (int) buf[1]));
+
+  *origPE = (GlobalTaskId) buf[0];
+  *age = (int) buf[1];
+  *history = (int) buf[2];
+  *hunger = (int) buf[3];
+}
+
+/*
+ * SendFree sends (weight, slot) pairs for GAs that we no longer need
+ * references to.  
+
+//@cindex FREE
+
+   Structure of a FREE message:
+   
+       +-----------------------------
+       | n | weight_1 | slot_1 | ...
+       +-----------------------------
+ */
+//@cindex sendFree
+void
+sendFree(GlobalTaskId pe, int nelem, StgPtr data)
+{
+    IF_PAR_DEBUG(free,
+		 belch("!! [%x] Sending Free (%d GAs) to %x", 
+		       mytid, nelem/2, pe));
+
+    sendOpN(PP_FREE, pe, nelem, data);
+}
+
+/*
+ * unpackFree unpacks a FREE message into the amount of data shipped and
+ * a data block.
+ */
+//@cindex unpackFree
+static void
+unpackFree(int *nelem, rtsPackBuffer *data)
+{
+  long buf[1];
+  
+  GetArgs(buf, 1);
+  *nelem = (int) buf[0];
+
+  IF_PAR_DEBUG(free,
+	       belch("!! [%x] Unpacking Free (%d GAs)", 
+		     mytid, *nelem/2));
+
+  GetArgs(data, *nelem);
+}
+
+/*
+ * SendSchedule sends a closure to be evaluated in response to a Fish
+ * message. The message is directed to the PE that originated the Fish
+ * (origPE), and includes the packed closure (data) along with its size
+ * (nelem).
+
+//@cindex SCHEDULE
+
+   Structure of a SCHEDULE message:
+
+       +------------------------------------
+       | PE | n | pack buffer of a graph ...
+       +------------------------------------
+ */
+//@cindex sendSchedule
+void
+sendSchedule(GlobalTaskId origPE, int nelem, rtsPackBuffer *data) // StgPtr data)
+{
+  IF_PAR_DEBUG(schedule,
+	       PrintPacket(data);
+	       belch("-- [%x] Sending Schedule (%d elems) to %x\n", 
+		     mytid, nelem, origPE));
+
+  sendOpN(PP_SCHEDULE, origPE, nelem + PACK_BUFFER_HDR_SIZE, (StgPtr)data);
+}
+
+/*
+ * unpackSchedule unpacks a SCHEDULE message into the Global address of
+ * the closure shipped, the amount of data shipped (nelem) and the data
+ * block (data).
+ */
+
+//@cindex unpackSchedule
+static void
+unpackSchedule(int *nelem, rtsPackBuffer *data)
+{
+    long buf[1];
+
+    GetArgs(buf, 1);
+    /* no. of elems, not counting the header of the pack buffer */
+    *nelem = (int) buf[0] - PACK_BUFFER_HDR_SIZE;
+
+    IF_PAR_DEBUG(schedule,
+		 belch("-- [%x] Unpacking Schedule (%d elems) on %x\n", 
+		       mytid, *nelem));
+
+    /* automatic cast of flat pvm-data to rtsPackBuffer */
+    GetArgs(data, *nelem + PACK_BUFFER_HDR_SIZE);
+}
+
+//@node Message-Processing Functions, GUM Message Processor, GUM Message Sending and Unpacking Functions, High Level Communications Routines
+//@subsection Message-Processing Functions
+
+/*
+ * Message-Processing Functions
+ *
+ * The following routines process incoming GUM messages. Often reissuing
+ * messages in response.
+ *
+ * processFish unpacks a fish message, reissuing it if it's our own,
+ * sending work if we have it or sending it onwards otherwise.
+ */
+
+/*
+ * blockFetch blocks a BlockedFetch node on some kind of black hole.
+ */
+//@cindex blockFetch
+static void
+blockFetch(StgBlockedFetch *bf, StgClosure *bh) {
+  bf->node = bh;
+  switch (get_itbl(bh)->type) {
+  case BLACKHOLE:
+    bf->link = END_BQ_QUEUE;
+    //((StgBlockingQueue *)bh)->header.info = &BLACKHOLE_BQ_info;
+    SET_INFO(bh, &BLACKHOLE_BQ_info);  // turn closure into a blocking queue
+    ((StgBlockingQueue *)bh)->blocking_queue = (StgBlockingQueueElement *)bf;
+    
+    // put bh on the mutables list
+    recordMutable((StgMutClosure *)bh);
+
+# if 0
+    /*
+     * If we modify a black hole in the old generation, we have to
+     * make sure it goes on the mutables list
+     */
+    
+    if (bh <= StorageMgrInfo.OldLim) {
+      MUT_LINK(bh) = (StgWord) StorageMgrInfo.OldMutables;
+      StorageMgrInfo.OldMutables = bh;
+    } else
+      MUT_LINK(bh) = MUT_NOT_LINKED;
+# endif
+    break;
+    
+  case BLACKHOLE_BQ:
+    /* enqueue bf on blocking queue of closure bh */
+    bf->link = ((StgBlockingQueue *)bh)->blocking_queue;
+    ((StgBlockingQueue *)bh)->blocking_queue = (StgBlockingQueueElement *)bf;
+
+    // put bh on the mutables list; ToDo: check
+    recordMutable((StgMutClosure *)bh);
+    break;
+
+  case FETCH_ME_BQ:
+    /* enqueue bf on blocking queue of closure bh */
+    bf->link = ((StgFetchMeBlockingQueue *)bh)->blocking_queue;
+    ((StgFetchMeBlockingQueue *)bh)->blocking_queue = (StgBlockingQueueElement *)bf;
+
+    // put bh on the mutables list; ToDo: check
+    recordMutable((StgMutClosure *)bh);
+    break;
+    
+  case RBH:
+    /* enqueue bf on blocking queue of closure bh */
+    bf->link = ((StgRBH *)bh)->blocking_queue;
+    ((StgRBH *)bh)->blocking_queue = (StgBlockingQueueElement *)bf;
+
+    // put bh on the mutables list; ToDo: check
+    recordMutable((StgMutClosure *)bh);
+    break;
+    
+  default:
+    barf("Panic (blockFetch): thought %p was a black hole (IP %#lx, %s)",
+	 (StgClosure *)bh, get_itbl((StgClosure *)bh), 
+	 info_type((StgClosure *)bh));
+  }
+  IF_PAR_DEBUG(verbose,
+	       belch("## blockFetch: after block the BQ of %p (%s) is:",
+		     bh, info_type(bh));
+	       print_bq(bh));
+}
+
+
+/*
+ * processFetches constructs and sends resume messages for every
+ * BlockedFetch which is ready to be awakened.
+ * awaken_blocked_queue (in Schedule.c) is responsible for moving 
+ * BlockedFetches from a blocking queue to the PendingFetches queue.
+ */
+void GetRoots(void);
+extern StgBlockedFetch *PendingFetches;
+
+nat
+pending_fetches_len(void)
+{
+  StgBlockedFetch *bf;
+  nat n;
+
+  for (n=0, bf=PendingFetches; bf != END_BF_QUEUE; n++, bf = (StgBlockedFetch *)(bf->link)) {
+    ASSERT(get_itbl(bf)->type==BLOCKED_FETCH);
+  }
+  return n;
+}
+
+//@cindex processFetches
+void
+processFetches(void) {
+  StgBlockedFetch *bf;
+  StgClosure *closure, *next;
+  StgInfoTable *ip;
+  globalAddr rga;
+  static rtsPackBuffer *packBuffer;
+    
+  IF_PAR_DEBUG(verbose,
+	       belch("__ processFetches: %d  pending fetches",
+		     pending_fetches_len()));
+  
+  for (bf = PendingFetches; 
+       bf != END_BF_QUEUE;
+       bf=(StgBlockedFetch *)(bf->link)) {
+    /* the PendingFetches list contains only BLOCKED_FETCH closures */
+    ASSERT(get_itbl(bf)->type==BLOCKED_FETCH);
+
+    /*
+     * Find the target at the end of the indirection chain, and
+     * process it in much the same fashion as the original target
+     * of the fetch.  Though we hope to find graph here, we could
+     * find a black hole (of any flavor) or even a FetchMe.
+     */
+    closure = bf->node;
+    /*
+      HACK 312: bf->node may have been evacuated since filling it; follow
+       the evacuee in this case; the proper way to handle this is to
+       traverse the blocking queue and update the node fields of
+       BLOCKED_FETCH entries when evacuating an BLACKHOLE_BQ, FETCH_ME_BQ
+       or RBH (but it's late and I'm tired) 
+    */
+    if (get_itbl(closure)->type == EVACUATED)
+      closure = ((StgEvacuated *)closure)->evacuee;
+
+    while ((next = IS_INDIRECTION(closure)) != NULL) { closure = next; }
+
+    ip = get_itbl(closure);
+    if (ip->type == FETCH_ME) {
+      /* Forward the Fetch to someone else */
+      rga.payload.gc.gtid = bf->ga.payload.gc.gtid;
+      rga.payload.gc.slot = bf->ga.payload.gc.slot;
+      rga.weight = bf->ga.weight;
+      
+      sendFetch(((StgFetchMe *)closure)->ga, &rga, 0 /* load */);
+
+      IF_PAR_DEBUG(forward,
+		   belch("__ processFetches: Forwarding fetch from %lx to %lx",
+			 mytid, rga.payload.gc.gtid));
+
+    } else if (IS_BLACK_HOLE(closure)) {
+      IF_PAR_DEBUG(verbose,
+		   belch("__ processFetches: trying to send a BLACK_HOLE => doign a blockFetch on closure %p (%s)",
+			 closure, info_type(closure)));
+      bf->node = closure;
+      blockFetch(bf, closure);
+    } else {
+      /* We now have some local graph to send back */
+      nat size;
+
+      packBuffer = gumPackBuffer;
+      IF_PAR_DEBUG(verbose,
+		   belch("__ processFetches: PackNearbyGraph of closure %p (%s)",
+			 closure, info_type(closure)));
+
+      if ((packBuffer = PackNearbyGraph(closure, END_TSO_QUEUE, &size)) == NULL) {
+	// Put current BF back on list
+	bf->link = (StgBlockingQueueElement *)PendingFetches;
+	PendingFetches = (StgBlockedFetch *)bf;
+	// ToDo: check that nothing more has to be done to prepare for GC!
+	GarbageCollect(GetRoots); 
+	bf = PendingFetches;
+	PendingFetches = (StgBlockedFetch *)(bf->link);
+	closure = bf->node;
+	packBuffer = PackNearbyGraph(closure, END_TSO_QUEUE, &size);
+	ASSERT(packBuffer != (rtsPackBuffer *)NULL);
+      }
+      rga.payload.gc.gtid = bf->ga.payload.gc.gtid;
+      rga.payload.gc.slot = bf->ga.payload.gc.slot;
+      rga.weight = bf->ga.weight;
+      
+      sendResume(&rga, size, packBuffer);
+    }
+  }
+  PendingFetches = END_BF_QUEUE;
+}
+
+#if 0
+/*
+  Alternatively to sending fetch messages directly from the FETCH_ME_entry
+  code we could just store the data about the remote data in a global
+  variable and send the fetch request from the main scheduling loop (similar
+  to processFetches above). This would save an expensive STGCALL in the entry 
+  code because we have to go back to the scheduler anyway.
+*/
+//@cindex processFetches
+void
+processTheRealFetches(void) {
+  StgBlockedFetch *bf;
+  StgClosure *closure, *next;
+    
+  IF_PAR_DEBUG(verbose,
+	       belch("__ processTheRealFetches: ");
+	       printGA(&theGlobalFromGA);
+	       printGA(&theGlobalToGA));
+
+  ASSERT(theGlobalFromGA.payload.gc.gtid != 0 &&
+	 theGlobalToGA.payload.gc.gtid != 0);
+
+  /* the old version did this in the FETCH_ME entry code */
+  sendFetch(&theGlobalFromGA, &theGlobalToGA, 0/*load*/);
+  
+#if DEBUG
+  theGlobalFromGA.payload.gc.gtid = 0;
+  theGlobalToGA.payload.gc.gtid = 0;
+#endif DEBUG
+}
+#endif
+
+
+/*
+ * processFish unpacks a fish message, reissuing it if it's our own,
+ * sending work if we have it or sending it onwards otherwise.
+ */
+//@cindex processFish
+static void
+processFish(void)
+{
+  GlobalTaskId origPE;
+  int age, history, hunger;
+  rtsSpark spark;
+  static rtsPackBuffer *packBuffer; 
+
+  unpackFish(&origPE, &age, &history, &hunger);
+
+  if (origPE == mytid) {
+    //fishing = rtsFalse;                   // fish has come home
+    outstandingFishes--;
+    last_fish_arrived_at = CURRENT_TIME;  // remember time (see schedule fct)
+    return;                               // that's all
+  }
+
+  ASSERT(origPE != mytid);
+  IF_PAR_DEBUG(fish,
+	       belch("$$ [%x] processing fish; %d sparks available",
+		     mytid, spark_queue_len(ADVISORY_POOL)));
+  while ((spark = findLocalSpark(rtsTrue)) != NULL) {
+    nat size;
+    // StgClosure *graph;
+
+    packBuffer = gumPackBuffer; 
+    ASSERT(closure_SHOULD_SPARK((StgClosure *)spark));
+    if ((packBuffer = PackNearbyGraph(spark, END_TSO_QUEUE, &size)) == NULL) {
+      IF_PAR_DEBUG(fish,
+		   belch("$$ GC while trying to satisfy FISH via PackNearbyGraph of node %p",
+			 (StgClosure *)spark));
+      GarbageCollect(GetRoots);
+      /* Now go back and try again */
+    } else {
+      IF_PAR_DEBUG(fish,
+		   belch("$$ [%x] Replying to FISH from %x by sending graph @ %p (%s)",
+			 mytid, origPE, 
+			 (StgClosure *)spark, info_type((StgClosure *)spark)));
+      sendSchedule(origPE, size, packBuffer);
+      disposeSpark(spark);
+      break;
+    }
+  }
+  if (spark == (rtsSpark)NULL) {
+    IF_PAR_DEBUG(fish,
+		 belch("$$ [%x] No sparks available for FISH from %x",
+		       mytid, origPE));
+    /* We have no sparks to give */
+    if (age < FISH_LIFE_EXPECTANCY)
+      /* and the fish is atill young, send it to another PE to look for work */
+      sendFish(choosePE(), origPE,
+	       (age + 1), NEW_FISH_HISTORY, NEW_FISH_HUNGER);
+
+    /* otherwise, send it home to die */
+    else
+      sendFish(origPE, origPE, (age + 1), NEW_FISH_HISTORY, NEW_FISH_HUNGER);
+    }
+}  /* processFish */
+
+/*
+ * processFetch either returns the requested data (if available) 
+ * or blocks the remote blocking queue on a black hole (if not).
+ */
+
+//@cindex processFetch
+static void
+processFetch(void)
+{
+  globalAddr ga, rga;
+  int load;
+  StgClosure *closure;
+  StgInfoTable *ip;
+
+  unpackFetch(&ga, &rga, &load);
+  IF_PAR_DEBUG(fetch,
+	       belch("%% [%x] Rcvd Fetch for ((%x, %d, 0)), Resume ((%x, %d, %x)) (load %d) from %x",
+		     mytid, 
+		     ga.payload.gc.gtid, ga.payload.gc.slot,
+		     rga.payload.gc.gtid, rga.payload.gc.slot, rga.weight, load,
+		     rga.payload.gc.gtid));
+
+  closure = GALAlookup(&ga);
+  ASSERT(closure != (StgClosure *)NULL);
+  ip = get_itbl(closure);
+  if (ip->type == FETCH_ME) {
+    /* Forward the Fetch to someone else */
+    sendFetch(((StgFetchMe *)closure)->ga, &rga, load);
+  } else if (rga.payload.gc.gtid == mytid) {
+    /* Our own FETCH forwarded back around to us */
+    StgFetchMeBlockingQueue *fmbq = (StgFetchMeBlockingQueue *)GALAlookup(&rga);
+    
+    IF_PAR_DEBUG(fetch,
+		 belch("%% [%x] Fetch returned to sending PE; closure=%p (%s); receiver=%p (%s)",
+		       mytid, closure, info_type(closure), fmbq, info_type(fmbq)));
+    /* We may have already discovered that the fetch target is our own. */
+    if ((StgClosure *)fmbq != closure) 
+      CommonUp((StgClosure *)fmbq, closure);
+    (void) addWeight(&rga);
+  } else if (IS_BLACK_HOLE(closure)) {
+    /* This includes RBH's and FMBQ's */
+    StgBlockedFetch *bf;
+
+    ASSERT(GALAlookup(&rga) == NULL);
+
+    /* If we're hitting a BH or RBH or FMBQ we have to put a BLOCKED_FETCH
+       closure into the BQ in order to denote that when updating this node
+       the result should be sent to the originator of this fetch message. */
+    bf = (StgBlockedFetch *)createBlockedFetch(ga, rga);
+    blockFetch(bf, closure);
+
+    IF_PAR_DEBUG(fetch,
+		 belch("%% [%x] Blocking Fetch ((%x, %d, %x)) on %p (%s)",
+		       mytid, 
+		       rga.payload.gc.gtid, rga.payload.gc.slot, rga.weight, 
+		       closure, info_type(closure)));
+    } else {			
+      /* The target of the FetchMe is some local graph */
+      nat size;
+      // StgClosure *graph;
+      rtsPackBuffer *buffer = (rtsPackBuffer *)NULL;
+
+      if ((buffer = PackNearbyGraph(closure, END_TSO_QUEUE, &size)) == NULL) {
+	GarbageCollect(GetRoots); 
+	closure = GALAlookup(&ga);
+	buffer = PackNearbyGraph(closure, END_TSO_QUEUE, &size);
+	ASSERT(buffer != (rtsPackBuffer *)NULL);
+      }
+      sendResume(&rga, size, buffer);
+    }
+}
+
+/*
+ * processFree unpacks a FREE message and adds the weights to our GAs.
+ */
+//@cindex processFree
+static void
+processFree(void)
+{
+  int nelem;
+  static StgWord *buffer;
+  int i;
+  globalAddr ga;
+
+  buffer = (StgWord *)gumPackBuffer;
+  unpackFree(&nelem, buffer);
+  IF_PAR_DEBUG(free,
+	       belch("!! [%x] Rcvd Free (%d GAs)", mytid, nelem / 2));
+
+  ga.payload.gc.gtid = mytid;
+  for (i = 0; i < nelem;) {
+    ga.weight = (rtsWeight) buffer[i++];
+    ga.payload.gc.slot = (int) buffer[i++];
+    IF_PAR_DEBUG(free,
+		 fprintf(stderr, "!! [%x] Processing free ", mytid); 
+		 printGA(&ga);
+		 fputc('\n', stderr);
+		 );
+    (void) addWeight(&ga);
+  }
+}
+
+/*
+ * processResume unpacks a RESUME message into the graph, filling in
+ * the LA -> GA, and GA -> LA tables. Threads blocked on the original
+ * FetchMe (now a blocking queue) are awakened, and the blocking queue
+ * is converted into an indirection.  Finally it sends an ACK in response
+ * which contains any newly allocated GAs.
+ */
+
+//@cindex processResume
+static void
+processResume(GlobalTaskId sender)
+{
+  int nelem;
+  nat nGAs;
+  static rtsPackBuffer *packBuffer;
+  StgClosure *newGraph, *old;
+  globalAddr lga;
+  globalAddr *gagamap;
+  
+  packBuffer = gumPackBuffer;
+  unpackResume(&lga, &nelem, (StgPtr)packBuffer);
+
+  IF_PAR_DEBUG(resume,
+	       fprintf(stderr, "[] [%x] Rcvd Resume for ", mytid); 
+	       printGA(&lga);
+	       fputc('\n', stderr);
+	       PrintPacket((rtsPackBuffer *)packBuffer));
+  
+  /* 
+   * We always unpack the incoming graph, even if we've received the
+   * requested node in some other data packet (and already awakened
+   * the blocking queue).
+  if (SAVE_Hp + packBuffer[0] >= SAVE_HpLim) {
+    ReallyPerformThreadGC(packBuffer[0], rtsFalse);
+    SAVE_Hp -= packBuffer[0];
+  }
+   */
+
+  // ToDo: Check for GC here !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+  /* Do this *after* GC; we don't want to release the object early! */
+
+  if (lga.weight > 0)
+    (void) addWeight(&lga);
+
+  old = GALAlookup(&lga);
+
+  if (RtsFlags.ParFlags.ParStats.Full) {
+    // StgTSO *tso = END_TSO_QUEUE;
+    StgBlockingQueueElement *bqe;
+
+    /* Write REPLY events to the log file, indicating that the remote
+       data has arrived */
+    if (get_itbl(old)->type == FETCH_ME_BQ ||
+	get_itbl(old)->type == RBH) 
+      for (bqe = ((StgFetchMeBlockingQueue *)old)->blocking_queue;
+	   bqe->link != END_BQ_QUEUE;
+	   bqe = bqe->link)
+	if (get_itbl((StgClosure *)bqe)->type == TSO)
+	  DumpRawGranEvent(CURRENT_PROC, taskIDtoPE(sender), 
+			   GR_REPLY, ((StgTSO *)bqe), ((StgTSO *)bqe)->block_info.closure,
+			   0, spark_queue_len(ADVISORY_POOL));
+  }
+
+  newGraph = UnpackGraph(packBuffer, &gagamap, &nGAs);
+  ASSERT(newGraph != NULL);
+
+  /* 
+   * Sometimes, unpacking will common up the resumee with the
+   * incoming graph, but if it hasn't, we'd better do so now.
+   */
+   
+  if (get_itbl(old)->type == FETCH_ME_BQ)
+    CommonUp(old, newGraph);
+
+  IF_PAR_DEBUG(resume,
+	       DebugPrintGAGAMap(gagamap, nGAs));
+  
+  sendAck(sender, nGAs, gagamap);
+}
+
+/*
+ * processSchedule unpacks a SCHEDULE message into the graph, filling
+ * in the LA -> GA, and GA -> LA tables. The root of the graph is added to
+ * the local spark queue.  Finally it sends an ACK in response
+ * which contains any newly allocated GAs.
+ */
+//@cindex processSchedule
+static void
+processSchedule(GlobalTaskId sender)
+{
+  nat nelem, space_required, nGAs;
+  rtsBool success;
+  static rtsPackBuffer *packBuffer;
+  StgClosure *newGraph;
+  globalAddr *gagamap;
+  
+  packBuffer = gumPackBuffer;		/* HWL */
+  unpackSchedule(&nelem, packBuffer);
+
+  IF_PAR_DEBUG(schedule,
+	       belch("-- [%x] Rcvd Schedule (%d elems)", mytid, nelem);
+	       PrintPacket(packBuffer));
+
+  /*
+   * For now, the graph is a closure to be sparked as an advisory
+   * spark, but in future it may be a complete spark with
+   * required/advisory status, priority etc.
+   */
+
+  /*
+  space_required = packBuffer[0];
+  if (SAVE_Hp + space_required >= SAVE_HpLim) {
+    ReallyPerformThreadGC(space_required, rtsFalse);
+    SAVE_Hp -= space_required;
+  }
+  */
+  // ToDo: check whether GC is necessary !!!!!!!!!!!!!!!!!!!!!1
+  newGraph = UnpackGraph(packBuffer, &gagamap, &nGAs);
+  ASSERT(newGraph != NULL);
+  success = add_to_spark_queue(newGraph, rtsFalse);
+
+  IF_PAR_DEBUG(pack,
+	       if (success)
+  	         belch("+* added spark to unpacked graph %p; %d sparks available on [%x]", 
+		     newGraph, spark_queue_len(ADVISORY_POOL), mytid);
+	       else
+                 belch("+* received non-sparkable closure %p; nothing added to spark pool; %d sparks available on [%x]", 
+		     newGraph, spark_queue_len(ADVISORY_POOL), mytid);
+	       belch("-* Unpacked graph with root at %p (%s):", 
+		     newGraph, info_type(newGraph));
+	       PrintGraph(newGraph, 0));
+
+  IF_PAR_DEBUG(pack,
+  	       DebugPrintGAGAMap(gagamap, nGAs));
+
+  if (nGAs > 0)
+    sendAck(sender, nGAs, gagamap);
+
+  //fishing = rtsFalse;
+  ASSERT(outstandingFishes>0);
+  outstandingFishes--;
+}
+
+/*
+ * processAck unpacks an ACK, and uses the GAGA map to convert RBH's
+ * (which represent shared thunks that have been shipped) into fetch-mes
+ * to remote GAs.
+ */
+//@cindex processAck
+static void
+processAck(void)
+{
+  nat nGAs;
+  globalAddr *gaga;
+  globalAddr gagamap[256]; // ToDo: elim magic constant!!   MAX_GAS * 2];??
+
+  unpackAck(&nGAs, gagamap);
+
+  IF_PAR_DEBUG(ack,
+	       belch(",, [%x] Rcvd Ack (%d pairs)", mytid, nGAs);
+	       DebugPrintGAGAMap(gagamap, nGAs));
+
+  /*
+   * For each (oldGA, newGA) pair, set the GA of the corresponding
+   * thunk to the newGA, convert the thunk to a FetchMe, and return
+   * the weight from the oldGA.
+   */
+  for (gaga = gagamap; gaga < gagamap + nGAs * 2; gaga += 2) {
+    StgClosure *old_closure = GALAlookup(gaga);
+    StgClosure *new_closure = GALAlookup(gaga + 1);
+
+    ASSERT(old_closure != NULL);
+    if (new_closure == NULL) {
+      /* We don't have this closure, so we make a fetchme for it */
+      globalAddr *ga = setRemoteGA(old_closure, gaga + 1, rtsTrue);
+      
+      /* convertToFetchMe should be done unconditionally here.
+	 Currently, we assign GAs to CONSTRs, too, (a bit of a hack),
+	 so we have to check whether it is an RBH before converting
+
+	 ASSERT(get_itbl(old_closure)==RBH);
+      */
+      if (get_itbl(old_closure)->type==RBH)
+	convertToFetchMe(old_closure, ga);
+    } else {
+      /* 
+       * Oops...we've got this one already; update the RBH to
+       * point to the object we already know about, whatever it
+       * happens to be.
+       */
+      CommonUp(old_closure, new_closure);
+      
+      /* 
+       * Increase the weight of the object by the amount just
+       * received in the second part of the ACK pair.
+       */
+      (void) addWeight(gaga + 1);
+    }
+    (void) addWeight(gaga);
+  }
+}
+
+//@node GUM Message Processor, Miscellaneous Functions, Message-Processing Functions, High Level Communications Routines
+//@subsection GUM Message Processor
+
+/*
+ * GUM Message Processor
+
+ * processMessages processes any messages that have arrived, calling
+ * appropriate routines depending on the message tag
+ * (opcode). N.B. Unless profiling it assumes that there {\em ARE} messages
+ * present and performs a blocking receive! During profiling it
+ * busy-waits in order to record idle time.
+ */
+
+//@cindex processMessages
+void
+processMessages(void)
+{
+  rtsPacket packet;
+  OpCode opcode;
+  GlobalTaskId task;
+    
+  do {
+    packet = GetPacket();  /* Get next message; block until one available */
+    getOpcodeAndSender(packet, &opcode, &task);
+
+    switch (opcode) {
+    case PP_FINISH:
+      IF_PAR_DEBUG(verbose,
+		   belch("== [%x] received FINISH", mytid));
+      /* setting this global variables eventually terminates the main
+         scheduling loop for this PE and causes a shut-down, sending 
+	 PP_FINISH to SysMan */
+      GlobalStopPending = rtsTrue;
+      break;
+
+    case PP_FETCH:
+      processFetch();
+      break;
+
+    case PP_RESUME:
+      processResume(task);
+      break;
+
+    case PP_ACK:
+      processAck();
+      break;
+
+    case PP_FISH:
+      processFish();
+      break;
+
+    case PP_FREE:
+      processFree();
+      break;
+      
+    case PP_SCHEDULE:
+      processSchedule(task);
+      break;
+
+    default:
+      /* Anything we're not prepared to deal with. */
+      barf("Task %x: Unexpected opcode %x from %x",
+	   mytid, opcode, task);
+    } /* switch */
+
+  } while (PacketsWaiting());	/* While there are messages: process them */
+}				/* processMessages */
+
+//@node Miscellaneous Functions, Index, GUM Message Processor, High Level Communications Routines
+//@subsection Miscellaneous Functions
+
+/*
+ * ChoosePE selects a GlobalTaskId from the array of PEs 'at random'.
+ * Important properties:
+ *   - it varies during execution, even if the PE is idle
+ *   - it's different for each PE
+ *   - we never send a fish to ourselves
+ */
+extern long lrand48 (void);
+
+//@cindex choosePE
+GlobalTaskId
+choosePE(void)
+{
+  long temp;
+
+  temp = lrand48() % nPEs;
+  if (allPEs[temp] == mytid) {	/* Never send a FISH to yourself */
+    temp = (temp + 1) % nPEs;
+  }
+  return allPEs[temp];
+}
+
+/* 
+ * allocate a BLOCKED_FETCH closure and fill it with the relevant fields
+ * of the ga argument; called from processFetch when the local closure is
+ * under evaluation
+ */
+//@cindex createBlockedFetch
+StgClosure *
+createBlockedFetch (globalAddr ga, globalAddr rga)
+{
+  StgBlockedFetch *bf;
+  StgClosure *closure;
+
+  closure = GALAlookup(&ga);
+  if ((bf = (StgBlockedFetch *)allocate(FIXED_HS + sizeofW(StgBlockedFetch))) == NULL) {
+    GarbageCollect(GetRoots); 
+    closure = GALAlookup(&ga);
+    bf = (StgBlockedFetch *)allocate(FIXED_HS + sizeofW(StgBlockedFetch));
+    // ToDo: check whether really guaranteed to succeed 2nd time around
+  }
+
+  ASSERT(bf != (StgClosure *)NULL);
+  SET_INFO((StgClosure *)bf, &BLOCKED_FETCH_info);
+  // ToDo: check whether other header info is needed
+  bf->node = closure;
+  bf->ga.payload.gc.gtid = rga.payload.gc.gtid;
+  bf->ga.payload.gc.slot = rga.payload.gc.slot;
+  bf->ga.weight = rga.weight;
+  // bf->link = NULL;  debugging
+
+  IF_PAR_DEBUG(fetch,
+	       fprintf(stderr, "%% [%x] created BF: closure=%p (%s), GA: ",
+		       mytid, closure, info_type(closure));
+	       printGA(&(bf->ga));
+	       fputc('\n',stderr));
+  return bf;
+}
+
+/*
+ * waitForTermination enters a loop ignoring spurious messages while
+ * waiting for the termination sequence to be completed.  
+ */
+//@cindex waitForTermination
+void
+waitForTermination(void)
+{
+  do {
+    rtsPacket p = GetPacket();
+    processUnexpected(p);
+  } while (rtsTrue);
+}
+
+#ifdef DEBUG
+//@cindex DebugPrintGAGAMap
+void
+DebugPrintGAGAMap(globalAddr *gagamap, int nGAs)
+{
+  int i;
+  
+  for (i = 0; i < nGAs; ++i, gagamap += 2)
+    fprintf(stderr, "gagamap[%d] = ((%x, %d, %x)) -> ((%x, %d, %x))\n", i,
+	    gagamap[0].payload.gc.gtid, gagamap[0].payload.gc.slot, gagamap[0].weight,
+	    gagamap[1].payload.gc.gtid, gagamap[1].payload.gc.slot, gagamap[1].weight);
+}
+#endif
+
+//@cindex freeMsgBuffer
+static StgWord **freeMsgBuffer = NULL;
+//@cindex freeMsgIndex
+static int      *freeMsgIndex  = NULL;
+
+//@cindex prepareFreeMsgBuffers
+void
+prepareFreeMsgBuffers(void)
+{
+  int i;
+  
+  /* Allocate the freeMsg buffers just once and then hang onto them. */
+  if (freeMsgIndex == NULL) {
+    freeMsgIndex = (int *) stgMallocBytes(nPEs * sizeof(int), 
+					  "prepareFreeMsgBuffers (Index)");
+    freeMsgBuffer = (StgWord **) stgMallocBytes(nPEs * sizeof(long *), 
+					  "prepareFreeMsgBuffers (Buffer)");
+    
+    for(i = 0; i < nPEs; i++) 
+      if (i != thisPE) 
+	freeMsgBuffer[i] = (StgPtr) stgMallocWords(RtsFlags.ParFlags.packBufferSize,
+					       "prepareFreeMsgBuffers (Buffer #i)");
+  }
+  
+  /* Initialize the freeMsg buffer pointers to point to the start of their
+     buffers */
+  for (i = 0; i < nPEs; i++)
+    freeMsgIndex[i] = 0;
+}
+
+//@cindex freeRemoteGA
+void
+freeRemoteGA(int pe, globalAddr *ga)
+{
+  int i;
+  
+  ASSERT(GALAlookup(ga) == NULL);
+  
+  if ((i = freeMsgIndex[pe]) + 2 >= RtsFlags.ParFlags.packBufferSize) {
+    IF_PAR_DEBUG(free,
+		 belch("Filled a free message buffer (sending remaining messages indivisually)"));	
+
+    sendFree(ga->payload.gc.gtid, i, freeMsgBuffer[pe]);
+    i = 0;
+  }
+  freeMsgBuffer[pe][i++] = (StgWord) ga->weight;
+  freeMsgBuffer[pe][i++] = (StgWord) ga->payload.gc.slot;
+  freeMsgIndex[pe] = i;
+
+#ifdef DEBUG
+  ga->weight = 0x0f0f0f0f;
+  ga->payload.gc.gtid = 0x666;
+  ga->payload.gc.slot = 0xdeaddead;
+#endif
+}
+
+//@cindex sendFreeMessages
+void
+sendFreeMessages(void)
+{
+  int i;
+  
+  for (i = 0; i < nPEs; i++) 
+    if (freeMsgIndex[i] > 0)
+      sendFree(allPEs[i], freeMsgIndex[i], freeMsgBuffer[i]);
+}
+
+#endif /* PAR -- whole file */
+
+//@node Index,  , Miscellaneous Functions, High Level Communications Routines
+//@subsection Index
+
+//@index
+//* ACK::  @cindex\s-+ACK
+//* DebugPrintGAGAMap::  @cindex\s-+DebugPrintGAGAMap
+//* FETCH::  @cindex\s-+FETCH
+//* FISH::  @cindex\s-+FISH
+//* FREE::  @cindex\s-+FREE
+//* RESUME::  @cindex\s-+RESUME
+//* SCHEDULE::  @cindex\s-+SCHEDULE
+//* blockFetch::  @cindex\s-+blockFetch
+//* choosePE::  @cindex\s-+choosePE
+//* freeMsgBuffer::  @cindex\s-+freeMsgBuffer
+//* freeMsgIndex::  @cindex\s-+freeMsgIndex
+//* freeRemoteGA::  @cindex\s-+freeRemoteGA
+//* gumPackBuffer::  @cindex\s-+gumPackBuffer
+//* initMoreBuffers::  @cindex\s-+initMoreBuffers
+//* prepareFreeMsgBuffers::  @cindex\s-+prepareFreeMsgBuffers
+//* processAck::  @cindex\s-+processAck
+//* processFetch::  @cindex\s-+processFetch
+//* processFetches::  @cindex\s-+processFetches
+//* processFish::  @cindex\s-+processFish
+//* processFree::  @cindex\s-+processFree
+//* processMessages::  @cindex\s-+processMessages
+//* processResume::  @cindex\s-+processResume
+//* processSchedule::  @cindex\s-+processSchedule
+//* sendAck::  @cindex\s-+sendAck
+//* sendFetch::  @cindex\s-+sendFetch
+//* sendFish::  @cindex\s-+sendFish
+//* sendFree::  @cindex\s-+sendFree
+//* sendFreeMessages::  @cindex\s-+sendFreeMessages
+//* sendResume::  @cindex\s-+sendResume
+//* sendSchedule::  @cindex\s-+sendSchedule
+//* unpackAck::  @cindex\s-+unpackAck
+//* unpackFetch::  @cindex\s-+unpackFetch
+//* unpackFish::  @cindex\s-+unpackFish
+//* unpackFree::  @cindex\s-+unpackFree
+//* unpackResume::  @cindex\s-+unpackResume
+//* unpackSchedule::  @cindex\s-+unpackSchedule
+//* waitForTermination::  @cindex\s-+waitForTermination
+//@end index
diff --git a/ghc/rts/parallel/LLC.h b/ghc/rts/parallel/LLC.h
new file mode 100644
index 0000000..eb63366
--- /dev/null
+++ b/ghc/rts/parallel/LLC.h
@@ -0,0 +1,128 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Wed Nov 17 1999 16:50:58 Stardate: [-30]3913.51 hwloidl>
+   $Id: LLC.h,v 1.2 2000/01/13 14:34:07 hwloidl Exp $
+
+   Low Level Communications Header (LLC.h)
+
+   Contains the definitions used by the Low-level Communications
+   module of the GUM Haskell runtime environment.
+   Based on the Graph for PVM implementation.
+
+   Phil Trinder, Glasgow University, 13th Dec 1994
+   Adapted for the 4.xx RTS
+   H-W. Loidl, Heriot-Watt, November 1999
+   ----------------------------------------------------------------------- */
+
+#ifndef __LLC_H
+#define __LLC_H
+
+#ifdef PAR
+
+//@node Low Level Communications Header, , ,
+//@section Low Level Communications Header
+
+//@menu
+//* Includes::			
+//* Macros and Constants::	
+//* PVM macros::		
+//* Externs::			
+//@end menu
+
+//@node Includes, Macros and Constants, Low Level Communications Header, Low Level Communications Header
+//@subsection Includes
+
+#include "Rts.h"
+#include "Parallel.h"
+
+#include "PEOpCodes.h"
+#include "pvm3.h"
+
+//@node Macros and Constants, PVM macros, Includes, Low Level Communications Header
+//@subsection Macros and Constants
+
+#define	ANY_TASK	(-1)	/* receive messages from any task */
+#define ANY_GLOBAL_TASK	ANY_TASK
+#define ANY_OPCODE	(-1)	/* receive any opcode */
+#define	ALL_GROUP	(-1)	/* wait for barrier from every group member */
+
+#define	PEGROUP		"PE"
+
+#define	MGRGROUP	"MGR"
+#define	PECTLGROUP	"PECTL"
+
+
+#define	PETASK		"PE"
+
+//@node PVM macros, Externs, Macros and Constants, Low Level Communications Header
+//@subsection PVM macros
+
+#define	sync(gp,op)		do { \
+                                  broadcast(gp,op); \
+                                  pvm_barrier(gp,ALL_GROUP); \
+                                } while(0)
+
+#define broadcast(gp,op)	do { \
+                                  pvm_initsend(PvmDataDefault); \
+                                  pvm_bcast(gp,op); \
+                                } while(0)
+
+#define checkComms(c,s)		do { \
+                                  if ((c)<0) { \
+                                    pvm_perror(s); \
+                                    stg_exit(EXIT_FAILURE); \
+                                }} while(0)
+
+#define _my_gtid		pvm_mytid()
+#define GetPacket()             pvm_recv(ANY_TASK,ANY_OPCODE)
+#define PacketsWaiting()	(pvm_probe(ANY_TASK,ANY_OPCODE) != 0)
+
+#define SPARK_THREAD_DESCRIPTOR		1
+#define GLOBAL_THREAD_DESCRIPTOR	2
+
+#define _extract_jump_field(v)	(v)
+
+#define MAX_DATA_WORDS_IN_PACKET	1024
+
+/* basic PVM packing */
+#define PutArg1(a)		pvm_pklong(&(a),1,1)
+#define PutArg2(a)		pvm_pklong(&(a),1,1)
+#define PutArgN(n,a)		pvm_pklong(&(a),1,1)
+#define PutArgs(b,n)		pvm_pklong(b,n,1)
+
+#define PutLit(l)		{ int a = l; PutArgN(?,a); }
+
+/* basic PVM unpacking */
+#define GetArg1(a)		pvm_upklong(&(a),1,1)
+#define GetArg2(a)		pvm_upklong(&(a),1,1)
+#define GetArgN(n,a)		pvm_upklong(&(a),1,1)
+#define GetArgs(b,n)		pvm_upklong(b,n,1)
+
+//@node Externs,  , PVM macros, Low Level Communications Header
+//@subsection Externs
+
+/* basic message passing routines */
+extern void sendOp   (OpCode,GlobalTaskId),
+            sendOp1  (OpCode,GlobalTaskId,StgWord),
+            sendOp2  (OpCode,GlobalTaskId,StgWord,StgWord),
+	    sendOpV  (OpCode,GlobalTaskId,int,...), 
+            sendOpN  (OpCode,GlobalTaskId,int,StgPtr),
+            sendOpNV (OpCode,GlobalTaskId,int,StgPtr,int,...);
+
+/* extracting data out of a packet */
+OpCode        getOpcode (rtsPacket p);
+void          getOpcodeAndSender (rtsPacket p, OpCode *popcode, 
+			          GlobalTaskId *psender_id);
+GlobalTaskId  senderTask (rtsPacket p);
+rtsPacket     waitForPEOp (OpCode op, GlobalTaskId who);
+
+/* Init and shutdown routines */
+GlobalTaskId *startUpPE (unsigned nPEs);
+void          shutDownPE(void);
+
+/* aux functions */
+char  *getOpName (unsigned op);  // returns string of opcode
+void   processUnexpected (rtsPacket);
+//void   NullException(void);
+
+#endif /*PAR */
+#endif /*defined __LLC_H */
diff --git a/ghc/rts/parallel/LLComms.c b/ghc/rts/parallel/LLComms.c
new file mode 100644
index 0000000..c40ae33
--- /dev/null
+++ b/ghc/rts/parallel/LLComms.c
@@ -0,0 +1,476 @@
+/* ----------------------------------------------------------------------------
+ * Time-stamp: <Wed Jan 12 2000 12:29:53 Stardate: [-30]4193.64 hwloidl>
+ * $Id: LLComms.c,v 1.2 2000/01/13 14:34:07 hwloidl Exp $
+ *
+ * GUM Low-Level Inter-Task Communication
+ *
+ * This module defines PVM Routines for PE-PE  communication.
+ * P. Trinder, December 5th. 1994.
+ * Adapted for the new RTS 
+ * P. Trinder, July 1998
+ * H-W. Loidl, November 1999
+ --------------------------------------------------------------------------- */
+
+#ifdef PAR /* whole file */
+
+//@node GUM Low-Level Inter-Task Communication, , ,
+//@section GUM Low-Level Inter-Task Communication
+
+/*
+ *This module defines the routines which communicate between PEs.  The
+ *code is based on Kevin Hammond's GRIP RTS. (OpCodes.h defines
+ *PEOp1 etc. in terms of sendOp1 etc.).  
+ *
+ *Routine	&	Arguments 
+ *		&		
+ *sendOp	&	0			\\
+ *sendOp1	&	1			\\
+ *sendOp2	&	2			\\
+ *sendOpN	&	vector			\\
+ *sendOpV	&	variable		\\
+ *sendOpNV	&	variable+ vector	\\
+ *
+ *First the standard include files.
+ */
+
+//@menu
+//* Macros etc::		
+//* Includes::			
+//* Auxiliary functions::	
+//* Index::			
+//@end menu
+
+//@node Macros etc, Includes, GUM Low-Level Inter-Task Communication, GUM Low-Level Inter-Task Communication
+//@subsection Macros etc
+
+#define NON_POSIX_SOURCE /* so says Solaris */
+#define UNUSED           /* nothing */
+
+//@node Includes, Auxiliary functions, Macros etc, GUM Low-Level Inter-Task Communication
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Parallel.h"
+#include "ParallelRts.h"
+#if defined(DEBUG)
+# include "ParallelDebug.h"
+#endif
+#include "LLC.h"
+
+#ifdef __STDC__
+#include <stdarg.h>
+#else
+#include <varargs.h>
+#endif
+
+/* Cannot use std macro when compiling for SysMan */
+/* debugging enabled */
+// #define IF_PAR_DEBUG(c,s)  { s; }
+/* debugging disabled */
+#define IF_PAR_DEBUG(c,s)  /* nothing */
+
+//@node Auxiliary functions, Index, Includes, GUM Low-Level Inter-Task Communication
+//@subsection Auxiliary functions
+
+/*
+ * heapChkCounter tracks the number of heap checks since the last probe.
+ * Not currently used! We check for messages when a thread is resheduled.
+ */
+int heapChkCounter = 0;
+
+/*
+ * Then some miscellaneous functions. 
+ * getOpName returns the character-string name of any OpCode.
+ */
+
+char *UserPEOpNames[] = { PEOP_NAMES };
+
+//@cindex getOpName
+char *
+getOpName(nat op)
+{
+    if (op >= MIN_PEOPS && op <= MAX_PEOPS)
+	return (UserPEOpNames[op - MIN_PEOPS]);
+    else
+	return ("Unknown PE OpCode");
+}
+
+/*
+ * traceSendOp handles the tracing of messages. 
+ */
+
+//@cindex traceSendOp
+static void
+traceSendOp(OpCode op, GlobalTaskId dest UNUSED,
+	     unsigned int data1 UNUSED, unsigned int data2 UNUSED)
+{
+    char *OpName;
+
+    OpName = getOpName(op);
+    IF_PAR_DEBUG(trace,
+		 fprintf(stderr," %s [%x,%x] sent from %x to %x", 
+		       OpName, data1, data2, mytid, dest));
+}
+
+/*
+ * sendOp sends a 0-argument message with OpCode {\em op} to
+ * the global task {\em task}.
+ */
+
+//@cindex sendOp
+void
+sendOp(OpCode op, GlobalTaskId task)
+{
+    traceSendOp(op, task,0,0);
+
+    pvm_initsend(PvmDataRaw);
+    pvm_send(task, op);
+}
+
+/*
+ * sendOp1 sends a 1-argument message with OpCode {\em op}
+ * to the global task {\em task}.
+ */
+
+//@cindex sendOp1
+void
+sendOp1(OpCode op, GlobalTaskId task, StgWord arg1)
+{
+    traceSendOp(op, task, arg1,0);
+
+    pvm_initsend(PvmDataRaw);
+    PutArg1(arg1);
+    pvm_send(task, op);
+}
+
+
+/*
+ * sendOp2 is used by the FP code only. 
+ */
+
+//@cindex sendOp2
+void
+sendOp2(OpCode op, GlobalTaskId task, StgWord arg1, StgWord arg2)
+{
+    traceSendOp(op, task, arg1, arg2);
+
+    pvm_initsend(PvmDataRaw);
+    PutArg1(arg1);
+    PutArg2(arg2);
+    pvm_send(task, op);
+}
+
+/*
+ *
+ * sendOpV takes a variable number of arguments, as specified by {\em n}.  
+ * For example,
+ *
+ *    sendOpV( PP_STATS, StatsTask, 3, start_time, stop_time, sparkcount);
+ */
+
+//@cindex sendOpV
+void
+sendOpV(OpCode op, GlobalTaskId task, int n, ...)
+{
+    va_list ap;
+    int i;
+    StgWord arg;
+
+    va_start(ap, n);
+
+    traceSendOp(op, task, 0, 0);
+
+    pvm_initsend(PvmDataRaw);
+
+    for (i = 0; i < n; ++i) {
+	arg = va_arg(ap, StgWord);
+	PutArgN(i, arg);
+    }
+    va_end(ap);
+
+    pvm_send(task, op);
+}
+
+/*    
+ *
+ * sendOpNV takes a variable-size datablock, as specified by {\em
+ * nelem} and a variable number of arguments, as specified by {\em
+ * narg}. N.B. The datablock and the additional arguments are contiguous
+ * and are copied over together.  For example,
+ *
+ *        sendOpNV(PP_RESUME, tsoga.pe, 6, nelem, data,
+ *	    (W_) ga.weight, (W_) ga.loc.gc.gtid, (W_) ga.loc.gc.slot, 
+ *	    (W_) tsoga.weight, (W_) tsoga.loc.gc.gtid, (W_) tsoga.loc.gc.slot);
+ *
+ * Important: The variable arguments must all be StgWords.
+
+ sendOpNV(_, tid, m, n, data, x1, ..., xm):
+
+                         |   n elems
+     +------------------------------
+     | x1 | ... | xm | n | data ....
+     +------------------------------
+ */
+
+//@cindex sendOpNV
+void
+sendOpNV(OpCode op, GlobalTaskId task, int nelem, 
+	 StgWord *datablock, int narg, ...)
+{
+    va_list ap;
+    int i;
+    StgWord arg;
+
+    va_start(ap, narg);
+
+    traceSendOp(op, task, 0, 0);
+    IF_PAR_DEBUG(trace,
+		 fprintf(stderr,"sendOpNV: op = %x (%s), task = %x, narg = %d, nelem = %d",
+		       op, getOpName(op), task, narg, nelem));
+
+    pvm_initsend(PvmDataRaw);
+
+    for (i = 0; i < narg; ++i) {
+	arg = va_arg(ap, StgWord);
+        IF_PAR_DEBUG(trace,
+		     fprintf(stderr,"sendOpNV: arg = %d\n",arg));
+	PutArgN(i, arg);
+    }
+    arg = (StgWord) nelem;
+    PutArgN(narg, arg);
+
+/*  for (i=0; i < nelem; ++i) fprintf(stderr, "%d ",datablock[i]); */
+/*  fprintf(stderr," in sendOpNV\n");*/
+
+    PutArgs(datablock, nelem);
+    va_end(ap);
+
+    pvm_send(task, op);
+}
+
+/*    
+ * sendOpN take a variable size array argument, whose size is given by
+ * {\em n}.  For example,
+ *
+ *    sendOpN( PP_STATS, StatsTask, 3, stats_array);
+ */
+
+//@cindex sendOpN
+void
+sendOpN(OpCode op, GlobalTaskId task, int n, StgPtr args)
+{
+    long arg;
+
+    traceSendOp(op, task, 0, 0);
+
+    pvm_initsend(PvmDataRaw);
+    arg = (long) n;
+    PutArgN(0, arg);
+    PutArgs(args, n);
+    pvm_send(task, op);
+}
+
+/*
+ * waitForPEOp waits for a packet from global task {\em who} with the
+ * OpCode {\em op}.  Other OpCodes are handled by processUnexpected.
+ */
+//@cindex waitForPEOp
+rtsPacket 
+waitForPEOp(OpCode op, GlobalTaskId who)
+{
+  rtsPacket p;
+  int nbytes;
+  OpCode opCode;
+  GlobalTaskId sender_id;
+  rtsBool match;
+
+  do {
+    IF_PAR_DEBUG(verbose,
+		  fprintf(stderr,"waitForPEOp: op = %x (%s), who = %x\n", 
+			  op, getOpName(op), who)); 
+
+    while((p = pvm_recv(ANY_TASK,ANY_OPCODE)) < 0)
+      pvm_perror("waitForPEOp: Waiting for PEOp");
+      
+    pvm_bufinfo( p, &nbytes, &opCode, &sender_id );
+    IF_PAR_DEBUG(verbose,
+		 fprintf(stderr,"waitForPEOp: received: OpCode = %x, sender_id = %x",
+		       opCode, getOpName(opCode), sender_id)); 
+
+    match = (op == ANY_OPCODE || op == opCode) && 
+            (who == ANY_TASK || who == sender_id);
+
+    if (match)
+      return(p);
+
+    /* Handle the unexpected OpCodes */
+    processUnexpected(p);
+
+  } while(rtsTrue);
+}
+
+/*
+ * processUnexpected processes unexpected messages. If the message is a
+ * FINISH it exits the prgram, and PVM gracefully
+ */
+//@cindex processUnexpected
+void
+processUnexpected(rtsPacket packet)
+{
+    OpCode opCode = getOpcode(packet);
+
+    IF_PAR_DEBUG(verbose,
+		 GlobalTaskId sender = senderTask(packet); 
+		 fprintf(stderr,"== [%x] processUnexpected: Received %x (%s), sender %x\n",
+		       mytid, opCode, getOpName(opCode), sender)); 
+
+    switch (opCode) {
+    case PP_FINISH:
+        stg_exit(EXIT_SUCCESS);
+	break;
+
+      /* Anything we're not prepared to deal with.  Note that ALL OpCodes
+	 are discarded during termination -- this helps prevent bizarre
+	 race conditions.  */
+      default:
+	if (!GlobalStopPending) {
+	  GlobalTaskId errorTask;
+	  OpCode opCode;
+
+	  getOpcodeAndSender(packet,&opCode,&errorTask);
+	  fprintf(stderr,"Task %x: Unexpected OpCode %x from %x in processUnexpected",
+		mytid, opCode, errorTask );
+            
+	  stg_exit(EXIT_FAILURE);
+	}
+    }
+}
+
+//@cindex getOpcode
+OpCode 
+getOpcode(rtsPacket p)
+{
+  int nbytes;
+  OpCode OpCode;
+  GlobalTaskId sender_id;
+  pvm_bufinfo(p, &nbytes, &OpCode, &sender_id);
+  return(OpCode);
+}
+
+//@cindex getOpcodeAndSender
+void
+getOpcodeAndSender(rtsPacket p, OpCode *opCodep, GlobalTaskId *senderIdp)
+{
+  int nbytes;
+  pvm_bufinfo(p, &nbytes, opCodep, senderIdp);
+}
+
+//@cindex senderTask
+GlobalTaskId
+senderTask(rtsPacket p)
+{
+  int nbytes;
+  OpCode opCode;
+  GlobalTaskId sender_id;
+  pvm_bufinfo(p, &nbytes, &opCode, &sender_id);
+  return(sender_id);
+}
+
+/*
+ * PEStartUp does the low-level comms specific startup stuff for a
+ * PE. It initialises the comms system, joins the appropriate groups,
+ * synchronises with the other PEs. Receives and records in a global
+ * variable the task-id of SysMan. If this is the main thread (discovered
+ * in main.lc), identifies itself to SysMan. Finally it receives
+ * from SysMan an array of the Global Task Ids of each PE, which is
+ * returned as the value of the function.
+ */
+
+//@cindex startUpPE
+GlobalTaskId *
+startUpPE(nat nPEs)
+{
+  int i;
+  rtsPacket addr;
+  long *buffer = (long *) stgMallocBytes(sizeof(long) * nPEs, 
+					 "PEStartUp (buffer)");
+  GlobalTaskId *thePEs = (GlobalTaskId *) 
+    stgMallocBytes(sizeof(GlobalTaskId) * nPEs, 
+		   "PEStartUp (PEs)");
+
+  mytid = _my_gtid;	/* Initialise PVM and get task id into global var.*/
+
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"== [%x] PEStartup: Task id = [%x], No. PEs = %d \n", 
+		       mytid, mytid, nPEs));
+  checkComms(pvm_joingroup(PEGROUP), "PEStartup");
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"== [%x] PEStartup: Joined PEGROUP\n", mytid));
+  checkComms(pvm_joingroup(PECTLGROUP), "PEStartup");
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"== [%x] PEStartup: Joined PECTLGROUP\n", mytid));
+  checkComms(pvm_barrier(PECTLGROUP, nPEs+1), "PEStartup");
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"== [%x] PEStartup, Passed PECTLGROUP barrier\n", mytid));
+
+  addr = waitForPEOp(PP_SYSMAN_TID, ANY_GLOBAL_TASK);
+  SysManTask = senderTask(addr);
+  if (IAmMainThread) {         /* Main Thread Identifies itself to SysMan */
+    pvm_initsend(PvmDataDefault);
+    pvm_send(SysManTask, PP_MAIN_TASK);
+  } 
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"== [%x] Thread waits for %s\n", 
+		       mytid, getOpName(PP_PETIDS)));
+  addr = waitForPEOp(PP_PETIDS, ANY_GLOBAL_TASK);
+  GetArgs(buffer, nPEs);
+  for (i = 0; i < nPEs; ++i) {
+    thePEs[i] = (GlobalTaskId) buffer[i];
+    IF_PAR_DEBUG(verbose,
+		 fprintf(stderr,"== [%x] PEStartup: PEs[%d] = %x \n", 
+			 mytid, i, thePEs[i])); 
+  }
+  free(buffer);
+  return thePEs;
+}
+
+/*
+ * PEShutdown does the low-level comms-specific shutdown stuff for a
+ * single PE. It leaves the groups and then exits from pvm.
+ */
+//@cindex shutDownPE
+void
+shutDownPE(void)
+{    
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr, "== [%x] PEshutdown\n", mytid));
+
+  checkComms(pvm_lvgroup(PEGROUP),"PEShutDown");
+  checkComms(pvm_lvgroup(PECTLGROUP),"PEShutDown");
+  checkComms(pvm_exit(),"PEShutDown");
+}
+
+#endif /* PAR -- whole file */
+
+//@node Index,  , Auxiliary functions, GUM Low-Level Inter-Task Communication
+//@subsection Index
+
+//@index
+//* getOpName::  @cindex\s-+getOpName
+//* traceSendOp::  @cindex\s-+traceSendOp
+//* sendOp::  @cindex\s-+sendOp
+//* sendOp1::  @cindex\s-+sendOp1
+//* sendOp2::  @cindex\s-+sendOp2
+//* sendOpV::  @cindex\s-+sendOpV
+//* sendOpNV::  @cindex\s-+sendOpNV
+//* sendOpN::  @cindex\s-+sendOpN
+//* waitForPEOp::  @cindex\s-+waitForPEOp
+//* processUnexpected::  @cindex\s-+processUnexpected
+//* getOpcode::  @cindex\s-+getOpcode
+//* getOpcodeAndSender::  @cindex\s-+getOpcodeAndSender
+//* senderTask::  @cindex\s-+senderTask
+//* startUpPE::  @cindex\s-+startUpPE
+//* shutDownPE::  @cindex\s-+shutDownPE
+//@end index
diff --git a/ghc/rts/parallel/PEOpCodes.h b/ghc/rts/parallel/PEOpCodes.h
new file mode 100644
index 0000000..8380f46
--- /dev/null
+++ b/ghc/rts/parallel/PEOpCodes.h
@@ -0,0 +1,52 @@
+#ifndef PEOPCODES_H
+#define PEOPCODES_H
+
+/************************************************************************
+*                         PEOpCodes.h                                   *
+*									*
+*	This file contains definitions for all the GUM PE Opcodes       *
+*       It's based on the GRAPH for PVM version                         *
+*       Phil Trinder, Glasgow University 8th December 1994              *
+*									*
+************************************************************************/
+
+#define REPLY_OK		0x00
+
+/*Startup + Shutdown*/
+#define	PP_SYSMAN_TID		0x50
+#define	PP_MAIN_TASK		0x51
+#define	PP_FINISH		0x52
+#define	PP_PETIDS		0x53
+
+/* Stats stuff */
+#define	PP_STATS		0x54
+#define PP_STATS_ON		0x55
+#define PP_STATS_OFF		0x56
+
+#define PP_FAIL			0x57
+
+/*Garbage Collection*/
+#define PP_GC_INIT              0x58
+#define PP_FULL_SYSTEM          0x59
+#define PP_GC_POLL              0x5a
+
+/*GUM Messages*/
+#define PP_FETCH                0x5b
+#define PP_RESUME               0x5c
+#define PP_ACK                  0x5d
+#define PP_FISH                 0x5e
+#define PP_SCHEDULE             0x5f
+#define PP_FREE			0x60
+
+#define	MIN_PEOPS		0x50
+#define	MAX_PEOPS		0x60
+
+#define	PEOP_NAMES		"Init", "IOInit", \
+				"Finish", "PETIDS", \
+                                "Stats", "Stats_On", "Stats_Off", \
+  				"Fail", \
+                                "GCInit", "FullSystem", "GCPoll", \
+                                "Fetch","Resume","ACK","Fish","Schedule", \
+				"Free"
+
+#endif /* PEOPCODES_H */
diff --git a/ghc/rts/parallel/Pack.c b/ghc/rts/parallel/Pack.c
new file mode 100644
index 0000000..b5484a1
--- /dev/null
+++ b/ghc/rts/parallel/Pack.c
@@ -0,0 +1,2614 @@
+/* 
+   Time-stamp: <Thu Dec 16 1999 18:21:17 Stardate: [-30]4058.61 software>
+   $Id: Pack.c,v 1.2 2000/01/13 14:34:08 hwloidl Exp $
+
+   Graph packing and unpacking code for sending it to another processor
+   and retrieving the original graph structure from the packet.
+   In the old RTS the code was split into Pack.c and Unpack.c (now deceased)
+   Used in GUM and GrAnSim.
+
+   The GrAnSim version of the code defines routines for *simulating* the
+   packing of closures in the same way it is done in the parallel runtime
+   system. Basically GrAnSim only puts the addresses of the closures to be
+   transferred into a buffer. This buffer will then be associated with the
+   event of transferring the graph. When this event is scheduled, the
+   @UnpackGraph@ routine is called and the buffer can be discarded
+   afterwards.
+
+   Note that in GranSim we need many buffers, not just one per PE.  */
+
+//@node Graph packing, , ,
+//@section Graph packing
+
+#if defined(PAR) || defined(GRAN)   /* whole file */
+
+#define _HS (sizeofW(StgHeader))
+
+//@menu
+//* Includes::			
+//* Prototypes::		
+//* Global variables::		
+//* ADT of Closure Queues::	
+//* Initialisation for packing::  
+//* Packing Functions::		
+//* Low level packing routines::  
+//* Unpacking routines::	
+//* Aux fcts for packing::	
+//* Printing Packet Contents::	
+//* End of file::		
+//@end menu
+//*/
+
+//@node Includes, Prototypes, Graph packing, Graph packing
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "ClosureTypes.h"
+#include "Storage.h"
+#include "Hash.h"
+#include "Parallel.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+# if defined(DEBUG)
+# include "ParallelDebug.h"
+# endif
+#include "FetchMe.h"
+
+/* Which RTS flag should be used to get the size of the pack buffer ? */
+# if defined(PAR)
+#  define RTS_PACK_BUFFER_SIZE   RtsFlags.ParFlags.packBufferSize
+# else   /* GRAN */
+#  define RTS_PACK_BUFFER_SIZE   RtsFlags.GranFlags.packBufferSize
+# endif
+
+//@node Prototypes, Global variables, Includes, Graph packing
+//@subsection Prototypes
+/* 
+   Code declarations. 
+*/
+
+//@node ADT of closure queues, Init for packing, Prototypes, Prototypes
+//@subsubsection ADT of closure queues
+
+static inline void    	  AllocClosureQueue(nat size);
+static inline void    	  InitClosureQueue(void);
+static inline rtsBool 	  QueueEmpty(void);
+static inline void    	  QueueClosure(StgClosure *closure);
+static inline StgClosure *DeQueueClosure(void);
+
+//@node Init for packing, Packing routines, ADT of closure queues, Prototypes
+//@subsubsection Init for packing
+
+static void     initPacking(void);
+# if defined(PAR)
+rtsBool         initPackBuffer(void);
+# elif defined(GRAN)
+rtsPackBuffer  *InstantiatePackBuffer (void);
+static void     reallocPackBuffer (void);
+# endif
+
+//@node Packing routines, Low level packing fcts, Init for packing, Prototypes
+//@subsubsection Packing routines
+
+static void    PackClosure (StgClosure *closure);
+
+//@node Low level packing fcts, Unpacking routines, Packing routines, Prototypes
+//@subsubsection Low level packing fcts
+
+# if defined(GRAN)
+static inline void    Pack (StgClosure *data);
+# else
+static inline void    Pack (StgWord data);
+
+static void    PackPLC (StgPtr addr);
+static void    PackOffset (int offset);
+static void    GlobaliseAndPackGA (StgClosure *closure);
+# endif
+
+//@node Unpacking routines, Aux fcts for packing, Low level packing fcts, Prototypes
+//@subsubsection Unpacking routines
+
+# if defined(PAR)
+void        InitPendingGABuffer(nat size); 
+void        CommonUp(StgClosure *src, StgClosure *dst);
+StgClosure *UnpackGraph(rtsPackBuffer *packBuffer,
+			globalAddr **gamap,
+			nat *nGAs);
+# elif defined(GRAN)
+void        CommonUp(StgClosure *src, StgClosure *dst);
+StgClosure *UnpackGraph(rtsPackBuffer* buffer);
+#endif
+
+//@node Aux fcts for packing,  , Unpacking routines, Prototypes
+//@subsubsection Aux fcts for packing
+
+# if defined(PAR)
+static void 	DonePacking(void);
+static void 	AmPacking(StgClosure *closure);
+static int  	OffsetFor(StgClosure *closure);
+static rtsBool  NotYetPacking(int offset);
+static rtsBool  RoomToPack (nat size, nat ptrs);
+       rtsBool  isOffset(globalAddr *ga);
+       rtsBool  isFixed(globalAddr *ga);
+# elif defined(GRAN)
+static void     DonePacking(void);
+static rtsBool  NotYetPacking(StgClosure *closure);
+# endif
+
+//@node Global variables, ADT of Closure Queues, Prototypes, Graph packing
+//@subsection Global variables
+/*
+  Static data declarations
+*/
+
+static nat     pack_locn,           /* ptr to first free loc in pack buffer */
+               clq_size, clq_pos,
+               buf_id = 1;          /* identifier for buffer */
+static nat     unpacked_size;
+static nat     reservedPAsize;        /* Space reserved for primitive arrays */
+static rtsBool RoomInBuffer;
+
+# if defined(GRAN)
+/* 
+   The pack buffer
+   To be pedantic: in GrAnSim we're packing *addresses* of closures,
+   not the closures themselves.
+*/
+static rtsPackBuffer *Bonzo = NULL;                /* size: can be set via option */
+# else
+static rtsPackBuffer *Bonzo = NULL;                /* size: can be set via option */
+# endif
+
+/*
+  Bit of a hack for testing if a closure is the root of the graph. This is
+  set in @PackNearbyGraph@ and tested in @PackClosure@.  
+*/
+
+static nat          packed_thunks = 0;
+static StgClosure  *graph_root;
+
+# if defined(PAR)
+/*
+  The offset hash table is used during packing to record the location in
+  the pack buffer of each closure which is packed.
+*/
+//@cindex offsetTable
+static HashTable *offsetTable;
+
+//@cindex PendingGABuffer
+static globalAddr *PendingGABuffer;  
+/* is initialised in main; */
+# endif /* PAR */
+
+//@node ADT of Closure Queues, Initialisation for packing, Global variables, Graph packing
+//@subsection ADT of Closure Queues
+
+//@menu
+//* Closure Queues::		
+//* Init routines::		
+//* Basic routines::		
+//@end menu
+
+//@node Closure Queues, Init routines, ADT of Closure Queues, ADT of Closure Queues
+//@subsubsection Closure Queues
+/*
+  Closure Queues
+
+  These routines manage the closure queue.
+*/
+
+static nat clq_pos, clq_size;
+
+static StgClosure **ClosureQueue = NULL;   /* HWL: init in main */
+
+//@node Init routines, Basic routines, Closure Queues, ADT of Closure Queues
+//@subsubsection Init routines
+
+/* @InitClosureQueue@ initialises the closure queue. */
+
+//@cindex AllocClosureQueue
+static inline void
+AllocClosureQueue(size)
+nat size;
+{
+  ASSERT(ClosureQueue == NULL);
+  ClosureQueue = (StgClosure**) stgMallocWords(size, "AllocClosureQueue");
+}
+
+//@cindex InitClosureQueue
+static inline void
+InitClosureQueue(void)
+{
+  clq_pos = clq_size = 0;
+
+  if ( ClosureQueue == NULL ) 
+     AllocClosureQueue(RTS_PACK_BUFFER_SIZE);
+}
+
+//@node Basic routines,  , Init routines, ADT of Closure Queues
+//@subsubsection Basic routines
+
+/*
+  QueueEmpty returns rtsTrue if the closure queue is empty; rtsFalse otherwise.
+*/
+
+//@cindex QueueEmpty
+static inline rtsBool
+QueueEmpty(void)
+{
+  return(clq_pos >= clq_size);
+}
+
+/* QueueClosure adds its argument to the closure queue. */
+
+//@cindex QueueClosure
+static inline void
+QueueClosure(closure)
+StgClosure *closure;
+{
+  if(clq_size < RTS_PACK_BUFFER_SIZE )
+    ClosureQueue[clq_size++] = closure;
+  else
+    barf("Closure Queue Overflow (EnQueueing %p (%s))", 
+	 closure, info_type(closure));
+}
+
+/* DeQueueClosure returns the head of the closure queue. */
+
+//@cindex DeQueueClosure
+static inline StgClosure * 
+DeQueueClosure(void)
+{
+  if(!QueueEmpty())
+    return(ClosureQueue[clq_pos++]);
+  else
+    return((StgClosure*)NULL);
+}
+
+//@node Initialisation for packing, Packing Functions, ADT of Closure Queues, Graph packing
+//@subsection Initialisation for packing
+/*
+  Simple Packing Routines
+
+  About packet sizes in GrAnSim: In GrAnSim we use a malloced block of
+  gransim_pack_buffer_size words to simulate a packet of pack_buffer_size
+  words.  In the simulated PackBuffer we only keep the addresses of the
+  closures that would be packed in the parallel system (see Pack). To
+  decide if a packet overflow occurs pack_buffer_size must be compared
+  versus unpacked_size (see RoomToPack).  Currently, there is no multi
+  packet strategy implemented, so in the case of an overflow we just stop
+  adding closures to the closure queue.  If an overflow of the simulated
+  packet occurs, we just realloc some more space for it and carry on as
+  usual.  -- HWL */
+
+# if defined(GRAN)
+rtsPackBuffer *
+InstantiatePackBuffer (void) {
+  extern rtsPackBuffer *Bonzo;
+
+  Bonzo = (rtsPackBuffer *) stgMallocWords(sizeofW(rtsPackBuffer), 
+			 "InstantiatePackBuffer: failed to alloc packBuffer");
+  Bonzo->size = RtsFlags.GranFlags.packBufferSize_internal;
+  Bonzo->buffer = (StgWord **) stgMallocWords(RtsFlags.GranFlags.packBufferSize_internal,
+				 "InstantiatePackBuffer: failed to alloc GranSim internal packBuffer");
+  /* NB: gransim_pack_buffer_size instead of pack_buffer_size -- HWL */
+  /* stgMallocWords is now simple allocate in Storage.c */
+
+  return (Bonzo);
+}
+
+/* 
+   Reallocate the GranSim internal pack buffer to make room for more closure
+   pointers. This is independent of the check for packet overflow as in GUM
+*/
+static void
+reallocPackBuffer (void) {
+
+  ASSERT(pack_locn >= (int)Bonzo->size+sizeofW(rtsPackBuffer));
+
+  IF_GRAN_DEBUG(packBuffer,
+		belch("** Increasing size of PackBuffer %p to %d words (PE %u @ %d)\n",
+		      Bonzo, Bonzo->size+REALLOC_SZ,
+		      CurrentProc, CurrentTime[CurrentProc]));
+  
+  Bonzo = (rtsPackBuffer*)realloc(Bonzo, 
+				  sizeof(StgClosure*)*(REALLOC_SZ +
+						       (int)Bonzo->size +
+						       sizeofW(rtsPackBuffer))) ;
+  if (Bonzo==(rtsPackBuffer*)NULL) 
+    barf("Failing to realloc %d more words for PackBuffer %p (PE %u @ %d)\n", 
+	 REALLOC_SZ, Bonzo, CurrentProc, CurrentTime[CurrentProc]);
+  
+  Bonzo->size += REALLOC_SZ;
+
+  ASSERT(pack_locn < Bonzo->size+sizeofW(rtsPackBuffer));
+}
+# endif
+
+# if defined(PAR)
+/* @initPacking@ initialises the packing buffer etc. */
+//@cindex initPackBuffer
+rtsBool
+initPackBuffer(void)
+{
+  if (Bonzo == NULL) { /* not yet allocated */
+
+      if ((Bonzo = (rtsPackBuffer *) 
+	             stgMallocWords(sizeofW(rtsPackBuffer)+RtsFlags.ParFlags.packBufferSize,
+					       "initPackBuffer")) == NULL)
+	return rtsFalse;
+      
+      InitPendingGABuffer(RtsFlags.ParFlags.packBufferSize);
+      AllocClosureQueue(RtsFlags.ParFlags.packBufferSize);
+  }
+  return rtsTrue;
+}
+# endif 
+
+static void
+initPacking(void)
+{
+# if defined(GRAN)
+  Bonzo = InstantiatePackBuffer();     /* for GrAnSim only -- HWL */
+                                       /* NB: free in UnpackGraph */
+# endif
+
+  Bonzo->id = buf_id++;  /* buffer id are only used for debugging! */
+  pack_locn = 0;         /* the index into the actual pack buffer */
+  unpacked_size = 0;     /* the size of the whole graph when unpacked */
+  reservedPAsize = 0;
+  RoomInBuffer = rtsTrue;
+  InitClosureQueue();
+  packed_thunks = 0;   /* total number of thunks packed so far */
+# if defined(PAR)
+  offsetTable = allocHashTable();
+# endif
+}
+
+//@node Packing Functions, Low level packing routines, Initialisation for packing, Graph packing
+//@subsection Packing Functions
+
+//@menu
+//* Packing Sections of Nearby Graph::	
+//* Packing Closures::		
+//@end menu
+
+//@node Packing Sections of Nearby Graph, Packing Closures, Packing Functions, Packing Functions
+//@subsubsection Packing Sections of Nearby Graph
+/*
+  Packing Sections of Nearby Graph
+
+  @PackNearbyGraph@ packs a closure and associated graph into a static
+  buffer (@PackBuffer@).  It returns the address of this buffer and the
+  size of the data packed into the buffer (in its second parameter,
+  @packBufferSize@).  The associated graph is packed in a depth first
+  manner, hence it uses an explicit queue of closures to be packed rather
+  than simply using a recursive algorithm.  Once the packet is full,
+  closures (other than primitive arrays) are packed as FetchMes, and their
+  children are not queued for packing.  */
+
+//@cindex PackNearbyGraph
+
+/* NB: this code is shared between GranSim and GUM;
+       tso only used in GranSim */
+rtsPackBuffer *
+PackNearbyGraph(closure, tso, packBufferSize)
+StgClosure* closure;
+StgTSO* tso;
+nat *packBufferSize;
+{
+  extern rtsPackBuffer *Bonzo;
+  /* Ensure enough heap for all possible RBH_Save closures */
+
+  ASSERT(RTS_PACK_BUFFER_SIZE > 0);
+
+  /* ToDo: check that we have enough heap for the packet
+     ngoq ngo'
+     if (Hp + PACK_HEAP_REQUIRED > HpLim) 
+     return NULL;
+  */
+
+  initPacking();
+# if defined(GRAN)
+  graph_root = closure;
+# endif
+
+  IF_GRAN_DEBUG(pack,
+		belch(">>> Packing <<%d>> (buffer @ %p); graph root @ %p [PE %d]\n    demanded by TSO %d (%p) [PE %u]",
+		      Bonzo->id, Bonzo, closure, where_is(closure), 
+		      tso->id, tso, where_is((StgClosure*)tso)));
+
+  IF_GRAN_DEBUG(pack,
+		belch("** PrintGraph of %p is:", closure); 
+		PrintGraph(closure,0));
+
+  IF_PAR_DEBUG(pack,
+	       belch(">>> Packing <<%d>> (buffer @ %p); graph root @ %p [%x]\n    demanded by TSO %d (%p)",
+		     Bonzo->id, Bonzo, closure, mytid,
+		     tso->id, tso)); 
+
+  IF_PAR_DEBUG(pack,
+	       belch("** PrintGraph of %p is:", closure); 
+	       belch("** pack_locn=%d", pack_locn);
+	       PrintGraph(closure,0));
+
+  QueueClosure(closure);
+  do {
+    PackClosure(DeQueueClosure());
+  } while (!QueueEmpty());
+  
+# if defined(PAR)
+
+  /* Record how much space is needed to unpack the graph */
+  Bonzo->tso = tso; // ToDo: check: used in GUM or only for debugging?
+  Bonzo->unpacked_size = unpacked_size;
+  Bonzo->size = pack_locn;
+
+  /* Set the size parameter */
+  ASSERT(pack_locn <= RtsFlags.ParFlags.packBufferSize);
+  *packBufferSize = pack_locn;
+
+# else  /* GRAN */
+
+  /* Record how much space is needed to unpack the graph */
+  // PackBuffer[PACK_FLAG_LOCN] = (P_) MAGIC_PACK_FLAG;  for testing
+  Bonzo->tso = tso;
+  Bonzo->unpacked_size = unpacked_size;
+
+  // ASSERT(pack_locn <= PackBuffer[PACK_SIZE_LOCN]+PACK_HDR_SIZE);
+  /* ToDo: Print an earlier, more meaningful message */
+  if (pack_locn==0)   /* i.e. packet is empty */
+    barf("EMPTY PACKET! Can't transfer closure %p at all!!\n",
+	 closure);
+  Bonzo->size = pack_locn;
+  *packBufferSize = pack_locn;
+
+# endif
+
+  DonePacking();                               /* {GrAnSim}vaD 'ut'Ha' */
+
+# if defined(GRAN)
+  IF_GRAN_DEBUG(pack ,
+		belch("** Finished <<%d>> packing graph %p; closures packed: %d; thunks packed: %d; size of graph: %d",
+		      Bonzo->id, closure, Bonzo->size, packed_thunks, Bonzo->unpacked_size));
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    globalGranStats.tot_packets++; 
+    globalGranStats.tot_packet_size += pack_locn; 
+  }
+  
+  IF_GRAN_DEBUG(pack, PrintPacket(Bonzo));
+# elif defined(PAR)
+  IF_GRAN_DEBUG(pack ,
+		belch("** Finished <<%d>> packing graph %p; closures packed: %d; thunks packed: %d; size of graph: %d",
+		      Bonzo->id, closure, Bonzo->size, packed_thunks, Bonzo->unpacked_size);
+		PrintPacket(Bonzo));
+# endif   /* GRAN */
+
+  return (Bonzo);
+}
+
+//@cindex PackOneNode
+
+# if defined(GRAN)
+/* This version is used when the node is already local */
+
+rtsPackBuffer *
+PackOneNode(closure, tso, packBufferSize)
+StgClosure* closure;
+StgTSO* tso;
+nat *packBufferSize;
+{
+  extern rtsPackBuffer *Bonzo;
+  int i, clpack_locn;
+
+  initPacking();
+
+  IF_GRAN_DEBUG(pack,
+		belch("** PackOneNode: %p (%s)[PE %d] requested by TSO %d (%p) [PE %d]",
+		      closure, info_type(closure),
+		      where_is(closure), tso->id, tso, where_is((StgClosure *)tso)));
+
+  Pack(closure);
+
+  /* Record how much space is needed to unpack the graph */
+  Bonzo->tso = tso;
+  Bonzo->unpacked_size = unpacked_size;
+
+  /* Set the size parameter */
+  ASSERT(pack_locn <= RTS_PACK_BUFFER_SIZE);
+  Bonzo->size =  pack_locn;
+  *packBufferSize = pack_locn;
+
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    globalGranStats.tot_packets++; 
+    globalGranStats.tot_packet_size += pack_locn; 
+  }
+  IF_GRAN_DEBUG(pack,
+    PrintPacket(Bonzo));
+
+  return (Bonzo);
+}
+# endif  /* GRAN */
+
+#if defined(GRAN)
+
+/*
+   PackTSO and PackStkO are entry points for two special kinds of closure
+   which are used in the parallel RTS.  Compared with other closures they
+   are rather awkward to pack because they don't follow the normal closure
+   layout (where all pointers occur before all non-pointers).  Luckily,
+   they're only needed when migrating threads between processors.  */
+
+//@cindex PackTSO
+rtsPackBuffer*
+PackTSO(tso, packBufferSize)
+StgTSO *tso;
+nat *packBufferSize;
+{
+  extern rtsPackBuffer *Bonzo;
+  IF_GRAN_DEBUG(pack,
+		belch("** Packing TSO %d (%p)", tso->id, tso));
+  *packBufferSize = 0;
+  // PackBuffer[0] = PackBuffer[1] = 0; ???
+  return(Bonzo);
+}
+
+//@cindex PackStkO
+rtsPackBuffer*
+PackStkO(stko, packBufferSize)
+StgPtr stko;
+nat *packBufferSize;
+{
+  extern rtsPackBuffer *Bonzo;
+  IF_GRAN_DEBUG(pack,
+		belch("** Packing STKO %p", stko));
+  *packBufferSize = 0;
+  // PackBuffer[0] = PackBuffer[1] = 0;
+  return(Bonzo);
+}
+
+void
+PackFetchMe(StgClosure *closure)
+{
+  barf("{PackFetchMe}Daq Qagh: no FetchMe closures in GRAN!");
+}
+
+#elif defined(PAR)
+
+rtsPackBuffer*
+PackTSO(tso, packBufferSize)
+StgTSO *tso;
+nat *packBufferSize;
+{
+  barf("{PackTSO}Daq Qagh: trying to pack a TSO; thread migrations not supported, yet");
+}
+
+rtsPackBuffer*
+PackStkO(stko, packBufferSize)
+StgPtr stko;
+nat *packBufferSize;
+{
+  barf("{PackStkO}Daq Qagh: trying to pack a STKO; thread migrations not supported, yet");
+}
+
+//@cindex PackFetchMe
+void
+PackFetchMe(StgClosure *closure)
+{
+  StgInfoTable *ip;
+  nat i;
+
+#if defined(GRAN)
+  barf("{PackFetchMe}Daq Qagh: no FetchMe closures in GRAN!");
+#else
+  /* Pack a FetchMe closure instead of closure */
+  ip = &FETCH_ME_info;
+  /* this assumes that the info ptr is always the first word in a closure*/
+  Pack((StgWord)ip);
+  for (i = 1; i < _HS; ++i)               // pack rest of fixed header
+    Pack((StgWord)*(((StgPtr)closure)+i));
+  
+  unpacked_size += _HS; // ToDo: check
+#endif
+}
+
+#endif
+
+//@node Packing Closures,  , Packing Sections of Nearby Graph, Packing Functions
+//@subsubsection Packing Closures
+/*
+  Packing Closures
+
+  @PackClosure@ is the heart of the normal packing code.  It packs a single
+  closure into the pack buffer, skipping over any indirections and
+  globalising it as necessary, queues any child pointers for further
+  packing, and turns it into a @FetchMe@ or revertible black hole (@RBH@)
+  locally if it was a thunk.  Before the actual closure is packed, a
+  suitable global address (GA) is inserted in the pack buffer.  There is
+  always room to pack a fetch-me to the closure (guaranteed by the
+  RoomToPack calculation), and this is packed if there is no room for the
+  entire closure.
+
+  Space is allocated for any primitive array children of a closure, and
+  hence a primitive array can always be packed along with it's parent
+  closure.  */
+
+//@cindex PackClosure
+
+# if defined(PAR)
+
+void
+PackClosure(closure)
+StgClosure *closure;
+{
+  StgInfoTable *info;
+  StgClosure *indirectee, *rbh;
+  nat size, ptrs, nonptrs, vhs, i, clpack_locn;
+  rtsBool is_CONSTR = rtsFalse;
+  char str[80];
+
+  ASSERT(closure!=NULL);
+  indirectee = closure;
+  do {
+    /* Don't pack indirection closures */
+    closure =  indirectee;
+    indirectee = IS_INDIRECTION(closure);
+    IF_PAR_DEBUG(pack,
+		 if (indirectee) 
+		   belch("** Shorted an indirection (%s) at %p (-> %p)", 
+			 info_type(closure), closure, indirectee));
+  } while (indirectee);
+
+  clpack_locn = OffsetFor(closure);
+
+  /* If the closure has been packed already, just pack an indirection to it
+     to guarantee that the graph doesn't become a tree when unpacked */
+  if (!NotYetPacking(clpack_locn)) {
+    StgInfoTable *info;
+
+    PackOffset(clpack_locn);
+    return;
+  }
+
+  /*
+   * PLCs reside on all of the PEs already. Just pack the
+   * address as a GA (a bit of a kludge, since an address may
+   * not fit in *any* of the individual GA fields). Const,
+   * charlike and small intlike closures are converted into
+   * PLCs.
+   */
+  switch (get_itbl(closure)->type) {
+
+#  ifdef DEBUG
+    // check error cases only in a debugging setup
+  case RET_BCO:
+  case RET_SMALL:
+  case RET_VEC_SMALL:
+  case RET_BIG:
+  case RET_VEC_BIG:
+  case RET_DYN:
+    barf("** {Pack}Daq Qagh: found return vector %p (%s) when packing (thread migration not implemented)", 
+	 closure, info_type(closure));
+    /* never reached */
+    
+  case UPDATE_FRAME:
+  case STOP_FRAME:
+  case CATCH_FRAME:
+  case SEQ_FRAME:
+    barf("** {Pack}Daq Qagh: found stack frame %p (%s) when packing (thread migration not implemented)", 
+	 closure, info_type(closure));
+    /* never reached */
+
+  case TSO:
+  case BLOCKED_FETCH:
+  case EVACUATED:
+    /* something's very wrong */
+    barf("** {Pack}Daq Qagh: found %s (%p) when packing", 
+	 info_type(closure), closure);
+    /* never reached */
+#  endif
+
+  case CONSTR_CHARLIKE:
+    IF_PAR_DEBUG(pack,
+		 belch("** Packing a charlike closure %d", 
+		       ((StgIntCharlikeClosure*)closure)->data));
+    
+    PackPLC(CHARLIKE_CLOSURE(((StgIntCharlikeClosure*)closure)->data));
+    return;
+      
+  case CONSTR_INTLIKE:
+    {
+      StgInt val = ((StgIntCharlikeClosure*)closure)->data;
+      
+      if ((val <= MAX_INTLIKE) && (val >= MIN_INTLIKE)) {
+	IF_PAR_DEBUG(pack,
+		     belch("** Packing a small intlike %d as a PLC", val));
+	PackPLC(INTLIKE_CLOSURE(val));
+	return;
+      } else {
+	IF_PAR_DEBUG(pack,
+		     belch("** Packing a big intlike %d as a normal closure", 
+			   val));
+	break;
+      }
+    }
+
+  case CONSTR:
+  case CONSTR_1_0:
+  case CONSTR_0_1:
+  case CONSTR_2_0:
+  case CONSTR_1_1:
+  case CONSTR_0_2:
+    /* it's a constructor (i.e. plain data) but we don't know 
+       how many ptrs, non-ptrs there are => use generic code */
+    IF_PAR_DEBUG(pack,
+		 belch("** Packing a CONSTR %p (%s) using generic packing with GA", 
+		       closure, info_type(closure)));
+    // is_CONSTR = rtsTrue;
+    break;
+    /* fall through to generic packing code */
+
+  case CONSTR_STATIC:
+  case CONSTR_NOCAF_STATIC:// For now we ship indirections to CAFs: They are
+			   // evaluated on each PE if needed
+    IF_PAR_DEBUG(pack,
+      belch("** Packing a %p (%s) as a PLC", 
+	    closure, info_type(closure)));
+
+    PackPLC(closure);
+    return;
+
+  case MVAR:
+    /* MVARs may not be copied; they are sticky objects in the new RTS */
+    /* therefore we treat them just as RBHs etc (what a great system!) */
+    IF_PAR_DEBUG(pack,
+		 belch("** Found an MVar at %p (%s)", 
+		       closure, info_type(closure)));
+    /* fall through !! */
+
+  case THUNK_SELECTOR: // ToDo: fix packing of this strange beast
+    IF_PAR_DEBUG(pack,
+		 belch("** Found an THUNK_SELECTORE at %p (%s)", 
+		       closure, info_type(closure)));
+    /* fall through !! */
+
+  case CAF_BLACKHOLE:
+  case SE_CAF_BLACKHOLE:
+  case SE_BLACKHOLE:
+  case BLACKHOLE:
+  case RBH:
+  case FETCH_ME:
+  case FETCH_ME_BQ:
+
+    /* If it's a (revertible) black-hole, pack a FetchMe closure to it */
+    //ASSERT(pack_locn > PACK_HDR_SIZE);
+    
+    IF_PAR_DEBUG(pack,
+		 belch("** Packing a BH or FM at %p (%s) of (fixed size %d)", 
+		       closure, info_type(closure), _HS));
+
+    /* Need a GA even when packing a constructed FETCH_ME (cruel world!) */
+    GlobaliseAndPackGA(closure);
+
+    PackFetchMe(closure);
+    return;
+
+  default:
+/*      IF_PAR_DEBUG(pack, */
+/*  		 belch("** Not a PLC or BH ... ")); */
+  } /* switch */
+
+  /* get info about basic layout of the closure */
+  info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+
+  ASSERT(!IS_BLACK_HOLE(closure));
+
+  IF_PAR_DEBUG(pack,
+	       fprintf(stderr, "** packing %p (%s) (size=%d, ptrs=%d, nonptrs=%d)\n",
+		       closure, info_type(closure), size, ptrs, nonptrs));
+
+  /*
+   * Now peek ahead to see whether the closure has any primitive array
+   * children
+   */
+  /*
+      ToDo: fix this code -- HWL
+    for (i = 0; i < ptrs; ++i) {
+      StgInfoTable * childInfo;
+      nat childSize, childPtrs, childNonPtrs, childVhs;
+      
+      // extract i-th pointer out of closure 
+      childInfo = get_closure_info(((PP_) (closure))[i + FIXED_HS + vhs],
+				   &childSize, &childPtrs, &childNonPtrs, &childVhs, str);
+      if (IS_BIG_MOTHER(childInfo)) {
+	reservedPAsize += PACK_GA_SIZE + FIXED_HS + childVhs + childNonPtrs
+	  + childPtrs * PACK_FETCHME_SIZE;
+      }
+    }
+    */
+  /* Record the location of the GA */
+  AmPacking(closure);
+
+  /* Pack the global address */
+  if (!is_CONSTR) {
+    GlobaliseAndPackGA(closure);
+  } else {
+    IF_PAR_DEBUG(pack,
+		 belch("** No GA allocated for CONSTR %p (%s)",
+		       closure, info_type(closure)));
+  }
+
+  /*
+   * Pack a fetchme to the closure if it's a black hole, or the buffer is full
+   * and it isn't a primitive array. N.B. Primitive arrays are always packed
+   * (because their parents index into them directly)
+   */
+
+  // ToDo: pack FMs if no more room available in packet (see below)
+  if (!(RoomToPack(PACK_GA_SIZE + _HS + vhs + nonptrs, ptrs)))
+    barf("** Qagh: Pack: not enough room in packet to pack closure %p (%s)",
+	 closure, info_type(closure));
+
+  /*
+    Has been moved into the switch statement
+    
+    if (IS_BLACK_HOLE(closure)) 
+    !(RoomToPack(PACK_GA_SIZE + FIXED_HS + vhs + nonptrs, ptrs)
+    || IS_BIG_MOTHER(info))) 
+    {
+      
+      ASSERT(pack_locn > PACK_HDR_SIZE);
+      
+      info = FetchMe_info;
+      for (i = 0; i < FIXED_HS; ++i) {
+	if (i == INFO_HDR_POSN)
+	  Pack((StgWord) FetchMe_info);
+	else
+	  Pack(closure[i]);
+      }
+
+      unpacked_size += FIXED_HS + FETCHME_CLOSURE_SIZE(dummy);
+
+    } else {
+  */
+  if (info->type == ARR_WORDS || info->type == MUT_ARR_PTRS ||
+      info->type == MUT_ARR_PTRS_FROZEN || info->type == MUT_VAR)
+    belch("** ghuH: found %s; packing of primitive arrays not yet implemented",
+	  info_type(closure));
+
+  /* At last! A closure we can actually pack! */
+  if (ip_MUTABLE(info) && (info->type != FETCH_ME))
+    fprintf(stderr, "** ghuH: Replicated a Mutable closure!\n");
+      
+  /* 
+     Remember, the generic closure layout is as follows:
+        +-------------------------------------------------+
+	| FIXED HEADER | VARIABLE HEADER | PTRS | NON-PRS |
+        +-------------------------------------------------+
+  */
+  /* pack fixed and variable header */
+  for (i = 0; i < _HS + vhs; ++i)
+    Pack((StgWord)*(((StgPtr)closure)+i));
+      
+  /* register all ptrs for further packing */
+  for (i = 0; i < ptrs; ++i)
+    QueueClosure(((StgClosure *) *(((StgPtr)closure)+(i+_HS+vhs))));
+
+  /* pack non-ptrs */
+  for (i = 0; i < nonptrs; ++i)
+    Pack((StgWord)*(((StgPtr)closure)+(i+_HS+vhs+ptrs)));
+      
+  unpacked_size += _HS + (size < MIN_UPD_SIZE ? MIN_UPD_SIZE : size);
+
+  /*
+   * Record that this is a revertable black hole so that we can fill in
+   * its address from the fetch reply.  Problem: unshared thunks may cause
+   * space leaks this way, their GAs should be deallocated following an
+   * ACK.
+   */
+      
+  // IS_UPDATABLE(closure) == !closure_UNPOINTED(closure) !? HWL
+  if (closure_THUNK(closure) && !closure_UNPOINTED(closure)) { 
+    rbh = convertToRBH(closure);
+    ASSERT(rbh == closure); // rbh at the same position (minced version)
+    packed_thunks++;
+  }
+}
+
+# else  /* GRAN */
+
+/* Fake the packing of a closure */
+
+void
+PackClosure(closure)
+StgClosure *closure;
+{
+  StgInfoTable *info, *childInfo;
+  nat size, ptrs, nonptrs, vhs;
+  char info_hdr_ty[80];
+  nat i;
+  StgClosure *indirectee, *rbh;
+  char str[80];
+  rtsBool is_mutable, will_be_rbh, no_more_thunks_please;
+
+  is_mutable = rtsFalse;
+
+  /* In GranSim we don't pack and unpack closures -- we just simulate
+     packing by updating the bitmask. So, the graph structure is unchanged
+     i.e. we don't short out indirections here. -- HWL */
+
+  /* Nothing to do with packing but good place to (sanity) check closure;
+     if the closure is a thunk, it must be unique; otherwise we have copied
+     work at some point before that which violates one of our main global
+     assertions in GranSim/GUM */
+  ASSERT(!closure_THUNK(closure) || is_unique(closure));
+
+  IF_GRAN_DEBUG(pack,
+		belch("**  Packing closure %p (%s)",
+		      closure, info_type(closure)));
+
+  if (where_is(closure) != where_is(graph_root)) {
+    IF_GRAN_DEBUG(pack,
+		  belch("**   faking a FETCHME [current PE: %d, closure's PE: %d]",
+			where_is(graph_root), where_is(closure)));
+
+    /* GUM would pack a FETCHME here; simulate that by increasing the */
+    /* unpacked size accordingly but don't pack anything -- HWL */
+    unpacked_size += _HS + 2 ; // sizeofW(StgFetchMe);
+    return; 
+  }
+
+  /* If the closure's not already being packed */
+  if (!NotYetPacking(closure)) 
+    /* Don't have to do anything in GrAnSim if closure is already */
+    /* packed -- HWL */
+    {
+      IF_GRAN_DEBUG(pack,
+		    belch("**    Closure %p is already packed and omitted now!",
+			    closure));
+      return;
+    }
+
+  switch (get_itbl(closure)->type) {
+    /* ToDo: check for sticky bit here? */
+    /* BH-like closures which must not be moved to another PE */
+    case CAF_BLACKHOLE:       /* # of ptrs, nptrs: 0,2 */
+    case SE_BLACKHOLE:        /* # of ptrs, nptrs: 0,2 */
+    case SE_CAF_BLACKHOLE:    /* # of ptrs, nptrs: 0,2 */
+    case BLACKHOLE:           /* # of ptrs, nptrs: 0,2 */
+    case BLACKHOLE_BQ:        /* # of ptrs, nptrs: 1,1 */
+    case RBH:                 /* # of ptrs, nptrs: 1,1 */
+      /* same for these parallel specific closures */
+    case BLOCKED_FETCH:
+    case FETCH_ME:
+    case FETCH_ME_BQ:
+      IF_GRAN_DEBUG(pack,
+	belch("**    Avoid packing BH-like closures (%p, %s)!", 
+	      closure, info_type(closure)));
+      /* Just ignore RBHs i.e. they stay where they are */
+      return;
+
+    case THUNK_SELECTOR:
+      {
+	StgClosure *sel = ((StgSelector *)closure)->selectee;
+
+	IF_GRAN_DEBUG(pack,
+		      belch("**    Avoid packing THUNK_SELECTOR (%p, %s) but queuing %p (%s)!", 
+			    closure, info_type(closure), sel, info_type(sel)));
+	QueueClosure(sel);
+	IF_GRAN_DEBUG(pack,
+		      belch("**    [%p (%s) (Queueing closure) ....]",
+			    sel, info_type(sel)));
+      }
+      return;
+
+    case CONSTR_STATIC:
+    case CONSTR_NOCAF_STATIC:
+                                  /* For now we ship indirections to CAFs:
+				   * They are evaluated on each PE if needed */
+      IF_GRAN_DEBUG(pack,
+	belch("**    Nothing to pack for %p (%s)!", 
+	      closure, info_type(closure)));
+      // Pack(closure); GUM only
+      return;
+
+    case CONSTR_CHARLIKE:
+    case CONSTR_INTLIKE:
+      IF_GRAN_DEBUG(pack,
+	belch("**    Nothing to pack for %s (%p)!", 
+	      closure, info_type(closure)));
+      // PackPLC(((StgIntCharlikeClosure *)closure)->data); GUM only
+      return;
+
+    case AP_UPD:   
+    case PAP:
+      /* partial applications; special treatment necessary? */
+      break;
+
+    case CAF_UNENTERED:    /* # of ptrs, nptrs: 1,3 */
+    case CAF_ENTERED:      /* # of ptrs, nptrs: 0,4  (allegedly bogus!!) */
+      /* CAFs; special treatment necessary? */
+      break;
+
+    case MVAR:
+      barf("{PackClosure}Daq Qagh: found an MVAR (%p, %s); ToDo: implement proper treatment of MVARs",
+	   closure, info_type(closure));
+
+    case ARR_WORDS:
+    case MUT_VAR:
+    case MUT_ARR_PTRS:
+    case MUT_ARR_PTRS_FROZEN:
+      /* Mutable objects; require special treatment to ship all data */
+      is_mutable = rtsTrue;
+      break;	  
+
+    case WEAK:
+    case FOREIGN:
+    case STABLE_NAME:
+	  /* weak pointers and other FFI objects */
+      barf("{PackClosure}Daq Qagh: found an FFI object (%p, %s); FFI not yet supported by GranSim, sorry",
+	   closure, info_type(closure));
+
+    case TSO:
+      /* parallel objects */
+      barf("{PackClosure}Daq Qagh: found a TSO when packing (%p, %s); thread migration not yet implemented, sorry",
+	   closure, info_type(closure));
+
+    case BCO:
+      /* Hugs objects (i.e. closures used by the interpreter) */
+      barf("{PackClosure}Daq Qagh: found a Hugs closure when packing (%p, %s); GranSim not yet integrated with Hugs, sorry",
+	   closure, info_type(closure));
+      
+    case IND:              /* # of ptrs, nptrs: 1,0 */
+    case IND_STATIC:       /* # of ptrs, nptrs: 1,0 */
+    case IND_PERM:         /* # of ptrs, nptrs: 1,1 */
+    case IND_OLDGEN:       /* # of ptrs, nptrs: 1,1 */
+    case IND_OLDGEN_PERM:  /* # of ptrs, nptrs: 1,1 */
+      /* we shouldn't find an indirection here, because we have shorted them
+	 out at the beginning of this functions already.
+      */
+      break;
+      /* should be:
+      barf("{PackClosure}Daq Qagh: found indirection when packing (%p, %s)",
+	   closure, info_type(closure));
+      */
+
+    case UPDATE_FRAME:
+    case CATCH_FRAME:
+    case SEQ_FRAME:
+    case STOP_FRAME:
+      /* stack frames; should never be found when packing for now;
+	 once we support thread migration these have to be covered properly
+      */
+      barf("{PackClosure}Daq Qagh: found stack frame when packing (%p, %s)",
+	   closure, info_type(closure));
+
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+    case RET_BIG:
+    case RET_VEC_BIG:
+    case RET_DYN:
+      /* vectored returns; should never be found when packing; */
+      barf("{PackClosure}Daq Qagh: found vectored return (%p, %s)",
+	   closure, info_type(closure));
+
+    case INVALID_OBJECT:
+      barf("{PackClosure}Daq Qagh: found Invalid object (%p, %s)",
+	   closure, info_type(closure));
+
+    default:
+      /* 
+	 Here we know that the closure is a CONSTR, FUN or THUNK (maybe
+	 a specialised version with wired in #ptr/#nptr info; currently
+	 we treat these specialised versions like the generic version)
+      */
+    }     /* switch */
+
+    /* Otherwise it's not Fixed */
+
+    info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+    will_be_rbh = closure_THUNK(closure) && !closure_UNPOINTED(closure);
+
+    IF_GRAN_DEBUG(pack,
+		belch("**    Info on closure %p (%s): size=%d; ptrs=%d",
+		      closure, info_type(closure),
+		      size, ptrs, 
+		      (will_be_rbh) ? "will become RBH" : "will NOT become RBH"));
+    
+    // check whether IS_UPDATABLE(closure) == !closure_UNPOINTED(closure) -- HWL
+    no_more_thunks_please = 
+      (RtsFlags.GranFlags.ThunksToPack>0) && 
+      (packed_thunks>=RtsFlags.GranFlags.ThunksToPack);
+
+    /*
+      should be covered by get_closure_info
+    if (info->type == FETCH_ME || info->type == FETCH_ME_BQ || 
+	info->type == BLACKHOLE || info->type == RBH )
+      size = ptrs = nonptrs = vhs = 0;
+    */
+    /* Now peek ahead to see whether the closure has any primitive */
+    /* array children */ 
+    /* 
+       ToDo: fix this code
+       for (i = 0; i < ptrs; ++i) {
+       P_ childInfo;
+       W_ childSize, childPtrs, childNonPtrs, childVhs;
+       
+       childInfo = get_closure_info(((StgPtrPtr) (closure))[i + FIXED_HS + vhs],
+       &childSize, &childPtrs, &childNonPtrs,
+       &childVhs, junk_str);
+       if (IS_BIG_MOTHER(childInfo)) {
+       reservedPAsize += PACK_GA_SIZE + FIXED_HS + 
+       childVhs + childNonPtrs +
+       childPtrs * PACK_FETCHME_SIZE;
+       PAsize += PACK_GA_SIZE + FIXED_HS + childSize;
+       PAptrs += childPtrs;
+       }
+       }
+    */
+    /* Don't pack anything (GrAnSim) if it's a black hole, or the buffer
+     * is full and it isn't a primitive array. N.B. Primitive arrays are
+     * always packed (because their parents index into them directly) */
+
+    if (IS_BLACK_HOLE(closure))
+	/*
+	  ToDo: fix this code
+	  || 
+	  !(RoomToPack(PACK_GA_SIZE + FIXED_HS + vhs + nonptrs, ptrs) 
+	  || IS_BIG_MOTHER(info))) 
+	  */
+      return;
+
+    /* At last! A closure we can actually pack! */
+
+    if (closure_MUTABLE(closure)) // not nec. && (info->type != FETCHME))
+      belch("ghuH: Replicated a Mutable closure!");
+
+    if (RtsFlags.GranFlags.GranSimStats.Global &&  
+	no_more_thunks_please && will_be_rbh) {
+      globalGranStats.tot_cuts++;
+      if ( RtsFlags.GranFlags.Debug.pack ) 
+	belch("**    PackClosure (w/ ThunksToPack=%d): Cutting tree with root at %#x\n",
+		RtsFlags.GranFlags.ThunksToPack, closure);
+    } else if (will_be_rbh || (closure==graph_root) ) {
+      packed_thunks++;
+      globalGranStats.tot_thunks++;
+    }
+
+    if (no_more_thunks_please && will_be_rbh) 
+      return; /* don't pack anything */
+
+    /* actual PACKING done here --  HWL */
+    Pack(closure);         
+    for (i = 0; i < ptrs; ++i) {
+      /* extract i-th pointer from closure */
+      QueueClosure((StgClosure *)payloadPtr(closure,i));
+      IF_GRAN_DEBUG(pack,
+		    belch("**    [%p (%s) (Queueing closure) ....]",
+			  payloadPtr(closure,i), info_type(payloadPtr(closure,i))));
+    }
+
+    /* 
+       for packing words (GUM only) do something like this:
+
+       for (i = 0; i < ptrs; ++i) {
+         Pack(payloadWord(obj,i+j));
+       }
+    */
+    /* Turn thunk into a revertible black hole. */
+    if (will_be_rbh) { 
+	rbh = convertToRBH(closure);
+	ASSERT(rbh != NULL);
+    }
+}
+# endif  /* PAR */
+
+//@node Low level packing routines, Unpacking routines, Packing Functions, Graph packing
+//@subsection Low level packing routines
+
+/*
+   @Pack@ is the basic packing routine.  It just writes a word of data into
+   the pack buffer and increments the pack location.  */
+
+//@cindex Pack
+
+# if defined(PAR)
+static inline void
+Pack(data)
+StgWord data;
+{
+  ASSERT(pack_locn < RtsFlags.ParFlags.packBufferSize);
+  Bonzo->buffer[pack_locn++] = data;
+}
+#endif
+
+#if defined(GRAN)
+static inline void
+Pack(closure)
+StgClosure *closure;
+{
+  StgInfoTable *info;
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+
+  /* This checks the size of the GrAnSim internal pack buffer. The simulated
+     pack buffer is checked via RoomToPack (as in GUM) */
+  if (pack_locn >= (int)Bonzo->size+sizeofW(rtsPackBuffer)) 
+    reallocPackBuffer();
+
+  if (closure==(StgClosure*)NULL) 
+    belch("Qagh {Pack}Daq: Trying to pack 0");
+  Bonzo->buffer[pack_locn++] = closure;
+  /* ASSERT: Data is a closure in GrAnSim here */
+  info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+  unpacked_size += _HS + (size < MIN_UPD_SIZE ? 
+				        MIN_UPD_SIZE : 
+				        size);
+}
+# endif  /* GRAN */
+
+/*
+   If a closure is local, make it global.  Then, divide its weight for
+   export.  The GA is then packed into the pack buffer.  */
+
+# if defined(PAR)
+
+static void
+GlobaliseAndPackGA(closure)
+StgClosure *closure;
+{
+  globalAddr *ga;
+  globalAddr packGA;
+
+  if ((ga = LAGAlookup(closure)) == NULL)
+    ga = makeGlobal(closure, rtsTrue);
+  splitWeight(&packGA, ga);
+  ASSERT(packGA.weight > 0);
+
+  IF_PAR_DEBUG(pack,
+	       fprintf(stderr, "** Globalising closure %p (%s) with GA", 
+		       closure, info_type(closure));
+	       printGA(&packGA);
+	       fputc('\n', stderr));
+
+
+  Pack((StgWord) packGA.weight);
+  Pack((StgWord) packGA.payload.gc.gtid);
+  Pack((StgWord) packGA.payload.gc.slot);
+}
+
+/*
+   @PackPLC@ makes up a bogus GA for a PLC. Weight 0 implies that a PLC
+   address follows instead of PE, slot.  */
+
+//@cindex PackPLC
+
+static void
+PackPLC(addr)
+StgPtr addr;
+{
+  Pack(0L);			/* weight */
+  Pack((StgWord) addr);		/* address */
+}
+
+/*
+   @PackOffset@ packs a special GA value that will be interpreted as an
+   offset to a closure in the pack buffer.  This is used to avoid unfolding
+   the graph structure into a tree.  */
+
+static void
+PackOffset(offset)
+int offset;
+{
+  IF_PAR_DEBUG(pack,
+	       belch("** Packing Offset %d at pack location %u",
+		     offset, pack_locn));
+  Pack(1L);			/* weight */
+  Pack(0L);			/* pe */
+  Pack(offset);		        /* slot/offset */
+}
+# endif  /* PAR */
+
+//@node Unpacking routines, Aux fcts for packing, Low level packing routines, Graph packing
+//@subsection Unpacking routines
+
+/*
+  This was formerly in the (now deceased) module Unpack.c
+
+  Unpacking closures which have been exported to remote processors
+
+  This module defines routines for unpacking closures in the parallel
+  runtime system (GUM).
+
+  In the case of GrAnSim, this module defines routines for *simulating* the
+  unpacking of closures as it is done in the parallel runtime system.
+*/
+
+//@node GUM code, Local Definitions, Unpacking routines, Unpacking routines
+//@subsubsection GUM code
+
+#if defined(PAR) 
+
+//@cindex InitPendingGABuffer
+void
+InitPendingGABuffer(size)
+nat size; 
+{
+  PendingGABuffer = (globalAddr *) 
+                      stgMallocBytes(size*2*sizeof(globalAddr),
+				     "InitPendingGABuffer");
+}
+
+/*
+  @CommonUp@ commons up two closures which we have discovered to be
+  variants of the same object.  One is made an indirection to the other.  */
+
+//@cindex CommonUp
+void
+CommonUp(StgClosure *src, StgClosure *dst)
+{
+  StgBlockingQueueElement *bqe;
+
+  ASSERT(src != (StgClosure *)NULL && dst != (StgClosure *)NULL);
+  ASSERT(src != dst);
+
+  IF_PAR_DEBUG(verbose,
+	       belch("__ CommonUp %p (%s) with %p (%s)",
+		     src, info_type(src), dst, info_type(dst)));
+  
+  switch (get_itbl(src)->type) {
+  case BLACKHOLE_BQ:
+    bqe = ((StgBlockingQueue *)src)->blocking_queue;
+    break;
+
+  case FETCH_ME_BQ:
+    bqe = ((StgFetchMeBlockingQueue *)src)->blocking_queue;
+    break;
+    
+  case RBH:
+    bqe = ((StgRBH *)src)->blocking_queue;
+    break;
+    
+  case BLACKHOLE:
+  case FETCH_ME:
+    bqe = END_BQ_QUEUE;
+    break;
+
+  default:
+    /* Don't common up anything else */
+    return;
+  }
+  /* NB: this also awakens the blocking queue for src */
+  UPD_IND(src, dst);
+  // updateWithIndirection(src, dst);
+  /*
+    ASSERT(!IS_BIG_MOTHER(INFO_PTR(dst)));
+    if (bqe != END_BQ_QUEUE)
+    awaken_blocked_queue(bqe, src);
+  */
+}
+
+/*
+  @UnpackGraph@ unpacks the graph contained in a message buffer.  It
+  returns a pointer to the new graph.  The @gamap@ parameter is set to
+  point to an array of (oldGA,newGA) pairs which were created as a result
+  of unpacking the buffer; @nGAs@ is set to the number of GA pairs which
+  were created.
+
+  The format of graph in the pack buffer is as defined in @Pack.lc@.  */
+
+//@cindex UnpackGraph
+StgClosure *
+UnpackGraph(packBuffer, gamap, nGAs)
+rtsPackBuffer *packBuffer;
+globalAddr **gamap;
+nat *nGAs;
+{
+  nat size, ptrs, nonptrs, vhs;
+  StgWord **buffer, **bufptr, **slotptr;
+  globalAddr ga, *gaga;
+  StgClosure *closure, *existing,
+             *graphroot, *graph, *parent;
+  StgInfoTable *ip, *oldip;
+  nat bufsize, i,
+      pptr = 0, pptrs = 0, pvhs;
+  rtsBool hasGA;
+  char str[80];
+
+  initPackBuffer();                  /* in case it isn't already init'd */
+  graphroot = (StgClosure *)NULL;
+
+  gaga = PendingGABuffer;
+
+  InitClosureQueue();
+
+  /* Unpack the header */
+  bufsize = packBuffer->size;
+  buffer = packBuffer->buffer;
+  bufptr = buffer;
+
+  /* allocate heap */
+  if (bufsize > 0) {
+    graph = allocate(bufsize);
+    ASSERT(graph != NULL);
+  }
+
+  parent = (StgClosure *)NULL;
+
+  do {
+    /* This is where we will ultimately save the closure's address */
+    slotptr = bufptr;
+
+    /* First, unpack the next GA or PLC */
+    ga.weight = (rtsWeight) *bufptr++;
+
+    if (ga.weight > 0) {
+      ga.payload.gc.gtid = (GlobalTaskId) *bufptr++;
+      ga.payload.gc.slot = (int) *bufptr++;
+    } else {
+      ga.payload.plc = (StgPtr) *bufptr++;
+    }
+
+    /* Now unpack the closure body, if there is one */
+    if (isFixed(&ga)) {
+      /* No more to unpack; just set closure to local address */
+      IF_PAR_DEBUG(pack,
+		   belch("_* Unpacked PLC at %x", ga.payload.plc)); 
+      hasGA = rtsFalse;
+      closure = ga.payload.plc;
+    } else if (isOffset(&ga)) {
+      /* No more to unpack; just set closure to cached address */
+      IF_PAR_DEBUG(pack,
+		   belch("_* Unpacked indirection to %p (was offset %x)", 
+			 (StgClosure *) buffer[ga.payload.gc.slot],
+			 ga.payload.gc.slot)); 
+      ASSERT(parent != (StgClosure *)NULL);
+      hasGA = rtsFalse;
+      closure = (StgClosure *) buffer[ga.payload.gc.slot];
+    } else {
+      /* Now we have to build something. */
+      hasGA = rtsTrue;
+
+      ASSERT(bufsize > 0);
+
+      /*
+       * Close your eyes.  You don't want to see where we're looking. You
+       * can't get closure info until you've unpacked the variable header,
+       * but you don't know how big it is until you've got closure info.
+       * So...we trust that the closure in the buffer is organized the
+       * same way as they will be in the heap...at least up through the
+       * end of the variable header.
+       */
+      ip = get_closure_info(bufptr, &size, &ptrs, &nonptrs, &vhs, str);
+	  
+      /* 
+	 Remember, the generic closure layout is as follows:
+	 +-------------------------------------------------+
+	 | FIXED HEADER | VARIABLE HEADER | PTRS | NON-PRS |
+	 +-------------------------------------------------+
+      */
+      /* Fill in the fixed header */
+      for (i = 0; i < _HS; i++)
+	((StgPtr)graph)[i] = (StgWord)*bufptr++;
+
+      if (ip->type == FETCH_ME)
+	size = ptrs = nonptrs = vhs = 0;
+
+      /* Fill in the packed variable header */
+      for (i = 0; i < vhs; i++)
+	((StgPtr)graph)[_HS + i] = (StgWord)*bufptr++;
+
+      /* Pointers will be filled in later */
+
+      /* Fill in the packed non-pointers */
+      for (i = 0; i < nonptrs; i++)
+	((StgPtr)graph)[_HS + i + vhs + ptrs] = (StgWord)*bufptr++;
+                
+      /* Indirections are never packed */
+      // ASSERT(INFO_PTR(graph) != (W_) Ind_info_TO_USE);
+
+      /* Add to queue for processing */
+      QueueClosure(graph);
+	
+      /*
+       * Common up the new closure with any existing closure having the same
+       * GA
+       */
+
+      if ((existing = GALAlookup(&ga)) == NULL) {
+	globalAddr *newGA;
+	/* Just keep the new object */
+	IF_PAR_DEBUG(pack,
+		     belch("_* Unpacking new GA ((%x, %d, %x))", 
+			   ga.payload.gc.gtid, ga.payload.gc.slot, ga.weight));
+
+	closure = graph;
+	newGA = setRemoteGA(graph, &ga, rtsTrue);
+	if (ip->type == FETCH_ME)
+	  // FETCHME_GA(closure) = newGA;
+	  ((StgFetchMe *)closure)->ga = newGA;
+      } else {
+	/* Two closures, one global name.  Someone loses */
+	oldip = get_itbl(existing);
+
+	if ((oldip->type == FETCH_ME || 
+	     // ToDo: don't pack a GA for these in the first place
+             oldip->type == CONSTR ||
+             oldip->type == CONSTR_1_0 ||
+             oldip->type == CONSTR_0_1 ||
+             oldip->type == CONSTR_2_0 ||
+             oldip->type == CONSTR_1_1 ||
+             oldip->type == CONSTR_0_2 ||
+	     IS_BLACK_HOLE(existing)) &&
+	    ip->type != FETCH_ME) {
+
+	  /* What we had wasn't worth keeping */
+	  closure = graph;
+	  CommonUp(existing, graph);
+	} else {
+	  StgWord ty;
+
+	  /*
+	   * Either we already had something worthwhile by this name or
+	   * the new thing is just another FetchMe.  However, the thing we
+	   * just unpacked has to be left as-is, or the child unpacking
+	   * code will fail.  Remember that the way pointer words are
+	   * filled in depends on the info pointers of the parents being
+	   * the same as when they were packed.
+	   */
+	  IF_PAR_DEBUG(pack,
+		       belch("_* Unpacking old GA ((%x, %d, %x)), keeping %#lx", 
+			     ga.payload.gc.gtid, ga.payload.gc.slot, ga.weight,
+			     existing));
+
+	  closure = existing;
+	  // HACK
+	  ty = get_itbl(closure)->type;
+	  if (ty == CONSTR ||
+	      ty == CONSTR_1_0 ||
+	      ty == CONSTR_0_1 ||
+	      ty == CONSTR_2_0 ||
+	      ty == CONSTR_1_1 ||
+	      ty == CONSTR_0_2)
+	    CommonUp(closure, graph);
+	  
+	}
+	/* Pool the total weight in the stored ga */
+	(void) addWeight(&ga);
+      }
+
+      /* Sort out the global address mapping */
+      if (hasGA || // (ip_THUNK(ip) && !ip_UNPOINTED(ip)) || 
+	  (ip_MUTABLE(ip) && ip->type != FETCH_ME)) {
+	/* Make up new GAs for single-copy closures */
+	globalAddr *newGA = makeGlobal(closure, rtsTrue);
+	
+	// keep this assertion!
+	// ASSERT(closure == graph);
+
+	/* Create an old GA to new GA mapping */
+	*gaga++ = ga;
+	splitWeight(gaga, newGA);
+	ASSERT(gaga->weight == 1L << (BITS_IN(unsigned) - 1));
+	gaga++;
+      }
+      graph += _HS + (size < MIN_UPD_SIZE ? MIN_UPD_SIZE : size);
+    }
+
+    /*
+     * Set parent pointer to point to chosen closure.  If we're at the top of
+     * the graph (our parent is NULL), then we want to arrange to return the
+     * chosen closure to our caller (possibly in place of the allocated graph
+     * root.)
+     */
+    if (parent == NULL)
+      graphroot = closure;
+    else
+      ((StgPtr)parent)[_HS + pvhs + pptr] = (StgWord) closure;
+
+    /* Save closure pointer for resolving offsets */
+    *slotptr = (StgWord) closure;
+
+    /* Locate next parent pointer */
+    pptr++;
+    while (pptr + 1 > pptrs) {
+      parent = DeQueueClosure();
+
+      if (parent == NULL)
+	break;
+      else {
+	(void) get_closure_info(parent, &size, &pptrs, &nonptrs,
+					&pvhs, str);
+	pptr = 0;
+      }
+    }
+  } while (parent != NULL);
+
+  //ASSERT(bufsize == 0 || graph - 1 <= SAVE_Hp);
+
+  *gamap = PendingGABuffer;
+  *nGAs = (gaga - PendingGABuffer) / 2;
+
+  /* ToDo: are we *certain* graphroot has been set??? WDP 95/07 */
+  ASSERT(graphroot!=NULL);
+  return (graphroot);
+}
+#endif  /* PAR */
+
+//@node GranSim Code,  , Local Definitions, Unpacking routines
+//@subsubsection GranSim Code
+
+/*
+   For GrAnSim: No actual unpacking should be necessary. We just
+   have to walk over the graph and set the bitmasks appropriately.
+   Since we use RBHs similarly to GUM but without an ACK message/event
+   we have to revert the RBH from within the UnpackGraph routine (good luck!)
+   -- HWL 
+*/
+
+#if defined(GRAN)
+void
+CommonUp(StgClosure *src, StgClosure *dst)
+{
+  barf("CommonUp: should never be entered in a GranSim setup");
+}
+
+StgClosure*
+UnpackGraph(buffer)
+rtsPackBuffer* buffer;
+{
+  nat size, ptrs, nonptrs, vhs,
+      bufptr = 0;
+  StgClosure *closure, *graphroot, *graph;
+  StgInfoTable *ip;
+  StgWord bufsize, unpackedsize,
+          pptr = 0, pptrs = 0, pvhs;
+  StgTSO* tso;
+  char str[240], str1[80];
+  int i;
+
+  bufptr = 0;
+  graphroot = buffer->buffer[0];
+
+  tso = buffer->tso;
+
+  /* Unpack the header */
+  unpackedsize = buffer->unpacked_size;
+  bufsize = buffer->size;
+
+  IF_GRAN_DEBUG(pack,
+		belch("<<< Unpacking <<%d>> (buffer @ %p):\n    (root @ %p, PE %d,size=%d), demanded by TSO %d (%p)[PE %d]",
+		      buffer->id, buffer, graphroot, where_is(graphroot), 
+		      bufsize, tso->id, tso, 
+		      where_is((StgClosure *)tso)));
+
+  do {
+    closure = buffer->buffer[bufptr++]; /* that's all we need for GrAnSim -- HWL */
+      
+    /* Actually only ip is needed; rest is useful for TESTING -- HWL */
+    ip = get_closure_info(closure, 
+			  &size, &ptrs, &nonptrs, &vhs, str);
+      
+    IF_GRAN_DEBUG(pack,
+		  sprintf(str, "**    (%p): Changing bitmask[%s]: 0x%x ",
+			  closure, (closure_HNF(closure) ? "NF" : "__"),
+			  PROCS(closure)));
+
+    if (get_itbl(closure)->type == RBH) {
+      /* if it's an RBH, we have to revert it into a normal closure, thereby
+	 awakening the blocking queue; not that this is code currently not
+	 needed in GUM, but it should be added with the new features in
+	 GdH (and the implementation of an NACK message)
+      */
+      // closure->header.gran.procs = PE_NUMBER(CurrentProc);
+      SET_GRAN_HDR(closure, PE_NUMBER(CurrentProc));    /* Move node */
+
+      IF_GRAN_DEBUG(pack,
+		    strcat(str, " (converting RBH) ")); 
+
+      convertFromRBH(closure);   /* In GUM that's done by convertToFetchMe */
+
+      IF_GRAN_DEBUG(pack,
+		    belch("::  closure %p (%s) is a RBH; after reverting: IP=%p",
+			  closure, info_type(closure), get_itbl(closure)));
+    } else if (IS_BLACK_HOLE(closure)) {
+      IF_GRAN_DEBUG(pack,
+		    belch("::  closure %p (%s) is a BH; copying node to %d",
+			  closure, info_type(closure), CurrentProc));
+      closure->header.gran.procs |= PE_NUMBER(CurrentProc); /* Copy node */
+    } else if ( (closure->header.gran.procs & PE_NUMBER(CurrentProc)) == 0 ) {
+      if (closure_HNF(closure)) {
+	IF_GRAN_DEBUG(pack,
+		      belch("::  closure %p (%s) is a HNF; copying node to %d",
+			    closure, info_type(closure), CurrentProc));
+	closure->header.gran.procs |= PE_NUMBER(CurrentProc); /* Copy node */
+      } else { 
+	IF_GRAN_DEBUG(pack,
+		      belch("::  closure %p (%s) is no (R)BH or HNF; moving node to %d",
+			    closure, info_type(closure), CurrentProc));
+	closure->header.gran.procs = PE_NUMBER(CurrentProc);  /* Move node */
+      }
+    }
+
+    IF_GRAN_DEBUG(pack,
+		  sprintf(str1, "0x%x",   PROCS(closure)); strcat(str, str1));
+    IF_GRAN_DEBUG(pack, belch(str));
+    
+  } while (bufptr<buffer->size) ;   /*  (parent != NULL);  */
+
+  /* In GrAnSim we allocate pack buffers dynamically! -- HWL */
+  free(buffer->buffer);
+  free(buffer);
+
+  IF_GRAN_DEBUG(pack,
+		belch("PrintGraph of %p is:", graphroot); PrintGraph(graphroot,0));
+
+  return (graphroot);
+}
+#endif  /* GRAN */
+
+//@node Aux fcts for packing, Printing Packet Contents, Unpacking routines, Graph packing
+//@subsection Aux fcts for packing
+
+//@menu
+//* Offset table::		
+//* Packet size::		
+//* Types of Global Addresses::	 
+//* Closure Info::		
+//@end menu
+
+//@node Offset table, Packet size, Aux fcts for packing, Aux fcts for packing
+//@subsubsection Offset table
+
+/*
+   DonePacking is called when we've finished packing.  It releases memory
+   etc.  */
+
+//@cindex DonePacking
+
+# if defined(PAR)
+
+static void
+DonePacking(void)
+{
+  freeHashTable(offsetTable, NULL);
+  offsetTable = NULL;
+}
+
+/*
+   AmPacking records that the closure is being packed.  Note the abuse of
+   the data field in the hash table -- this saves calling @malloc@!  */
+
+//@cindex AmPacking
+
+static void
+AmPacking(closure)
+StgClosure *closure;
+{
+/*    IF_PAR_DEBUG(pack, */
+/*  	       fprintf(stderr, "** AmPacking %p (%s)(IP %p) at %u\n",  */
+/*  		       closure, info_type(closure), get_itbl(closure), pack_locn)); */
+
+  insertHashTable(offsetTable, (StgWord) closure, (void *) (StgWord) pack_locn);
+}
+
+/*
+   OffsetFor returns an offset for a closure which is already being packed.  */
+
+//@cindex OffsetFor
+
+static int
+OffsetFor(closure)
+StgClosure *closure;
+{
+  return (int) (StgWord) lookupHashTable(offsetTable, (StgWord) closure);
+}
+
+/*
+   NotYetPacking determines whether the closure's already being packed.
+   Offsets $<$ @PACK_HDR_SIZE@ (e.g. 0) mean no.  */
+
+//@cindex NotYetPacking
+
+static rtsBool
+NotYetPacking(offset)
+int offset;
+{
+  return(offset == 0); // ToDo: what if root is found again?? FIX 
+}
+
+# else  /* GRAN */
+
+static void
+DonePacking(void)
+{
+  /* nothing */
+}
+
+/* 
+   NotYetPacking searches through the whole pack buffer for closure.  */
+
+static rtsBool
+NotYetPacking(closure)
+StgClosure *closure;
+{ nat i;
+  rtsBool found = rtsFalse;
+
+  for (i=0; (i<pack_locn) && !found; i++)
+    found = Bonzo->buffer[i]==closure;
+
+  return (!found);
+}
+# endif
+
+//@node Packet size, Types of Global Addresses, Offset table, Aux fcts for packing
+//@subsubsection Packet size
+
+/*
+  RoomToPack determines whether there's room to pack the closure into
+  the pack buffer based on 
+
+  o how full the buffer is already,
+  o the closures' size and number of pointers (which must be packed as GAs),
+  o the size and number of pointers held by any primitive arrays that it 
+    points to
+  
+    It has a *side-effect* (naughty, naughty) in assigning RoomInBuffer 
+    to rtsFalse.
+*/
+
+//@cindex RoomToPack
+static rtsBool
+RoomToPack(size, ptrs)
+nat size, ptrs;
+{
+# if defined(PAR)
+  if (RoomInBuffer &&
+      (pack_locn + reservedPAsize + size +
+       ((clq_size - clq_pos) + ptrs) * PACK_FETCHME_SIZE >= RTS_PACK_BUFFER_SIZE))
+    {
+      IF_PAR_DEBUG(pack,
+		   fprintf(stderr, "Buffer full\n"));
+
+      RoomInBuffer = rtsFalse;
+    }
+# else   /* GRAN */
+  if (RoomInBuffer &&
+      (unpacked_size + reservedPAsize + size +
+       ((clq_size - clq_pos) + ptrs) * PACK_FETCHME_SIZE >= RTS_PACK_BUFFER_SIZE))
+    {
+      IF_GRAN_DEBUG(packBuffer,
+		    fprintf(stderr, "Buffer full\n"));
+      RoomInBuffer = rtsFalse;
+    }
+# endif
+  return (RoomInBuffer);
+}
+
+//@node Types of Global Addresses, Closure Info, Packet size, Aux fcts for packing
+//@subsubsection Types of Global Addresses
+
+/*
+  Types of Global Addresses
+
+  These routines determine whether a GA is one of a number of special types
+  of GA.
+*/
+
+# if defined(PAR)
+//@cindex isOffset
+rtsBool
+isOffset(ga)
+globalAddr *ga;
+{
+    return (ga->weight == 1 && ga->payload.gc.gtid == 0);
+}
+
+//@cindex isFixed
+rtsBool
+isFixed(ga)
+globalAddr *ga;
+{
+    return (ga->weight == 0);
+}
+# endif
+
+//@node Closure Info,  , Types of Global Addresses, Aux fcts for packing
+//@subsubsection Closure Info
+
+/*
+   Closure Info
+
+   @get_closure_info@ determines the size, number of pointers etc. for this
+   type of closure -- see @SMInfoTables.lh@ for the legal info. types etc.
+
+[Can someone please keep this function up to date.  I keep needing it
+ (or something similar) for interpretive code, and it keeps
+ bit-rotting.  {\em It really belongs somewhere else too}.  KH @@ 17/2/95] */
+
+#if 0
+
+// {Parallel.h}Daq ngoqvam vIroQpu'
+
+# if defined(GRAN) || defined(PAR)
+/* extracting specific info out of closure; currently only used in GRAN -- HWL */
+//@cindex get_closure_info
+StgInfoTable*
+get_closure_info(node, size, ptrs, nonptrs, vhs, info_hdr_ty)
+StgClosure* node;
+nat *size, *ptrs, *nonptrs, *vhs;
+char *info_hdr_ty;
+{
+  StgInfoTable *info;
+
+  info = get_itbl(node);
+  /* the switch shouldn't be necessary, really; just use default case */
+  switch (info->type) {
+#if 0
+   case CONSTR_1_0:
+   case THUNK_1_0:
+   case FUN_1_0:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 1; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 0; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+     
+  case CONSTR_0_1:
+  case THUNK_0_1:
+  case FUN_0_1:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 0; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 1; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+
+  case CONSTR_2_0:
+  case THUNK_2_0:
+  case FUN_2_0:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 2; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 0; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+
+  case CONSTR_1_1:
+  case THUNK_1_1:
+  case FUN_1_1:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 1; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 1; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+
+  case CONSTR_0_2:
+  case THUNK_0_2:
+  case FUN_0_2:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 0; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 2; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+#endif
+  case RBH:
+    {
+      StgInfoTable *rip = REVERT_INFOPTR(info); // closure to revert to
+      *size = sizeW_fromITBL(rip);
+      *ptrs = (nat) (rip->layout.payload.ptrs);
+      *nonptrs = (nat) (rip->layout.payload.nptrs);
+      *vhs = (nat) 0; // unknown
+      info_hdr_type(node, info_hdr_ty);
+      return rip;  // NB: we return the reverted info ptr for a RBH!!!!!!
+    }
+
+  default:
+    *size = sizeW_fromITBL(info);
+    *ptrs = (nat) (info->layout.payload.ptrs);
+    *nonptrs = (nat) (info->layout.payload.nptrs);
+    *vhs = (nat) 0; // unknown
+    info_hdr_type(node, info_hdr_ty);
+    return info;
+  }
+} 
+
+//@cindex IS_BLACK_HOLE
+rtsBool
+IS_BLACK_HOLE(StgClosure* node)          
+{ 
+  StgInfoTable *info;
+  info = get_itbl(node);
+  return ((info->type == BLACKHOLE || info->type == RBH) ? rtsTrue : rtsFalse);
+}
+
+//@cindex IS_INDIRECTION
+StgClosure *
+IS_INDIRECTION(StgClosure* node)          
+{ 
+  StgInfoTable *info;
+  info = get_itbl(node);
+  switch (info->type) {
+    case IND:
+    case IND_OLDGEN:
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+    case IND_STATIC:
+      /* relies on indirectee being at same place for all these closure types */
+      return (((StgInd*)node) -> indirectee);
+    default:
+      return NULL;
+  }
+}
+
+/*
+rtsBool
+IS_THUNK(StgClosure* node)
+{
+  StgInfoTable *info;
+  info = get_itbl(node);
+  return ((info->type == THUNK ||
+	   info->type == THUNK_STATIC ||
+	   info->type == THUNK_SELECTOR) ? rtsTrue : rtsFalse);
+}
+*/
+
+# endif /* GRAN */
+#endif /* 0 */
+
+# if 0
+/* ngoq ngo' */
+
+P_
+get_closure_info(closure, size, ptrs, nonptrs, vhs, type)
+P_ closure;
+W_ *size, *ptrs, *nonptrs, *vhs;
+char *type;
+{
+   P_ ip = (P_) INFO_PTR(closure);
+
+   if (closure==NULL) {
+     fprintf(stderr, "Qagh {get_closure_info}Daq: NULL closure\n");
+     *size = *ptrs = *nonptrs = *vhs = 0; 
+     strcpy(type,"ERROR in get_closure_info");
+     return;
+   } else if (closure==PrelBase_Z91Z93_closure) {
+     /* fprintf(stderr, "Qagh {get_closure_info}Daq: PrelBase_Z91Z93_closure closure\n"); */
+     *size = *ptrs = *nonptrs = *vhs = 0; 
+     strcpy(type,"PrelBase_Z91Z93_closure");
+     return;
+   };
+
+    ip = (P_) INFO_PTR(closure);
+
+    switch (INFO_TYPE(ip)) {
+    case INFO_SPEC_U_TYPE:
+    case INFO_SPEC_S_TYPE:
+    case INFO_SPEC_N_TYPE:
+	*size = SPEC_CLOSURE_SIZE(closure);
+	*ptrs = SPEC_CLOSURE_NoPTRS(closure);
+	*nonptrs = SPEC_CLOSURE_NoNONPTRS(closure);
+	*vhs = 0 /*SPEC_VHS*/;
+	strcpy(type,"SPEC");
+	break;
+
+    case INFO_GEN_U_TYPE:
+    case INFO_GEN_S_TYPE:
+    case INFO_GEN_N_TYPE:
+	*size = GEN_CLOSURE_SIZE(closure);
+	*ptrs = GEN_CLOSURE_NoPTRS(closure);
+	*nonptrs = GEN_CLOSURE_NoNONPTRS(closure);
+	*vhs = GEN_VHS;
+	strcpy(type,"GEN");
+	break;
+
+    case INFO_DYN_TYPE:
+	*size = DYN_CLOSURE_SIZE(closure);
+	*ptrs = DYN_CLOSURE_NoPTRS(closure);
+	*nonptrs = DYN_CLOSURE_NoNONPTRS(closure);
+	*vhs = DYN_VHS;
+	strcpy(type,"DYN");
+	break;
+
+    case INFO_TUPLE_TYPE:
+	*size = TUPLE_CLOSURE_SIZE(closure);
+	*ptrs = TUPLE_CLOSURE_NoPTRS(closure);
+	*nonptrs = TUPLE_CLOSURE_NoNONPTRS(closure);
+	*vhs = TUPLE_VHS;
+	strcpy(type,"TUPLE");
+	break;
+
+    case INFO_DATA_TYPE:
+	*size = DATA_CLOSURE_SIZE(closure);
+	*ptrs = DATA_CLOSURE_NoPTRS(closure);
+	*nonptrs = DATA_CLOSURE_NoNONPTRS(closure);
+	*vhs = DATA_VHS;
+	strcpy(type,"DATA");
+	break;
+
+    case INFO_IMMUTUPLE_TYPE:
+    case INFO_MUTUPLE_TYPE:
+	*size = MUTUPLE_CLOSURE_SIZE(closure);
+	*ptrs = MUTUPLE_CLOSURE_NoPTRS(closure);
+	*nonptrs = MUTUPLE_CLOSURE_NoNONPTRS(closure);
+	*vhs = MUTUPLE_VHS;
+	strcpy(type,"(IM)MUTUPLE");
+	break;
+
+    case INFO_STATIC_TYPE:
+	*size = STATIC_CLOSURE_SIZE(closure);
+	*ptrs = STATIC_CLOSURE_NoPTRS(closure);
+	*nonptrs = STATIC_CLOSURE_NoNONPTRS(closure);
+	*vhs = STATIC_VHS;
+	strcpy(type,"STATIC");
+	break;
+
+    case INFO_CAF_TYPE:
+    case INFO_IND_TYPE:
+	*size = IND_CLOSURE_SIZE(closure);
+	*ptrs = IND_CLOSURE_NoPTRS(closure);
+	*nonptrs = IND_CLOSURE_NoNONPTRS(closure);
+	*vhs = IND_VHS;
+	strcpy(type,"CAF|IND");
+	break;
+
+    case INFO_CONST_TYPE:
+	*size = CONST_CLOSURE_SIZE(closure);
+	*ptrs = CONST_CLOSURE_NoPTRS(closure);
+	*nonptrs = CONST_CLOSURE_NoNONPTRS(closure);
+	*vhs = CONST_VHS;
+	strcpy(type,"CONST");
+	break;
+
+    case INFO_SPEC_RBH_TYPE:
+	*size = SPEC_RBH_CLOSURE_SIZE(closure);
+	*ptrs = SPEC_RBH_CLOSURE_NoPTRS(closure);
+	*nonptrs = SPEC_RBH_CLOSURE_NoNONPTRS(closure);
+	if (*ptrs <= 2) {
+	    *nonptrs -= (2 - *ptrs);
+	    *ptrs = 1;
+	} else
+	    *ptrs -= 1;
+	*vhs = SPEC_RBH_VHS;
+	strcpy(type,"SPEC_RBH");
+	break;
+
+    case INFO_GEN_RBH_TYPE:
+	*size = GEN_RBH_CLOSURE_SIZE(closure);
+	*ptrs = GEN_RBH_CLOSURE_NoPTRS(closure);
+	*nonptrs = GEN_RBH_CLOSURE_NoNONPTRS(closure);
+	if (*ptrs <= 2) {
+	    *nonptrs -= (2 - *ptrs);
+	    *ptrs = 1;
+	} else
+	    *ptrs -= 1;
+	*vhs = GEN_RBH_VHS;
+	strcpy(type,"GEN_RBH");
+	break;
+
+    case INFO_CHARLIKE_TYPE:
+	*size = CHARLIKE_CLOSURE_SIZE(closure);
+	*ptrs = CHARLIKE_CLOSURE_NoPTRS(closure);
+	*nonptrs = CHARLIKE_CLOSURE_NoNONPTRS(closure);
+	*vhs = CHARLIKE_VHS;
+	strcpy(type,"CHARLIKE");
+	break;
+
+    case INFO_INTLIKE_TYPE:
+	*size = INTLIKE_CLOSURE_SIZE(closure);
+	*ptrs = INTLIKE_CLOSURE_NoPTRS(closure);
+	*nonptrs = INTLIKE_CLOSURE_NoNONPTRS(closure);
+	*vhs = INTLIKE_VHS;
+	strcpy(type,"INTLIKE");
+	break;
+
+#  if !defined(GRAN)
+    case INFO_FETCHME_TYPE:
+	*size = FETCHME_CLOSURE_SIZE(closure);
+        *ptrs = FETCHME_CLOSURE_NoPTRS(closure);
+        *nonptrs = FETCHME_CLOSURE_NoNONPTRS(closure);
+        *vhs = FETCHME_VHS;
+	strcpy(type,"FETCHME");
+	break;
+
+    case INFO_FMBQ_TYPE:
+	*size = FMBQ_CLOSURE_SIZE(closure);
+        *ptrs = FMBQ_CLOSURE_NoPTRS(closure);
+        *nonptrs = FMBQ_CLOSURE_NoNONPTRS(closure);
+        *vhs = FMBQ_VHS;
+	strcpy(type,"FMBQ");
+	break;
+#  endif
+
+    case INFO_BQ_TYPE:
+	*size = BQ_CLOSURE_SIZE(closure);
+        *ptrs = BQ_CLOSURE_NoPTRS(closure);
+        *nonptrs = BQ_CLOSURE_NoNONPTRS(closure);
+        *vhs = BQ_VHS;
+	strcpy(type,"BQ");
+	break;
+
+    case INFO_BH_TYPE:
+	*size = BH_CLOSURE_SIZE(closure);
+        *ptrs = BH_CLOSURE_NoPTRS(closure);
+        *nonptrs = BH_CLOSURE_NoNONPTRS(closure);
+        *vhs = BH_VHS;
+	strcpy(type,"BH");
+	break;
+
+    case INFO_TSO_TYPE:
+	*size = 0; /* TSO_CLOSURE_SIZE(closure); */
+        *ptrs = 0; /* TSO_CLOSURE_NoPTRS(closure); */
+        *nonptrs = 0; /* TSO_CLOSURE_NoNONPTRS(closure); */
+        *vhs = TSO_VHS;
+	strcpy(type,"TSO");
+	break;
+
+    case INFO_STKO_TYPE:
+        *size = 0;
+    	*ptrs = 0;
+        *nonptrs = 0;
+    	*vhs = STKO_VHS;
+    	strcpy(type,"STKO");
+        break;
+
+    default:
+	fprintf(stderr, "get_closure_info:  Unexpected closure type (%lu), closure %lx\n",
+	  INFO_TYPE(ip), (StgWord) closure);
+	EXIT(EXIT_FAILURE);
+    }
+
+    return ip;
+}
+# endif
+
+# if 0
+// Use allocate in Storage.c instead
+/*
+   @AllocateHeap@ will bump the heap pointer by @size@ words if the space
+   is available, but it will not perform garbage collection.
+   ToDo: check whether we can use an existing STG allocation routine -- HWL
+*/
+
+
+//@cindex AllocateHeap
+StgPtr
+AllocateHeap(size)
+nat size;
+{
+  StgPtr newClosure;
+  
+  /* Allocate a new closure */
+  if (Hp + size > HpLim)
+    return NULL;
+  
+  newClosure = Hp + 1;
+  Hp += size;
+  
+  return newClosure;
+}
+# endif
+
+# if defined(PAR)
+
+//@cindex doGlobalGC
+void
+doGlobalGC(void)
+{
+  fprintf(stderr,"Splat -- we just hit global GC!\n");
+  stg_exit(EXIT_FAILURE);
+  //fishing = rtsFalse;
+  outstandingFishes--;
+}
+
+# endif /* PAR */
+
+//@node Printing Packet Contents, End of file, Aux fcts for packing, Graph packing
+//@subsection Printing Packet Contents
+/*
+  Printing Packet Contents
+  */
+
+#if defined(DEBUG) || defined(GRAN_CHECK)
+
+//@cindex PrintPacket
+
+#if defined(PAR)
+void
+PrintPacket(packBuffer)
+rtsPackBuffer *packBuffer;
+{
+  StgClosure *parent, *graphroot, *closure_start;
+  StgInfoTable *ip, *oldip;
+  globalAddr ga;
+  StgWord **buffer, **bufptr, **slotptr;
+
+  nat bufsize;
+  nat pptr = 0, pptrs = 0, pvhs;
+  nat unpack_locn = 0;
+  nat gastart = unpack_locn;
+  nat closurestart = unpack_locn;
+  nat i;
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+
+  /* NB: this whole routine is more or less a copy of UnpackGraph with all
+     unpacking components replaced by printing fcts
+     Long live higher-order fcts!
+  */
+  initPackBuffer();                  /* in case it isn't already init'd */
+  graphroot = (StgClosure *)NULL;
+
+  // gaga = PendingGABuffer;
+
+  InitClosureQueue();
+
+  /* Unpack the header */
+  bufsize = packBuffer->size;
+  buffer = packBuffer->buffer;
+  bufptr = buffer;
+
+  /* allocate heap 
+  if (bufsize > 0) {
+    graph = allocate(bufsize);
+    ASSERT(graph != NULL);
+  }
+  */
+
+  fprintf(stderr, ".* Printing <<%d>> (buffer @ %p):\n", 
+	  packBuffer->id, packBuffer);
+  fprintf(stderr, ".*   size: %d; unpacked_size: %d; tso: %p; buffer: %p\n",
+	  packBuffer->size, packBuffer->unpacked_size, 
+	  packBuffer->tso, packBuffer->buffer);
+
+  parent = (StgClosure *)NULL;
+
+  do {
+    /* This is where we will ultimately save the closure's address */
+    slotptr = bufptr;
+
+    /* First, unpack the next GA or PLC */
+    ga.weight = (rtsWeight) *bufptr++;
+
+    if (ga.weight > 0) {
+      ga.payload.gc.gtid = (GlobalTaskId) *bufptr++;
+      ga.payload.gc.slot = (int) *bufptr++;
+    } else
+      ga.payload.plc = (StgPtr) *bufptr++;
+    
+    /* Now unpack the closure body, if there is one */
+    if (isFixed(&ga)) {
+      fprintf(stderr, ".* [%u]: PLC @ %p\n", gastart, ga.payload.plc);
+      // closure = ga.payload.plc;
+    } else if (isOffset(&ga)) {
+      fprintf(stderr, ".* [%u]: OFFSET TO [%d]\n", gastart, ga.payload.gc.slot);
+      // closure = (StgClosure *) buffer[ga.payload.gc.slot];
+    } else {
+      /* Print normal closures */
+
+      ASSERT(bufsize > 0);
+
+      fprintf(stderr, ".* [%u]: ((%x, %d, %x)) ", gastart, 
+              ga.payload.gc.gtid, ga.payload.gc.slot, ga.weight);
+
+      closure_start = bufptr;
+      ip = get_closure_info(bufptr, &size, &ptrs, &nonptrs, &vhs, str);
+	  
+      /* 
+	 Remember, the generic closure layout is as follows:
+	 +-------------------------------------------------+
+	 | FIXED HEADER | VARIABLE HEADER | PTRS | NON-PRS |
+	 +-------------------------------------------------+
+      */
+      /* Print fixed header */
+      fprintf(stderr, "FH ["); 
+      for (i = 0; i < _HS; i++)
+	fprintf(stderr, " %p", *bufptr++);
+
+      if (ip->type == FETCH_ME)
+	size = ptrs = nonptrs = vhs = 0;
+
+      /* Print variable header */
+      fprintf(stderr, "] VH ["); 
+      for (i = 0; i < vhs; i++)
+	fprintf(stderr, " %p", *bufptr++);
+
+      fprintf(stderr, "] %d PTRS [", ptrs); 
+
+      /* Pointers will be filled in later */
+
+      fprintf(stderr, " ] %d NON-PTRS [", nonptrs); 
+      /* Print non-pointers */
+      for (i = 0; i < nonptrs; i++)
+	fprintf(stderr, " %p", *bufptr++);
+
+      fprintf(stderr, "] (%s)\n", str);
+
+      /* Indirections are never packed */
+      // ASSERT(INFO_PTR(graph) != (W_) Ind_info_TO_USE);
+
+      /* Add to queue for processing 
+	 When just printing the packet we do not have an unpacked closure
+	 in hand, so we feed it the packet entry; 
+	 again, this assumes that at least the fixed header of the closure
+	 has the same layout in the packet; also we may not overwrite entries
+	 in the packet (done in Unpack), but for printing that's a bad idea
+	 anyway */
+      QueueClosure((StgClosure *)closure_start);
+	
+      /* No Common up needed for printing */
+
+      /* No Sort out the global address mapping for printing */
+
+    } /* normal closure case */
+
+    /* Locate next parent pointer */
+    pptr++;
+    while (pptr + 1 > pptrs) {
+      parent = DeQueueClosure();
+
+      if (parent == NULL)
+	break;
+      else {
+	(void) get_closure_info(parent, &size, &pptrs, &nonptrs,
+					&pvhs, str);
+	pptr = 0;
+      }
+    }
+  } while (parent != NULL);
+  fprintf(stderr, ".* --- End packet <<%d>> ---\n", packBuffer->id);
+}
+#else  /* GRAN */
+void
+PrintPacket(buffer)
+rtsPackBuffer *buffer;
+{
+    // extern char *info_hdr_type(P_ infoptr);  /* defined in Threads.lc */
+    // extern char *display_info_type(P_ infoptr);      /* defined in Threads.lc */
+
+    StgInfoTable *info;
+    nat size, ptrs, nonptrs, vhs;
+    char info_hdr_ty[80];
+    char str1[80], str2[80], junk_str[80];
+
+    /* globalAddr ga; */
+
+    nat bufsize, unpacked_size ;
+    StgClosure *parent;
+    nat pptr = 0, pptrs = 0, pvhs;
+
+    nat unpack_locn = 0;
+    nat gastart = unpack_locn;
+    nat closurestart = unpack_locn;
+
+    StgTSO *tso;
+    StgClosure *closure, *p;
+
+    nat i;
+
+    fprintf(stderr, "*** Printing <<%d>> (buffer @ %p):\n", buffer->id, buffer);
+    fprintf(stderr, "  size: %d; unpacked_size: %d; tso: %d (%p); buffer: %p\n",
+	    buffer->size, buffer->unpacked_size, buffer->tso, buffer->buffer);
+    fputs("  contents: ", stderr);
+    for (unpack_locn=0; unpack_locn<buffer->size; unpack_locn++) {
+      closure = buffer->buffer[unpack_locn];
+      fprintf(stderr, ", %p (%s)", 
+	      closure, info_type(closure)); 
+    }
+    fputc('\n', stderr);
+
+#if 0
+    /* traverse all elements of the graph; omitted for now, but might be usefule */
+    InitClosureQueue();
+
+    tso = buffer->tso;
+
+    /* Unpack the header */
+    unpacked_size = buffer->unpacked_size;
+    bufsize = buffer->size;
+
+    fprintf(stderr, "Packet %p, size %u (unpacked size is %u); demanded by TSO %d (%p)[PE %d]\n--- Begin ---\n", 
+	            buffer, bufsize, unpacked_size,  
+	            tso->id, tso, where_is((StgClosure*)tso));
+
+    do {
+	closurestart = unpack_locn;
+	closure = buffer->buffer[unpack_locn++];
+	
+	fprintf(stderr, "[%u]: (%p) ", closurestart, closure);
+
+	info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str1);
+	strcpy(str2, str1);
+	fprintf(stderr, "(%s|%s) ", str1, str2);
+	
+        if (info->type == FETCH_ME || info->type == FETCH_ME_BQ || 
+	    IS_BLACK_HOLE(closure))
+	  size = ptrs = nonptrs = vhs = 0;
+	
+	if (closure_THUNK(closure)) {
+		if (closure_UNPOINTED(closure))
+		    fputs("UNPOINTED ", stderr);
+		else
+		    fputs("POINTED ", stderr);
+	} 
+        if (IS_BLACK_HOLE(closure)) {
+		fputs("BLACK HOLE\n", stderr);
+	} else {
+		/* Fixed header */
+		fprintf(stderr, "FH ["); 
+		for (i = 0, p = (StgClosure*)&(closure->header); i < _HS; i++, p++)
+		    fprintf(stderr, " %p", *p);
+	
+		/* Variable header 
+		if (vhs > 0) {
+		    fprintf(stderr, "] VH [%p", closure->payload[_HS]);
+	
+		    for (i = 1; i < vhs; i++)
+			fprintf(stderr, " %p", closure->payload[_HS+i]);
+		}
+		*/
+		fprintf(stderr, "] PTRS %u", ptrs);
+	
+		/* Non-pointers */
+		if (nonptrs > 0) {
+		    fprintf(stderr, " NPTRS [%p", closure->payload[_HS+vhs]);
+		
+		    for (i = 1; i < nonptrs; i++)
+			fprintf(stderr, " %p", closure->payload[_HS+vhs+i]);
+	
+		    putc(']', stderr);
+		}
+		putc('\n', stderr);
+	}
+    } while (unpack_locn<bufsize) ;  /* (parent != NULL); */
+
+    fprintf(stderr, "--- End ---\n\n");
+#endif /* 0 */
+}
+#endif /* PAR */
+#endif /* DEBUG || GRAN_CHECK */
+
+#endif /* PAR  || GRAN  -- whole file */
+
+//@node End of file,  , Printing Packet Contents, Graph packing
+//@subsection End of file
+//@index
+//* AllocClosureQueue::  @cindex\s-+AllocClosureQueue
+//* AllocateHeap::  @cindex\s-+AllocateHeap
+//* AmPacking::  @cindex\s-+AmPacking
+//* CommonUp::  @cindex\s-+CommonUp
+//* DeQueueClosure::  @cindex\s-+DeQueueClosure
+//* DonePacking::  @cindex\s-+DonePacking
+//* IS_BLACK_HOLE::  @cindex\s-+IS_BLACK_HOLE
+//* IS_INDIRECTION::  @cindex\s-+IS_INDIRECTION
+//* InitClosureQueue::  @cindex\s-+InitClosureQueue
+//* InitPendingGABuffer::  @cindex\s-+InitPendingGABuffer
+//* NotYetPacking::  @cindex\s-+NotYetPacking
+//* OffsetFor::  @cindex\s-+OffsetFor
+//* Pack::  @cindex\s-+Pack
+//* PackClosure::  @cindex\s-+PackClosure
+//* PackNearbyGraph::  @cindex\s-+PackNearbyGraph
+//* PackOneNode::  @cindex\s-+PackOneNode
+//* PackPLC::  @cindex\s-+PackPLC
+//* PackStkO::  @cindex\s-+PackStkO
+//* PackTSO::  @cindex\s-+PackTSO
+//* PendingGABuffer::  @cindex\s-+PendingGABuffer
+//* PrintPacket::  @cindex\s-+PrintPacket
+//* QueueClosure::  @cindex\s-+QueueClosure
+//* QueueEmpty::  @cindex\s-+QueueEmpty
+//* RoomToPack::  @cindex\s-+RoomToPack
+//* UnpackGraph::  @cindex\s-+UnpackGraph
+//* doGlobalGC::  @cindex\s-+doGlobalGC
+//* get_closure_info::  @cindex\s-+get_closure_info
+//* get_closure_info::  @cindex\s-+get_closure_info
+//* initPackBuffer::  @cindex\s-+initPackBuffer
+//* isFixed::  @cindex\s-+isFixed
+//* isOffset::  @cindex\s-+isOffset
+//* offsetTable::  @cindex\s-+offsetTable
+//@end index
diff --git a/ghc/rts/parallel/ParInit.c b/ghc/rts/parallel/ParInit.c
new file mode 100644
index 0000000..d54ff00
--- /dev/null
+++ b/ghc/rts/parallel/ParInit.c
@@ -0,0 +1,227 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Sat Dec 04 1999 18:26:22 Stardate: [-30]3998.84 hwloidl>
+   $Id: ParInit.c,v 1.2 2000/01/13 14:34:08 hwloidl Exp $
+
+   Initialising the parallel RTS
+
+   An extension based on Kevin Hammond's GRAPH for PVM version
+   P. Trinder, January 17th 1995.
+   Adapted for the new RTS
+   P. Trinder, July 1997.
+   H-W. Loidl, November 1999.
+
+   ------------------------------------------------------------------------ */
+
+#ifdef PAR /* whole file */
+
+#define NON_POSIX_SOURCE /* so says Solaris */
+
+//@menu
+//* Includes::			
+//* Global variables::		
+//* Initialisation Routines::	
+//@end menu
+
+//@node Includes, Global variables
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "ParallelRts.h"
+#include <setjmp.h>
+#include "LLC.h"
+#include "HLC.h"
+
+//@node Global variables, Initialisation Routines, Includes
+//@subsection Global variables
+
+/* Global conditions defined here. */
+
+rtsBool	IAmMainThread = rtsFalse,	/* Set for the main thread	*/
+	GlobalStopPending = rtsFalse;	/* Terminating			*/
+
+/* Task identifiers for various interesting global tasks. */
+
+GlobalTaskId IOTask = 0,                /* The IO Task Id		*/
+             SysManTask = 0,            /* The System Manager Task Id	*/
+             mytid = 0;                 /* This PE's Task Id		*/
+
+rtsTime 	main_start_time;	/* When the program started	*/
+rtsTime   	main_stop_time;	    	/* When the program finished    */
+jmp_buf		exit_parallel_system;	/* How to abort from the RTS	*/
+
+
+//rtsBool fishing = rtsFalse;             /* We have no fish out in the stream */
+rtsTime last_fish_arrived_at = 0;       /* Time of arrival of most recent fish*/
+nat     outstandingFishes = 0;          /* Number of active fishes */ 
+
+//@cindex spark queue
+/* GranSim: a globally visible array of spark queues */
+rtsSpark *pending_sparks_hd[SPARK_POOLS],  /* ptr to start of a spark pool */ 
+         *pending_sparks_tl[SPARK_POOLS],  /* ptr to end of a spark pool */ 
+         *pending_sparks_lim[SPARK_POOLS],
+         *pending_sparks_base[SPARK_POOLS]; 
+
+//@cindex spark_limit
+/* max number of sparks permitted on the PE; 
+   see RtsFlags.ParFlags.maxLocalSparks */
+nat spark_limit[SPARK_POOLS];
+
+globalAddr theGlobalFromGA, theGlobalToGA;
+/*
+  HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACK !! see FETCH_ME_entry
+  Only used within FETCH_ME_entry as local vars, but they shouldn't
+  be defined locally in there -- that would move %esp and you'll never
+  return from STG land.
+  -- HWL
+*/
+globalAddr *rga_GLOBAL;
+globalAddr *lga_GLOBAL;
+globalAddr fmbqga_GLOBAL;
+StgClosure *p_GLOBAL;
+
+//@cindex PendingFetches
+/* A list of fetch reply messages not yet processed; this list is filled
+   by awaken_blocked_queue and processed by processFetches */
+StgBlockedFetch *PendingFetches = END_BF_QUEUE;
+
+//@cindex allPEs
+GlobalTaskId *allPEs;
+
+//@cindex nPEs
+nat nPEs = 0;
+
+//@cindex sparksIgnored
+nat sparksIgnored = 0, sparksCreated = 0, 
+    threadsIgnored = 0, threadsCreated = 0;
+
+//@cindex advisory_thread_count
+nat advisory_thread_count = 0;
+
+/* Where to write the log file 
+   This is now in Parallel.c 
+FILE *gr_file = NULL;
+char gr_filename[STATS_FILENAME_MAXLEN];
+*/
+
+/* Flag handling. */
+
+#if 0
+/* that's now all done via RtsFlags.ParFlags... */
+rtsBool TraceSparks =    rtsFalse;		/* Enable the spark trace mode 		*/
+rtsBool SparkLocally =   rtsFalse;		/* Use local threads if possible 	*/
+rtsBool DelaySparks =    rtsFalse;		/* Use delayed sparking 		*/
+rtsBool LocalSparkStrategy =   rtsFalse;	/* Either delayed threads or local threads*/
+rtsBool GlobalSparkStrategy =  rtsFalse;	/* Export all threads	    	     	*/
+
+rtsBool DeferGlobalUpdates =   rtsFalse;	/* Defer updating of global nodes	*/
+#endif
+
+//@node Initialisation Routines,  , Global variables
+//@subsection Initialisation Routines
+
+/*
+  par_exit defines how to terminate the program.  If the exit code is
+  non-zero (i.e. an error has occurred), the PE should not halt until
+  outstanding error messages have been processed.  Otherwise, messages
+  might be sent to non-existent Task Ids.  The infinite loop will actually
+  terminate, since STG_Exception will call myexit\tr{(0)} when
+  it received a PP_FINISH from the system manager task.
+*/
+//@cindex par_exit
+void
+shutdownParallelSystem(StgInt n)
+{
+  belch("   entered shutdownParallelSystem ...");
+  ASSERT(GlobalStopPending = rtsTrue);
+  sendOp(PP_FINISH, SysManTask);
+  if (n != 0) 
+    waitForTermination();
+  else
+    waitForPEOp(PP_FINISH, SysManTask);
+  shutDownPE();
+  IF_PAR_DEBUG(verbose,
+	       belch("--++ shutting down PE %lx, %ld sparks created, %ld sparks Ignored, %ld threads created, %ld threads Ignored", 
+		     (W_) mytid, sparksCreated, sparksIgnored,
+		     threadsCreated, threadsIgnored));
+  exit(n);
+}
+
+/* Some prototypes */
+void srand48 (long);
+time_t time (time_t *);
+
+//@cindex initParallelSystem
+void
+initParallelSystem(void)
+{
+  belch("entered initParallelSystem ...");
+
+  /* Don't buffer standard channels... */
+  setbuf(stdout,NULL);
+  setbuf(stderr,NULL);
+
+  srand48(time(NULL) * getpid());  /*Initialise Random-number generator seed*/
+                                   /* Used to select target of FISH message*/
+
+  theGlobalFromGA.payload.gc.gtid = 0;
+  theGlobalToGA.payload.gc.gtid = 0;
+
+  //IF_PAR_DEBUG(verbose,
+	       belch("initPackBuffer ...");
+  if (!initPackBuffer())
+    barf("initPackBuffer");
+
+  // IF_PAR_DEBUG(verbose,
+	       belch("initMoreBuffers ...");
+  if (!initMoreBuffers())
+    barf("initMoreBuffers");
+
+  // IF_PAR_DEBUG(verbose,
+	       belch("initSparkPools ...");
+  if (!initSparkPools())
+    barf("initSparkPools");
+}
+
+/* 
+ * SynchroniseSystem synchronises the reduction task with the system
+ * manager, and initialises the Global address tables (LAGA & GALA)
+ */
+
+//@cindex SynchroniseSystem
+void
+SynchroniseSystem(void)
+{
+  int i;
+
+  fprintf(stderr, "SynchroniseSystem: nPEs=%d\n", nPEs); 
+
+  initEachPEHook();                  /* HWL: hook to be execed on each PE */
+
+  fprintf(stderr, "SynchroniseSystem: initParallelSystem\n");
+  initParallelSystem();
+  allPEs = startUpPE(nPEs);
+
+  /* Initialize global address tables */
+  initGAtables();
+
+  /* Record the shortened the PE identifiers for LAGA etc. tables */
+  for (i = 0; i < nPEs; ++i) {
+    fprintf(stderr, "[%x] registering %d-th PE as %x\n", mytid, i, allPEs[i]);
+    registerTask(allPEs[i]);
+  }
+}
+
+#endif /* PAR -- whole file */
+
+//@index
+//* PendingFetches::  @cindex\s-+PendingFetches
+//* SynchroniseSystem::  @cindex\s-+SynchroniseSystem
+//* allPEs::  @cindex\s-+allPEs
+//* initParallelSystem::  @cindex\s-+initParallelSystem
+//* nPEs::  @cindex\s-+nPEs
+//* par_exit::  @cindex\s-+par_exit
+//* spark queue::  @cindex\s-+spark queue
+//* sparksIgnored::  @cindex\s-+sparksIgnored
+//@end index
diff --git a/ghc/rts/parallel/ParInit.h b/ghc/rts/parallel/ParInit.h
new file mode 100644
index 0000000..add7ad9
--- /dev/null
+++ b/ghc/rts/parallel/ParInit.h
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ * ParInit.h,1
+ * 
+ * Phil Trinder
+ * July 1998
+ *
+ * External Parallel Initialisation Interface
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef PARINIT_H
+#define PARINIT_H
+
+extern void RunParallelSystem (P_);
+extern void initParallelSystem(void);
+extern void SynchroniseSystem(void);
+extern void par_exit(I_);
+
+#endif PARINIT_H
diff --git a/ghc/rts/parallel/ParTypes.h b/ghc/rts/parallel/ParTypes.h
new file mode 100644
index 0000000..b280eae
--- /dev/null
+++ b/ghc/rts/parallel/ParTypes.h
@@ -0,0 +1,39 @@
+/* ---------------------------------------------------------------------------
+ * Time-stamp: <Tue Nov 09 1999 16:31:38 Stardate: [-30]3873.44 hwloidl>
+ * $Id: ParTypes.h,v 1.2 2000/01/13 14:34:08 hwloidl Exp $  
+ *
+ * Runtime system types for GUM
+ *
+ * ------------------------------------------------------------------------- */
+
+#ifndef PARTYPES_H
+#define PARTYPES_H
+
+#ifdef PAR /* all of it */
+
+// now in Parallel.h 
+//typedef struct hashtable  HashTable;
+//typedef struct hashlist   HashList;
+
+/* Global addresses now live in Parallel.h (needed in Closures.h) */
+// gaddr
+
+// now in Parallel.h 
+/* (GA, LA) pairs 
+typedef struct gala {
+    globalAddr   ga;
+    StgPtr       la;
+    struct gala *next;
+    rtsBool      preferred;
+} rtsGaLa;
+*/
+
+#if defined(GRAN)
+typedef unsigned long TIME;
+typedef unsigned char Proc;
+typedef unsigned char EVTTYPE;
+#endif
+
+#endif /* PAR */
+
+#endif /* ! PARTYPES_H */
diff --git a/ghc/rts/parallel/Parallel.c b/ghc/rts/parallel/Parallel.c
new file mode 100644
index 0000000..8feb516
--- /dev/null
+++ b/ghc/rts/parallel/Parallel.c
@@ -0,0 +1,776 @@
+/*
+  Time-stamp: <Sat Dec 04 1999 19:43:39 Stardate: [-30]3999.10 hwloidl>
+
+  Basic functions for use in either GranSim or GUM.
+*/
+
+#if defined(GRAN) || defined(PAR)                              /* whole file */
+
+//@menu
+//* Includes::			
+//* Variables and constants::	
+//* Writing to the log-file::	
+//* Dumping routines::		
+//@end menu
+
+//@node Includes, Variables and constants
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+
+
+//@node Variables and constants, Writing to the log-file, Includes
+//@subsection Variables and constants
+
+/* Where to write the log file */
+FILE *gr_file = NULL;
+char gr_filename[STATS_FILENAME_MAXLEN];
+
+//@node Writing to the log-file, Dumping routines, Variables and constants
+//@subsection Writing to the log-file
+/*
+  Writing to the log-file
+
+  These routines dump event-based info to the main log-file.
+  The code for writing log files is shared between GranSim and GUM.
+*/
+
+/* 
+ * If you're not using GNUC and you're on a 32-bit machine, you're 
+ * probably out of luck here.  However, since CONCURRENT currently
+ * requires GNUC, I'm not too worried about it.  --JSM
+ */
+
+//@cindex init_gr_simulation
+#if defined(GRAN)
+void
+init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv)
+char *prog_argv[], *rts_argv[];
+int prog_argc, rts_argc;
+{
+  nat i;
+  char *extension = RtsFlags.GranFlags.GranSimStats.Binary ? "gb" : "gr";
+
+  if (RtsFlags.GranFlags.GranSimStats.Global)
+    init_gr_stats();
+
+  /* init global constants for costs of basic operations */
+  gran_arith_cost = RtsFlags.GranFlags.Costs.arith_cost;
+  gran_branch_cost = RtsFlags.GranFlags.Costs.branch_cost;
+  gran_load_cost = RtsFlags.GranFlags.Costs.load_cost;
+  gran_store_cost = RtsFlags.GranFlags.Costs.store_cost;
+  gran_float_cost = RtsFlags.GranFlags.Costs.float_cost;
+
+  if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+    return;
+
+  if (!RtsFlags.GranFlags.GranSimStats.Full) 
+    return;
+
+  sprintf(gr_filename, GR_FILENAME_FMT, prog_argv[0], extension);
+
+  if ((gr_file = fopen(gr_filename, "w")) == NULL) {
+    barf("Can't open granularity simulation report file %s\n", 
+	 gr_filename);
+  }
+
+  setbuf(gr_file, NULL); // for debugging turn buffering off
+
+  /* write header with program name, options and setup to gr_file */
+  fputs("Granularity Simulation for ", gr_file);
+  for (i = 0; i < prog_argc; ++i) {
+    fputs(prog_argv[i], gr_file);
+    fputc(' ', gr_file);
+  }
+
+  if (rts_argc > 0) {
+    fputs("+RTS ", gr_file);
+    
+    for (i = 0; i < rts_argc; ++i) {
+      fputs(rts_argv[i], gr_file);
+      fputc(' ', gr_file);
+    }
+  }
+
+  fputs("\nStart time: ", gr_file);
+  fputs(time_str(), gr_file);               /* defined in RtsUtils.c */
+  fputc('\n', gr_file);
+    
+  fputs("\n\n--------------------\n\n", gr_file);
+
+  fputs("General Parameters:\n\n", gr_file);
+
+  if (RtsFlags.GranFlags.Light) 
+    fprintf(gr_file, "GrAnSim-Light\nPEs infinite, %s Scheduler, %sMigrate Threads %s, %s\n",
+	    RtsFlags.GranFlags.DoFairSchedule?"Fair":"Unfair",
+	    RtsFlags.GranFlags.DoThreadMigration?"":"Don't ",
+	    RtsFlags.GranFlags.DoThreadMigration && RtsFlags.GranFlags.DoStealThreadsFirst?" Before Sparks":"",
+	    RtsFlags.GranFlags.DoAsyncFetch ? "Asynchronous Fetch" :
+	    "Block on Fetch");
+  else 
+    fprintf(gr_file, "PEs %u, %s Scheduler, %sMigrate Threads %s, %s\n",
+	    RtsFlags.GranFlags.proc,RtsFlags.GranFlags.DoFairSchedule?"Fair":"Unfair",
+	    RtsFlags.GranFlags.DoThreadMigration?"":"Don't ",
+	    RtsFlags.GranFlags.DoThreadMigration && RtsFlags.GranFlags.DoStealThreadsFirst?" Before Sparks":"",
+	    RtsFlags.GranFlags.DoAsyncFetch ? "Asynchronous Fetch" :
+	    "Block on Fetch");
+  
+  if (RtsFlags.GranFlags.DoBulkFetching) 
+    if (RtsFlags.GranFlags.ThunksToPack)
+      fprintf(gr_file, "Bulk Fetching: Fetch %d Thunks in Each Packet (Packet Size = %d closures)\n",
+	      RtsFlags.GranFlags.ThunksToPack, 
+	      RtsFlags.GranFlags.packBufferSize);
+    else
+      fprintf(gr_file, "Bulk Fetching: Fetch as many closures as possible (Packet Size = %d closures)\n",
+	      RtsFlags.GranFlags.packBufferSize);
+  else
+    fprintf(gr_file, "Incremental Fetching: Fetch Exactly One Closure in Each Packet\n");
+  
+  fprintf(gr_file, "Fetch Strategy(%u):If outstanding fetches %s\n",
+	  RtsFlags.GranFlags.FetchStrategy,
+	  RtsFlags.GranFlags.FetchStrategy==0 ?
+	    " block (block-on-fetch)":
+	  RtsFlags.GranFlags.FetchStrategy==1 ?
+	    "only run runnable threads":
+	  RtsFlags.GranFlags.FetchStrategy==2 ? 
+	    "create threads only from local sparks":
+	  RtsFlags.GranFlags.FetchStrategy==3 ? 
+	    "create threads from local or global sparks":
+	  RtsFlags.GranFlags.FetchStrategy==4 ?
+	    "create sparks and steal threads if necessary":
+	  "unknown");
+
+  if (RtsFlags.GranFlags.DoPrioritySparking)
+    fprintf(gr_file, "Priority Sparking (i.e. keep sparks ordered by priority)\n");
+
+  if (RtsFlags.GranFlags.DoPriorityScheduling)
+    fprintf(gr_file, "Priority Scheduling (i.e. keep threads ordered by priority)\n");
+
+  fprintf(gr_file, "Thread Creation Time %u, Thread Queue Time %u\n",
+	  RtsFlags.GranFlags.Costs.threadcreatetime, 
+	  RtsFlags.GranFlags.Costs.threadqueuetime);
+  fprintf(gr_file, "Thread DeSchedule Time %u, Thread Schedule Time %u\n",
+	  RtsFlags.GranFlags.Costs.threaddescheduletime, 
+	  RtsFlags.GranFlags.Costs.threadscheduletime);
+  fprintf(gr_file, "Thread Context-Switch Time %u\n",
+	  RtsFlags.GranFlags.Costs.threadcontextswitchtime);
+  fputs("\n\n--------------------\n\n", gr_file);
+
+  fputs("Communication Metrics:\n\n", gr_file);
+  fprintf(gr_file,
+	  "Latency %u (1st) %u (rest), Fetch %u, Notify %u (Global) %u (Local)\n",
+	  RtsFlags.GranFlags.Costs.latency, 
+	  RtsFlags.GranFlags.Costs.additional_latency, 
+	  RtsFlags.GranFlags.Costs.fetchtime,
+	  RtsFlags.GranFlags.Costs.gunblocktime, 
+	  RtsFlags.GranFlags.Costs.lunblocktime);
+  fprintf(gr_file,
+	  "Message Creation %u (+ %u after send), Message Read %u\n",
+	  RtsFlags.GranFlags.Costs.mpacktime, 
+	  RtsFlags.GranFlags.Costs.mtidytime, 
+	  RtsFlags.GranFlags.Costs.munpacktime);
+  fputs("\n\n--------------------\n\n", gr_file);
+
+  fputs("Instruction Metrics:\n\n", gr_file);
+  fprintf(gr_file, "Arith %u, Branch %u, Load %u, Store %u, Float %u, Alloc %u\n",
+	  RtsFlags.GranFlags.Costs.arith_cost, 
+	  RtsFlags.GranFlags.Costs.branch_cost,
+	  RtsFlags.GranFlags.Costs.load_cost, 
+	  RtsFlags.GranFlags.Costs.store_cost, 
+	  RtsFlags.GranFlags.Costs.float_cost, 
+	  RtsFlags.GranFlags.Costs.heapalloc_cost);
+  fputs("\n\n++++++++++++++++++++\n\n", gr_file);
+
+# if 0
+  /* binary log files are currently not supported */
+  if (RtsFlags.GranFlags.GranSimStats.Binary)
+    grputw(sizeof(rtsTime));
+# endif
+
+  return (0);
+}
+
+#elif defined(PAR)
+
+void
+init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv)
+char *prog_argv[], *rts_argv[];
+int prog_argc, rts_argc;
+{
+  nat i;
+  char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
+  char *extension = RtsFlags.ParFlags.ParStats.Binary ? "gb" : "gr";
+
+  sprintf(gr_filename, GR_FILENAME_FMT_GUM, prog_argv[0], thisPE, extension);
+
+  if (!RtsFlags.ParFlags.ParStats.Full) 
+    return;
+
+  if ((gr_file = fopen(gr_filename, "w")) == NULL)
+    barf("Can't open activity report file %s\n", gr_filename);
+
+  setbuf(gr_file, NULL); // for debugging turn buffering off
+
+  /* write header with program name, options and setup to gr_file */
+  for (i = 0; i < prog_argc; ++i) {
+    fputs(prog_argv[i], gr_file);
+    fputc(' ', gr_file);
+  }
+
+  if (rts_argc > 0) {
+    fputs("+RTS ", gr_file);
+    
+    for (i = 0; i < rts_argc; ++i) {
+      fputs(rts_argv[i], gr_file);
+      fputc(' ', gr_file);
+    }
+  }
+  fputc('\n', gr_file);
+
+  /* record the absolute start time to allow synchronisation of log-files */
+  fputs("Start-Time: ", gr_file);
+  fputs(time_str(), gr_file);
+  fputc('\n', gr_file);
+    
+  startTime = CURRENT_TIME;
+  ullong_format_string(CURRENT_TIME, time_string, rtsFalse/*no commas!*/);
+  fprintf(gr_file, "PE %2u [%s]: TIME\n", thisPE, time_string);
+
+  /*
+  IF_PAR_DEBUG(verbose,
+	       belch("== Start-time: %ld (%s)",
+		     startTime, time_string));
+  */
+# if 0
+    ngoq Dogh'q' vImuS
+
+    if (startTime > LL(1000000000)) {
+      fprintf(gr_file, "PE %2u [%lu%lu]: TIME\n", thisPE, 
+	    (rtsTime) (startTime / LL(1000000000)),
+	    (rtsTime) (startTime % LL(1000000000)));
+    } else {
+      fprintf(gr_file, "PE %2u [%lu]: TIME\n", thisPE, (TIME) startTime);
+    } 
+    /* binary log files are currently not supported */
+    if (RtsFlags.GranFlags.GranSimStats.Binary)
+	grputw(sizeof(rtsTime));
+# endif
+
+    return;
+}
+#endif /* PAR */
+
+//@cindex end_gr_simulation
+#if defined(GRAN)
+void
+end_gr_simulation(void)
+{
+   char time_string[TIME_STR_LEN];
+
+   ullong_format_string(CURRENT_TIME, time_string, rtsFalse/*no commas!*/);
+
+   if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+     return;
+
+   /* Print event stats */
+   if (RtsFlags.GranFlags.GranSimStats.Global) {
+     nat i;
+   
+     fprintf(stderr,"Total yields: %d\n",
+             globalGranStats.tot_yields);
+
+     fprintf(stderr,"Total number of threads created: %d ; per PE:\n",
+             globalGranStats.tot_threads_created);
+     for (i=0; i<RtsFlags.GranFlags.proc; i++) {
+       fprintf(stderr,"  PE %d: %d\t", 
+	       i, globalGranStats.threads_created_on_PE[i]);
+       if (i+1 % 4 == 0) fprintf(stderr,"\n");
+     }
+     if (RtsFlags.GranFlags.proc+1 % 4 != 0) fprintf(stderr,"\n");
+     fprintf(stderr,"Total number of threads migrated: %d\n",
+             globalGranStats.tot_TSOs_migrated);
+
+     fprintf(stderr,"Total number of sparks created: %d ; per PE:\n",
+             globalGranStats.tot_sparks_created);
+     for (i=0; i<RtsFlags.GranFlags.proc; i++) {
+       fprintf(stderr,"  PE %d: %d\t", 
+	       i, globalGranStats.sparks_created_on_PE[i]);
+       if (i+1 % 4 == 0) fprintf(stderr,"\n");
+     }
+     if (RtsFlags.GranFlags.proc+1 % 4 != 0) fprintf(stderr,"\n");
+
+     fprintf(stderr,"Event statistics (number of events: %d):\n",
+             globalGranStats.noOfEvents);
+     for (i=0; i<=MAX_EVENT; i++) {
+       fprintf(stderr,"  %s (%d): \t%d \t%f%%\t%f%%\n",
+               event_names[i],i,globalGranStats.event_counts[i],
+               (float)(100*globalGranStats.event_counts[i])/(float)(globalGranStats.noOfEvents),
+               (i==ContinueThread ? 0.0 :
+   		   (float)(100*(globalGranStats.event_counts[i])/(float)(globalGranStats.noOfEvents-globalGranStats.event_counts[ContinueThread])) ));
+     }
+     fprintf(stderr,"Randomized steals: %ld sparks, %ld threads \n \t(Sparks: #%u (avg ntimes=%f; avg fl=%f)\n\t(Threads: %ld)", 
+   	             globalGranStats.rs_sp_count, 
+	             globalGranStats.rs_t_count, 
+	             globalGranStats.no_of_steals, 
+   	             (float)globalGranStats.ntimes_total/(float)stg_max(globalGranStats.no_of_steals,1),
+   	             (float)globalGranStats.fl_total/(float)stg_max(globalGranStats.no_of_steals,1),
+	             globalGranStats.no_of_migrates);
+     fprintf(stderr,"Moved sparks: %d  Withered sparks: %d (%.2f %%)\n",
+   	      globalGranStats.tot_sparks, globalGranStats.withered_sparks,
+             ( globalGranStats.tot_sparks == 0 ? 0 :
+                  (float)(100*globalGranStats.withered_sparks)/(float)(globalGranStats.tot_sparks)) );
+     /* Print statistics about priority sparking */
+     if (RtsFlags.GranFlags.DoPrioritySparking) {
+   	fprintf(stderr,"About Priority Sparking:\n");
+   	fprintf(stderr,"  Total no. NewThreads: %d   Avg. spark queue len: %.2f \n", globalGranStats.tot_sq_probes, (float)globalGranStats.tot_sq_len/(float)globalGranStats.tot_sq_probes);
+     }
+     /* Print statistics about priority sparking */
+     if (RtsFlags.GranFlags.DoPriorityScheduling) {
+   	fprintf(stderr,"About Priority Scheduling:\n");
+   	fprintf(stderr,"  Total no. of StartThreads: %d (non-end: %d) Avg. thread queue len: %.2f\n", 
+   		globalGranStats.tot_add_threads, globalGranStats.non_end_add_threads, 
+   		(float)globalGranStats.tot_tq_len/(float)globalGranStats.tot_add_threads);
+     }
+     /* Blocking queue statistics */
+     if (1) {
+   	fprintf(stderr,"Blocking queue statistcs:\n");
+   	fprintf(stderr,"  Total no. of FMBQs generated: %d\n",
+		globalGranStats.tot_FMBQs);
+   	fprintf(stderr,"  Total no. of bqs awakened: %d\n",
+		globalGranStats.tot_awbq);
+   	fprintf(stderr,"  Total length of all bqs: %d\tAvg length of bqs: %.2f\n",
+		globalGranStats.tot_bq_len, (float)globalGranStats.tot_bq_len/(float)globalGranStats.tot_awbq);
+   	fprintf(stderr,"  Percentage of local TSOs in BQs: %.2f\n",
+		(float)globalGranStats.tot_bq_len*100.0/(float)globalGranStats.tot_bq_len);
+   	fprintf(stderr,"  Total time spent processing BQs: %lx\n",
+		globalGranStats.tot_bq_processing_time);
+     }
+
+     /* Fetch misses and thunk stealing */
+     fprintf(stderr,"Number of fetch misses: %d\n", 
+	     globalGranStats.fetch_misses);
+
+     /* Print packet statistics if GUMM fetching is turned on */
+     if (RtsFlags.GranFlags.DoBulkFetching) {
+   	fprintf(stderr,"Packet statistcs:\n");
+   	fprintf(stderr,"  Total no. of packets: %d   Avg. packet size: %.2f \n", globalGranStats.tot_packets, (float)globalGranStats.tot_packet_size/(float)globalGranStats.tot_packets);
+   	fprintf(stderr,"  Total no. of thunks: %d   Avg. thunks/packet: %.2f \n", globalGranStats.tot_thunks, (float)globalGranStats.tot_thunks/(float)globalGranStats.tot_packets);
+   	fprintf(stderr,"  Total no. of cuts: %d   Avg. cuts/packet: %.2f\n", globalGranStats.tot_cuts, (float)globalGranStats.tot_cuts/(float)globalGranStats.tot_packets);
+        /* 
+   	if (closure_queue_overflows>0) 
+   	  fprintf(stderr,"  Number of closure queue overflows: %u\n",
+   		  closure_queue_overflows);
+	*/
+     }
+   } /* RtsFlags.GranFlags.GranSimStats.Global */
+
+#  if defined(GRAN_COUNT)
+#  error "GRAN_COUNT not supported; should be parallel ticky profiling, really"
+    fprintf(stderr,"Update count statistics:\n");
+    fprintf(stderr,"  Total number of updates: %u\n",nUPDs);
+    fprintf(stderr,"  Needed to awaken BQ: %u with avg BQ len of: %f\n",
+	    nUPDs_BQ,(float)BQ_lens/(float)nUPDs_BQ);
+    fprintf(stderr,"  Number of PAPs: %u\n",nPAPs);
+#  endif
+
+    fprintf(stderr, "Simulation finished after @ %s @ cycles. %d sparks created, %d sparks ignored. Check %s for details.\n",
+	    time_string, sparksCreated, sparksIgnored, gr_filename);
+
+    if (RtsFlags.GranFlags.GranSimStats.Full) 
+      fclose(gr_file);
+}
+
+#elif defined(PAR)
+
+/*
+  Under GUM we print only one line. 
+*/
+void
+end_gr_simulation(void)
+{
+  char time_string[TIME_STR_LEN];
+
+  ullong_format_string(CURRENT_TIME-startTime, time_string, rtsFalse/*no commas!*/);
+
+  fprintf(stderr, "Computation finished after @ %s @ ms. %d sparks created, %d sparks ignored. Check %s for details.\n",
+	    time_string, sparksCreated, sparksIgnored, gr_filename);
+
+  if (RtsFlags.ParFlags.ParStats.Full) 
+    fclose(gr_file);
+}
+#endif /* PAR */
+
+//@node Dumping routines,  , Writing to the log-file
+//@subsection Dumping routines
+
+//@cindex DumpGranEvent
+void
+DumpGranEvent(name, tso)
+GranEventType name;
+StgTSO *tso;
+{
+    DumpRawGranEvent(CURRENT_PROC, (PEs)0, name, tso, END_TSO_QUEUE, (StgInt)0, (StgInt)0);
+}
+
+//@cindex DumpRawGranEvent
+void
+DumpRawGranEvent(proc, p, name, tso, node, sparkname, len)
+PEs proc, p;         /* proc ... where it happens; p ... where node lives */
+GranEventType name;
+StgTSO *tso;
+StgClosure *node;
+StgInt sparkname, len;
+{
+  FILE *output_file; // DEBUGGING ONLY !!!!!!!!!!!!!!!!!!!!!!!!!1
+  StgWord id;
+  char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
+  ullong_format_string(TIME_ON_PROC(proc), time_string, rtsFalse/*no commas!*/);
+
+  output_file = gr_file;
+  ASSERT(output_file!=NULL);
+# if defined(GRAN)
+  IF_DEBUG(gran,
+	   fprintf(stderr, "GRAN: Dumping info to file with handle %#x\n", output_file))
+		   
+  if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+    return;
+# endif
+
+  id = tso == NULL ? -1 : tso->id;
+  if (node==stgCast(StgClosure*,&END_TSO_QUEUE_closure))
+      strcpy(node_str,"________");  /* "END_TSO_QUEUE"); */
+  else
+      sprintf(node_str,"0x%-6lx",node);
+
+  if (name > GR_EVENT_MAX)
+	name = GR_EVENT_MAX;
+
+  if (BINARY_STATS)
+    barf("binary log files not yet supported");
+#if 0
+    /* ToDo: fix code for writing binary GrAnSim statistics */
+    switch (name) { 
+      case GR_START:
+      case GR_STARTQ:
+                      grputw(name);
+		      grputw(proc);
+		      abort();        /* die please: a single word */
+				      /* doesn't represent long long times */
+		      grputw(TIME_ON_PROC(proc));
+		      grputw((StgWord)node);
+		      break;
+      case GR_FETCH:
+      case GR_REPLY:
+      case GR_BLOCK:
+		      grputw(name);
+		      grputw(proc);
+		      abort();        /* die please: a single word */
+				      /* doesn't represent long long times */
+		      grputw(TIME_ON_PROC(proc));  /* this line is bound to */
+		      grputw(id);                  /*   do the wrong thing */
+		      break;
+      default: 
+                      grputw(name);
+		      grputw(proc);
+		      abort();        /* die please: a single word */
+				      /* doesn't represent long long times */
+		      grputw(TIME_ON_PROC(proc));
+		      grputw((StgWord)node);
+    }
+#endif
+  else /* !BINARY_STATS */
+    switch (name) { 
+     case GR_START:
+     case GR_STARTQ:
+        fprintf(output_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[SN %u]\t[sparks %u]\n", 
+	        proc,time_string,gran_event_names[name],
+	        id,node_str,sparkname,len);
+        break;
+     case GR_FETCH:
+     case GR_REPLY:
+     case GR_BLOCK:
+     case GR_STOLEN:
+     case GR_STOLENQ:
+	fprintf(output_file, "PE %2u [%s]: %-9s\t%lx \t%s\t(from %2u)\n",
+	        proc, time_string, gran_event_names[name], 
+		id,node_str,p);
+	break;
+     case GR_RESUME:
+     case GR_RESUMEQ:
+     case GR_SCHEDULE:
+     case GR_DESCHEDULE:
+        fprintf(output_file,"PE %2u [%s]: %-9s\t%lx \n",
+	        proc,time_string,gran_event_names[name],id);
+        break;
+     case GR_STEALING:
+        fprintf(output_file,"PE %2u [%s]: %-9s\t%lx\t        \t(by %2u)\n",
+	        proc,time_string,gran_event_names[name],id,p);
+        break;
+     case GR_ALLOC:
+        fprintf(output_file,"PE %2u [%s]: %-9s\t%lx\t        \tallocating %u words\n",
+	        proc,time_string,gran_event_names[name],id,len);
+        break;
+     default:
+        fprintf(output_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[sparks %u]\n",
+	        proc,time_string,gran_event_names[name],id,node_str,len);
+    }
+}
+
+//@cindex DumpGranInfo
+void
+DumpEndEvent(proc, tso, mandatory_thread)
+PEs proc;
+StgTSO *tso;
+rtsBool mandatory_thread;
+{
+  FILE *output_file; // DEBUGGING ONLY !!!!!!!!!!!!!!!!!!!!!!!!!1
+    char time_string[TIME_STR_LEN];
+    ullong_format_string(CURRENT_TIME, time_string, rtsFalse/*no commas!*/);
+
+  output_file = gr_file;
+  ASSERT(output_file!=NULL);
+#if defined(GRAN)
+    if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+      return;
+#endif
+
+    if (BINARY_STATS) {
+    barf("binary log files not yet supported");
+#if 0
+	grputw(GR_END);
+	grputw(proc);
+	abort(); /* die please: a single word doesn't represent long long times */
+	grputw(CURRENT_TIME); /* this line is bound to fail */
+	grputw(tso->id);
+#ifdef PAR
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+#else
+	grputw(tso->gran.sparkname);
+	grputw(tso->gran.startedat);
+	grputw(tso->gran.exported);
+	grputw(tso->gran.basicblocks);
+	grputw(tso->gran.allocs);
+	grputw(tso->gran.exectime);
+	grputw(tso->gran.blocktime);
+	grputw(tso->gran.blockcount);
+	grputw(tso->gran.fetchtime);
+	grputw(tso->gran.fetchcount);
+	grputw(tso->gran.localsparks);
+	grputw(tso->gran.globalsparks);
+#endif
+	grputw(mandatory_thread);
+#endif /* 0 */
+    } else {
+
+	/*
+	 * NB: DumpGranEvent cannot be used because PE may be wrong 
+	 * (as well as the extra info)
+	 */
+	fprintf(output_file, "PE %2u [%s]: END %lx, SN %u, ST %lu, EXP %c, BB %u, HA %u, RT %u, BT %u (%u), FT %u (%u), LS %u, GS %u, MY %c\n"
+	  ,proc
+	  ,time_string
+	  ,tso->id
+#if defined(GRAN)		
+	  ,tso->gran.sparkname
+	  ,tso->gran.startedat
+	  ,tso->gran.exported ? 'T' : 'F'
+	  ,tso->gran.basicblocks
+	  ,tso->gran.allocs
+	  ,tso->gran.exectime
+	  ,tso->gran.blocktime
+	  ,tso->gran.blockcount
+	  ,tso->gran.fetchtime
+	  ,tso->gran.fetchcount
+	  ,tso->gran.localsparks
+	  ,tso->gran.globalsparks
+#elif defined(PAR)
+	  ,tso->par.sparkname
+	  ,tso->par.startedat
+	  ,tso->par.exported ? 'T' : 'F'
+	  ,tso->par.basicblocks
+	  ,tso->par.allocs
+	  ,tso->par.exectime
+	  ,tso->par.blocktime
+	  ,tso->par.blockcount
+	  ,tso->par.fetchtime
+	  ,tso->par.fetchcount
+	  ,tso->par.localsparks
+	  ,tso->par.globalsparks
+#endif
+	  ,mandatory_thread ? 'T' : 'F'
+	  );
+    }
+}
+
+//@cindex DumpTSO
+void
+DumpTSO(tso)
+StgTSO *tso;
+{
+  FILE *output_file; // DEBUGGING ONLY !!!!!!!!!!!!!!!!!!!!!!!!!1
+
+  output_file = gr_file;
+  ASSERT(output_file!=NULL);
+  fprintf(stderr,"TSO 0x%lx, NAME 0x%lx, ID %u, LINK 0x%lx, TYPE %s\n"
+          ,tso
+#if defined(GRAN)
+          ,tso->gran.sparkname
+#elif defined(PAR)
+          ,tso->par.sparkname
+#endif
+          ,tso->id
+          ,tso->link
+          ,/*tso->state==T_MAIN?"MAIN":
+           TSO_TYPE(tso)==T_FAIL?"FAIL":
+           TSO_TYPE(tso)==T_REQUIRED?"REQUIRED":
+           TSO_TYPE(tso)==T_ADVISORY?"ADVISORY":
+	   */
+           "???"
+          );
+          
+  fprintf(output_file,"TSO %lx: SN %u, ST %u, GBL %c, BB %u, HA %u, RT %u, BT %u (%u), FT %u (%u) LS %u, GS %u\n"
+	  ,tso->id
+#if defined(GRAN)
+          ,tso->gran.sparkname
+          ,tso->gran.startedat
+          ,tso->gran.exported?'T':'F'
+          ,tso->gran.basicblocks
+          ,tso->gran.allocs
+          ,tso->gran.exectime
+          ,tso->gran.blocktime
+          ,tso->gran.blockcount
+          ,tso->gran.fetchtime
+          ,tso->gran.fetchcount
+          ,tso->gran.localsparks
+          ,tso->gran.globalsparks
+#elif defined(PAR)
+          ,tso->par.sparkname
+          ,tso->par.startedat
+          ,tso->par.exported?'T':'F'
+          ,tso->par.basicblocks
+          ,tso->par.allocs
+          ,tso->par.exectime
+          ,tso->par.blocktime
+          ,tso->par.blockcount
+          ,tso->par.fetchtime
+          ,tso->par.fetchcount
+          ,tso->par.localsparks
+          ,tso->par.globalsparks
+#endif
+          );
+}
+
+#if 0
+/*
+  ToDo: fix binary output of log files, and support new log file format.
+*/
+/*
+   Output a terminate event and an 8-byte time.
+*/
+
+//@cindex grterminate
+void
+grterminate(v)
+rtsTime v;
+{
+  if (!BINARY_STATS) 
+    barf("grterminate: binary statistics not enabled\n");
+
+# if defined(GRAN)
+    if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+      return;
+# endif
+
+    DumpGranEvent(GR_TERMINATE, stgCast(StgTSO*,&END_TSO_QUEUE_closure));
+
+    if (sizeof(rtsTime) == 4) {
+      putc('\0', gr_file);
+      putc('\0', gr_file);
+      putc('\0', gr_file);
+      putc('\0', gr_file);
+    } else {
+      putc(v >> 56l, gr_file);
+      putc((v >> 48l) & 0xffl, gr_file);
+      putc((v >> 40l) & 0xffl, gr_file);
+      putc((v >> 32l) & 0xffl, gr_file);
+    }
+    putc((v >> 24l) & 0xffl, gr_file);
+    putc((v >> 16l) & 0xffl, gr_file);
+    putc((v >> 8l) & 0xffl, gr_file);
+    putc(v & 0xffl, gr_file);
+}
+
+/*
+   Length-coded output: first 3 bits contain length coding
+
+     00x        1 byte
+     01x        2 bytes
+     10x        4 bytes
+     110        8 bytes
+     111        5 or 9 bytes
+*/
+
+//@cindex grputw
+void
+grputw(v)
+rtsTime v;
+{
+  if (!BINARY_STATS) 
+    barf("grputw: binary statistics not enabled\n");
+
+# if defined(GRAN)
+    if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+      return;
+# endif
+
+    if (v <= 0x3fl) {                           /* length v = 1 byte */ 
+	fputc(v & 0x3f, gr_file);
+    } else if (v <= 0x3fffl) {                  /* length v = 2 byte */ 
+	fputc((v >> 8l) | 0x40l, gr_file);
+	fputc(v & 0xffl, gr_file);
+    } else if (v <= 0x3fffffffl) {              /* length v = 4 byte */ 
+	fputc((v >> 24l) | 0x80l, gr_file);
+	fputc((v >> 16l) & 0xffl, gr_file);
+	fputc((v >> 8l) & 0xffl, gr_file);
+	fputc(v & 0xffl, gr_file);
+    } else if (sizeof(TIME) == 4) {
+	fputc(0x70, gr_file);
+	fputc((v >> 24l) & 0xffl, gr_file);
+	fputc((v >> 16l) & 0xffl, gr_file);
+	fputc((v >> 8l) & 0xffl, gr_file);
+	fputc(v & 0xffl, gr_file);
+    } else {
+	if (v <= 0x3fffffffffffffl)
+	    putc((v >> 56l) | 0x60l, gr_file);
+	else {
+	    putc(0x70, gr_file);
+	    putc((v >> 56l) & 0xffl, gr_file);
+	}
+
+	putc((v >> 48l) & 0xffl, gr_file);
+	putc((v >> 40l) & 0xffl, gr_file);
+	putc((v >> 32l) & 0xffl, gr_file);
+	putc((v >> 24l) & 0xffl, gr_file);
+	putc((v >> 16l) & 0xffl, gr_file);
+	putc((v >> 8l) & 0xffl, gr_file);
+	putc(v & 0xffl, gr_file);
+    }
+}
+#endif /* 0 */
+
+#endif /* GRAN || PAR   whole file */
diff --git a/ghc/rts/parallel/ParallelDebug.c b/ghc/rts/parallel/ParallelDebug.c
new file mode 100644
index 0000000..2924b51
--- /dev/null
+++ b/ghc/rts/parallel/ParallelDebug.c
@@ -0,0 +1,1390 @@
+/*
+  Time-stamp: <Sun Dec 12 1999 20:37:00 Stardate: [-30]4039.08 software>
+
+Various debugging routines for GranSim and GUM
+*/
+
+#if defined(GRAN) || defined(PAR)                             /* whole file */
+
+//@node Debugging routines for GranSim and GUM, , ,
+//@section Debugging routines for GranSim and GUM
+
+//@menu
+//* Includes::			
+//* Constants and Variables::	
+//* Closures::			
+//* Threads::			
+//* Events::			
+//* Sparks::			
+//* Processors::		
+//* Shortcuts::			
+//* Printing info type::	
+//* Printing Pack:et Contents::	
+//* End of File::		
+//@end menu
+//*/
+
+//@node Includes, Prototypes, Debugging routines for GranSim and GUM, Debugging routines for GranSim and GUM
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+#include "StgMiscClosures.h"
+# if defined(DEBUG)
+# include "ParallelDebug.h"
+# endif
+
+//@node Prototypes, Constants and Variables, Includes, Debugging routines for GranSim and GUM
+//@subsection Prototypes
+/*
+rtsBool  isOffset(globalAddr *ga);
+rtsBool  isFixed(globalAddr *ga);
+*/
+//@node Constants and Variables, Closures, Prototypes, Debugging routines for GranSim and GUM
+//@subsection Constants and Variables
+
+/* Names as strings; needed by get_closure_info in ClosureMacros.h -- HWL */
+static char *closure_type_names[] = {
+  "INVALID_OBJECT",          /* 0 */
+  "CONSTR",                  /* 1 */
+  "CONSTR_1_0",		/* 2 */
+  "CONSTR_0_1",		/* 3 */
+  "CONSTR_2_0",		/* 4 */
+  "CONSTR_1_1",		/* 5 */
+  "CONSTR_0_2",		/* 6 */
+  "CONSTR_INTLIKE",	        /* 7  */
+  "CONSTR_CHARLIKE",	        /* 8  */
+  "CONSTR_STATIC",	        /* 9  */
+  "CONSTR_NOCAF_STATIC",     /* 10 */
+  "FUN",		        /* 11 */
+  "FUN_1_0",		  	/* 12 */
+  "FUN_0_1",		  	/* 13 */
+  "FUN_2_0",		  	/* 14 */
+  "FUN_1_1",		  	/* 15 */
+  "FUN_0_2",			/* 16 */
+  "FUN_STATIC",	        /* 17 */
+  "THUNK",		        /* 18 */
+  "THUNK_1_0",	  	/* 19 */
+  "THUNK_0_1",	  	/* 20 */
+  "THUNK_2_0",	  	/* 21 */
+  "THUNK_1_1",	  	/* 22 */
+  "THUNK_0_2",		/* 23 */
+  "THUNK_STATIC",	        /* 24 */
+  "THUNK_SELECTOR",	        /* 25 */
+  "BCO",		        /* 26 */
+  "AP_UPD",		        /* 27 */
+  "PAP",			/* 28 */
+  "IND",		        /* 29 */
+  "IND_OLDGEN",	        /* 30 */
+  "IND_PERM",	        /* 31 */
+  "IND_OLDGEN_PERM",	        /* 32 */
+  "IND_STATIC",	        /* 33 */
+  "CAF_UNENTERED",           /* 34 */
+  "CAF_ENTERED",		/* 35 */
+  "CAF_BLACKHOLE",		/* 36 */
+  "RET_BCO",                 /* 37 */
+  "RET_SMALL",	        /* 38 */
+  "RET_VEC_SMALL",	        /* 39 */
+  "RET_BIG",		        /* 40 */
+  "RET_VEC_BIG",	        /* 41 */
+  "RET_DYN",		        /* 42 */
+  "UPDATE_FRAME",	        /* 43 */
+  "CATCH_FRAME",	        /* 44 */
+  "STOP_FRAME",	        /* 45 */
+  "SEQ_FRAME",	        /* 46 */
+  "BLACKHOLE",	        /* 47 */
+  "BLACKHOLE_BQ",	        /* 48 */
+  "SE_BLACKHOLE",		/* 49 */
+  "SE_CAF_BLACKHOLE",	/* 50 */
+  "MVAR",		        /* 51 */
+  "ARR_WORDS",	        /* 52 */
+  "MUT_ARR_PTRS",	        /* 53 */
+  "MUT_ARR_PTRS_FROZEN",     /* 54 */
+  "MUT_VAR",		        /* 55 */
+  "WEAK",		        /* 56 */
+  "FOREIGN",		        /* 57 */
+  "STABLE_NAME",	        /* 58 */
+  "TSO",		        /* 59 */
+  "BLOCKED_FETCH",	        /* 60 */
+  "FETCH_ME",                /* 61 */
+  "EVACUATED",               /* 62 */
+  "N_CLOSURE_TYPES",         /* 63 */
+  "FETCH_ME_BQ",             /* 64 */
+  "RBH"                     /* 65 */
+};
+
+
+#if defined(GRAN) && defined(GRAN_CHECK)
+//@node Closures, Threads, Constants and Variables, Debugging routines for GranSim and GUM
+//@subsection Closures
+
+void
+G_PRINT_NODE(node)
+StgClosure* node;
+{
+   StgInfoTable *info_ptr;
+   StgTSO* bqe;
+   nat size = 0, ptrs = 0, nonptrs = 0, i, vhs = 0;
+   char info_hdr_ty[80], info_ty[80];
+
+   if (node==NULL) {
+     fprintf(stderr,"NULL\n");
+     return;
+   } else if (node==END_TSO_QUEUE) {
+     fprintf(stderr,"END_TSO_QUEUE\n");
+     return;
+   }
+   /* size_and_ptrs(node,&size,&ptrs); */
+   info_ptr = get_closure_info(node, &size, &ptrs, &nonptrs, &vhs, info_hdr_ty);
+
+   /* vhs = var_hdr_size(node); */
+   display_info_type(info_ptr,info_ty);
+
+   fprintf(stderr,"Node: 0x%lx", node);
+
+#if defined(PAR)
+   fprintf(stderr," [GA: 0x%lx]",GA(node));
+#endif
+
+#if defined(USE_COST_CENTRES)
+   fprintf(stderr," [CC: 0x%lx]",CC_HDR(node));
+#endif
+
+#if defined(GRAN)
+   fprintf(stderr," [Bitmask: 0%lo]",PROCS(node));
+#endif
+
+   if (info_ptr->type==TSO) 
+     fprintf(stderr," TSO: 0x%lx (%x) IP: 0x%lx (%s), type %s \n     ",
+	     (StgTSO*)node, ((StgTSO*)node)->id, info_ptr, info_hdr_ty, info_ty);
+   else
+     fprintf(stderr," IP: 0x%lx (%s), type %s \n       VHS: %d, size: %ld, ptrs:%ld, nonptrs:  %ld\n     ",
+	     info_ptr,info_hdr_ty,info_ty,vhs,size,ptrs,nonptrs);
+
+   /* For now, we ignore the variable header */
+
+   fprintf(stderr," Ptrs: ");
+   for(i=0; i < ptrs; ++i)
+     {
+     if ( (i+1) % 6 == 0)
+       fprintf(stderr,"\n      ");
+     fprintf(stderr," 0x%lx[P]",node->payload[i]);
+     };
+
+   fprintf(stderr," Data: ");
+   for(i=0; i < nonptrs; ++i)
+     {
+       if( (i+1) % 6 == 0)
+         fprintf(stderr,"\n      ");
+       fprintf(stderr," %lu[D]",node->payload[ptrs+i]);
+     }
+   fprintf(stderr, "\n");
+
+
+   switch (info_ptr->type)
+    {
+     case TSO: 
+      fprintf(stderr,"\n TSO_LINK: %#lx", 
+	      ((StgTSO*)node)->link);
+      break;
+
+    case BLACKHOLE:
+    case RBH:
+      bqe = ((StgBlockingQueue*)node)->blocking_queue;
+      fprintf(stderr," BQ of %#lx: ", node);
+      G_PRINT_BQ(bqe);
+      break;
+    case FETCH_ME:
+    case FETCH_ME_BQ:
+      printf("Panic: found FETCH_ME or FETCH_ME_BQ Infotable in GrAnSim system.\n");
+      break;
+    default:
+      /* do nothing */
+    }
+}
+
+void
+G_PPN(node)  /* Extracted from PrintPacket in Pack.lc */
+StgClosure* node;
+{
+   StgInfoTable *info ;
+   nat size = 0, ptrs = 0, nonptrs = 0, i, vhs = 0, locn = 0;
+   char info_type[80];
+
+   /* size_and_ptrs(node,&size,&ptrs); */
+   info = get_closure_info(node, &size, &ptrs, &nonptrs, &vhs, info_type);
+
+   if (info->type == FETCH_ME || info->type == FETCH_ME_BQ || 
+       info->type == BLACKHOLE || info->type == RBH )
+     size = ptrs = nonptrs = vhs = 0;
+
+   if (closure_THUNK(node)) {
+     if (!closure_UNPOINTED(node))
+       fputs("SHARED ", stderr);
+     else
+       fputs("UNSHARED ", stderr);
+   } 
+   if (info->type==BLACKHOLE) {
+     fputs("BLACK HOLE\n", stderr);
+   } else {
+     /* Fixed header */
+     fprintf(stderr, "(%s) FH [%#lx", info_type, node[locn++]);
+     for (i = 1; i < FIXED_HS; i++)
+       fprintf(stderr, " %#lx", node[locn++]);
+     
+     /* Variable header */
+     if (vhs > 0) {
+       fprintf(stderr, "] VH [%#lx", node->payload[0]);
+       
+       for (i = 1; i < vhs; i++)
+	 fprintf(stderr, " %#lx", node->payload[i]);
+     }
+     
+     fprintf(stderr, "] PTRS %u", ptrs);
+     
+     /* Non-pointers */
+     if (nonptrs > 0) {
+       fprintf(stderr, " NPTRS [%#lx", node->payload[ptrs]);
+       
+       for (i = 1; i < nonptrs; i++)
+	 fprintf(stderr, " %#lx", node->payload[ptrs+i]);
+       
+       putc(']', stderr);
+     }
+     putc('\n', stderr);
+   }
+   
+}
+
+#if 0
+// ToDo: fix this!! -- HWL
+void
+G_INFO_TABLE(node)
+StgClosure *node;
+{
+  StgInfoTable *info_ptr;
+  nat size = 0, ptrs = 0, nonptrs = 0, vhs = 0;
+  char info_type[80], hdr_type[80];
+
+  info_hdr_type(info_ptr, hdr_type);
+
+  // get_itbl(node);
+  info_ptr = get_closure_info(node, &size, &ptrs, &nonptrs, &vhs, info_type);
+  fprintf(stderr,"%s Info Ptr @0x%lx; Entry: 0x%lx; Size: %lu; Ptrs: %lu\n\n",
+                 info_type,info_ptr,(W_) ENTRY_CODE(info_ptr),
+	         size, ptrs);
+	         // INFO_SIZE(info_ptr),INFO_NoPTRS(info_ptr));
+
+  if (closure_THUNK(node) && !closure_UNPOINTED(node) ) {
+    fprintf(stderr,"  RBH InfoPtr: %#lx\n",
+	    RBH_INFOPTR(info_ptr));
+  }
+
+#if defined(PAR)
+  fprintf(stderr,"Enter Flush Entry: 0x%lx;\tExit Flush Entry: 0x%lx\n",INFO_FLUSHENT(info_ptr),INFO_FLUSH(info_ptr));
+#endif
+
+#if defined(USE_COST_CENTRES)
+  fprintf(stderr,"Cost Centre (???):       0x%lx\n",INFO_CAT(info_ptr));
+#endif
+
+#if defined(_INFO_COPYING)
+  fprintf(stderr,"Evacuate Entry:    0x%lx;\tScavenge Entry: 0x%lx\n",
+          INFO_EVAC_2S(info_ptr),INFO_SCAV_2S(info_ptr));
+#endif
+
+#if defined(_INFO_COMPACTING)
+  fprintf(stderr,"Scan Link:         0x%lx;\tScan Move:      0x%lx\n",
+          (W_) INFO_SCAN_LINK_1S(info_ptr), (W_) INFO_SCAN_MOVE_1S(info_ptr));
+  fprintf(stderr,"Mark:              0x%lx;\tMarked:         0x%lx;\t",
+          (W_) INFO_MARK_1S(info_ptr), (W_) INFO_MARKED_1S(info_ptr));
+#if 0 /* avoid INFO_TYPE */
+  if(BASE_INFO_TYPE(info_ptr)==INFO_SPEC_TYPE)
+    fprintf(stderr,"plus specialised code\n");
+  else
+    fprintf(stderr,"Marking:           0x%lx\n",(W_) INFO_MARKING_1S(info_ptr));
+#endif /* 0 */
+#endif /* _INFO_COMPACTING */
+}
+#endif /* 0 */
+
+//@cindex G_PRINT_BQ
+void
+G_PRINT_BQ(node)
+StgClosure* node;
+{
+    StgInfoTable *info;
+    StgTSO *tso, *last;
+    char str[80], str0[80];
+
+    fprintf(stderr,"\n[PE %d] @ %lu BQ: ",
+	            CurrentProc,CurrentTime[CurrentProc]);
+    if ( node == (StgClosure*)NULL ) {
+      fprintf(stderr," NULL.\n");
+      return;
+    }
+    if ( node == END_TSO_QUEUE ) {
+      fprintf(stderr," _|_\n");
+      return;
+    }
+    tso = ((StgBlockingQueue*)node)->blocking_queue;
+    while (node != END_TSO_QUEUE) {
+      PEs proc;                     
+      
+      /* Find where the tso lives */
+      proc = where_is(node);
+      info = get_itbl(node);
+
+      switch (info->type) {
+	  case TSO:
+	    strcpy(str0,"TSO");
+	    break;
+	  case BLOCKED_FETCH:
+	    strcpy(str0,"BLOCKED_FETCH");
+	    break;
+	  default:
+	    strcpy(str0,"???");
+	    break;
+	  }
+
+      if(proc == CurrentProc)
+	fprintf(stderr," %#lx (%x) L %s,", 
+		node, ((StgBlockingQueue*)node)->blocking_queue, str0);
+      else
+	fprintf(stderr," %#lx (%x) G (PE %d) %s,", 
+		node, ((StgBlockingQueue*)node)->blocking_queue, proc, str0);
+
+      last = tso;
+      tso = last->link;
+    }
+    if ( tso == END_TSO_QUEUE ) 
+      fprintf(stderr," _|_\n");
+}
+
+//@node Threads, Events, Closures, Debugging routines for GranSim and GUM
+//@subsection Threads
+
+void
+G_CURR_THREADQ(verbose) 
+StgInt verbose;
+{ 
+  fprintf(stderr,"Thread Queue on proc %d: ", CurrentProc);
+  G_THREADQ(run_queue_hd, verbose);
+}
+
+void 
+G_THREADQ(closure, verbose) 
+StgTSO* closure;
+StgInt verbose;
+{
+ StgTSO* x;
+
+ fprintf(stderr,"Thread Queue: ");
+ for (x=closure; x!=END_TSO_QUEUE; x=x->link)
+   if (verbose) 
+     G_TSO(x,0);
+   else
+     fprintf(stderr," %#lx",x);
+
+ if (closure==END_TSO_QUEUE)
+   fprintf(stderr,"NIL\n");
+ else
+   fprintf(stderr,"\n");
+}
+
+void 
+G_TSO(closure,verbose) 
+StgTSO* closure;
+StgInt verbose;
+{
+ 
+ if (closure==END_TSO_QUEUE) {
+   fprintf(stderr,"TSO at %#lx is END_TSO_QUEUE!\n");
+   return;
+ }
+
+ if ( verbose & 0x08 ) {   /* short info */
+   fprintf(stderr,"[TSO @ %#lx, PE %d]: Id: %#lx, Link: %#lx\n",
+	   closure,where_is(closure),
+	   closure->id,closure->link);
+   return;
+ }
+   
+ fprintf(stderr,"TSO at %#lx has the following contents:\n",
+                 closure);
+
+ fprintf(stderr,"> Id:   \t%#lx",closure->id);
+ // fprintf(stderr,"\tstate: \t%#lx",closure->state);
+ fprintf(stderr,"\twhatNext: \t%#lx",closure->whatNext);
+ fprintf(stderr,"\tlink: \t%#lx\n",closure->link);
+ // fprintf(stderr,"\tType: \t%s\n",type_name[TSO_TYPE(closure)]);
+ fprintf(stderr,">PRI: \t%#lx", closure->gran.pri);
+ fprintf(stderr,"\tMAGIC: \t%#lx %s\n", closure->gran.magic, 
+	 (closure->gran.magic==TSO_MAGIC ? "it IS a TSO" : "THIS IS NO TSO!!"));
+ if ( verbose & 0x04 ) {
+   fprintf(stderr, "Stack: stack @ %#lx (stack_size: %u; max_stack_size: %u)\n", 
+	   closure->stack, closure->stack_size, closure->max_stack_size);
+   fprintf(stderr, "  sp: %#lx, su: %#lx, splim: %#lx\n", 
+	   closure->sp, closure->su, closure->splim);
+ }
+ // fprintf(stderr,"\n");
+ if (verbose & 0x01) {
+   // fprintf(stderr,"} LOCKED: \t%#lx",closure->locked);
+   fprintf(stderr,"} SPARKNAME: \t%#lx\n", closure->gran.sparkname);
+   fprintf(stderr,"} STARTEDAT: \t%#lx", closure->gran.startedat);
+   fprintf(stderr,"\tEXPORTED: \t%#lx\n", closure->gran.exported);
+   fprintf(stderr,"} BASICBLOCKS: \t%#lx", closure->gran.basicblocks);
+   fprintf(stderr,"\tALLOCS: \t%#lx\n", closure->gran.allocs);
+   fprintf(stderr,"} EXECTIME: \t%#lx", closure->gran.exectime);
+   fprintf(stderr,"\tFETCHTIME: \t%#lx\n", closure->gran.fetchtime);
+   fprintf(stderr,"} FETCHCOUNT: \t%#lx", closure->gran.fetchcount);
+   fprintf(stderr,"\tBLOCKTIME: \t%#lx\n", closure->gran.blocktime);
+   fprintf(stderr,"} BLOCKCOUNT: \t%#lx", closure->gran.blockcount);
+   fprintf(stderr,"\tBLOCKEDAT: \t%#lx\n", closure->gran.blockedat);
+   fprintf(stderr,"} GLOBALSPARKS:\t%#lx", closure->gran.globalsparks);
+   fprintf(stderr,"\tLOCALSPARKS:\t%#lx\n", closure->gran.localsparks);
+ }
+ if ( verbose & 0x02 ) {
+   fprintf(stderr,"BQ that starts with this TSO: ");
+   G_PRINT_BQ(closure);
+ }
+}
+
+//@node Events, Sparks, Threads, Debugging routines for GranSim and GUM
+//@subsection Events
+
+void 
+G_EVENT(event, verbose) 
+rtsEventQ event;
+StgInt verbose;
+{
+  if (verbose) {
+    print_event(event);
+  }else{
+    fprintf(stderr," %#lx",event);
+  }
+}
+
+void
+G_EVENTQ(verbose)
+StgInt verbose;
+{
+ extern rtsEventQ EventHd;
+ rtsEventQ x;
+
+ fprintf(stderr,"RtsEventQ (hd @%#lx):\n",EventHd);
+ for (x=EventHd; x!=NULL; x=x->next) {
+   G_EVENT(x,verbose);
+ }
+ if (EventHd==NULL) 
+   fprintf(stderr,"NIL\n");
+ else
+   fprintf(stderr,"\n");
+}
+
+void
+G_PE_EQ(pe,verbose)
+PEs pe;
+StgInt verbose;
+{
+ extern rtsEventQ EventHd;
+ rtsEventQ x;
+
+ fprintf(stderr,"RtsEventQ (hd @%#lx):\n",EventHd);
+ for (x=EventHd; x!=NULL; x=x->next) {
+   if (x->proc==pe)
+     G_EVENT(x,verbose);
+ }
+ if (EventHd==NULL) 
+   fprintf(stderr,"NIL\n");
+ else
+   fprintf(stderr,"\n");
+}
+
+//@node Sparks, Processors, Events, Debugging routines for GranSim and GUM
+//@subsection Sparks
+
+void 
+G_SPARK(spark, verbose) 
+rtsSparkQ spark;
+StgInt verbose;
+{
+ if (spark==(rtsSpark*)NULL) {
+   belch("G_SPARK: NULL spark; aborting");
+   return;
+ }
+  if (verbose)
+    print_spark(spark);
+  else
+    fprintf(stderr," %#lx",spark);
+}
+
+void 
+G_SPARKQ(spark,verbose) 
+rtsSparkQ spark;
+StgInt verbose;
+{
+ rtsSparkQ x;
+
+ if (spark==(rtsSpark*)NULL) {
+   belch("G_SPARKQ: NULL spark; aborting");
+   return;
+ }
+   
+ fprintf(stderr,"RtsSparkQ (hd @%#lx):\n",spark);
+ for (x=spark; x!=NULL; x=x->next) {
+   G_SPARK(x,verbose);
+ }
+ if (spark==NULL) 
+   fprintf(stderr,"NIL\n");
+ else
+   fprintf(stderr,"\n");
+}
+
+void 
+G_CURR_SPARKQ(verbose) 
+StgInt verbose;
+{
+  G_SPARKQ(pending_sparks_hd,verbose);
+}
+
+//@node Processors, Shortcuts, Sparks, Debugging routines for GranSim and GUM
+//@subsection Processors
+
+void 
+G_PROC(proc,verbose)
+StgInt proc;
+StgInt verbose;
+{ 
+  extern rtsEventQ EventHd;
+  extern char *proc_status_names[];
+
+  fprintf(stderr,"Status of proc %d at time %d (%#lx): %s (%s)\n",
+          proc,CurrentTime[proc],CurrentTime[proc],
+          (CurrentProc==proc)?"ACTIVE":"INACTIVE",
+          proc_status_names[procStatus[proc]]);
+  G_THREADQ(run_queue_hds[proc],verbose & 0x2);
+  if ( (CurrentProc==proc) )
+    G_TSO(CurrentTSO,1);
+
+  if (EventHd!=NULL)
+    fprintf(stderr,"Next event (%s) is on proc %d\n",
+            event_names[EventHd->evttype],EventHd->proc);
+
+  if (verbose & 0x1) {
+    fprintf(stderr,"\nREQUIRED sparks: ");
+    G_SPARKQ(pending_sparks_hds[proc],1);
+    fprintf(stderr,"\nADVISORY_sparks: ");
+    G_SPARKQ(pending_sparks_hds[proc],1);
+  }
+}
+
+//@node Shortcuts, Printing info type, Processors, Debugging routines for GranSim and GUM
+//@subsection Shortcuts
+
+/* Debug Processor */
+void 
+GP(proc)
+StgInt proc;
+{ G_PROC(proc,1);
+}
+
+/* Debug Current Processor */
+void
+GCP(){ G_PROC(CurrentProc,2); }
+
+/* Debug TSO */
+void
+GT(StgPtr tso){ 
+  G_TSO(tso,1);
+}
+
+/* Debug CurrentTSO */
+void
+GCT(){ 
+  fprintf(stderr,"Current Proc: %d\n",CurrentProc);
+  G_TSO(CurrentTSO,1);
+}
+
+/* Shorthand for debugging event queue */
+void
+GEQ() { G_EVENTQ(1); }
+
+/* Shorthand for debugging thread queue of a processor */
+void 
+GTQ(PEs p) { G_THREADQ(run_queue_hds[p],1); } 
+
+/* Shorthand for debugging thread queue of current processor */
+void 
+GCTQ() { G_THREADQ(run_queue_hds[CurrentProc],1); } 
+
+/* Shorthand for debugging spark queue of a processor */
+void
+GSQ(PEs p) { G_SPARKQ(pending_sparks_hds[p],1); }
+
+/* Shorthand for debugging spark queue of current processor */
+void
+GCSQ() { G_CURR_SPARKQ(1); }
+
+/* Shorthand for printing a node */
+void
+GN(StgPtr node) { G_PRINT_NODE(node); }
+
+/* Shorthand for printing info table */
+#if 0
+// ToDo: fix -- HWL
+void
+GIT(StgPtr node) { G_INFO_TABLE(node); }
+#endif
+
+void 
+printThreadQPtrs(void)
+{
+  PEs p;
+  for (p=0; p<RtsFlags.GranFlags.proc; p++) {
+    fprintf(stderr,", PE %d: (hd=%p,tl=%p)", 
+	    run_queue_hds[p], run_queue_tls[p]);
+  }
+}
+
+void
+printThreadQ(StgTSO *tso) { G_THREADQ(tso, 0); };
+
+void
+printSparkQ(rtsSpark *spark) { G_SPARKQ(spark, 0); };
+
+void
+printThreadQ_verbose(StgTSO *tso) { G_THREADQ(tso, 1); };
+
+void
+printSparkQ_verbose(rtsSpark *spark) { G_SPARKQ(spark, 1); };
+
+/* Shorthand for some of ADRs debugging functions */
+
+#endif /* GRAN && GRAN_CHECK*/
+
+#if 0
+void
+DEBUG_PRINT_NODE(node)
+StgPtr node;
+{
+   W_ info_ptr = INFO_PTR(node);
+   StgInt size = 0, ptrs = 0, i, vhs = 0;
+   char info_type[80];
+
+   info_hdr_type(info_ptr, info_type);
+
+   size_and_ptrs(node,&size,&ptrs);
+   vhs = var_hdr_size(node);
+
+   fprintf(stderr,"Node: 0x%lx", (W_) node);
+
+#if defined(PAR)
+   fprintf(stderr," [GA: 0x%lx]",GA(node));
+#endif
+
+#if defined(PROFILING)
+   fprintf(stderr," [CC: 0x%lx]",CC_HDR(node));
+#endif
+
+#if defined(GRAN)
+   fprintf(stderr," [Bitmask: 0%lo]",PROCS(node));
+#endif
+
+   fprintf(stderr," IP: 0x%lx (%s), size %ld, %ld ptrs\n",
+                  info_ptr,info_type,size,ptrs);
+
+   /* For now, we ignore the variable header */
+
+   for(i=0; i < size; ++i)
+     {
+       if(i == 0)
+         fprintf(stderr,"Data: ");
+
+       else if(i % 6 == 0)
+         fprintf(stderr,"\n      ");
+
+       if(i < ptrs)
+         fprintf(stderr," 0x%lx[P]",*(node+_FHS+vhs+i));
+       else
+         fprintf(stderr," %lu[D]",*(node+_FHS+vhs+i));
+     }
+   fprintf(stderr, "\n");
+}
+
+
+#define INFO_MASK       0x80000000
+
+void
+DEBUG_TREE(node)
+StgPtr node;
+{
+  W_ size = 0, ptrs = 0, i, vhs = 0;
+
+  /* Don't print cycles */
+  if((INFO_PTR(node) & INFO_MASK) != 0)
+    return;
+
+  size_and_ptrs(node,&size,&ptrs);
+  vhs = var_hdr_size(node);
+
+  DEBUG_PRINT_NODE(node);
+  fprintf(stderr, "\n");
+
+  /* Mark the node -- may be dangerous */
+  INFO_PTR(node) |= INFO_MASK;
+
+  for(i = 0; i < ptrs; ++i)
+    DEBUG_TREE((StgPtr)node[i+vhs+_FHS]);
+
+  /* Unmark the node */
+  INFO_PTR(node) &= ~INFO_MASK;
+}
+
+
+void
+DEBUG_INFO_TABLE(node)
+StgPtr node;
+{
+  W_ info_ptr = INFO_PTR(node);
+  char *iStgPtrtype = info_hdr_type(info_ptr);
+
+  fprintf(stderr,"%s Info Ptr @0x%lx; Entry: 0x%lx; Size: %lu; Ptrs: %lu\n\n",
+                 iStgPtrtype,info_ptr,(W_) ENTRY_CODE(info_ptr),INFO_SIZE(info_ptr),INFO_NoPTRS(info_ptr));
+#if defined(PAR)
+  fprintf(stderr,"Enter Flush Entry: 0x%lx;\tExit Flush Entry: 0x%lx\n",INFO_FLUSHENT(info_ptr),INFO_FLUSH(info_ptr));
+#endif
+
+#if defined(PROFILING)
+  fprintf(stderr,"Cost Centre (???):       0x%lx\n",INFO_CAT(info_ptr));
+#endif
+
+#if defined(_INFO_COPYING)
+  fprintf(stderr,"Evacuate Entry:    0x%lx;\tScavenge Entry: 0x%lx\n",
+          INFO_EVAC_2S(info_ptr),INFO_SCAV_2S(info_ptr));
+#endif
+
+#if defined(_INFO_COMPACTING)
+  fprintf(stderr,"Scan Link:         0x%lx;\tScan Move:      0x%lx\n",
+          (W_) INFO_SCAN_LINK_1S(info_ptr), (W_) INFO_SCAN_MOVE_1S(info_ptr));
+  fprintf(stderr,"Mark:              0x%lx;\tMarked:         0x%lx;\t",
+          (W_) INFO_MARK_1S(info_ptr), (W_) INFO_MARKED_1S(info_ptr));
+#if 0 /* avoid INFO_TYPE */
+  if(BASE_INFO_TYPE(info_ptr)==INFO_SPEC_TYPE)
+    fprintf(stderr,"plus specialised code\n");
+  else
+    fprintf(stderr,"Marking:           0x%lx\n",(W_) INFO_MARKING_1S(info_ptr));
+#endif /* 0 */
+#endif /* _INFO_COMPACTING */
+}
+#endif /* 0 */
+
+//@node Printing info type, Printing Packet Contents, Shortcuts, Debugging routines for GranSim and GUM
+//@subsection Printing info type
+
+char *
+display_info_type(closure, str)
+StgClosure *closure;
+char *str;
+{ 
+  strcpy(str,"");
+  if ( closure_HNF(closure) )
+    strcat(str,"|_HNF ");
+  else if ( closure_BITMAP(closure) )
+    strcat(str,"|_BTM");
+  else if ( !closure_SHOULD_SPARK(closure) )
+    strcat(str,"|_NS");
+  else if ( closure_STATIC(closure) )
+    strcat(str,"|_STA");
+  else if ( closure_THUNK(closure) )
+    strcat(str,"|_THU");
+  else if ( closure_MUTABLE(closure) )
+    strcat(str,"|_MUT");
+  else if ( closure_UNPOINTED(closure) )
+    strcat(str,"|_UPT");
+  else if ( closure_SRT(closure) )
+    strcat(str,"|_SRT");
+
+  return(str);
+}
+
+char *
+info_type(StgClosure *closure){ 
+  return closure_type_names[get_itbl(closure)->type];
+}
+
+char *
+info_type_by_ip(StgInfoTable *ip){ 
+  return closure_type_names[ip->type];
+}
+
+void
+info_hdr_type(StgClosure *closure, char *res){ 
+  strcpy(res,closure_type_names[get_itbl(closure)->type]);
+}
+
+/*
+  PrintPacket is in Pack.c because it makes use of closure queues
+*/
+
+#if defined(GRAN) || defined(PAR)
+
+/*
+  Print graph rooted at q. The structure of this recursive printing routine
+  should be the same as in the graph traversals when packing a graph in
+  GUM. Thus, it demonstrates the structure of such a generic graph
+  traversal, and in particular, how to extract pointer and non-pointer info
+  from the multitude of different heap objects available. 
+
+  {evacuate}Daq ngoqvam nIHlu'pu'!!
+*/
+
+void
+PrintGraph(StgClosure *p, int indent_level)
+{
+  StgPtr x, q;
+  rtsBool printed = rtsFalse;
+  nat i, j;
+  const StgInfoTable *info;
+  
+  q = p;			/* save ptr to object */
+  
+  /* indentation */
+  for (j=0; j<indent_level; j++)
+    fputs(" ", stderr);
+
+  ASSERT(p && (LOOKS_LIKE_GHC_INFO(GET_INFO((StgClosure *)p))
+              || IS_HUGS_CONSTR_INFO(GET_INFO((StgClosure *)p))));
+
+  printClosure(p); // prints contents of this one closure
+
+  /* indentation */
+  for (j=0; j<indent_level; j++)
+    fputs(" ", stderr);
+
+  info = get_itbl((StgClosure *)p);
+  /* the rest of this fct recursively traverses the graph */
+  switch (info -> type) {
+  
+  case BCO:
+    {
+  	StgBCO* bco = stgCast(StgBCO*,p);
+  	nat i;
+	fprintf(stderr, "BCO (%p) with %d pointers\n", p, bco->n_ptrs);
+  	for (i = 0; i < bco->n_ptrs; i++) {
+  	  // bcoConstCPtr(bco,i) = 
+	  PrintGraph(bcoConstCPtr(bco,i), indent_level+1);
+  	}
+  	// p += bco_sizeW(bco);
+  	break;
+    }
+  
+  case MVAR:
+    /* treat MVars specially, because we don't want to PrintGraph the
+     * mut_link field in the middle of the closure.
+     */
+    { 
+  	StgMVar *mvar = ((StgMVar *)p);
+  	// evac_gen = 0;
+	fprintf(stderr, "MVAR (%p) with 3 pointers (head, tail, value)\n", p);
+  	// (StgClosure *)mvar->head = 
+	PrintGraph((StgClosure *)mvar->head, indent_level+1);
+  	// (StgClosure *)mvar->tail = 
+	PrintGraph((StgClosure *)mvar->tail, indent_level+1);
+  	//(StgClosure *)mvar->value = 
+	PrintGraph((StgClosure *)mvar->value, indent_level+1);
+  	// p += sizeofW(StgMVar);
+  	// evac_gen = saved_evac_gen;
+  	break;
+    }
+  
+  case THUNK_2_0:
+    if (!printed) {
+      fprintf(stderr, "THUNK_2_0 (%p) with 2 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case FUN_2_0:
+    if (!printed) {
+      fprintf(stderr, "FUN_2_0 (%p) with 2 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+  case CONSTR_2_0:
+    if (!printed) {
+      fprintf(stderr, "CONSTR_2_0 (%p) with 2 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // ((StgClosure *)p)->payload[0] = 
+    PrintGraph(((StgClosure *)p)->payload[0],
+	       indent_level+1);
+    // ((StgClosure *)p)->payload[1] = 
+    PrintGraph(((StgClosure *)p)->payload[1],
+	       indent_level+1);
+    // p += sizeofW(StgHeader) + 2;
+    break;
+  
+  case THUNK_1_0:
+    // scavenge_srt(info);
+    fprintf(stderr, "THUNK_1_0 (%p) with 1 pointer\n", p);
+    // ((StgClosure *)p)->payload[0] = 
+    PrintGraph(((StgClosure *)p)->payload[0],
+	       indent_level+1);
+    // p += sizeofW(StgHeader) + 2; /* MIN_UPD_SIZE */
+    break;
+  
+  case FUN_1_0:
+    if (!printed) {
+      fprintf(stderr, "FUN_1_0 (%p) with 1 pointer\n", p);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+  case CONSTR_1_0:
+    if (!printed) {
+      fprintf(stderr, "CONSTR_2_0 (%p) with 2 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // ((StgClosure *)p)->payload[0] = 
+    PrintGraph(((StgClosure *)p)->payload[0],
+	       indent_level+1);
+    // p += sizeofW(StgHeader) + 1;
+    break;
+  
+  case THUNK_0_1:
+    fprintf(stderr, "THUNK_0_1 (%p) with 0 pointers\n", p);
+    // scavenge_srt(info);
+    // p += sizeofW(StgHeader) + 2; /* MIN_UPD_SIZE */
+    break;
+  
+  case FUN_0_1:
+    fprintf(stderr, "FUN_0_1 (%p) with 0 pointers\n", p);
+    //scavenge_srt(info);
+  case CONSTR_0_1:
+    fprintf(stderr, "CONSTR_0_1 (%p) with 0 pointers\n", p);
+    //p += sizeofW(StgHeader) + 1;
+    break;
+  
+  case THUNK_0_2:
+    if (!printed) {
+      fprintf(stderr, "THUNK_0_2 (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case FUN_0_2:
+    if (!printed) {
+      fprintf(stderr, "FUN_0_2 (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+  case CONSTR_0_2:
+    if (!printed) {
+      fprintf(stderr, "CONSTR_0_2 (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // p += sizeofW(StgHeader) + 2;
+    break;
+  
+  case THUNK_1_1:
+    if (!printed) {
+      fprintf(stderr, "THUNK_1_1 (%p) with 1 pointer\n", p);
+      printed = rtsTrue;
+    }
+  case FUN_1_1:
+    if (!printed) {
+      fprintf(stderr, "FUN_1_1 (%p) with 1 pointer\n", p);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+  case CONSTR_1_1:
+    if (!printed) {
+      fprintf(stderr, "CONSTR_1_1 (%p) with 1 pointer\n", p);
+      printed = rtsTrue;
+    }
+    // ((StgClosure *)p)->payload[0] = 
+    PrintGraph(((StgClosure *)p)->payload[0],
+	       indent_level+1);
+    // p += sizeofW(StgHeader) + 2;
+    break;
+  
+  case FUN:
+    if (!printed) {
+      fprintf(stderr, "FUN (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    /* fall through */
+  
+  case THUNK:
+    if (!printed) {
+      fprintf(stderr, "THUNK (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+    /* fall through */
+  
+  case CONSTR:
+    if (!printed) {
+      fprintf(stderr, "CONSTR (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    /* basically same as loop in STABLE_NAME case  */
+    for (i=0; i<info->layout.payload.ptrs; i++)
+      PrintGraph(((StgClosure *)p)->payload[i],
+		 indent_level+1);
+    break;
+    /* NOT fall through */
+  
+  case WEAK:
+    if (!printed) {
+      fprintf(stderr, "WEAK (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    /* fall through */
+  
+  case FOREIGN:
+    if (!printed) {
+      fprintf(stderr, "FOREIGN (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    /* fall through */
+  
+  case STABLE_NAME:
+    {
+      StgPtr end;
+      
+      if (!printed) {
+	fprintf(stderr, "STABLE_NAME (%p) with %d pointers (not followed!)\n", 
+		p, info->layout.payload.ptrs);
+	printed = rtsTrue;
+      }
+      end = (StgPtr)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+      for (p = (StgPtr)((StgClosure *)p)->payload; p < end; p++) {
+	// (StgClosure *)*p = 
+	//PrintGraph((StgClosure *)*p, indent_level+1);
+	fprintf(stderr, ", %p", *p); 
+      }
+      //fputs("\n", stderr);
+      // p += info->layout.payload.nptrs;
+      break;
+    }
+  
+  case IND_PERM:
+    //if (step->gen->no != 0) {
+    //	SET_INFO(((StgClosure *)p), &IND_OLDGEN_PERM_info);
+    //}
+    if (!printed) {
+      fprintf(stderr, "IND_PERM (%p) with indirection to\n", 
+	      p, ((StgIndOldGen *)p)->indirectee);
+      printed = rtsTrue;
+    }
+    /* fall through */
+
+  case IND_OLDGEN_PERM:
+    if (!printed) {
+      fprintf(stderr, "IND_OLDGEN_PERM (%p) with indirection to %p\n", 
+	      p, ((StgIndOldGen *)p)->indirectee);
+      printed = rtsTrue;
+    }
+    // ((StgIndOldGen *)p)->indirectee = 
+    PrintGraph(((StgIndOldGen *)p)->indirectee,
+	       indent_level+1);
+    //if (failed_to_evac) {
+    //	failed_to_evac = rtsFalse;
+    //	recordOldToNewPtrs((StgMutClosure *)p);
+    //}
+    // p += sizeofW(StgIndOldGen);
+    break;
+  
+  case CAF_UNENTERED:
+    {
+  	StgCAF *caf = (StgCAF *)p;
+  
+	fprintf(stderr, "CAF_UNENTERED (%p) pointing to %p\n", p, caf->body);
+  	PrintGraph(caf->body, indent_level+1);
+  	//if (failed_to_evac) {
+  	//  failed_to_evac = rtsFalse;
+  	//  recordOldToNewPtrs((StgMutClosure *)p);
+  	//} else {
+  	//  caf->mut_link = NULL;
+  	//}
+	//p += sizeofW(StgCAF);
+  	break;
+    }
+  
+  case CAF_ENTERED:
+    {
+  	StgCAF *caf = (StgCAF *)p;
+  
+	fprintf(stderr, "CAF_ENTERED (%p) pointing to %p and %p\n", 
+		p, caf->body, caf->value);
+  	// caf->body = 
+	PrintGraph(caf->body, indent_level+1);
+  	//caf->value = 
+	PrintGraph(caf->value, indent_level+1);
+  	//if (failed_to_evac) {
+  	//  failed_to_evac = rtsFalse;
+  	//  recordOldToNewPtrs((StgMutClosure *)p);
+  	//} else {
+  	//  caf->mut_link = NULL;
+  	//}
+	//p += sizeofW(StgCAF);
+  	break;
+    }
+  
+  case MUT_VAR:
+    /* ignore MUT_CONSs */
+    fprintf(stderr, "MUT_VAR (%p) pointing to %p\n", p, ((StgMutVar *)p)->var);
+    if (((StgMutVar *)p)->header.info != &MUT_CONS_info) {
+      //evac_gen = 0;
+      PrintGraph(((StgMutVar *)p)->var, indent_level+1);
+  	//evac_gen = saved_evac_gen;
+    }
+    //p += sizeofW(StgMutVar);
+    break;
+  
+  case CAF_BLACKHOLE:
+    if (!printed) {
+      fprintf(stderr, "CAF_BLACKHOLE (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case SE_CAF_BLACKHOLE:
+    if (!printed) {
+      fprintf(stderr, "SE_CAF_BLACKHOLE (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case SE_BLACKHOLE:
+    if (!printed) {
+      fprintf(stderr, "SE_BLACKHOLE (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case BLACKHOLE:
+    if (!printed) {
+      fprintf(stderr, "BLACKHOLE (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+    //p += BLACKHOLE_sizeW();
+    break;
+  
+  case BLACKHOLE_BQ:
+    { 
+      StgBlockingQueue *bh = (StgBlockingQueue *)p;
+      // (StgClosure *)bh->blocking_queue = 
+      fprintf(stderr, "BLACKHOLE_BQ (%p) pointing to %p\n", 
+	      p, (StgClosure *)bh->blocking_queue);
+      PrintGraph((StgClosure *)bh->blocking_queue, indent_level+1);
+      //if (failed_to_evac) {
+      //  failed_to_evac = rtsFalse;
+      //  recordMutable((StgMutClosure *)bh);
+      //}
+      // p += BLACKHOLE_sizeW();
+      break;
+    }
+  
+  case THUNK_SELECTOR:
+    { 
+      StgSelector *s = (StgSelector *)p;
+      fprintf(stderr, "THUNK_SELECTOR (%p) pointing to %p\n", 
+	      p, s->selectee);
+      PrintGraph(s->selectee, indent_level+1);
+      // p += THUNK_SELECTOR_sizeW();
+      break;
+    }
+  
+  case IND:
+    fprintf(stderr, "IND (%p) pointing to %p\n", p, ((StgInd*)p)->indirectee);
+    PrintGraph(((StgInd*)p)->indirectee, indent_level+1);
+    break;
+
+  case IND_OLDGEN:
+    fprintf(stderr, "IND_OLDGEN (%p) pointing to %p\n", 
+	    p, ((StgIndOldGen*)p)->indirectee);
+    PrintGraph(((StgIndOldGen*)p)->indirectee, indent_level+1);
+    break;
+  
+  case CONSTR_INTLIKE:
+    fprintf(stderr, "CONSTR_INTLIKE (%p) with 0 pointers\n", p);
+    break;
+  case CONSTR_CHARLIKE:
+    fprintf(stderr, "CONSTR_CHARLIKE (%p) with 0 pointers\n", p);
+    break;
+  case CONSTR_STATIC:
+    fprintf(stderr, "CONSTR_STATIC (%p) with 0 pointers\n", p);
+    break;
+  case CONSTR_NOCAF_STATIC:
+    fprintf(stderr, "CONSTR_NOCAF_STATIC (%p) with 0 pointers\n", p);
+    break;
+  case THUNK_STATIC:
+    fprintf(stderr, "THUNK_STATIC (%p) with 0 pointers\n", p);
+    break;
+  case FUN_STATIC:
+    fprintf(stderr, "FUN_STATIC (%p) with 0 pointers\n", p);
+    break;
+  case IND_STATIC:
+    fprintf(stderr, "IND_STATIC (%p) with 0 pointers\n", p);
+    break;
+  
+  case RET_BCO:
+    fprintf(stderr, "RET_BCO (%p) with 0 pointers\n", p);
+    break;
+  case RET_SMALL:
+    fprintf(stderr, "RET_SMALL (%p) with 0 pointers\n", p);
+    break;
+  case RET_VEC_SMALL:
+    fprintf(stderr, "RET_VEC_SMALL (%p) with 0 pointers\n", p);
+    break;
+  case RET_BIG:
+    fprintf(stderr, "RET_BIG (%p) with 0 pointers\n", p);
+    break;
+  case RET_VEC_BIG:
+    fprintf(stderr, "RET_VEC_BIG (%p) with 0 pointers\n", p);
+    break;
+  case RET_DYN:
+    fprintf(stderr, "RET_DYN (%p) with 0 pointers\n", p);
+    break;
+  case UPDATE_FRAME:
+    fprintf(stderr, "UPDATE_FRAME (%p) with 0 pointers\n", p);
+    break;
+  case STOP_FRAME:
+    fprintf(stderr, "STOP_FRAME (%p) with 0 pointers\n", p);
+    break;
+  case CATCH_FRAME:
+    fprintf(stderr, "CATCH_FRAME (%p) with 0 pointers\n", p);
+    break;
+  case SEQ_FRAME:
+    fprintf(stderr, "SEQ_FRAME (%p) with 0 pointers\n", p);
+    break;
+  
+  case AP_UPD: /* same as PAPs */
+    fprintf(stderr, "AP_UPD (%p) with 0 pointers\n", p);
+  case PAP:
+    /* Treat a PAP just like a section of stack, not forgetting to
+     * PrintGraph the function pointer too...
+     */
+    { 
+  	StgPAP* pap = stgCast(StgPAP*,p);
+  
+	fprintf(stderr, "PAP (%p) pointing to %p\n", p, pap->fun);
+  	// pap->fun = 
+	PrintGraph(pap->fun, indent_level+1);
+  	//scavenge_stack((P_)pap->payload, (P_)pap->payload + pap->n_args);
+  	//p += pap_sizeW(pap);
+  	break;
+    }
+    
+  case ARR_WORDS:
+    fprintf(stderr, "ARR_WORDS (%p) with 0 pointers\n", p);
+    /* nothing to follow */
+    //p += arr_words_sizeW(stgCast(StgArrWords*,p));
+    break;
+  
+  case MUT_ARR_PTRS:
+    /* follow everything */
+    {
+  	StgPtr next;
+  
+	fprintf(stderr, "MUT_ARR_PTRS (%p) with %d pointers (not followed)\n", 
+		p, mut_arr_ptrs_sizeW((StgMutArrPtrs*)p));
+  	// evac_gen = 0;		/* repeatedly mutable */
+  	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+  	  // (StgClosure *)*p = 
+	  // PrintGraph((StgClosure *)*p, indent_level+1);
+	  fprintf(stderr, ", %p", *p); 
+  	}
+	fputs("\n", stderr);
+  	//evac_gen = saved_evac_gen;
+  	break;
+    }
+  
+  case MUT_ARR_PTRS_FROZEN:
+    /* follow everything */
+    {
+  	StgPtr start = p, next;
+  
+	fprintf(stderr, "MUT_ARR_PTRS (%p) with %d pointers (not followed)", 
+		p, mut_arr_ptrs_sizeW((StgMutArrPtrs*)p));
+  	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+  	  // (StgClosure *)*p = 
+	  // PrintGraph((StgClosure *)*p, indent_level+1);
+	  fprintf(stderr, ", %p", *p); 
+  	}
+	fputs("\n", stderr);
+  	//if (failed_to_evac) {
+  	  /* we can do this easier... */
+  	//  recordMutable((StgMutClosure *)start);
+  	//  failed_to_evac = rtsFalse;
+  	//}
+  	break;
+    }
+  
+  case TSO:
+    { 
+  	StgTSO *tso;
+  	
+  	tso = (StgTSO *)p;
+	fprintf(stderr, "TSO (%p) with link field %p\n", p, (StgClosure *)tso->link);
+  	// evac_gen = 0;
+  	/* chase the link field for any TSOs on the same queue */
+  	// (StgClosure *)tso->link = 
+	PrintGraph((StgClosure *)tso->link, indent_level+1);
+  	//if (tso->blocked_on) {
+  	//  tso->blocked_on = PrintGraph(tso->blocked_on);
+  	//}
+  	/* scavenge this thread's stack */
+  	//scavenge_stack(tso->sp, &(tso->stack[tso->stack_size]));
+  	//evac_gen = saved_evac_gen;
+  	//p += tso_sizeW(tso);
+  	break;
+    }
+  
+#if defined(GRAN) || defined(PAR)
+  case RBH:
+    {
+    StgInfoTable *rip = REVERT_INFOPTR(get_itbl(p));
+    //if (LOOKS_LIKE_GHC_INFO(rip))
+    //  fprintf(stderr, "RBH (%p) with 0 pointers (reverted type=%s)\n", 
+	//      p, info_type_by_ip(rip)); 
+    //else
+    fprintf(stderr, "RBH (%p) with 0 pointers (reverted IP=%x)\n", 
+	    p, rip); 
+    }
+    break;
+#endif
+#if defined(PAR)
+  case BLOCKED_FETCH:
+    fprintf(stderr, "BLOCKED_FETCH (%p) with 0 pointers (link=%p)\n", 
+	    p, ((StgBlockedFetch *)p)->link);
+    break;
+  case FETCH_ME:
+    fprintf(stderr, "FETCH_ME (%p) with 0 pointers\n", p);
+    break;
+  case FETCH_ME_BQ:
+    fprintf(stderr, "FETCH_ME_BQ (%p) with 0 pointers (blocking_queue=%p)\n", 
+	    p, ((StgFetchMeBlockingQueue *)p)->blocking_queue);
+    break;
+#endif
+  case EVACUATED:
+    fprintf(stderr, "EVACUATED (%p) with 0 pointers (evacuee=%p)\n", 
+	    p, ((StgEvacuated *)p)->evacuee);
+    break;
+  
+  default:
+    barf("PrintGraph: unknown closure %d (%s)",
+	 info -> type, info_type(info));
+  }
+  
+  /* If we didn't manage to promote all the objects pointed to by
+   * the current object, then we have to designate this object as
+   * mutable (because it contains old-to-new generation pointers).
+   */
+  //if (failed_to_evac) {
+  //  mkMutCons((StgClosure *)q, &generations[evac_gen]);
+  //  failed_to_evac = rtsFalse;
+  //}
+}    
+
+#endif /* GRAN */
+
+#endif /* GRAN || PAR */
+//@node End of File,  , Printing Packet Contents, Debugging routines for GranSim and GUM
+//@subsection End of File
diff --git a/ghc/rts/parallel/ParallelDebug.h b/ghc/rts/parallel/ParallelDebug.h
new file mode 100644
index 0000000..62f2232
--- /dev/null
+++ b/ghc/rts/parallel/ParallelDebug.h
@@ -0,0 +1,49 @@
+/* 
+   Time-stamp: <Mon Nov 29 1999 17:17:13 Stardate: [-30]3973.60 hwloidl>
+
+   Prototypes of all parallel debugging functions.
+   */
+
+#ifndef PARALLEL_DEBUG_H
+#define PARALLEL_DEBUG_H
+
+#if defined(GRAN) // || defined(PAR)
+void G_PRINT_NODE(StgClosure* node);
+void G_PPN(StgClosure* node);
+void G_INFO_TABLE(StgClosure* node);
+void G_CURR_THREADQ(StgInt verbose);
+void G_THREADQ(StgTSO* closure, StgInt verbose);
+void G_TSO(StgTSO* closure, StgInt verbose);
+void G_EVENT(rtsEventQ event, StgInt verbose);
+void G_EVENTQ(StgInt verbose);
+void G_PE_EQ(PEs pe, StgInt verbose);
+void G_SPARK(rtsSparkQ spark, StgInt verbose);
+void G_SPARKQ(rtsSparkQ spark, StgInt verbose);
+void G_CURR_SPARKQ(StgInt verbose);
+void G_PROC(StgInt proc, StgInt verbose);
+void GP(StgInt proc);
+void GCP(void);
+void GT(StgPtr tso);
+void GCT(void);
+void GEQ(void);
+void GTQ(PEs p);
+void GCTQ(void);
+void GSQ(PEs p);
+void GCSQ(void);
+void GN(StgPtr node);
+void GIT(StgPtr node);
+#endif
+
+#if defined(GRAN) || defined(PAR)
+
+char  *display_info_type(StgClosure *closure, char *str);
+void   info_hdr_type(StgClosure *closure, char *res);
+char  *info_type(StgClosure *closure);
+char  *info_type_by_ip(StgInfoTable *ip);
+
+void   PrintPacket(rtsPackBuffer *buffer);
+void   PrintGraph(StgClosure *p, int indent_level);
+
+#endif /* GRAN || PAR */
+
+#endif /* PARALLEL_DEBUG_H */
diff --git a/ghc/rts/parallel/ParallelRts.h b/ghc/rts/parallel/ParallelRts.h
new file mode 100644
index 0000000..e139541
--- /dev/null
+++ b/ghc/rts/parallel/ParallelRts.h
@@ -0,0 +1,294 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Wed Jan 12 2000 16:22:43 Stardate: [-30]4194.45 hwloidl>
+   $Id: ParallelRts.h,v 1.2 2000/01/13 14:34:09 hwloidl Exp $
+
+   Variables and functions specific to the parallel RTS (i.e. GUM or GranSim)
+   ----------------------------------------------------------------------- */
+
+#ifndef PARALLEL_RTS_H
+#define PARALLEL_RTS_H
+
+#if defined(GRAN) || defined(PAR)
+
+//@menu
+//* Packing routines::		
+//* Spark handling routines::	
+//* GC routines::		
+//* Debugging routines::	
+//* Generating .gr profiles::	
+//* Common macros::		
+//* Index::			
+//@end menu
+
+#ifndef GRAN
+// Dummy def for NO_PRI if not in GranSim
+#define NO_PRI  0
+#endif
+
+//@node Packing routines, Spark handling routines
+//@subsection Packing routines
+
+#if defined(GRAN)
+/* Statistics info */
+extern nat tot_packets, tot_packet_size, tot_cuts, tot_thunks;
+#endif
+
+#if defined(GRAN)
+/* Pack.c */
+rtsPackBuffer *PackNearbyGraph(StgClosure* closure, StgTSO* tso, 
+			       nat *packBufferSize);
+rtsPackBuffer *PackOneNode(StgClosure* closure, StgTSO* tso, 
+			   nat *packBufferSize);
+rtsPackBuffer *PackTSO(StgTSO *tso, nat *packBufferSize);
+rtsPackBuffer *PackStkO(StgPtr stko, nat *packBufferSize);
+void           PackFetchMe(StgClosure *closure);
+
+/* Unpack.c */
+StgClosure*    UnpackGraph(rtsPackBuffer* buffer);
+void           InitPendingGABuffer(nat size); 
+
+/* RBH.c */
+StgClosure    *convertToRBH(StgClosure *closure);
+void           convertFromRBH(StgClosure *closure);
+
+/* General closure predicates */
+/*
+    {Parallel.h}Daq ngoqvam vIroQpu'
+
+StgInfoTable *get_closure_info(StgClosure* node, nat *size, nat *ptrs, nat *nonptrs, nat *vhs, char *info_hdr_ty);
+rtsBool      IS_BLACK_HOLE(StgClosure* node);
+StgClosure  *IS_INDIRECTION(StgClosure* node);
+rtsBool      IS_THUNK(StgClosure* closure);
+*/
+
+#elif defined(PAR)
+
+/* Pack.c */
+rtsPackBuffer *PackNearbyGraph(StgClosure* closure, StgTSO* tso, 
+			       nat *packBufferSize); 
+
+rtsPackBuffer *PackTSO(StgTSO *tso, nat *packBufferSize);
+rtsPackBuffer *PackStkO(StgPtr stko, nat *packBufferSize);
+void           PackFetchMe(StgClosure *closure);
+
+/* Unpack.c */
+void           CommonUp(StgClosure *src, StgClosure *dst);
+StgClosure    *UnpackGraph(rtsPackBuffer *buffer, globalAddr **gamap, 
+			   nat *nGAs);
+
+/* RBH.c */
+StgClosure    *convertToRBH(StgClosure *closure);
+void           convertToFetchMe(StgRBH *rbh, globalAddr *ga);
+
+/* General closure predicates */
+/* 
+  {Parallel.h}Daq ngoqvam vIroQpu'
+
+StgInfoTable *get_closure_info(StgClosure* node, nat *size, nat *ptrs, nat *nonptrs, nat *vhs, char *info_hdr_ty);
+rtsBool      IS_BLACK_HOLE(StgClosure* node);
+StgClosure  *IS_INDIRECTION(StgClosure* node);
+rtsBool      IS_THUNK(StgClosure* closure);
+*/
+
+#endif
+
+/* this routine should be moved to a more general module; currently in Pack.c 
+StgInfoTable* get_closure_info(StgClosure* node, 
+			       nat *size, nat *ptrs, nat *nonptrs, nat *vhs, 
+			       char *info_hdr_ty);
+*/
+void doGlobalGC(void); 
+
+//@node Spark handling routines, GC routines, Packing routines
+//@subsection Spark handling routines
+
+/* now in ../Sparks.c */
+
+#if 0
+
+#if defined(PAR)
+
+rtsSpark  findLocalSpark(rtsBool forexport);
+StgTSO*   activateSpark (rtsSpark spark); 
+void      disposeSpark(rtsSpark spark);
+rtsBool   add_to_spark_queue(StgClosure *closure, rtsBool required);
+rtsBool   initSparkPools (void);
+
+nat       spark_queue_len(nat pool);
+void      markSparkQueue(void);
+void      print_sparkq(void);
+
+#elif defined(GRAN)
+
+void      findLocalSpark (rtsEvent *event, 
+			  rtsBool *found_res, rtsSparkQ *spark_res);
+rtsBool   activateSpark (rtsEvent *event, rtsSparkQ spark);
+rtsSpark *newSpark (StgClosure *node, StgInt name, StgInt gran_info, 
+		    StgInt size_info, StgInt par_info, StgInt local);
+void      disposeSpark(rtsSpark *spark);
+void      disposeSparkQ(rtsSparkQ spark);
+void      add_to_spark_queue(rtsSpark *spark);
+void      print_spark(rtsSpark *spark);
+nat       spark_queue_len(PEs proc);
+rtsSpark *delete_from_sparkq (rtsSpark *spark, PEs p, rtsBool dispose_too);
+void      markSparkQueue(void);
+void      print_sparkq(PEs proc);
+void      print_sparkq_stats(void);
+
+#endif
+#endif /* 0 */
+
+//@node GC routines, Debugging routines, Spark handling routines
+//@subsection GC routines
+
+#if defined(PAR)
+/* HLComms.c */
+void      freeRemoteGA(int pe, globalAddr *ga);
+void      sendFreeMessages(void);
+
+/* Global.c */
+void      markLocalGAs(rtsBool full);
+void      RebuildGAtables(rtsBool full);
+void      RebuildLAGAtable(void);
+#endif
+
+//@node Debugging routines, Generating .gr profiles, GC routines
+//@subsection Debugging routines
+
+#if defined(PAR)
+void      printGA (globalAddr *ga);
+void      printGALA (GALA *gala);
+void      printLAGAtable(void);
+#endif
+
+//@node Generating .gr profiles, Common macros, Debugging routines
+//@subsection Generating .gr profiles
+
+#define STATS_FILENAME_MAXLEN	128
+
+/* Where to write the log file */
+//@cindex gr_file
+//@cindex gr_filename
+extern FILE *gr_file;
+extern char gr_filename[STATS_FILENAME_MAXLEN];
+
+//@cindex init_gr_simulation
+//@cindex end_gr_simulation
+void init_gr_simulation(int rts_argc, char *rts_argv[], 
+			int prog_argc, char *prog_argv[]);
+void end_gr_simulation(void);
+
+//@node Common macros, Index, Generating .gr profiles
+//@subsection Common macros
+
+/* 
+   extracting specific info out of a closure; used in packing (GranSim, GUM)
+*/
+//@cindex get_closure_info
+static inline StgInfoTable*
+get_closure_info(node, size, ptrs, nonptrs, vhs, info_hdr_ty)
+StgClosure* node;
+nat *size, *ptrs, *nonptrs, *vhs;
+char *info_hdr_ty;
+{
+  StgInfoTable *info;
+
+  info = get_itbl(node);
+  /* the switch shouldn't be necessary, really; just use default case */
+  switch (info->type) {
+  case RBH:
+    {
+      StgInfoTable *rip = REVERT_INFOPTR(info); // closure to revert to
+      *size = sizeW_fromITBL(rip);
+      *ptrs = (nat) (rip->layout.payload.ptrs);
+      *nonptrs = (nat) (rip->layout.payload.nptrs);
+      *vhs = (nat) 0; // unknown
+#if 0 /* DEBUG */
+      info_hdr_type(node, info_hdr_ty);
+#else
+      strcpy(info_hdr_ty, "UNKNOWN");
+#endif
+      return rip;  // NB: we return the reverted info ptr for a RBH!!!!!!
+    }
+
+  default:
+    *size = sizeW_fromITBL(info);
+    *ptrs = (nat) (info->layout.payload.ptrs);
+    *nonptrs = (nat) (info->layout.payload.nptrs);
+    *vhs = (nat) 0; // unknown
+#if 0 /* DEBUG */
+      info_hdr_type(node, info_hdr_ty);
+#else
+      strcpy(info_hdr_ty, "UNKNOWN");
+#endif
+    return info;
+  }
+} 
+
+//@cindex IS_BLACK_HOLE
+static inline rtsBool
+IS_BLACK_HOLE(StgClosure* node)          
+{ 
+  StgInfoTable *info;
+  switch (get_itbl(node)->type) {
+  case BLACKHOLE:
+  case BLACKHOLE_BQ:
+  case RBH:
+  case FETCH_ME:
+  case FETCH_ME_BQ:
+    return rtsTrue;
+  default:
+    return rtsFalse;
+  }
+//return ((info->type == BLACKHOLE || info->type == RBH) ? rtsTrue : rtsFalse);
+}
+
+//@cindex IS_INDIRECTION
+static inline StgClosure *
+IS_INDIRECTION(StgClosure* node)          
+{ 
+  StgInfoTable *info;
+  info = get_itbl(node);
+  switch (info->type) {
+    case IND:
+    case IND_OLDGEN:
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+    case IND_STATIC:
+      /* relies on indirectee being at same place for all these closure types */
+      return (((StgInd*)node) -> indirectee);
+    default:
+      return NULL;
+  }
+}
+
+//@cindex unwindInd
+static inline StgClosure *
+UNWIND_IND (StgClosure *closure)
+{
+  StgClosure *next;
+
+  while ((next = IS_INDIRECTION((StgClosure *)closure)) != NULL) 
+    closure = next;
+
+  ASSERT(next==(StgClosure *)NULL);
+  return closure;
+}
+
+#endif /* defined(PAR) || defined(GRAN) */
+
+#endif /* PARALLEL_RTS_H */
+
+//@node Index,  , Common macros
+//@subsection Index
+
+//@index
+//* IS_BLACK_HOLE::  @cindex\s-+IS_BLACK_HOLE
+//* IS_INDIRECTION::  @cindex\s-+IS_INDIRECTION
+//* end_gr_simulation::  @cindex\s-+end_gr_simulation
+//* get_closure_info::  @cindex\s-+get_closure_info
+//* gr_file::  @cindex\s-+gr_file
+//* gr_filename::  @cindex\s-+gr_filename
+//* init_gr_simulation::  @cindex\s-+init_gr_simulation
+//* unwindInd::  @cindex\s-+unwindInd
+//@end index
diff --git a/ghc/rts/parallel/RBH.c b/ghc/rts/parallel/RBH.c
new file mode 100644
index 0000000..faf2591
--- /dev/null
+++ b/ghc/rts/parallel/RBH.c
@@ -0,0 +1,338 @@
+/*
+  Time-stamp: <Sun Dec 12 1999 20:39:04 Stardate: [-30]4039.09 software>
+
+  Revertible Black Hole Manipulation.
+  Used in GUM and GranSim during the packing of closures. These black holes
+  must be revertible because a GC might occur while the packet is being 
+  transmitted. In this case all RBHs have to be reverted.
+  */
+
+#if defined(PAR) || defined(GRAN) /* whole file */
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+# if defined(DEBUG)
+# include "ParallelDebug.h"
+# endif
+#include "Storage.h"  // for recordMutable
+#include "StgMacros.h" // inlined IS_... fcts
+
+/*
+   Turn a closure into a revertible black hole.  After the conversion, the
+   first two words of the closure (after the fixed header, of course) will
+   be a link to the mutables list (if appropriate for the garbage
+   collector), and a pointer to the blocking queue.  The blocking queue is
+   terminated by a 2-word SPEC closure which holds the original contents of
+   the first two words of the closure.  
+*/
+
+//@menu
+//* Externs and prototypes::	
+//* Conversion Functions::	
+//* Index::			
+//@end menu
+//*/
+
+//@node Externs and prototypes, Conversion Functions
+//@section Externs and prototypes
+
+EXTFUN(RBH_Save_0_info);
+EXTFUN(RBH_Save_1_info);
+EXTFUN(RBH_Save_2_info);
+
+//@node Conversion Functions, Index, Externs and prototypes
+//@section Conversion Functions
+
+/*
+  A closure is turned into an RBH upon packing it (see PackClosure in Pack.c).
+  This is needed in case we have to do a GC before the packet is turned
+  into a graph on the PE receiving the packet. 
+*/
+//@cindex convertToRBH
+StgClosure *
+convertToRBH(closure)
+StgClosure *closure;
+{
+  StgRBHSave *rbh_save;
+  StgInfoTable *info_ptr, *rbh_info_ptr, *old_info;
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+
+  /*
+     Closure layout before this routine runs amuck:
+       +-------------------
+       |   HEADER   | DATA ...
+       +-------------------
+       | FIXED_HS   |
+  */
+  /* 
+     Turn closure into an RBH.  This is done by modifying the info_ptr,
+     grabbing the info_ptr of the RBH for this closure out of its
+     ITBL. Additionally, we have to save the words from the closure, which
+     will hold the link to the blocking queue.  For this purpose we use the
+     RBH_Save_N closures, with N being the number of pointers for this
+     closure.  */
+  IF_GRAN_DEBUG(pack,
+		belch(":*   Converting closure %p (%s) into an RBH",
+		      closure, info_type(closure))); 
+  IF_PAR_DEBUG(pack,
+		belch(":*   Converting closure %p (%s) into an RBH",
+		      closure, info_type(closure))); 
+
+  ASSERT(closure_THUNK(closure));
+
+  IF_GRAN_DEBUG(pack,
+		old_info = get_itbl(closure));
+
+  /* Allocate a new closure for the holding data ripped out of closure */
+  if ((rbh_save = (StgRBHSave *)allocate(FIXED_HS + 2)) == NULL)
+    return NULL;  /* have to Garbage Collect; check that in the caller! */
+
+  info_ptr = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+  ASSERT(size >= MIN_UPD_SIZE);
+
+  /* Fill in the RBH_Save closure with the original data from closure */
+  rbh_save->payload[0] = (StgPtr) ((StgRBH *)closure)->blocking_queue;
+  rbh_save->payload[1] = (StgPtr) ((StgRBH *)closure)->mut_link;
+
+  /* Set the info_ptr for the rbh_Save closure according to the number of
+     pointers in the original */
+
+  rbh_info_ptr = (StgInfoTable *) (ptrs == 0 ? &RBH_Save_0_info :
+				   ptrs == 1 ? &RBH_Save_1_info :
+				   &RBH_Save_2_info);
+  SET_INFO(rbh_save, rbh_info_ptr);
+  /* same bitmask as the original closure */
+  SET_GRAN_HDR(rbh_save, PROCS(closure));
+
+  /* Init the blocking queue of the RBH and have it point to the saved data */
+  ((StgRBH *)closure)->blocking_queue = (StgBlockingQueueElement *)rbh_save;
+
+  ASSERT(LOOKS_LIKE_GHC_INFO(RBH_INFOPTR(get_itbl(closure))));
+  /* Turn the closure into a RBH;  a great system, indeed! */
+  SET_INFO(closure, RBH_INFOPTR(get_itbl(closure)));
+
+  /*
+    add closure to the mutable list!
+    do this after having turned the closure into an RBH, because an
+    RBH is mutable but the think it was previously isn't
+  */
+  //recordMutable((StgMutClosure *)closure);
+
+  //IF_GRAN_DEBUG(pack,
+		/* sanity check; make sure that reverting the RBH yields the 
+		   orig closure, again */
+  //ASSERT(REVERT_INFOPTR(get_itbl(closure))==old_info));
+
+  /*
+     Closure layout after this routine has run amuck:
+       +---------------------
+       | RBH-HEADER | |   |  ...
+       +--------------|---|--
+       | FIXED_HS   | |   v
+                      |   Mutable-list ie another StgMutClosure
+		      v
+		      +---------
+		      | RBH_SAVE with 0-2 words of DATA
+		      +---------
+  */
+
+  return closure;
+}
+
+/*
+  An RBH closure is turned into a FETCH_ME when reveiving an ACK message
+  indicating that the transferred closure has been unpacked on the other PE
+  (see processAck in HLComms.c). The ACK also contains the new GA of the
+  closure to which the FETCH_ME closure has to point.
+
+  Converting a closure to a FetchMe is trivial, unless the closure has
+  acquired a blocking queue.  If that has happened, we first have to awaken
+  the blocking queue.  What a nuisance!  Fortunately, @AwakenBlockingQueue@
+  should now know what to do.
+
+  A note on GrAnSim: In GrAnSim we don't have FetchMe closures. However,
+  we have to turn a RBH back to its original form when the simulated
+  transfer of the closure has been finished. Therefore we need the
+  @convertFromRBH@ routine below. After converting the RBH back to its
+  original form and awakening all TSOs, the first TSO will reenter the
+  closure which is now local and carry on merrily reducing it (the other
+  TSO will be less merrily blocked on the now local closure; we're costing
+  the difference between local and global blocks in the BQ code).  -- HWL 
+*/
+
+# if defined(PAR)
+
+EXTFUN(FETCH_ME_info);
+
+//@cindex convertToFetchMe
+void
+convertToFetchMe(rbh, ga)
+StgRBH *rbh;
+globalAddr *ga;
+{
+  // StgInfoTable *ip = get_itbl(rbh);
+  StgBlockingQueueElement *bqe = rbh->blocking_queue;
+
+  ASSERT(get_itbl(rbh)->type==RBH);
+
+  IF_PAR_DEBUG(pack,
+	       belch(":*   Converting RBH %p (%s) into a FETCH_ME for GA ((%x, %d, %x))",
+		     rbh, info_type(rbh), 
+	             ga->payload.gc.gtid, ga->payload.gc.slot, ga->weight)); 
+
+  /* put closure on mutables list, while it is still a RBH */
+  //recordMutable((StgMutClosure *)rbh);
+
+  /* actually turn it into a FETCH_ME */
+  SET_INFO((StgClosure *)rbh, &FETCH_ME_info);
+
+  /* set the global pointer in the FETCH_ME closure to the given value */
+  ((StgFetchMe *)rbh)->ga = ga;
+
+  IF_PAR_DEBUG(pack,
+	       if (get_itbl(bqe)->type==TSO || get_itbl(bqe)->type==BLOCKED_FETCH)
+	         belch(":*     Awakening non-empty BQ of RBH closure %p (first TSO is %d (%p)",
+		      rbh, ((StgTSO *)bqe)->id, ((StgTSO *)bqe))); 
+
+  /* awaken all TSOs and BLOCKED_FETCHES on the blocking queue */
+  if (get_itbl(bqe)->type==TSO || get_itbl(bqe)->type==BLOCKED_FETCH)
+    awaken_blocked_queue(bqe, (StgClosure *)rbh);
+}
+# else  /* GRAN */
+/* Prototype */
+// void UnlinkFromMUT(StgPtr closure); 
+
+/*
+  This routine in fact reverts the RBH into its original form; this code 
+  should be of interest for GUM, too, but is not needed in the current version.
+  convertFromRBH is called where GUM uses convertToFetchMe.
+*/
+void
+convertFromRBH(closure)
+StgClosure *closure;
+{
+  StgBlockingQueueElement *bqe = ((StgRBH*)closure)->blocking_queue;
+  char str[NODE_STR_LEN]; // debugging only
+  StgInfoTable *rip = REVERT_INFOPTR(get_itbl(closure));  // debugging only
+
+  IF_GRAN_DEBUG(pack,
+		if (get_itbl(bqe)->type==TSO)
+		  sprintf(str, "%d (%p)", 
+			  ((StgTSO *)bqe)->id, ((StgTSO *)bqe));
+		else 
+		  strcpy(str, "empty");
+		belch(":*   Reverting RBH %p (%s) into a ??? closure again; BQ start: %s",
+		      closure, info_type(closure), str));
+
+  ASSERT(get_itbl(closure)->type==RBH);
+
+  /* awaken_blocked_queue also restores the RBH_Save closure
+     (have to call it even if there are no TSOs in the queue!) */
+  awaken_blocked_queue(bqe, closure);
+
+  /* Put back old info pointer (grabbed from the RBH's info table).
+     We do that *after* awakening the BQ to be sure node is an RBH when
+     calling awaken_blocked_queue (different in GUM!)
+  */
+  SET_INFO(closure, REVERT_INFOPTR(get_itbl(closure)));
+
+  /* put closure on mutables list */
+  //recordMutable((StgMutClosure *)closure);
+
+# if 0 /* rest of this fct */
+    /* ngoq ngo' */
+    /* FETCHME_GA(closure) = ga; */
+    if (IS_MUTABLE(INFO_PTR(bqe))) {
+      PROC old_proc = CurrentProc,        /* NB: For AwakenBlockingQueue, */
+           new_proc = where_is(closure);  /*     CurentProc must be where */
+					  /*     closure lives. */
+      CurrentProc = new_proc;
+
+#  if defined(GRAN_CHECK)
+      if (RTSflags.GranFlags.debug & 0x100)
+        fprintf(stderr,"===== AwBQ of node 0x%lx (%s) [PE %2u]\n",
+	               closure, (isSpec ? "SPEC_RBH" : "GEN_RBH"), new_proc);
+#  endif
+
+      rbh_save = AwakenBlockingQueue(bqe);     /* AwakenBlockingQueue(bqe); */
+      CurrentProc = old_proc;
+    } else {
+        rbh_save = bqe;
+    }
+
+    /* Put data from special RBH save closures back into the closure */
+    if ( rbh_save == NULL ) {
+      fprintf(stderr,"convertFromRBH: No RBH_Save_? closure found at end of BQ!\n");
+      EXIT(EXIT_FAILURE);
+    } else {
+      closure[isSpec ? SPEC_HS : GEN_HS] = rbh_save[SPEC_HS];
+      closure[(isSpec ? SPEC_HS : GEN_HS) + 1] = rbh_save[SPEC_HS + 1];
+    }
+# endif /* 0 */
+
+# if 0 && (defined(GCap) || defined(GCgn))
+    /* ngoq ngo' */
+    /* If we convert from an RBH in the old generation,
+       we have to make sure it goes on the mutables list */
+
+    if(closure <= StorageMgrInfo.OldLim) {
+	if (IS_MUTABLE(INFO_PTR(closure)) && MUT_LINK(closure) == MUT_NOT_LINKED) {
+	    MUT_LINK(closure) = (StgWord) StorageMgrInfo.OldMutables;
+            StorageMgrInfo.OldMutables = closure;
+	}
+    }
+# endif /* 0 */
+}
+#endif /* PAR */
+
+/* Remove closure from the mutables list */
+#if 0
+/* ngoq ngo' */
+void
+UnlinkFromMUT(StgPtr closure) 
+{
+  StgPtr curr = StorageMgrInfo.OldMutables, prev = NULL;
+
+  while (curr != NULL && curr != closure) {
+    ASSERT(MUT_LINK(curr)!=MUT_NOT_LINKED);
+    prev=curr;
+    curr=MUT_LINK(curr); 
+  }
+  if (curr==closure) {   
+   if (prev==NULL) 
+     StorageMgrInfo.OldMutables = MUT_LINK(curr);
+   else   
+     MUT_LINK(prev) = MUT_LINK(curr);
+   MUT_LINK(curr) = MUT_NOT_LINKED;
+  }
+
+#  if 0 && (defined(GCap) || defined(GCgn))
+  {
+    closq newclos;
+    extern closq ex_RBH_q;
+
+    newclos = (closq) stgMallocBytes(sizeof(struct clos), "UnlinkFromMUT");
+    CLOS_CLOSURE(newclos) = closure;
+    CLOS_PREV(newclos) = NULL;
+    CLOS_NEXT(newclos) = ex_RBH_q;
+    if (ex_RBH_q!=NULL)
+      CLOS_PREV(ex_RBH_q) = newclos;
+    ex_RBH_q = newclos;
+  }
+#  endif
+}
+#endif /* PAR */
+
+#endif /* PAR || GRAN -- whole file */
+
+//@node Index,  , Conversion Functions
+//@section Index
+
+//@index
+//* convertToFetchMe::  @cindex\s-+convertToFetchMe
+//* convertToRBH::  @cindex\s-+convertToRBH
+//@end index
diff --git a/ghc/rts/parallel/SysMan.c b/ghc/rts/parallel/SysMan.c
new file mode 100644
index 0000000..eaafc03
--- /dev/null
+++ b/ghc/rts/parallel/SysMan.c
@@ -0,0 +1,417 @@
+/* ----------------------------------------------------------------------------
+   Time-stamp: <Sat Dec 04 1999 19:29:57 Stardate: [-30]3999.06 hwloidl>
+   $Id: SysMan.c,v 1.2 2000/01/13 14:34:09 hwloidl Exp $
+
+   GUM System Manager Program
+   Handles startup, shutdown and global synchronisation of the parallel system.
+
+   The Parade/AQUA Projects, Glasgow University, 1994-1995.
+   GdH/APART Projects, Heriot-Watt University, Edinburgh, 1997-1999.
+   P. Trinder, November 30th. 1994.
+   Adapted for new RTS
+   P. Trinder, July 1997.
+   H-W. Loidl, November 1999.  
+ 
+   ------------------------------------------------------------------------- */
+
+//@node GUM System Manager Program, , ,
+//@section GUM System Manager Program
+
+//@menu
+//* General docu::		
+//* Includes::			
+//* Macros etc::		
+//* Variables::			
+//* Main fct::			
+//* Auxiliary fcts::		
+//* Index::			
+//@end menu
+
+//@node General docu, Includes, GUM System Manager Program, GUM System Manager Program
+//@subsection General docu
+
+/*
+
+The Sysman task currently controls initiation, termination, of a
+parallel Haskell program running under GUM. In the future it may
+control global GC synchronisation and statistics gathering. Based on
+K. Hammond's SysMan.lc in Graph for PVM. SysMan is unusual in that it
+is not part of the executable produced by ghc: it is a free-standing
+program that spawns PVM tasks (logical PEs) to evaluate the
+program. After initialisation it runs in parallel with the PE tasks,
+awaiting messages.
+
+OK children, buckle down for some serious weirdness, it works like this ...
+
+
+o The argument vector (argv) for SysMan has one the following 2 shapes:
+
+-------------------------------------------------------------------------------
+| SysMan path | debug flag | pvm-executable path | Num. PEs | Program Args ...|
+-------------------------------------------------------------------------------
+
+-------------------------------------------------------------------
+| SysMan path | pvm-executable path | Num. PEs | Program Args ... |
+-------------------------------------------------------------------
+
+The "pvm-executable path" is an absolute path of where PVM stashes the
+code for each PE. The arguments passed on to each PE-executable
+spawned by PVM are:
+
+-------------------------------
+| Num. PEs | Program Args ... |
+-------------------------------
+
+The arguments passed to the Main-thread PE-executable are
+
+-------------------------------------------------------------------
+| main flag | pvm-executable path | Num. PEs | Program Args ... |
+-------------------------------------------------------------------
+
+o SysMan's algorithm is as follows.
+
+o use PVM to spawn (nPE-1) PVM tasks 
+o fork SysMan to create the main-thread PE. This permits the main-thread to 
+read and write to stdin and stdout. 
+o Barrier-synchronise waiting for all of the PE-tasks to start.
+o Broadcast the SysMan task-id, so that the main thread knows it.
+o Wait for the Main-thread PE to send it's task-id.
+o Broadcast an array of the PE task-ids to all of the PE-tasks.
+o Enter a loop awaiting incoming messages, e.g. failure, Garbage-collection, 
+termination.
+
+The forked Main-thread algorithm, in SysMan, is as follows.
+
+o disconnects from PVM.
+o sets a flag in argv to indicate that it is the main thread.
+o `exec's a copy of the pvm-executable (i.e. the program being run)
+
+
+The pvm-executable run by each PE-task, is initialised as follows.
+
+o Registers with PVM, obtaining a task-id.
+o Joins the barrier synchronisation awaiting the other PEs.
+o Receives and records the task-id of SysMan, for future use.
+o If the PE is the main thread it sends its task-id to SysMan.
+o Receives and records the array of task-ids of the other PEs.
+o Begins execution.
+
+*/
+
+//@node Includes, Macros etc, General docu, GUM System Manager Program
+//@subsection Includes
+
+#include "Rts.h"
+#include "ParTypes.h"
+#include "LLC.h"
+#include "Parallel.h"
+
+//@node Macros etc, Variables, Includes, GUM System Manager Program
+//@subsection Macros etc
+
+#define NON_POSIX_SOURCE /* so says Solaris */
+
+#define checkerr(c)	do { \
+                          if ((c)<0) { \
+                            pvm_perror("Sysman"); \
+                            fprintf(stderr,"Sysman"); \
+                            stg_exit(EXIT_FAILURE); \
+                          } \
+                        } while(0)
+
+/* SysMan is put on top of the GHC routine that does the RtsFlags handling.
+   So, we cannot use the standard macros. For the time being we use a macro
+   that is fixed at compile time.
+*/
+/* debugging enabled */
+#define IF_PAR_DEBUG(c,s)  { s; }
+/* debugging disabled */
+// #define IF_PAR_DEBUG(c,s)  /* nothing */
+
+//@node Variables, Main fct, Macros etc, GUM System Manager Program
+//@subsection Variables
+
+/*
+   The following definitions included so that SysMan can be linked with Low
+   Level Communications module (LLComms). They are not used in SysMan.  */
+
+GlobalTaskId  mytid, SysManTask;
+rtsBool       IAmMainThread;
+rtsBool       GlobalStopPending = rtsFalse;
+              /* Handle unexpected messages correctly */
+
+static           GlobalTaskId gtids[MAX_PES];
+static           GlobalTaskId sysman_id, sender_id, mainThread_id;
+static unsigned  PEsTerminated = 0;
+static rtsBool   Finishing = rtsFalse;
+static long      PEbuffer[MAX_PES];
+nat              nPEs = 0;
+
+//@node Main fct, Auxiliary fcts, Variables, GUM System Manager Program
+//@subsection Main fct
+
+//@cindex main
+main (int argc, char **argv) {
+  int rbufid;
+  int opcode, nbytes;
+  char **pargv;
+  int i, cc, spawn_flag = PvmTaskDefault;
+  char *petask, *pvmExecutable;
+  rtsPacket addr;
+  
+  setbuf(stdout, NULL);  // disable buffering of stdout
+  setbuf(stderr, NULL);  // disable buffering of stderr
+  
+  if (argc > 1) {
+    if (*argv[1] == '-') {
+      spawn_flag = PvmTaskDebug;
+      argv[1] = argv[0];
+      argv++; argc--;
+    }
+    sysman_id = pvm_mytid();  /* This must be the first PVM call */
+    
+    checkerr(sysman_id);
+    
+    /* 
+       Get the full path and filename of the pvm executable (stashed in some
+       PVM directory), and the number of PEs from the command line.
+    */
+    pvmExecutable = argv[1];
+    nPEs = atoi(argv[2]);
+    
+    if ((petask = getenv(PETASK)) == NULL)  // PETASK set by driver
+      petask = PETASK;
+
+    IF_PAR_DEBUG(verbose,
+		 fprintf(stderr,"== [%x] nPEs (%s) = %d\n", 
+			 sysman_id, petask, nPEs));
+    
+    /* Check that we can create the number of PE and IMU tasks requested */
+    if (nPEs > MAX_PES) {
+      fprintf(stderr,"SysMan: No more than %d PEs allowed (%d requested)\n", 
+	   MAX_PES, nPEs);
+      stg_exit(EXIT_FAILURE);
+    }
+    /* 
+       Now create the PE Tasks. We spawn (nPEs-1) pvm threads: the Main Thread 
+       (which starts execution and performs IO) is created by forking SysMan 
+    */
+    nPEs--;
+    if (nPEs > 0) {
+      /* Initialise the PE task arguments from Sysman's arguments */
+      pargv = argv + 2;
+
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr, "== [%x] Spawning %d PEs(%s) ...\n", 
+			   sysman_id, nPEs, petask);
+		   fprintf(stderr, "  args: ");
+		   for (i = 0; pargv[i]; ++i)
+		     fprintf(stderr, "%s, ", pargv[i]);
+		   fprintf(stderr, "\n"));
+
+      checkerr(pvm_spawn(petask, pargv, spawn_flag, "", nPEs, gtids));
+      /*
+       * Stash the task-ids of the PEs away in a buffer, once we know 
+       * the Main Thread's task-id, we'll broadcast them all.
+       */	    
+      for (i = 0; i < nPEs; i++)
+	PEbuffer[i+1] = (long) gtids[i];
+
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,"== [%x] Spawned\n", sysman_id));
+    }
+    
+    /* 
+       Create the MainThread PE by forking SysMan. This arcane coding 
+       is required to allow MainThread to read stdin and write to stdout.
+       PWT 18/1/96 
+    */
+    nPEs++;                /* Record that the number of PEs is increasing */
+    if ((cc = fork())) {
+      checkerr(cc);        /* Parent continues as SysMan */
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,"== [%x] SysMan Task is [t%x]\n", sysman_id));
+
+      /*
+	SysMan joins PECTLGROUP, so that it can wait (at the
+	barrier sysnchronisation a few instructions later) for the
+	other PE-tasks to start.
+	
+	The manager group (MGRGROUP) is vestigial at the moment. It
+	may eventually include a statistics manager, and a (global) 
+	garbage collector manager.
+      */
+      checkerr(pvm_joingroup(PECTLGROUP));
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,"== [%x] Joined PECTLGROUP \n", sysman_id));
+
+      /* Wait for all the PEs to arrive */
+      checkerr(pvm_barrier(PECTLGROUP, nPEs + 1));
+
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,"== [%x] PECTLGROUP  barrier passed \n", 
+			   sysman_id));
+
+      /* Broadcast SysMan's ID, so Main Thread PE knows it */
+      pvm_initsend(PvmDataDefault);
+      pvm_bcast(PEGROUP, PP_SYSMAN_TID);
+      
+      /* Wait for Main Thread to identify itself*/
+      addr = waitForPEOp(PP_MAIN_TASK, ANY_GLOBAL_TASK);
+      pvm_bufinfo(addr, &nbytes, &opcode, &mainThread_id);
+      PEbuffer[0] = mainThread_id;
+
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,"== [%x] SysMan received Main Task = %x\n", 
+			   sysman_id, mainThread_id));
+
+      /* Now that we have them all, broadcast Global Task Ids of all PEs */
+      pvm_initsend(PvmDataDefault);
+      PutArgs(PEbuffer, nPEs);
+      pvm_bcast(PEGROUP, PP_PETIDS);
+
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,"== [%x] Sysman successfully initialized!\n",
+			   sysman_id));
+
+//@cindex message handling loop
+      /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
+      /* Main message handling loop                                         */
+      /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
+      /* Process incoming messages */
+      while (1) {
+	if ((rbufid = pvm_recv(ANY_TASK, ANY_OPCODE)) < 0)
+	  pvm_perror("Sysman: Receiving Message");
+	else {
+	  pvm_bufinfo(rbufid, &nbytes, &opcode, &sender_id);
+
+	  /* 
+	  IF_PAR_DEBUG(trace,
+		       fprintf(stderr,"== [%x] SysMan: Message received by SysMan: rbufid=%x, nbytes = %d, opcode = %x, sender_id = %x\n",
+			     sysman_id, rbufid, nbytes, opcode, sender_id));
+	  */
+	  switch (opcode) {
+	  case PP_GC_INIT:
+	    /* This Function not yet implemented for GUM */
+	    fprintf(stderr,"Global GC from %x Not yet implemented for GUM!\n", 
+		  sender_id);
+	    sync(PECTLGROUP, PP_FULL_SYSTEM);
+	    broadcast(PEGROUP, PP_GC_INIT);
+	    /*                DoGlobalGC();                */
+	    /*		      broadcast(PEGROUP, PP_INIT); */
+	    break;
+	    
+	  case PP_STATS_ON:
+	    fprintf(stderr,"PP_STATS_ON (from %x) not yet implemented for GUM!\n", 
+		  sender_id);
+	    break;
+
+	  case PP_STATS_OFF:
+	    fprintf(stderr,"PP_STATS_OFF (from %x) not yet implemented for GUM!\n", 
+		  sender_id);
+	    break;
+	    
+	  case PP_FINISH:
+	    IF_PAR_DEBUG(verbose,
+			 fprintf(stderr,"== [%x] Finish from %x\n", 
+				 sysman_id, sender_id));
+	    if (!Finishing) {
+	      Finishing = rtsTrue;
+	      PEsTerminated = 1;
+	      pvm_initsend(PvmDataDefault);
+	      pvm_bcast(PEGROUP, PP_FINISH);
+	    } else {
+	      ++PEsTerminated;
+	    }
+	    if (PEsTerminated >= nPEs) {
+	      IF_PAR_DEBUG(verbose,
+			   fprintf(stderr,"== [%x] Global Shutdown, Goodbye!! (SysMan has received FINISHes from all PEs)\n", 
+				   sysman_id));
+	      broadcast(PEGROUP, PP_FINISH);
+	      broadcast(MGRGROUP, PP_FINISH);
+	      pvm_lvgroup(PECTLGROUP);
+	      pvm_lvgroup(MGRGROUP);
+	      pvm_exit();
+	      exit(EXIT_SUCCESS);
+	      /* Qapla'! */
+	    }
+	    break;
+	    
+	  case PP_FAIL:
+	    IF_PAR_DEBUG(verbose,
+			 fprintf(stderr,"== [%x] Fail from %x\n", 
+				 sysman_id, sender_id));
+	    if (!Finishing) {
+	      Finishing = rtsTrue;
+	      broadcast(PEGROUP, PP_FAIL);
+	    }
+	    break;
+	    
+	  default:
+	    {
+	     /*		          
+	      char *opname = GetOpName(opcode);
+	      fprintf(stderr,"Sysman: Unrecognised opcode %s (%x)\n",
+	                      opname,opcode);	*/
+	      fprintf(stderr,"Qagh: Sysman: Unrecognised opcode (%x)\n",
+		    opcode);
+	    }
+	    break;
+	  } 	/* switch */
+	}		/* else */
+      }		/* while 1 */
+    }      		/* forked Sysman Process */
+    else {
+      fprintf(stderr, "Main Thread PE has been forked; doing an execv(%s,...)\n", 
+	      pvmExecutable);
+      pvmendtask();		 /* Disconnect from PVM to avoid confusion: */
+      /* executable reconnects  */
+      *argv[0] = '-';		 /* Flag that this is the Main Thread PE */
+      execv(pvmExecutable,argv); /* Parent task becomes Main Thread PE */
+    }
+  }			/* argc > 1 */  
+}			/* main */
+
+//@node Auxiliary fcts, Index, Main fct, GUM System Manager Program
+//@subsection Auxiliary fcts
+
+/*
+ * This reproduced from RtsUtlis to save linking with a whole ball of wax
+ */
+/* result-checking malloc wrappers. */
+
+//@cindex stgMallocBytes
+
+void *
+stgMallocBytes (int n, char *msg)
+{
+    char *space;
+
+    if ((space = (char *) malloc((size_t) n)) == NULL) {
+	fflush(stdout);
+	fprintf(stderr, msg);
+	// MallocFailHook((W_) n, msg); /*msg*/
+	stg_exit(EXIT_FAILURE);
+    }
+    return space;
+}
+
+/* Needed here because its used in loads of places like LLComms etc */
+
+//@cindex stg_exit
+
+void stg_exit(n)
+I_ n;
+{
+    exit(n);
+}
+
+//@node Index,  , Auxiliary fcts, GUM System Manager Program
+//@subsection Index
+
+//@index
+//* main::  @cindex\s-+main
+//* message handling loop::  @cindex\s-+message handling loop
+//* stgMallocBytes::  @cindex\s-+stgMallocBytes
+//* stg_exit::  @cindex\s-+stg_exit
+//@end index