[project @ 1998-11-26 09:17:22 by sof]
[ghc-hetmet.git] / ghc / runtime / main / GranSim.lc
index de603dd..cdaee56 100644 (file)
@@ -1,5 +1,8 @@
 %
-% (c) The GRASP/AQUA Project, Glasgow University, 1995
+% (c) The GRASP/AQUA Project, Glasgow University, 1995 - 1996
+%     Hans Wolfgang Loidl
+%
+% Time-stamp: <Sun Oct 19 1997 23:39:59 Stardate: [-30]0119.72 hwloidl>
 %
 %************************************************************************
 %*                                                                      *
@@ -17,47 +20,239 @@ which should be <= the length of a word in bits.  -- HWL
 \begin{code}
 #if defined(GRAN) || defined(PAR)
 
+#ifndef _AIX
 #define NON_POSIX_SOURCE    /* gettimeofday */
+#endif
 
-#include "rtsdefs.h"
+#include "rtsdefs.h" 
+
+/* qaStaH nuq Sovpu' ngoqvam ghItlhpu'bogh nuv 'e' vItul */
+#  if defined(HAVE_GETCLOCK)
+#    if defined(HAVE_SYS_TIMERS_H)
+#    define POSIX_4D9 1
+#    include <sys/timers.h>
+#    endif
+#  else
+#    if defined(HAVE_GETTIMEOFDAY)
+#      if defined(HAVE_SYS_TIME_H)
+#      include <sys/time.h>
+#      endif
+#    else
+#      ifdef HAVE_TIME_H
+#      include <time.h>
+#      endif
+#    endif
+#  endif
+\end{code}
 
 
-#ifdef HAVE_GETCLOCK
+%****************************************************************
+%*                                                              *
+\subsection[GranSim-data-types]{Basic data types and set-up variables for GranSim}
+%*                                                              *
+%****************************************************************
 
-#ifdef HAVE_SYS_TIMERS_H
-#define POSIX_4D9 1
-#include <sys/timers.h>
-#endif
+\begin{code}
 
-#else
-#ifdef HAVE_GETTIMEOFDAY
+/* See GranSim.lh for the definition of the enum gran_event_types */
+char *gran_event_names[] = {
+    "START", "START(Q)",
+    "STEALING", "STOLEN", "STOLEN(Q)",
+    "FETCH", "REPLY", "BLOCK", "RESUME", "RESUME(Q)",
+    "SCHEDULE", "DESCHEDULE",
+    "END",
+    "SPARK", "SPARKAT", "USED", "PRUNED", "EXPORTED", "ACQUIRED",
+    "ALLOC",
+    "TERMINATE",
+    "SYSTEM_START", "SYSTEM_END",           /* only for debugging */
+    "??"
+};
 
-#ifdef HAVE_SYS_TIME_H
-#include <sys/time.h>
-#endif
+#if defined(GRAN)
+char *proc_status_names[] = {
+  "Idle", "Sparking", "Starting", "Fetching", "Fishing", "Busy", 
+  "UnknownProcStatus"
+};
 
-#else
+#define RAND_MAX  0x7fffffff    /* 2^31-1 = 0x80000000 - 1 (see lrand48(3)  */
 
-#ifdef HAVE_TIME_H
-#include <time.h>
-#endif
+unsigned CurrentProc = 0;
+rtsBool IgnoreEvents = rtsFalse; /* HACK only for testing */
 
+#if 0 && (defined(GCap) || defined(GCgn))
+closq ex_RBH_q = NULL;
 #endif
-#endif
+#endif  /* GRAN */
+\end{code}
+
+The following variables control the behaviour of GrAnSim. In general, there
+is one RTS option for enabling each of these features. In getting the
+desired setup of GranSim the following questions have to be answered:
+\begin{itemize}
+\item {\em Which scheduling algorithm} to use (@RTSflags.GranFlags.DoFairSchedule@)? 
+      Currently only unfair scheduling is supported.
+\item What to do when remote data is fetched (@RTSflags.GranFlags.DoReScheduleOnFetch@)? 
+      Either block and wait for the
+      data or reschedule and do some other work.
+      Thus, if this variable is true, asynchronous communication is
+      modelled. Block on fetch mainly makes sense for incremental fetching.
+
+      There is also a simplified fetch variant available
+      (@RTSflags.GranFlags.SimplifiedFetch@). This variant does not use events to model
+      communication. It is faster but the results will be less accurate.
+\item How aggressive to be in getting work after a reschedule on fetch
+      (@RTSflags.GranFlags.FetchStrategy@)?
+      This is determined by the so-called {\em fetching
+      strategy\/}. Currently, there are four possibilities:
+      \begin{enumerate}
+       \item Only run a runnable thread.
+       \item Turn a spark into a thread, if necessary.
+       \item Steal a remote spark, if necessary.
+       \item Steal a runnable thread from another processor, if necessary.
+      \end{itemize}
+      The variable @RTSflags.GranFlags.FetchStrategy@ determines how far to go in this list
+      when rescheduling on a fetch.
+\item Should sparks or threads be stolen first when looking for work
+      (@RTSflags.GranFlags.DoStealThreadsFirst@)? 
+      The default is to steal sparks first (much cheaper).
+\item Should the RTS use a lazy thread creation scheme
+      (@RTSflags.GranFlags.DoAlwaysCreateThreads@)?  By default yes i.e.\ sparks are only
+      turned into threads when work is needed. Also note, that sparks
+      can be discarded by the RTS (this is done in the case of an overflow
+      of the spark pool). Setting @RTSflags.GranFlags.DoAlwaysCreateThreads@  to @True@ forces
+      the creation of threads at the next possibility (i.e.\ when new work
+      is demanded the next time).
+\item Should data be fetched closure-by-closure or in packets
+      (@RTSflags.GranFlags.DoGUMMFetching@)? The default strategy is a GRIP-like incremental 
+      (i.e.\ closure-by-closure) strategy. This makes sense in a
+      low-latency setting but is bad in a high-latency system. Setting 
+      @RTSflags.GranFlags.DoGUMMFetching@ to @True@ enables bulk (packet) fetching. Other
+      parameters determine the size of the packets (@pack_buffer_size@) and the number of
+      thunks that should be put into one packet (@RTSflags.GranFlags.ThunksToPack@).
+\item If there is no other possibility to find work, should runnable threads
+      be moved to an idle processor (@RTSflags.GranFlags.DoThreadMigration@)? In any case, the
+      RTS tried to get sparks (either local or remote ones) first. Thread
+      migration is very expensive, since a whole TSO has to be transferred
+      and probably data locality becomes worse in the process. Note, that
+      the closure, which will be evaluated next by that TSO is not
+      transferred together with the TSO (that might block another thread).
+\item Should the RTS distinguish between sparks created by local nodes and
+      stolen sparks (@RTSflags.GranFlags.PreferSparksOfLocalNodes@)?  The idea is to improve 
+      data locality by preferring sparks of local nodes (it is more likely
+      that the data for those sparks is already on the local processor). 
+      However, such a distinction also imposes an overhead on the spark
+      queue management, and typically a large number of sparks are
+      generated during execution. By default this variable is set to @False@.
+\item Should the RTS use granularity control mechanisms? The idea of a 
+      granularity control mechanism is to make use of granularity
+      information provided via annotation of the @par@ construct in order
+      to prefer bigger threads when either turning a spark into a thread or
+      when choosing the next thread to schedule. Currently, three such
+      mechanisms are implemented:
+      \begin{itemize}
+        \item Cut-off: The granularity information is interpreted as a
+              priority. If a threshold priority is given to the RTS, then
+              only those sparks with a higher priority than the threshold 
+              are actually created. Other sparks are immediately discarded.
+              This is similar to a usual cut-off mechanism often used in 
+              parallel programs, where parallelism is only created if the 
+              input data is lage enough. With this option, the choice is 
+              hidden in the RTS and only the threshold value has to be 
+              provided as a parameter to the runtime system.
+        \item Priority Sparking: This mechanism keeps priorities for sparks
+              and chooses the spark with the highest priority when turning
+              a spark into a thread. After that the priority information is
+              discarded. The overhead of this mechanism comes from
+              maintaining a sorted spark queue.
+        \item Priority Scheduling: This mechanism keeps the granularity
+              information for threads, to. Thus, on each reschedule the 
+              largest thread is chosen. This mechanism has a higher
+              overhead, as the thread queue is sorted, too.
+       \end{itemize}  
+\end{itemize}
 
-void grputw PROTO((TIME v));
+\begin{code}
+#if defined(GRAN)
+
+/* Do we need to reschedule following a fetch? */
+rtsBool NeedToReSchedule = rtsFalse; 
+TIME TimeOfNextEvent, EndOfTimeSlice;   /* checked from the threaded world! */
+/* I_ avoidedCS=0; */ /* Unused!! ToDo: Remake libraries and nuke this var */
+
+/* For internal use (event statistics) only */
+char *event_names[] =
+    { "STARTTHREAD", "CONTINUETHREAD", "RESUMETHREAD", 
+      "MOVESPARK", "MOVETHREAD", "FINDWORK",
+      "FETCHNODE", "FETCHREPLY",
+      "GLOBALBLOCK", "UNBLOCKTHREAD"
+    };
+
+# if defined(GRAN_CHECK) && defined(GRAN) /* Just for testing */
+I_ noOfEvents = 0;
+I_ event_counts[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+I_ fetch_misses = 0;
+I_ tot_low_pri_sparks = 0;
+
+I_ rs_sp_count=0, rs_t_count=0, ntimes_total=0, fl_total=0, no_of_steals=0;
+
+/* Variables for gathering packet and queue statistics */
+I_ tot_packets = 0, tot_packet_size = 0, tot_cuts = 0, tot_thunks = 0;
+I_ tot_sq_len = 0, tot_sq_probes = 0,  tot_sparks = 0, withered_sparks = 0;
+I_ tot_add_threads = 0, tot_tq_len = 0, non_end_add_threads = 0;
+#  endif 
+
+#  if defined(GRAN_COUNT)
+/* Count the number of updates that are done. Mainly for testing, but 
+   could be useful for other purposes, too. */
+I_ nUPDs = 0, nUPDs_old = 0, nUPDs_new = 0, nUPDs_BQ = 0, nPAPs = 0,
+   BQ_lens = 0;
+#  endif
+
+/* Prototypes */
+I_ HandleFetchRequest(P_, PROC, P_);
+/* void HandleFetchRequest(P_, PROC, P_);  changed for GUMMFeching */
+static I_ blockFetch(P_ tso, PROC proc, P_ bh);
+
+#endif  /* GRAN */
+\end{code}
 
+%****************************************************************
+%*                                                              *
+\subsection[global-address-op]{Global Address Operations}
+%*                                                              *
+%****************************************************************
+
+These functions perform operations on the global-address (ga) part
+of a closure. The ga is the only new field (1 word) in a closure introduced
+by GrAnSim. It serves as a bitmask, indicating on which processor 
+the closure is residing. Since threads are described by Thread State
+Object (TSO), which is nothing but another kind of closure, this
+scheme allows gives placement information about threads.
+
+A ga is just a bitmask, so the operations on them are mainly bitmask
+manipulating functions. Note, that there are important macros like PROCS, 
+IS_LOCAL_TO etc. They are defined in @GrAnSim.lh@.
+
+NOTE: In GrAnSim-light we don't maintain placement information. This
+allows to simulate an arbitrary number  of processors. The price we have 
+to be is the lack of costing any communication properly. In short, 
+GrAnSim-light is meant to reveal the maximal parallelism in a program.
+From an implementation point of view the important thing is: 
+{\em GrAnSim-light does not maintain global-addresses}.
+
+\begin{code}
 #if defined(GRAN)
-/* Pointer to the event queue; events are currently malloc'ed */
-static eventq EventHd = NULL;
 
+/* ga_to_proc returns the first processor marked in the bitmask ga.
+   Normally only one bit in ga should be set. But for PLCs all bits
+   are set. That shouldn't hurt since we only need IS_LOCAL_TO for PLCs */
 PROC
 ga_to_proc(W_ ga)
 {
     PROC i;
-
     for (i = 0; i < MAX_PROC && !IS_LOCAL_TO(ga, i); i++);
-
     return (i);
 }
 
@@ -66,61 +261,155 @@ PROC
 where_is(P_ node)
 { return (ga_to_proc(PROCS(node))); }   /* Access the GA field of the node */
 
-#if 0
-PROC
-no_of_copies(W_ ga)    /* DaH lo'lu'Qo'; currently unused */
-{
-    PROC i, n;
-
-    for (i = 0, n = 0; i < MAX_PROC; i++)
-       if (IS_LOCAL_TO(ga, i))
-           n++;;
+rtsBool
+any_idle() {
+ I_ i; 
+ rtsBool any_idle; 
+ for(i=0, any_idle=rtsFalse; 
+     !any_idle && i<RTSflags.GranFlags.proc; 
+     any_idle = any_idle || IS_IDLE(i), i++) 
+ {} ;
+}
 
-    return (n);
+int
+idlers() {
+ I_ i, j; 
+ for(i=0, j=0;
+     i<RTSflags.GranFlags.proc; 
+     j += IS_IDLE(i)?1:0, i++) 
+ {} ;
+ return j;
 }
-#endif
+#endif  /* GRAN */
+\end{code}
+
+%****************************************************************
+%*                                                              *
+\subsection[event-queue]{The Global Event Queue}
+%*                                                              *
+%****************************************************************
+
+The following routines implement an ADT of an event-queue (FIFO). 
+ToDo: Put that in an own file(?)
+
+\begin{code}
+#if defined(GRAN)
+
+/* Pointer to the global event queue; events are currently malloc'ed */
+eventq EventHd = NULL;
 
 eventq 
-getnextevent()
+get_next_event()
 {
   static eventq entry = NULL;
 
   if(EventHd == NULL)
     {
-      fprintf(stderr,"No next event\n");
-      exit(EXIT_FAILURE); /* why not EXIT??? WDP 95/07 */
+      fprintf(stderr,"No next event. This may be caused by a circular data dependency in the program.\n");
+      EXIT(EXIT_FAILURE);
     }
 
   if(entry != NULL)
     free((char *)entry);
 
-#if defined(GRAN_CHECK) && defined(GRAN)
-  if (debug & 0x20) {     /* count events */
+#  if defined(GRAN_CHECK) && defined(GRAN)
+  if (RTSflags.GranFlags.debug & 0x20) {     /* count events */
     noOfEvents++;
-    event_counts[(EVENT_TYPE(EventHd)>=CONTINUETHREAD1) ? 
-                  CONTINUETHREAD :
-                  EVENT_TYPE(EventHd)]++;
+    event_counts[EVENT_TYPE(EventHd)]++;
   }
-#endif       
+#  endif       
 
   entry = EventHd;
   EventHd = EVENT_NEXT(EventHd);
   return(entry);
 }
 
-/* ToDo: replace malloc/free with a free list */
+/* When getting the time of the next event we ignore CONTINUETHREAD events:
+   we don't want to be interrupted before the end of the current time slice
+   unless there is something important to handle. 
+*/
+TIME
+get_time_of_next_event()
+{ 
+  eventq event = EventHd;
+
+  while (event != NULL && EVENT_TYPE(event)==CONTINUETHREAD) {
+    event = EVENT_NEXT(event);
+  }
+  if(event == NULL)
+      return ((TIME) 0);
+  else
+      return (EVENT_TIME(event));
+}
 
-/* NB: newevent unused (WDP 95/07) */
+/* ToDo: replace malloc/free with a free list */
 
 static 
-newevent(proc,creator,time,evttype,tso,node,spark)
-  PROC proc, creator;
-  TIME time;
-  EVTTYPE evttype;
-  P_ tso, node;
-  sparkq spark;
+insert_event(newentry)
+eventq newentry;
+{
+  EVTTYPE evttype = EVENT_TYPE(newentry);
+  eventq event, *prev;
+
+  /* if(evttype >= CONTINUETHREAD1) evttype = CONTINUETHREAD; */
+
+  /* Search the queue and insert at the right point:
+     FINDWORK before everything, CONTINUETHREAD after everything.
+
+     This ensures that we find any available work after all threads have
+     executed the current cycle.  This level of detail would normally be
+     irrelevant, but matters for ridiculously low latencies...
+  */
+
+  /* Changed the ordering: Now FINDWORK comes after everything but 
+     CONTINUETHREAD. This makes sure that a MOVESPARK comes before a 
+     FINDWORK. This is important when a GranSimSparkAt happens and
+     DoAlwaysCreateThreads is turned on. Also important if a GC occurs
+     when trying to build a new thread (see much_spark)  -- HWL 02/96  */
+
+  if(EventHd == NULL)
+    EventHd = newentry;
+  else {
+    for (event = EventHd, prev=&EventHd; 
+        event != NULL; 
+         prev = &(EVENT_NEXT(event)), event = EVENT_NEXT(event)) {
+      switch (evttype) {
+        case FINDWORK: if ( EVENT_TIME(event) < EVENT_TIME(newentry) ||
+                            ( (EVENT_TIME(event) ==  EVENT_TIME(newentry)) &&
+                             (EVENT_TYPE(event) != CONTINUETHREAD) ) )
+                         continue;
+                       else
+                         break;
+        case CONTINUETHREAD: if ( EVENT_TIME(event) <= EVENT_TIME(newentry) )
+                              continue;
+                            else
+                               break;
+        default: if ( EVENT_TIME(event) < EVENT_TIME(newentry) || 
+                     ((EVENT_TIME(event) == EVENT_TIME(newentry)) &&
+                      (EVENT_TYPE(event) == EVENT_TYPE(newentry))) )
+                  continue;
+                else
+                   break;
+       }
+       /* Insert newentry here (i.e. before event) */
+       *prev = newentry;
+       EVENT_NEXT(newentry) = event;
+       break;
+    }
+    if (event == NULL)
+      *prev = newentry;
+  }
+}
+
+void
+new_event(proc,creator,time,evttype,tso,node,spark)
+PROC proc, creator;
+TIME time;
+EVTTYPE evttype;
+P_ tso, node;
+sparkq spark;
 {
-  eventq newentry = (eventq) xmalloc(sizeof(struct event));
+  eventq newentry = (eventq) stgMallocBytes(sizeof(struct event), "new_event");
 
   EVENT_PROC(newentry) = proc;
   EVENT_CREATOR(newentry) = creator;
@@ -129,36 +418,501 @@ newevent(proc,creator,time,evttype,tso,node,spark)
   EVENT_TSO(newentry) =  tso;
   EVENT_NODE(newentry) =  node;
   EVENT_SPARK(newentry) =  spark;
+  EVENT_GC_INFO(newentry) =  0;
   EVENT_NEXT(newentry) = NULL;
 
   insert_event(newentry);
 }
 
-#endif /* GRAN ; HWL */ 
+void
+prepend_event(eventq event)       /* put event at beginning of EventQueue */
+{                                /* only used for GC! */
+ EVENT_NEXT(event) = EventHd;
+ EventHd = event;
+}
+
+eventq
+grab_event()             /* undo prepend_event i.e. get the event */
+{                       /* at the head of EventQ but don't free anything */
+ eventq event = EventHd;
+
+ if(EventHd == NULL) {
+   fprintf(stderr,"No next event (in grab_event). This may be caused by a circular data dependency in the program.\n");
+   EXIT(EXIT_FAILURE);
+ }
+
+ EventHd = EVENT_NEXT(EventHd);
+ return (event);
+}
+
+void 
+traverse_eventq_for_gc()
+{
+ eventq event = EventHd;
+ W_ bufsize;
+ P_ closure, tso, buffer, bufptr;
+ PROC proc, creator;
+
+ /* Traverse eventq and replace every FETCHREPLY by a FETCHNODE for the
+    orig closure (root of packed graph). This means that a graph, which is
+    between processors at the time of GC is fetched again at the time when
+    it would have arrived, had there been no GC. Slightly inaccurate but
+    safe for GC.
+    This is only needed for GUM style fetchng. */
+ if (!RTSflags.GranFlags.DoGUMMFetching)
+   return;
+
+ for(event = EventHd; event!=NULL; event=EVENT_NEXT(event)) {
+   if (EVENT_TYPE(event)==FETCHREPLY) {
+     buffer = EVENT_NODE(event);
+     ASSERT(buffer[PACK_FLAG_LOCN]==MAGIC_PACK_FLAG);  /* It's a pack buffer */
+     bufsize = buffer[PACK_SIZE_LOCN];
+     closure= (P_)buffer[PACK_HDR_SIZE];
+     tso = (P_)buffer[PACK_TSO_LOCN];
+     proc = EVENT_PROC(event);
+     creator = EVENT_CREATOR(event);                 /* similar to unpacking */
+     for (bufptr=buffer+PACK_HDR_SIZE; bufptr<(buffer+bufsize);
+         bufptr++) {
+        if ( (INFO_TYPE(INFO_PTR(*bufptr)) == INFO_SPEC_RBH_TYPE) ||
+             (INFO_TYPE(INFO_PTR(*bufptr)) == INFO_GEN_RBH_TYPE) ) {
+            convertFromRBH((P_)*bufptr);
+        }
+     }
+     free(buffer);
+     EVENT_TYPE(event) = FETCHNODE;
+     EVENT_PROC(event) = creator;
+     EVENT_CREATOR(event) = proc;
+     EVENT_NODE(event) = closure;
+     EVENT_TSO(event) = tso;
+     EVENT_GC_INFO(event) =  0;
+   }
+ }
+}
+
+void
+print_event(event)
+eventq event;
+{
+
+  char str_tso[16], str_node[16];
+
+  sprintf(str_tso,((EVENT_TSO(event)==PrelBase_Z91Z93_closure) ? "______" : "%#6lx"), 
+                  EVENT_TSO(event));
+  sprintf(str_node,((EVENT_NODE(event)==PrelBase_Z91Z93_closure) ? "______" : "%#6lx"), 
+                    EVENT_NODE(event));
+
+  if (event==NULL)
+    fprintf(stderr,"Evt: NIL\n");
+  else
+    fprintf(stderr,"Evt: %s (%u), PE %u [%u], Time %lu, TSO %s (%x), node %s\n",
+              event_names[EVENT_TYPE(event)],EVENT_TYPE(event),
+              EVENT_PROC(event), EVENT_CREATOR(event), EVENT_TIME(event), 
+             str_tso, TSO_ID(EVENT_TSO(event)), str_node
+             /*, EVENT_SPARK(event), EVENT_NEXT(event)*/ );
+
+}
+
+void
+print_eventq(hd)
+eventq hd;
+{
+  eventq x;
+
+  fprintf(stderr,"Event Queue with root at %x:\n",hd);
+  for (x=hd; x!=NULL; x=EVENT_NEXT(x)) {
+    print_event(x);
+  }
+}
+
+void
+print_spark(spark)
+  sparkq spark;
+{ 
+  char str[16];
+
+  sprintf(str,((SPARK_NODE(spark)==PrelBase_Z91Z93_closure) ? "______" : "%#6lx"), 
+              (W_) SPARK_NODE(spark));
+
+  if (spark==NULL)
+    fprintf(stderr,"Spark: NIL\n");
+  else
+    fprintf(stderr,"Spark: Node %8s, Name %#6lx, Exported %5s, Prev %#6x, Next %#6x\n",
+           str, SPARK_NAME(spark), 
+            ((SPARK_EXPORTED(spark))?"True":"False"), 
+            SPARK_PREV(spark), SPARK_NEXT(spark) );
+}
+
+void
+print_sparkq(hd)
+sparkq hd;
+{
+  sparkq x;
+
+  fprintf(stderr,"Spark Queue with root at %x:\n",hd);
+  for (x=hd; x!=NULL; x=SPARK_NEXT(x)) {
+    print_spark(x);
+  }
+}
+
+
+#endif  /* GRAN */ 
 \end{code}
 
 %****************************************************************************
 %
-\subsection[GrAnSim-profile]{Writing profiling info for GrAnSim}
+\subsection[entry-points]{Routines directly called from Haskell world}
 %
 %****************************************************************************
 
-Event dumping routines.
+The @GranSim...@ routines in here are directly called via macros from the
+threaded world. 
+
+First some auxiliary routines.
 
 \begin{code}
+#if defined(GRAN)
+/* Take the current thread off the thread queue and thereby activate the */
+/* next thread. It's assumed that the next ReSchedule after this uses */
+/* NEW_THREAD as param. */
+/* This fct is called from GranSimBlock and GranSimFetch */
 
-FILE *gr_file = NULL;
+void 
+ActivateNextThread (PROC proc)
+{
+  ASSERT(RunnableThreadsHd[proc]!=PrelBase_Z91Z93_closure);
+
+  RunnableThreadsHd[proc] = TSO_LINK(RunnableThreadsHd[proc]);
+  if(RunnableThreadsHd[proc]==PrelBase_Z91Z93_closure) {
+    MAKE_IDLE(proc);
+    RunnableThreadsTl[proc] = PrelBase_Z91Z93_closure;
+  } else {
+    CurrentTime[proc] += RTSflags.GranFlags.gran_threadcontextswitchtime;
+    if (RTSflags.GranFlags.granSimStats && 
+       (!RTSflags.GranFlags.Light || (RTSflags.GranFlags.debug & 0x20000))) 
+      DumpRawGranEvent(proc,0,GR_SCHEDULE,RunnableThreadsHd[proc],
+                       PrelBase_Z91Z93_closure,0);
+  }
+}
+\end{code}
 
-char *gran_event_names[] = {
-    "START", "START(Q)",
-    "STEALING", "STOLEN", "STOLEN(Q)",
-    "FETCH", "REPLY", "BLOCK", "RESUME", "RESUME(Q)",
-    "SCHEDULE", "DESCHEDULE",
-    "END",
-    "SPARK", "SPARKAT", "USED", "PRUNED", "EXPORTED", "ACQUIRED",
-    "TERMINATE",
-    "??"
-};
+Now the main stg-called routines:
+
+\begin{code}
+/* ------------------------------------------------------------------------ */
+/* The following GranSim... fcts are stg-called from the threaded world.    */
+/* ------------------------------------------------------------------------ */
+
+/* Called from HEAP_CHK  -- NB: node and liveness are junk here now. 
+   They are left temporarily to avoid complete recompilation.
+   KH 
+*/
+void 
+GranSimAllocate(n,node,liveness)
+I_ n;
+P_ node;
+W_ liveness;
+{
+  TSO_ALLOCS(CurrentTSO) += n;
+  ++TSO_BASICBLOCKS(CurrentTSO);
+
+  if (RTSflags.GranFlags.granSimStats_Heap) {
+      DumpRawGranEvent(CurrentProc,0,GR_ALLOC,CurrentTSO,
+                       PrelBase_Z91Z93_closure,n);
+  }
+  
+  TSO_EXECTIME(CurrentTSO) += RTSflags.GranFlags.gran_heapalloc_cost;
+  CurrentTime[CurrentProc] += RTSflags.GranFlags.gran_heapalloc_cost;
+}
+
+/*
+  Subtract the values added above, if a heap check fails and
+  so has to be redone.
+*/
+void 
+GranSimUnallocate(n,node,liveness)
+W_ n;
+P_ node;
+W_ liveness;
+{
+  TSO_ALLOCS(CurrentTSO) -= n;
+  --TSO_BASICBLOCKS(CurrentTSO);
+  
+  TSO_EXECTIME(CurrentTSO) -= RTSflags.GranFlags.gran_heapalloc_cost;
+  CurrentTime[CurrentProc] -= RTSflags.GranFlags.gran_heapalloc_cost;
+}
+
+/* NB: We now inline this code via GRAN_EXEC rather than calling this fct */
+void 
+GranSimExec(ariths,branches,loads,stores,floats)
+W_ ariths,branches,loads,stores,floats;
+{
+  W_ cost = RTSflags.GranFlags.gran_arith_cost*ariths + 
+            RTSflags.GranFlags.gran_branch_cost*branches + 
+            RTSflags.GranFlags.gran_load_cost * loads +
+            RTSflags.GranFlags.gran_store_cost*stores + 
+            RTSflags.GranFlags.gran_float_cost*floats;
+
+  TSO_EXECTIME(CurrentTSO) += cost;
+  CurrentTime[CurrentProc] += cost;
+}
+
+
+/* 
+   Fetch the node if it isn't local
+   -- result indicates whether fetch has been done.
+
+   This is GRIP-style single item fetching.
+*/
+
+/* This function in Threads.lc is only needed for SimplifiedFetch */
+FetchNode PROTO((P_ node,PROC CurrentProc));
+
+I_ 
+GranSimFetch(node /* , liveness_mask */ )
+P_ node;
+/* I_ liveness_mask; */
+{
+  if (RTSflags.GranFlags.Light) {
+     /* Always reschedule in GrAnSim-Light to prevent one TSO from
+        running off too far 
+     new_event(CurrentProc,CurrentProc,CurrentTime[CurrentProc],
+             CONTINUETHREAD,CurrentTSO,node,NULL);
+     */
+     NeedToReSchedule = rtsFalse;   
+     return(0); 
+  }
+
+  /* Note: once a node has been fetched, this test will be passed */
+  if(!IS_LOCAL_TO(PROCS(node),CurrentProc))
+    {
+      /* Add mpacktime to the remote PE for the reply */
+        {
+          PROC p = where_is(node);
+          TIME fetchtime;
+
+#  ifdef GRAN_CHECK
+         if ( ( RTSflags.GranFlags.debug & 0x40 ) &&
+              p == CurrentProc )
+           fprintf(stderr,"GranSimFetch: Trying to fetch from own processor%u\n", p);
+#  endif  /* GRAN_CHECK */
+
+          CurrentTime[CurrentProc] += RTSflags.GranFlags.gran_mpacktime;
+         /* NB: Fetch is counted on arrival (FETCHREPLY) */
+              
+          if (RTSflags.GranFlags.SimplifiedFetch)
+            {
+              FetchNode(node,CurrentProc);
+              CurrentTime[CurrentProc] += RTSflags.GranFlags.gran_mtidytime+
+                                         RTSflags.GranFlags.gran_fetchtime+
+                                          RTSflags.GranFlags.gran_munpacktime;
+              return(1);
+            }
+
+          fetchtime = STG_MAX(CurrentTime[CurrentProc],CurrentTime[p]) +
+                      RTSflags.GranFlags.gran_latency;
+
+          new_event(p,CurrentProc,fetchtime,FETCHNODE,CurrentTSO,node,NULL);
+          if (!RTSflags.GranFlags.DoReScheduleOnFetch)
+           MAKE_FETCHING(CurrentProc);
+          ++OutstandingFetches[CurrentProc];
+
+         if (fetchtime<TimeOfNextEvent)
+           TimeOfNextEvent = fetchtime;
+
+          /* About to block */
+          TSO_BLOCKEDAT(CurrentTSO) = CurrentTime[CurrentProc];
+
+          if (RTSflags.GranFlags.DoReScheduleOnFetch) 
+            {
+              /* Remove CurrentTSO from the queue 
+                 -- assumes head of queue == CurrentTSO */
+              if(!RTSflags.GranFlags.DoFairSchedule)
+                {
+                  if(RTSflags.GranFlags.granSimStats)
+                    DumpRawGranEvent(CurrentProc,p,GR_FETCH,CurrentTSO,
+                                    node,0);
+
+                  ActivateNextThread(CurrentProc);
+              
+#  if defined(GRAN_CHECK)
+                  if (RTSflags.GranFlags.debug & 0x10) {
+                    if (TSO_TYPE(CurrentTSO) & FETCH_MASK_TSO) {
+                      fprintf(stderr,"FETCHNODE: TSO 0x%x has fetch-mask set @ %d\n",
+                              CurrentTSO,CurrentTime[CurrentProc]);
+                      EXIT(EXIT_FAILURE);
+                    } else {
+                      TSO_TYPE(CurrentTSO) |= FETCH_MASK_TSO;
+                    }
+                  }
+#  endif
+                  TSO_LINK(CurrentTSO) = PrelBase_Z91Z93_closure;
+                  /* CurrentTSO = PrelBase_Z91Z93_closure; */
+
+                  /* ThreadQueueHd is now the next TSO to schedule or NULL */
+                  /* CurrentTSO is pointed to by the FETCHNODE event */
+                }
+              else  /* fair scheduling currently not supported -- HWL */
+                {
+                  fprintf(stderr,"Reschedule-on-fetch is not yet compatible with fair scheduling\n");
+                  EXIT(EXIT_FAILURE);
+                }
+            }
+          else                 /* !RTSflags.GranFlags.DoReScheduleOnFetch */
+            {
+              /* Note: CurrentProc is still busy as it's blocked on fetch */
+              if(RTSflags.GranFlags.granSimStats)
+                DumpRawGranEvent(CurrentProc,p,GR_FETCH,CurrentTSO,node,0);
+
+#  if defined(GRAN_CHECK)
+              if (RTSflags.GranFlags.debug & 0x04) 
+                BlockedOnFetch[CurrentProc] = CurrentTSO; /*- rtsTrue; -*/
+              if (RTSflags.GranFlags.debug & 0x10) {
+                if (TSO_TYPE(CurrentTSO) & FETCH_MASK_TSO) {
+                  fprintf(stderr,"FETCHNODE: TSO 0x%x has fetch-mask set @ %d\n",
+                          CurrentTSO,CurrentTime[CurrentProc]);
+                  EXIT(EXIT_FAILURE);
+                } else {
+                  TSO_TYPE(CurrentTSO) |= FETCH_MASK_TSO;
+                }
+                CurrentTSO = PrelBase_Z91Z93_closure;
+              }
+#  endif
+            }
+          CurrentTime[CurrentProc] += RTSflags.GranFlags.gran_mtidytime;
+
+          /* Rescheduling is necessary */
+          NeedToReSchedule = rtsTrue;
+
+          return(1); 
+        }
+    }
+  return(0);
+}
+
+void 
+GranSimSpark(local,node)
+W_ local;
+P_ node;
+{
+  /* ++SparksAvail;  Nope; do that in add_to_spark_queue */
+  if(RTSflags.GranFlags.granSimStats_Sparks)
+    DumpRawGranEvent(CurrentProc,(PROC)0,SP_SPARK,PrelBase_Z91Z93_closure,node,
+                      spark_queue_len(CurrentProc,ADVISORY_POOL)-1);
+
+  /* Force the PE to take notice of the spark */
+  if(RTSflags.GranFlags.DoAlwaysCreateThreads) {
+    new_event(CurrentProc,CurrentProc,CurrentTime[CurrentProc],
+             FINDWORK,PrelBase_Z91Z93_closure,PrelBase_Z91Z93_closure,NULL);
+    if (CurrentTime[CurrentProc]<TimeOfNextEvent)
+      TimeOfNextEvent = CurrentTime[CurrentProc];
+  }
+
+  if(local)
+    ++TSO_LOCALSPARKS(CurrentTSO);
+  else
+    ++TSO_GLOBALSPARKS(CurrentTSO);
+}
+
+void 
+GranSimSparkAt(spark,where,identifier)
+sparkq spark;
+P_  where;        /* This should be a node; alternatively could be a GA */
+I_ identifier;
+{
+  PROC p = where_is(where);
+  GranSimSparkAtAbs(spark,p,identifier);
+}
+
+void 
+GranSimSparkAtAbs(spark,proc,identifier)
+sparkq spark;
+PROC proc;        
+I_ identifier;
+{
+  TIME exporttime;
+
+  if ( spark == (sparkq)NULL)    /* Note: Granularity control might have */
+    return;                      /* turned a spark into a NULL. */
+
+  /* ++SparksAvail; Nope; do that in add_to_spark_queue */
+  if(RTSflags.GranFlags.granSimStats_Sparks)
+    DumpRawGranEvent(proc,0,SP_SPARKAT,PrelBase_Z91Z93_closure,SPARK_NODE(spark),
+                    spark_queue_len(proc,ADVISORY_POOL));
+
+  if (proc!=CurrentProc) {
+    CurrentTime[CurrentProc] += RTSflags.GranFlags.gran_mpacktime;
+    exporttime = (CurrentTime[proc] > CurrentTime[CurrentProc]? 
+                  CurrentTime[proc]: CurrentTime[CurrentProc])
+                 + RTSflags.GranFlags.gran_latency;
+  } else {
+    exporttime = CurrentTime[CurrentProc];
+  }
+
+  if ( RTSflags.GranFlags.Light )
+    /* Need CurrentTSO in event field to associate costs with creating
+       spark even in a GrAnSim Light setup */
+    new_event(proc,CurrentProc,exporttime,
+            MOVESPARK,CurrentTSO,PrelBase_Z91Z93_closure,spark);
+  else
+    new_event(proc,CurrentProc,exporttime,
+            MOVESPARK,PrelBase_Z91Z93_closure,PrelBase_Z91Z93_closure,spark);
+  /* Bit of a hack to treat placed sparks the same as stolen sparks */
+  ++OutstandingFishes[proc];
+
+  /* Force the PE to take notice of the spark (FINDWORK is put after a
+     MOVESPARK into the sparkq!) */
+  if(RTSflags.GranFlags.DoAlwaysCreateThreads) {
+    new_event(CurrentProc,CurrentProc,exporttime+1,
+              FINDWORK,PrelBase_Z91Z93_closure,PrelBase_Z91Z93_closure,NULL);
+  }
+
+  if (exporttime<TimeOfNextEvent)
+    TimeOfNextEvent = exporttime;
+
+  if (proc!=CurrentProc) {
+    CurrentTime[CurrentProc] += RTSflags.GranFlags.gran_mtidytime;
+    ++TSO_GLOBALSPARKS(CurrentTSO);
+  } else { 
+    ++TSO_LOCALSPARKS(CurrentTSO);
+  }
+}
+
+/* This function handles local and global blocking */
+/* It's called either from threaded code (RBH_entry, BH_entry etc) or */
+/* from blockFetch when trying to fetch an BH or RBH */
+
+void 
+GranSimBlock(P_ tso, PROC proc, P_ node)
+{
+  PROC node_proc = where_is(node);
+
+  ASSERT(tso==RunnableThreadsHd[proc]);
+
+  if(RTSflags.GranFlags.granSimStats)
+    DumpRawGranEvent(proc,node_proc,GR_BLOCK,tso,node,0);
+
+  ++TSO_BLOCKCOUNT(tso);
+  /* Distinction  between local and global block is made in blockFetch */
+  TSO_BLOCKEDAT(tso) = CurrentTime[proc];
+
+  CurrentTime[proc] += RTSflags.GranFlags.gran_threadqueuetime;
+  ActivateNextThread(proc);
+  TSO_LINK(tso) = PrelBase_Z91Z93_closure;  /* not really necessary; only for testing */
+}
+
+#endif  /* GRAN */
+
+\end{code}
+
+%****************************************************************************
+%
+\subsection[GrAnSim-profile]{Writing profiling info for GrAnSim}
+%
+%****************************************************************************
+
+Event dumping routines.
+
+\begin{code}
 
 /* 
  * If you're not using GNUC and you're on a 32-bit machine, you're 
@@ -206,90 +960,189 @@ msTime(STG_NO_ARGS)
 
 #endif /* !GRAN */
 
+#if defined(GRAN) || defined(PAR)
 
 void
 DumpGranEvent(name, tso)
 enum gran_event_types name;
 P_ tso;
 {
-    DumpRawGranEvent(CURRENT_PROC, name, TSO_ID(tso));
-}
-
-void
-DumpSparkGranEvent(name, id)
-enum gran_event_types name;
-W_ id;
-{
-    DumpRawGranEvent(CURRENT_PROC, name, id);
+    DumpRawGranEvent(CURRENT_PROC, (PROC)0, name, tso, PrelBase_Z91Z93_closure, 0);
 }
 
 void
-DumpGranEventAndNode(name, tso, node, proc)
+DumpRawGranEvent(proc, p, name, tso, node, len)
+PROC proc, p;         /* proc ... where it happens; p ... where node lives */
 enum gran_event_types name;
 P_ tso, node;
-PROC proc;
+I_ len;
 {
-    PROC pe = CURRENT_PROC;
-    W_ id;
+  W_ id;
+  char time_string[500], node_str[16]; /*ToDo: kill magic constants */
+  ullong_format_string(TIME_ON_PROC(proc), time_string, rtsFalse/*no commas!*/);
+#if defined(GRAN)
+  if (RTSflags.GranFlags.granSimStats_suppressed)
+    return;
+#endif
 
-    char time_string[500]; /*ToDo: kill magic constant */
-    ullong_format_string(CURRENT_TIME, time_string, rtsFalse/*no commas!*/);
+  id = tso == NULL ? -1 : TSO_ID(tso);
+  if (node==PrelBase_Z91Z93_closure)
+      strcpy(node_str,"________");  /* "PrelBase_Z91Z93_closure"); */
+  else
+      sprintf(node_str,"0x%-6lx",node);
 
-#ifdef PAR
-    id = tso == NULL ? -1 : TSO_ID(tso);
-#else
-    id = TSO_ID(tso);
-#endif
-    if (name > GR_EVENT_MAX)
+  if (name > GR_EVENT_MAX)
        name = GR_EVENT_MAX;
 
-    if (do_gr_binary) {
-       grputw(name);
-       grputw(pe);
-       abort(); /* die please: a single word doesn't represent long long times */
-       grputw(CURRENT_TIME); /* this line is bound to do the wrong thing */
-       grputw(id);
-    } else
-       fprintf(gr_file, "PE %2u [%s]: %s %lx \t0x%lx\t(from %2u)\n",
-         pe, time_string, gran_event_names[name], id, (W_) node, proc);
+  if(GRANSIMSTATS_BINARY)
+    /* ToDo: fix code for writing binary GrAnSim statistics */
+    switch (name) { 
+      case GR_START:
+      case GR_STARTQ:
+                      grputw(name);
+                     grputw(proc);
+                     abort();        /* die please: a single word */
+                                     /* doesn't represent long long times */
+                     grputw(TIME_ON_PROC(proc));
+                     grputw((W_)node);
+                     break;
+      case GR_FETCH:
+      case GR_REPLY:
+      case GR_BLOCK:
+                     grputw(name);
+                     grputw(proc);
+                     abort();        /* die please: a single word */
+                                     /* doesn't represent long long times */
+                     grputw(TIME_ON_PROC(proc));  /* this line is bound to */
+                     grputw(id);                  /*   do the wrong thing */
+                     break;
+      default: 
+                      grputw(name);
+                     grputw(proc);
+                     abort();        /* die please: a single word */
+                                     /* doesn't represent long long times */
+                     grputw(TIME_ON_PROC(proc));
+                     grputw((W_)node);
+    }
+  else
+    switch (name) { 
+     case GR_START:
+     case GR_STARTQ:
+        /* fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[sparks %u]\n", */
+       /* using spark queue length as optional argument ^^^^^^^^^ */
+        fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[SN %u]\n", 
+       /* using spark name as optional argument     ^^^^^^ */
+               proc,time_string,gran_event_names[name],
+               id,node_str,len);
+        break;
+     case GR_FETCH:
+     case GR_REPLY:
+     case GR_BLOCK:
+     case GR_STOLEN:
+     case GR_STOLENQ:
+       fprintf(gr_file, "PE %2u [%s]: %-9s\t%lx \t%s\t(from %2u)\n",
+               proc, time_string, gran_event_names[name], 
+               id,node_str,p);
+       break;
+     case GR_RESUME:
+     case GR_RESUMEQ:
+     case GR_SCHEDULE:
+     case GR_DESCHEDULE:
+        fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx \n",
+               proc,time_string,gran_event_names[name],id);
+        break;
+     case GR_STEALING:
+        fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t        \t(by %2u)\n",
+               proc,time_string,gran_event_names[name],id,p);
+        break;
+     case GR_ALLOC:
+        fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t        \tallocating %u words\n",
+               proc,time_string,gran_event_names[name],id,len);
+        break;
+     default:
+        fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[sparks %u]\n",
+               proc,time_string,gran_event_names[name],id,node_str,len);
+    }
 }
 
+
+#if defined(GRAN)
+/* Only needed for special dynamic spark labelling support */
 void
-DumpRawGranEvent(pe, name, id)
-PROC pe;
+DumpStartEventAt(time, proc, p, name, tso, node, len)
+TIME time;
+PROC proc, p;         /* proc ... where it happens; p ... where node lives */
 enum gran_event_types name;
-W_ id;
+P_ tso, node;
+I_ len;
 {
-    char time_string[500]; /* ToDo: kill magic constant */
-
-    if (name > GR_EVENT_MAX)
+  W_ id;
+  char time_string[500], node_str[16]; /*ToDo: kill magic constants */
+  ullong_format_string(time, time_string, rtsFalse/*no commas!*/);
+                    /* ^^^^ only important change to DumpRawGranEvent */
+  if (RTSflags.GranFlags.granSimStats_suppressed)
+    return;
+
+  id = tso == NULL ? -1 : TSO_ID(tso);
+  if (node==PrelBase_Z91Z93_closure)
+      strcpy(node_str,"________");  /* "Z91Z93_closure"); */
+  else
+      sprintf(node_str,"0x%-6lx",node);
+
+  if (name > GR_EVENT_MAX)
        name = GR_EVENT_MAX;
 
-    ullong_format_string(CURRENT_TIME, time_string, rtsFalse/*no commas!*/);
-
-    if (do_gr_binary) {
-       grputw(name);
-       grputw(pe);
-       abort(); /* die please: a single word doesn't represent long long times */
-       grputw(CURRENT_TIME); /* this line is bound to fail */
-       grputw(id);
-    } else
-       fprintf(gr_file, "PE %2u [%s]: %s %lx\n",
-         pe, time_string, gran_event_names[name], id);
+  if(GRANSIMSTATS_BINARY)
+    /* ToDo: fix code for writing binary GrAnSim statistics */
+    switch (name) { 
+      case GR_START:
+      case GR_STARTQ:
+                      grputw(name);
+                     grputw(proc);
+                     abort();        /* die please: a single word */
+                                     /* doesn't represent long long times */
+                     grputw(TIME_ON_PROC(proc));
+                     grputw((W_)node);
+                     break;
+     default:
+        fprintf(stderr,"Error in DumpStartEventAt: event %s is not a START event\n",
+               gran_event_names[name]);
+    }
+  else
+    switch (name) { 
+     case GR_START:
+     case GR_STARTQ:
+        /* fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[sparks %u]\n", */
+       /* using spark queue length as optional argument ^^^^^^^^^ */
+        fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[SN %u]\n", 
+       /* using spark name as optional argument     ^^^^^^ */
+               proc,time_string,gran_event_names[name],
+               id,node_str,len);
+        break;
+     default:
+        fprintf(stderr,"Error in DumpStartEventAt: event %s is not a START event\n",
+               gran_event_names[name]);
+    }
 }
+#endif  /* GRAN  */
 
 void
-DumpGranInfo(pe, tso, mandatory_thread)
-PROC pe;
+DumpGranInfo(proc, tso, mandatory_thread)
+PROC proc;
 P_ tso;
 rtsBool mandatory_thread;
 {
     char time_string[500]; /* ToDo: kill magic constant */
     ullong_format_string(CURRENT_TIME, time_string, rtsFalse/*no commas!*/);
 
-    if (do_gr_binary) {
+#if defined(GRAN)
+    if (RTSflags.GranFlags.granSimStats_suppressed)
+      return;
+#endif
+
+    if (GRANSIMSTATS_BINARY) {
        grputw(GR_END);
-       grputw(pe);
+       grputw(proc);
        abort(); /* die please: a single word doesn't represent long long times */
        grputw(CURRENT_TIME); /* this line is bound to fail */
        grputw(TSO_ID(tso));
@@ -324,11 +1177,11 @@ rtsBool mandatory_thread;
     } else {
 
        /*
-        * NB: DumpGranEvent cannot be used because PE may be wrong (as well as the
-        * extra info)
+        * NB: DumpGranEvent cannot be used because PE may be wrong 
+        * (as well as the extra info)
         */
        fprintf(gr_file, "PE %2u [%s]: END %lx, SN %lu, ST %lu, EXP %c, BB %lu, HA %lu, RT %lu, BT %lu (%lu), FT %lu (%lu), LS %lu, GS %lu, MY %c\n"
-         ,pe
+         ,proc
          ,time_string
          ,TSO_ID(tso)
          ,TSO_SPARKNAME(tso)
@@ -348,6 +1201,47 @@ rtsBool mandatory_thread;
     }
 }
 
+void
+DumpTSO(tso)
+P_ tso;
+{
+  fprintf(stderr,"TSO 0x%lx, NAME 0x%lx, ID %lu, LINK 0x%lx, TYPE %s\n"
+          ,tso
+          ,TSO_NAME(tso)
+          ,TSO_ID(tso)
+          ,TSO_LINK(tso)
+          ,TSO_TYPE(tso)==T_MAIN?"MAIN":
+           TSO_TYPE(tso)==T_FAIL?"FAIL":
+           TSO_TYPE(tso)==T_REQUIRED?"REQUIRED":
+           TSO_TYPE(tso)==T_ADVISORY?"ADVISORY":
+           "???"
+          );
+          
+  fprintf(stderr,"PC (0x%lx,0x%lx), ARG (0x%lx), SWITCH %lx0x\n"
+          ,TSO_PC1(tso)
+          ,TSO_PC2(tso)
+          ,TSO_ARG1(tso)
+          /* ,TSO_ARG2(tso) */
+          ,TSO_SWITCH(tso)
+          );
+
+  fprintf(gr_file,"TSO %lx: SN %lu, ST %lu, GBL %c, BB %lu, HA %lu, RT %lu, BT %lu (%lu), FT %lu (%lu) LS %lu, GS %lu\n"
+         ,TSO_ID(tso)
+          ,TSO_SPARKNAME(tso)
+          ,TSO_STARTEDAT(tso)
+          ,TSO_EXPORTED(tso)?'T':'F'
+          ,TSO_BASICBLOCKS(tso)
+          ,TSO_ALLOCS(tso)
+          ,TSO_EXECTIME(tso)
+          ,TSO_BLOCKTIME(tso)
+          ,TSO_BLOCKCOUNT(tso)
+          ,TSO_FETCHTIME(tso)
+          ,TSO_FETCHCOUNT(tso)
+          ,TSO_LOCALSPARKS(tso)
+          ,TSO_GLOBALSPARKS(tso)
+          );
+}
+
 /*
    Output a terminate event and an 8-byte time.
 */
@@ -356,7 +1250,12 @@ void
 grterminate(v)
 TIME v;
 {
-    DumpGranEvent(GR_TERMINATE, 0);
+#if defined(GRAN)
+    if (RTSflags.GranFlags.granSimStats_suppressed)
+      return;
+#endif
+
+    DumpGranEvent(GR_TERMINATE, PrelBase_Z91Z93_closure);
 
     if (sizeof(TIME) == 4) {
        putc('\0', gr_file);
@@ -389,12 +1288,17 @@ void
 grputw(v)
 TIME v;
 {
-    if (v <= 0x3fl) {
+#if defined(GRAN)
+    if (RTSflags.GranFlags.granSimStats_suppressed)
+      return;
+#endif
+
+    if (v <= 0x3fl) {                           /* length v = 1 byte */ 
        fputc(v & 0x3f, gr_file);
-    } else if (v <= 0x3fffl) {
+    } else if (v <= 0x3fffl) {                  /* length v = 2 byte */ 
        fputc((v >> 8l) | 0x40l, gr_file);
        fputc(v & 0xffl, gr_file);
-    } else if (v <= 0x3fffffffl) {
+    } else if (v <= 0x3fffffffl) {              /* length v = 4 byte */ 
        fputc((v >> 24l) | 0x80l, gr_file);
        fputc((v >> 16l) & 0xffl, gr_file);
        fputc((v >> 8l) & 0xffl, gr_file);
@@ -423,6 +1327,7 @@ TIME v;
     }
 }
 
+#endif /* GRAN || PAR */
 \end{code}
 
 %****************************************************************************
@@ -431,10 +1336,14 @@ TIME v;
 %
 %****************************************************************************
 
+General routines for GranSim. Mainly, startup and shutdown routines, called
+from @main.lc@.
+
 \begin{code}
-#ifdef GRAN
-char gr_filename[32]; /*ToDo: magic short filename constant????? WDP 95/07 */
-I_ do_gr_sim = 0;
+#if defined(GRAN)
+FILE *gr_file = NULL;
+char gr_filename[STATS_FILENAME_MAXLEN];
+/* I_ do_gr_sim = 0; */ /* In GrAnSim setup always do simulation */
 
 int
 init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv)
@@ -443,19 +1352,21 @@ int prog_argc, rts_argc;
 {
     I_ i;
 
-    if (do_gr_sim) {
-       char *extension = do_gr_binary ? "gb" : "gr";
+    char *extension = RTSflags.GranFlags.granSimStats_Binary ? "gb" : "gr";
 
-       sprintf(gr_filename, "%0.28s.%0.2s", prog_argv[0], extension);
+    if (RTSflags.GranFlags.granSimStats_suppressed)
+       return;
 
-       if ((gr_file = fopen(gr_filename, "w")) == NULL) {
-           fprintf(stderr, "Can't open granularity simulation report file %s\n", gr_filename);
-           exit(EXIT_FAILURE); /* why not EXIT??? WDP 95/07 */
-       }
-#if defined(GRAN_CHECK) && defined(GRAN)
-       if (DoReScheduleOnFetch)
+    sprintf(gr_filename, GR_FILENAME_FMT, prog_argv[0], extension);
+
+    if ((gr_file = fopen(gr_filename, "w")) == NULL) {
+       fprintf(stderr, "Can't open granularity simulation report file %s\n", gr_filename);
+       EXIT(EXIT_FAILURE); 
+    }
+#  if 0  /* that's obsolete now, I think -- HWL */
+       if (RTSflags.GranFlags.DoReScheduleOnFetch)
            setbuf(gr_file, NULL);
-#endif
+#  endif
 
        fputs("Granularity Simulation for ", gr_file);
        for (i = 0; i < prog_argc; ++i) {
@@ -471,87 +1382,195 @@ int prog_argc, rts_argc;
                fputc(' ', gr_file);
            }
        }
+
+       fputs("\nStart time: ", gr_file);
+       fputs(time_str(), gr_file); /* defined in main.lc */
+       fputc('\n', gr_file);
+    
        fputs("\n\n--------------------\n\n", gr_file);
 
        fputs("General Parameters:\n\n", gr_file);
 
-       fprintf(gr_file, "PEs %u, %s Scheduler, %sMigrate Threads%s ????? %s\n",
-         max_proc, DoFairSchedule ? "Fair" : "Unfair",
-         DoThreadMigration ? "" : "Don't ",
-         DoThreadMigration && DoStealThreadsFirst ? " Before Sparks" : "",
-         DoReScheduleOnFetch ? "" : "Don't ");
-
-       fprintf(gr_file, "%s, Fetch %s in Each Packet\n",
-         SimplifiedFetch ? "Simplified Fetch" : (DoReScheduleOnFetch ? "Reschedule on Fetch" : "Block on Fetch"),
-         DoGUMMFetching ? "Many Closures" : "Exactly One Closure");
-       fprintf(gr_file, "Fetch Strategy(%lu): If outstanding fetches %s\n",
-         FetchStrategy,
-         FetchStrategy == 1 ? "only run runnable threads (don't create new ones" :
-         FetchStrategy == 2 ? "create threads only from local sparks" :
-         FetchStrategy == 3 ? "create threads from local or global sparks" :
-         FetchStrategy == 4 ? "create sparks and steal threads if necessary" :
-         "unknown");
+        if (RTSflags.GranFlags.Light) 
+          fprintf(gr_file, "GrAnSim-Light\nPEs infinite, %s Scheduler, %sMigrate Threads %s, %s\n",
+                RTSflags.GranFlags.DoFairSchedule?"Fair":"Unfair",
+                RTSflags.GranFlags.DoThreadMigration?"":"Don't ",
+                RTSflags.GranFlags.DoThreadMigration && RTSflags.GranFlags.DoStealThreadsFirst?" Before Sparks":"",
+               RTSflags.GranFlags.SimplifiedFetch ? "Simplified Fetch" :
+               RTSflags.GranFlags.DoReScheduleOnFetch ? "Reschedule on Fetch" :
+               "Block on Fetch");
+        else 
+          fprintf(gr_file, "PEs %u, %s Scheduler, %sMigrate Threads %s, %s\n",
+                RTSflags.GranFlags.proc,RTSflags.GranFlags.DoFairSchedule?"Fair":"Unfair",
+                RTSflags.GranFlags.DoThreadMigration?"":"Don't ",
+                RTSflags.GranFlags.DoThreadMigration && RTSflags.GranFlags.DoStealThreadsFirst?" Before Sparks":"",
+               RTSflags.GranFlags.SimplifiedFetch ? "Simplified Fetch" :
+               RTSflags.GranFlags.DoReScheduleOnFetch ? "Reschedule on Fetch" :
+               "Block on Fetch");
+
+       if (RTSflags.GranFlags.DoGUMMFetching) 
+         if (RTSflags.GranFlags.ThunksToPack)
+           fprintf(gr_file, "Bulk Fetching: Fetch %d Thunks in Each Packet (Packet Size = %d closures)\n",
+                   RTSflags.GranFlags.ThunksToPack, 
+                   RTSflags.GranFlags.packBufferSize);
+         else
+           fprintf(gr_file, "Bulk Fetching: Fetch as many closures as possible (Packet Size = %d closures)\n",
+                   RTSflags.GranFlags.packBufferSize);
+       else
+         fprintf(gr_file, "Incremental Fetching: Fetch Exactly One Closure in Each Packet\n");
+
+        fprintf(gr_file, "Fetch Strategy(%u):If outstanding fetches %s\n",
+                RTSflags.GranFlags.FetchStrategy,
+                RTSflags.GranFlags.FetchStrategy==0 ?
+                 " block (block-on-fetch)":
+                RTSflags.GranFlags.FetchStrategy==1 ?
+                 "only run runnable threads":
+                RTSflags.GranFlags.FetchStrategy==2 ? 
+                 "create threads only from local sparks":
+                RTSflags.GranFlags.FetchStrategy==3 ? 
+                 "create threads from local or global sparks":
+                RTSflags.GranFlags.FetchStrategy==4 ?
+                 "create sparks and steal threads if necessary":
+                  "unknown");
+
+       if (RTSflags.GranFlags.DoPrioritySparking)
+         fprintf(gr_file, "Priority Sparking (i.e. keep sparks ordered by priority)\n");
+
+       if (RTSflags.GranFlags.DoPriorityScheduling)
+         fprintf(gr_file, "Priority Scheduling (i.e. keep threads ordered by priority)\n");
 
        fprintf(gr_file, "Thread Creation Time %lu, Thread Queue Time %lu\n",
-         gran_threadcreatetime, gran_threadqueuetime);
+               RTSflags.GranFlags.gran_threadcreatetime, 
+               RTSflags.GranFlags.gran_threadqueuetime);
        fprintf(gr_file, "Thread DeSchedule Time %lu, Thread Schedule Time %lu\n",
-         gran_threaddescheduletime, gran_threadscheduletime);
+               RTSflags.GranFlags.gran_threaddescheduletime, 
+               RTSflags.GranFlags.gran_threadscheduletime);
        fprintf(gr_file, "Thread Context-Switch Time %lu\n",
-         gran_threadcontextswitchtime);
+               RTSflags.GranFlags.gran_threadcontextswitchtime);
        fputs("\n\n--------------------\n\n", gr_file);
 
        fputs("Communication Metrics:\n\n", gr_file);
        fprintf(gr_file,
          "Latency %lu (1st) %lu (rest), Fetch %lu, Notify %lu (Global) %lu (Local)\n",
-         gran_latency, gran_additional_latency, gran_fetchtime,
-         gran_gunblocktime, gran_lunblocktime);
+               RTSflags.GranFlags.gran_latency, 
+               RTSflags.GranFlags.gran_additional_latency, 
+               RTSflags.GranFlags.gran_fetchtime,
+               RTSflags.GranFlags.gran_gunblocktime, 
+               RTSflags.GranFlags.gran_lunblocktime);
        fprintf(gr_file,
          "Message Creation %lu (+ %lu after send), Message Read %lu\n",
-         gran_mpacktime, gran_mtidytime, gran_munpacktime);
+               RTSflags.GranFlags.gran_mpacktime, 
+               RTSflags.GranFlags.gran_mtidytime, 
+               RTSflags.GranFlags.gran_munpacktime);
        fputs("\n\n--------------------\n\n", gr_file);
 
        fputs("Instruction Metrics:\n\n", gr_file);
        fprintf(gr_file, "Arith %lu, Branch %lu, Load %lu, Store %lu, Float %lu, Alloc %lu\n",
-         gran_arith_cost, gran_branch_cost,
-         gran_load_cost, gran_store_cost, gran_float_cost, gran_heapalloc_cost);
+               RTSflags.GranFlags.gran_arith_cost, 
+               RTSflags.GranFlags.gran_branch_cost,
+               RTSflags.GranFlags.gran_load_cost, 
+               RTSflags.GranFlags.gran_store_cost, 
+               RTSflags.GranFlags.gran_float_cost, 
+               RTSflags.GranFlags.gran_heapalloc_cost);
        fputs("\n\n++++++++++++++++++++\n\n", gr_file);
-    }
-    if (do_gr_binary)
+
+    if (RTSflags.GranFlags.granSimStats_Binary)
        grputw(sizeof(TIME));
 
-    Idlers = max_proc;
     return (0);
 }
 
 void
 end_gr_simulation(STG_NO_ARGS)
 {
-    if (do_gr_sim) {
-       fprintf(stderr, "The simulation is finished. Look at %s for details.\n",
-         gr_filename);
-       fclose(gr_file);
-    }
-}
+   char time_string[500]; /* ToDo: kill magic constant */
+   ullong_format_string(CURRENT_TIME, time_string, rtsFalse/*no commas!*/);
 
-#endif /* GRAN */
+   if (RTSflags.GranFlags.granSimStats_suppressed)
+     return;
 
-#ifdef PAR
-char gr_filename[50]; /*ToDo: (small) magic constant alert!!!! WDP 95/07 */
+#if defined(GRAN_CHECK) && defined(GRAN)
+   /* Print event stats */
+   if (RTSflags.GranFlags.debug & 0x20) {
+     int i;
+   
+     fprintf(stderr,"Event statistics (number of events: %d):\n",
+             noOfEvents);
+     for (i=0; i<=MAX_EVENT; i++) {
+       fprintf(stderr,"  %s (%d): \t%ld \t%f%%\t%f%%\n",
+               event_names[i],i,event_counts[i],
+               (float)(100*event_counts[i])/(float)(noOfEvents),
+               (i==CONTINUETHREAD ? 0.0 :
+                  (float)(100*(event_counts[i])/(float)(noOfEvents-event_counts[CONTINUETHREAD])) ));
+     }
+     fprintf(stderr,"Randomized steals: %u sparks, %u threads \n \t(Sparks: #%u (avg ntimes=%f; avg fl=%f) \n", 
+                    rs_sp_count, rs_t_count, no_of_steals, 
+                    (float)ntimes_total/(float)STG_MAX(no_of_steals,1),
+                    (float)fl_total/(float)STG_MAX(no_of_steals,1));
+     fprintf(stderr,"Moved sparks: %d  Withered sparks: %d (%.2f %%)\n",
+             tot_sparks,withered_sparks,
+             ( tot_sparks == 0 ? 0 :
+                  (float)(100*withered_sparks)/(float)(tot_sparks)) );
+     /* Print statistics about priority sparking */
+     if (RTSflags.GranFlags.DoPrioritySparking) {
+       fprintf(stderr,"About Priority Sparking:\n");
+       fprintf(stderr,"  Total no. NewThreads: %d   Avg. spark queue len: %.2f \n", tot_sq_probes, (float)tot_sq_len/(float)tot_sq_probes);
+     }
+     /* Print statistics about priority sparking */
+     if (RTSflags.GranFlags.DoPriorityScheduling) {
+       fprintf(stderr,"About Priority Scheduling:\n");
+       fprintf(stderr,"  Total no. of StartThreads: %d (non-end: %d) Avg. thread queue len: %.2f\n", 
+               tot_add_threads, non_end_add_threads, 
+               (float)tot_tq_len/(float)tot_add_threads);
+     }
+     /* Print packet statistics if GUMM fetching is turned on */
+     if (RTSflags.GranFlags.DoGUMMFetching) {
+       fprintf(stderr,"Packet statistcs:\n");
+       fprintf(stderr,"  Total no. of packets: %d   Avg. packet size: %.2f \n", tot_packets, (float)tot_packet_size/(float)tot_packets);
+       fprintf(stderr,"  Total no. of thunks: %d   Avg. thunks/packet: %.2f \n", tot_thunks, (float)tot_thunks/(float)tot_packets);
+       fprintf(stderr,"  Total no. of cuts: %d   Avg. cuts/packet: %.2f\n", tot_cuts, (float)tot_cuts/(float)tot_packets);
+        /* 
+       if (closure_queue_overflows>0) 
+         fprintf(stderr,"  Number of closure queue overflows: %u\n",
+                 closure_queue_overflows);
+       */
+     }
+   }
+
+   if (RTSflags.GranFlags.PrintFetchMisses)
+     fprintf(stderr,"Number of fetch misses: %d\n",fetch_misses);
+
+# if defined(GRAN_COUNT)
+    fprintf(stderr,"Update count statistics:\n");
+    fprintf(stderr,"  Total number of updates: %u\n",nUPDs);
+    fprintf(stderr,"  Needed to awaken BQ: %u with avg BQ len of: %f\n",
+           nUPDs_BQ,(float)BQ_lens/(float)nUPDs_BQ);
+    fprintf(stderr,"  Number of PAPs: %u\n",nPAPs);
+# endif
 
-I_ do_gr_profile = 0;
-I_ do_sp_profile = 0;
-I_ do_gr_binary = 0;
+#endif /* GRAN_CHECK */
+
+       fprintf(stderr, "Simulation finished after @ %s @ cycles. Look at %s for details.\n",
+         time_string,gr_filename);
+       if (RTSflags.GranFlags.granSimStats) 
+           fclose(gr_file);
+}
+#elif defined(PAR)
+FILE *gr_file = NULL;
+char gr_filename[STATS_FILENAME_MAXLEN];
+
+/* I_ do_sp_profile = 0; */
 
 void
 init_gr_profiling(rts_argc, rts_argv, prog_argc, prog_argv)
-char *prog_argv[], *rts_argv[];
-int prog_argc, rts_argc;
+  char *prog_argv[], *rts_argv[];
+  int prog_argc, rts_argc;
 {
     int i;
 
-    char *extension = do_gr_binary ? "gb" : "gr";
+    char *extension = RTSflags.ParFlags.granSimStats_Binary ? "gb" : "gr";
 
-    sprintf(gr_filename, "%0.28s.%03d.%0.2s", prog_argv[0], thisPE, extension);
+    sprintf(gr_filename, GR_FILENAME_FMT_GUM, prog_argv[0], thisPE, extension);
 
     if ((gr_file = fopen(gr_filename, "w")) == NULL) {
        fprintf(stderr, "Can't open activity report file %s\n", gr_filename);
@@ -573,6 +1592,10 @@ int prog_argc, rts_argc;
     }
     fputc('\n', gr_file);
 
+    fputs("Start-Time: ", gr_file);
+    fputs(time_str(), gr_file); /* defined in main.lc */
+    fputc('\n', gr_file);
+    
     startTime = CURRENT_TIME;
 
     if (startTime > LL(1000000000)) {
@@ -584,12 +1607,12 @@ int prog_argc, rts_argc;
        fprintf(gr_file, "PE %2u [%lu]: TIME\n", thisPE, (TIME) startTime);
     }
 
-    if (do_gr_binary)
+    if (RTSflags.ParFlags.granSimStats_Binary)
         grputw(sizeof(TIME));
 }
 #endif /* PAR */
 
-#endif /* GRAN || PAR */
+#endif   /* GRAN || PAR */ 
 \end{code}