% % (c) The GRASP/AQUA Project, Glasgow University, 1995 - 1996 % Hans Wolfgang Loidl % % Time-stamp: % %************************************************************************ %* * \section[GranSim.lc]{Granularity Simulator Routines} %* * %************************************************************************ Macros for dealing with the new and improved GA field for simulating parallel execution. Based on @CONCURRENT@ package. The GA field now contains a mask, where the n-th bit stands for the n-th processor, where this data can be found. In case of multiple copies, several bits are set. The total number of processors is bounded by @MAX_PROC@, which should be <= the length of a word in bits. -- HWL \begin{code} #if defined(GRAN) || defined(PAR) #define NON_POSIX_SOURCE /* gettimeofday */ #include "rtsdefs.h" /* qaStaH nuq Sovpu' ngoqvam ghItlhpu'bogh nuv 'e' vItul */ # if defined(HAVE_GETCLOCK) # if defined(HAVE_SYS_TIMERS_H) # define POSIX_4D9 1 # include # endif # else # if defined(HAVE_GETTIMEOFDAY) # if defined(HAVE_SYS_TIME_H) # include # endif # else # ifdef HAVE_TIME_H # include # endif # endif # endif \end{code} %**************************************************************** %* * \subsection[GranSim-data-types]{Basic data types and set-up variables for GranSim} %* * %**************************************************************** \begin{code} /* See GranSim.lh for the definition of the enum gran_event_types */ char *gran_event_names[] = { "START", "START(Q)", "STEALING", "STOLEN", "STOLEN(Q)", "FETCH", "REPLY", "BLOCK", "RESUME", "RESUME(Q)", "SCHEDULE", "DESCHEDULE", "END", "SPARK", "SPARKAT", "USED", "PRUNED", "EXPORTED", "ACQUIRED", "ALLOC", "TERMINATE", "SYSTEM_START", "SYSTEM_END", /* only for debugging */ "??" }; #if defined(GRAN) char *proc_status_names[] = { "Idle", "Sparking", "Starting", "Fetching", "Fishing", "Busy", "UnknownProcStatus" }; #define RAND_MAX 0x7fffffff /* 2^31-1 = 0x80000000 - 1 (see lrand48(3) */ unsigned CurrentProc = 0; rtsBool IgnoreEvents = rtsFalse; /* HACK only for testing */ #if 0 && (defined(GCap) || defined(GCgn)) closq ex_RBH_q = NULL; #endif #endif /* GRAN */ \end{code} The following variables control the behaviour of GrAnSim. In general, there is one RTS option for enabling each of these features. In getting the desired setup of GranSim the following questions have to be answered: \begin{itemize} \item {\em Which scheduling algorithm} to use (@RTSflags.GranFlags.DoFairSchedule@)? Currently only unfair scheduling is supported. \item What to do when remote data is fetched (@RTSflags.GranFlags.DoReScheduleOnFetch@)? Either block and wait for the data or reschedule and do some other work. Thus, if this variable is true, asynchronous communication is modelled. Block on fetch mainly makes sense for incremental fetching. There is also a simplified fetch variant available (@RTSflags.GranFlags.SimplifiedFetch@). This variant does not use events to model communication. It is faster but the results will be less accurate. \item How aggressive to be in getting work after a reschedule on fetch (@RTSflags.GranFlags.FetchStrategy@)? This is determined by the so-called {\em fetching strategy\/}. Currently, there are four possibilities: \begin{enumerate} \item Only run a runnable thread. \item Turn a spark into a thread, if necessary. \item Steal a remote spark, if necessary. \item Steal a runnable thread from another processor, if necessary. \end{itemize} The variable @RTSflags.GranFlags.FetchStrategy@ determines how far to go in this list when rescheduling on a fetch. \item Should sparks or threads be stolen first when looking for work (@RTSflags.GranFlags.DoStealThreadsFirst@)? The default is to steal sparks first (much cheaper). \item Should the RTS use a lazy thread creation scheme (@RTSflags.GranFlags.DoAlwaysCreateThreads@)? By default yes i.e.\ sparks are only turned into threads when work is needed. Also note, that sparks can be discarded by the RTS (this is done in the case of an overflow of the spark pool). Setting @RTSflags.GranFlags.DoAlwaysCreateThreads@ to @True@ forces the creation of threads at the next possibility (i.e.\ when new work is demanded the next time). \item Should data be fetched closure-by-closure or in packets (@RTSflags.GranFlags.DoGUMMFetching@)? The default strategy is a GRIP-like incremental (i.e.\ closure-by-closure) strategy. This makes sense in a low-latency setting but is bad in a high-latency system. Setting @RTSflags.GranFlags.DoGUMMFetching@ to @True@ enables bulk (packet) fetching. Other parameters determine the size of the packets (@pack_buffer_size@) and the number of thunks that should be put into one packet (@RTSflags.GranFlags.ThunksToPack@). \item If there is no other possibility to find work, should runnable threads be moved to an idle processor (@RTSflags.GranFlags.DoThreadMigration@)? In any case, the RTS tried to get sparks (either local or remote ones) first. Thread migration is very expensive, since a whole TSO has to be transferred and probably data locality becomes worse in the process. Note, that the closure, which will be evaluated next by that TSO is not transferred together with the TSO (that might block another thread). \item Should the RTS distinguish between sparks created by local nodes and stolen sparks (@RTSflags.GranFlags.PreferSparksOfLocalNodes@)? The idea is to improve data locality by preferring sparks of local nodes (it is more likely that the data for those sparks is already on the local processor). However, such a distinction also imposes an overhead on the spark queue management, and typically a large number of sparks are generated during execution. By default this variable is set to @False@. \item Should the RTS use granularity control mechanisms? The idea of a granularity control mechanism is to make use of granularity information provided via annotation of the @par@ construct in order to prefer bigger threads when either turning a spark into a thread or when choosing the next thread to schedule. Currently, three such mechanisms are implemented: \begin{itemize} \item Cut-off: The granularity information is interpreted as a priority. If a threshold priority is given to the RTS, then only those sparks with a higher priority than the threshold are actually created. Other sparks are immediately discarded. This is similar to a usual cut-off mechanism often used in parallel programs, where parallelism is only created if the input data is lage enough. With this option, the choice is hidden in the RTS and only the threshold value has to be provided as a parameter to the runtime system. \item Priority Sparking: This mechanism keeps priorities for sparks and chooses the spark with the highest priority when turning a spark into a thread. After that the priority information is discarded. The overhead of this mechanism comes from maintaining a sorted spark queue. \item Priority Scheduling: This mechanism keeps the granularity information for threads, to. Thus, on each reschedule the largest thread is chosen. This mechanism has a higher overhead, as the thread queue is sorted, too. \end{itemize} \end{itemize} \begin{code} #if defined(GRAN) /* Do we need to reschedule following a fetch? */ rtsBool NeedToReSchedule = rtsFalse; TIME TimeOfNextEvent, EndOfTimeSlice; /* checked from the threaded world! */ /* I_ avoidedCS=0; */ /* Unused!! ToDo: Remake libraries and nuke this var */ /* For internal use (event statistics) only */ char *event_names[] = { "STARTTHREAD", "CONTINUETHREAD", "RESUMETHREAD", "MOVESPARK", "MOVETHREAD", "FINDWORK", "FETCHNODE", "FETCHREPLY", "GLOBALBLOCK", "UNBLOCKTHREAD" }; # if defined(GRAN_CHECK) && defined(GRAN) /* Just for testing */ I_ noOfEvents = 0; I_ event_counts[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; I_ fetch_misses = 0; I_ tot_low_pri_sparks = 0; I_ rs_sp_count=0, rs_t_count=0, ntimes_total=0, fl_total=0, no_of_steals=0; /* Variables for gathering packet and queue statistics */ I_ tot_packets = 0, tot_packet_size = 0, tot_cuts = 0, tot_thunks = 0; I_ tot_sq_len = 0, tot_sq_probes = 0, tot_sparks = 0, withered_sparks = 0; I_ tot_add_threads = 0, tot_tq_len = 0, non_end_add_threads = 0; # endif # if defined(GRAN_COUNT) /* Count the number of updates that are done. Mainly for testing, but could be useful for other purposes, too. */ I_ nUPDs = 0, nUPDs_old = 0, nUPDs_new = 0, nUPDs_BQ = 0, nPAPs = 0, BQ_lens = 0; # endif /* Prototypes */ I_ HandleFetchRequest(P_, PROC, P_); /* void HandleFetchRequest(P_, PROC, P_); changed for GUMMFeching */ static I_ blockFetch(P_ tso, PROC proc, P_ bh); #endif /* GRAN */ \end{code} %**************************************************************** %* * \subsection[global-address-op]{Global Address Operations} %* * %**************************************************************** These functions perform operations on the global-address (ga) part of a closure. The ga is the only new field (1 word) in a closure introduced by GrAnSim. It serves as a bitmask, indicating on which processor the closure is residing. Since threads are described by Thread State Object (TSO), which is nothing but another kind of closure, this scheme allows gives placement information about threads. A ga is just a bitmask, so the operations on them are mainly bitmask manipulating functions. Note, that there are important macros like PROCS, IS_LOCAL_TO etc. They are defined in @GrAnSim.lh@. NOTE: In GrAnSim-light we don't maintain placement information. This allows to simulate an arbitrary number of processors. The price we have to be is the lack of costing any communication properly. In short, GrAnSim-light is meant to reveal the maximal parallelism in a program. From an implementation point of view the important thing is: {\em GrAnSim-light does not maintain global-addresses}. \begin{code} #if defined(GRAN) /* ga_to_proc returns the first processor marked in the bitmask ga. Normally only one bit in ga should be set. But for PLCs all bits are set. That shouldn't hurt since we only need IS_LOCAL_TO for PLCs */ PROC ga_to_proc(W_ ga) { PROC i; for (i = 0; i < MAX_PROC && !IS_LOCAL_TO(ga, i); i++); return (i); } /* NB: This takes a *node* rather than just a ga as input */ PROC where_is(P_ node) { return (ga_to_proc(PROCS(node))); } /* Access the GA field of the node */ rtsBool any_idle() { I_ i; rtsBool any_idle; for(i=0, any_idle=rtsFalse; !any_idle && i= CONTINUETHREAD1) evttype = CONTINUETHREAD; */ /* Search the queue and insert at the right point: FINDWORK before everything, CONTINUETHREAD after everything. This ensures that we find any available work after all threads have executed the current cycle. This level of detail would normally be irrelevant, but matters for ridiculously low latencies... */ /* Changed the ordering: Now FINDWORK comes after everything but CONTINUETHREAD. This makes sure that a MOVESPARK comes before a FINDWORK. This is important when a GranSimSparkAt happens and DoAlwaysCreateThreads is turned on. Also important if a GC occurs when trying to build a new thread (see much_spark) -- HWL 02/96 */ if(EventHd == NULL) EventHd = newentry; else { for (event = EventHd, prev=&EventHd; event != NULL; prev = &(EVENT_NEXT(event)), event = EVENT_NEXT(event)) { switch (evttype) { case FINDWORK: if ( EVENT_TIME(event) < EVENT_TIME(newentry) || ( (EVENT_TIME(event) == EVENT_TIME(newentry)) && (EVENT_TYPE(event) != CONTINUETHREAD) ) ) continue; else break; case CONTINUETHREAD: if ( EVENT_TIME(event) <= EVENT_TIME(newentry) ) continue; else break; default: if ( EVENT_TIME(event) < EVENT_TIME(newentry) || ((EVENT_TIME(event) == EVENT_TIME(newentry)) && (EVENT_TYPE(event) == EVENT_TYPE(newentry))) ) continue; else break; } /* Insert newentry here (i.e. before event) */ *prev = newentry; EVENT_NEXT(newentry) = event; break; } if (event == NULL) *prev = newentry; } } void new_event(proc,creator,time,evttype,tso,node,spark) PROC proc, creator; TIME time; EVTTYPE evttype; P_ tso, node; sparkq spark; { eventq newentry = (eventq) stgMallocBytes(sizeof(struct event), "new_event"); EVENT_PROC(newentry) = proc; EVENT_CREATOR(newentry) = creator; EVENT_TIME(newentry) = time; EVENT_TYPE(newentry) = evttype; EVENT_TSO(newentry) = tso; EVENT_NODE(newentry) = node; EVENT_SPARK(newentry) = spark; EVENT_GC_INFO(newentry) = 0; EVENT_NEXT(newentry) = NULL; insert_event(newentry); } void prepend_event(eventq event) /* put event at beginning of EventQueue */ { /* only used for GC! */ EVENT_NEXT(event) = EventHd; EventHd = event; } eventq grab_event() /* undo prepend_event i.e. get the event */ { /* at the head of EventQ but don't free anything */ eventq event = EventHd; if(EventHd == NULL) { fprintf(stderr,"No next event (in grab_event)\n"); EXIT(EXIT_FAILURE); } EventHd = EVENT_NEXT(EventHd); return (event); } void traverse_eventq_for_gc() { eventq event = EventHd; W_ bufsize; P_ closure, tso, buffer, bufptr; PROC proc, creator; /* Traverse eventq and replace every FETCHREPLY by a FETCHNODE for the orig closure (root of packed graph). This means that a graph, which is between processors at the time of GC is fetched again at the time when it would have arrived, had there been no GC. Slightly inaccurate but safe for GC. This is only needed for GUM style fetchng. */ if (!RTSflags.GranFlags.DoGUMMFetching) return; for(event = EventHd; event!=NULL; event=EVENT_NEXT(event)) { if (EVENT_TYPE(event)==FETCHREPLY) { buffer = EVENT_NODE(event); ASSERT(buffer[PACK_FLAG_LOCN]==MAGIC_PACK_FLAG); /* It's a pack buffer */ bufsize = buffer[PACK_SIZE_LOCN]; closure= (P_)buffer[PACK_HDR_SIZE]; tso = (P_)buffer[PACK_TSO_LOCN]; proc = EVENT_PROC(event); creator = EVENT_CREATOR(event); /* similar to unpacking */ for (bufptr=buffer+PACK_HDR_SIZE; bufptr<(buffer+bufsize); bufptr++) { if ( (INFO_TYPE(INFO_PTR(*bufptr)) == INFO_SPEC_RBH_TYPE) || (INFO_TYPE(INFO_PTR(*bufptr)) == INFO_GEN_RBH_TYPE) ) { convertFromRBH((P_)*bufptr); } } free(buffer); EVENT_TYPE(event) = FETCHNODE; EVENT_PROC(event) = creator; EVENT_CREATOR(event) = proc; EVENT_NODE(event) = closure; EVENT_TSO(event) = tso; EVENT_GC_INFO(event) = 0; } } } void print_event(event) eventq event; { char str_tso[16], str_node[16]; sprintf(str_tso,((EVENT_TSO(event)==Prelude_Z91Z93_closure) ? "______" : "%#6lx"), EVENT_TSO(event)); sprintf(str_node,((EVENT_NODE(event)==Prelude_Z91Z93_closure) ? "______" : "%#6lx"), EVENT_NODE(event)); if (event==NULL) fprintf(stderr,"Evt: NIL\n"); else fprintf(stderr,"Evt: %s (%u), PE %u [%u], Time %lu, TSO %s (%x), node %s\n", event_names[EVENT_TYPE(event)],EVENT_TYPE(event), EVENT_PROC(event), EVENT_CREATOR(event), EVENT_TIME(event), str_tso, TSO_ID(EVENT_TSO(event)), str_node /*, EVENT_SPARK(event), EVENT_NEXT(event)*/ ); } void print_eventq(hd) eventq hd; { eventq x; fprintf(stderr,"Event Queue with root at %x:\n",hd); for (x=hd; x!=NULL; x=EVENT_NEXT(x)) { print_event(x); } } void print_spark(spark) sparkq spark; { char str[16]; sprintf(str,((SPARK_NODE(spark)==Prelude_Z91Z93_closure) ? "______" : "%#6lx"), (W_) SPARK_NODE(spark)); if (spark==NULL) fprintf(stderr,"Spark: NIL\n"); else fprintf(stderr,"Spark: Node %8s, Name %#6lx, Exported %5s, Prev %#6x, Next %#6x\n", str, SPARK_NAME(spark), ((SPARK_EXPORTED(spark))?"True":"False"), SPARK_PREV(spark), SPARK_NEXT(spark) ); } void print_sparkq(hd) sparkq hd; { sparkq x; fprintf(stderr,"Spark Queue with root at %x:\n",hd); for (x=hd; x!=NULL; x=SPARK_NEXT(x)) { print_spark(x); } } #endif /* GRAN */ \end{code} %**************************************************************************** % \subsection[entry-points]{Routines directly called from Haskell world} % %**************************************************************************** The @GranSim...@ routines in here are directly called via macros from the threaded world. First some auxiliary routines. \begin{code} #if defined(GRAN) /* Take the current thread off the thread queue and thereby activate the */ /* next thread. It's assumed that the next ReSchedule after this uses */ /* NEW_THREAD as param. */ /* This fct is called from GranSimBlock and GranSimFetch */ void ActivateNextThread (PROC proc) { ASSERT(RunnableThreadsHd[proc]!=Prelude_Z91Z93_closure); RunnableThreadsHd[proc] = TSO_LINK(RunnableThreadsHd[proc]); if(RunnableThreadsHd[proc]==Prelude_Z91Z93_closure) { MAKE_IDLE(proc); RunnableThreadsTl[proc] = Prelude_Z91Z93_closure; } else { CurrentTime[proc] += RTSflags.GranFlags.gran_threadcontextswitchtime; if (RTSflags.GranFlags.granSimStats && (!RTSflags.GranFlags.Light || (RTSflags.GranFlags.debug & 0x20000))) DumpRawGranEvent(proc,0,GR_SCHEDULE,RunnableThreadsHd[proc], Prelude_Z91Z93_closure,0); } } \end{code} Now the main stg-called routines: \begin{code} /* ------------------------------------------------------------------------ */ /* The following GranSim... fcts are stg-called from the threaded world. */ /* ------------------------------------------------------------------------ */ /* Called from HEAP_CHK -- NB: node and liveness are junk here now. They are left temporarily to avoid complete recompilation. KH */ void GranSimAllocate(n,node,liveness) I_ n; P_ node; W_ liveness; { TSO_ALLOCS(CurrentTSO) += n; ++TSO_BASICBLOCKS(CurrentTSO); if (RTSflags.GranFlags.granSimStats_Heap) { DumpRawGranEvent(CurrentProc,0,GR_ALLOC,CurrentTSO, Prelude_Z91Z93_closure,n); } TSO_EXECTIME(CurrentTSO) += RTSflags.GranFlags.gran_heapalloc_cost; CurrentTime[CurrentProc] += RTSflags.GranFlags.gran_heapalloc_cost; } /* Subtract the values added above, if a heap check fails and so has to be redone. */ void GranSimUnallocate(n,node,liveness) W_ n; P_ node; W_ liveness; { TSO_ALLOCS(CurrentTSO) -= n; --TSO_BASICBLOCKS(CurrentTSO); TSO_EXECTIME(CurrentTSO) -= RTSflags.GranFlags.gran_heapalloc_cost; CurrentTime[CurrentProc] -= RTSflags.GranFlags.gran_heapalloc_cost; } /* NB: We now inline this code via GRAN_EXEC rather than calling this fct */ void GranSimExec(ariths,branches,loads,stores,floats) W_ ariths,branches,loads,stores,floats; { W_ cost = RTSflags.GranFlags.gran_arith_cost*ariths + RTSflags.GranFlags.gran_branch_cost*branches + RTSflags.GranFlags.gran_load_cost * loads + RTSflags.GranFlags.gran_store_cost*stores + RTSflags.GranFlags.gran_float_cost*floats; TSO_EXECTIME(CurrentTSO) += cost; CurrentTime[CurrentProc] += cost; } /* Fetch the node if it isn't local -- result indicates whether fetch has been done. This is GRIP-style single item fetching. */ /* This function in Threads.lc is only needed for SimplifiedFetch */ FetchNode PROTO((P_ node,PROC CurrentProc)); I_ GranSimFetch(node /* , liveness_mask */ ) P_ node; /* I_ liveness_mask; */ { if (RTSflags.GranFlags.Light) { /* Always reschedule in GrAnSim-Light to prevent one TSO from running off too far new_event(CurrentProc,CurrentProc,CurrentTime[CurrentProc], CONTINUETHREAD,CurrentTSO,node,NULL); */ NeedToReSchedule = rtsFalse; return(0); } /* Note: once a node has been fetched, this test will be passed */ if(!IS_LOCAL_TO(PROCS(node),CurrentProc)) { /* Add mpacktime to the remote PE for the reply */ { PROC p = where_is(node); TIME fetchtime; # ifdef GRAN_CHECK if ( ( RTSflags.GranFlags.debug & 0x40 ) && p == CurrentProc ) fprintf(stderr,"GranSimFetch: Trying to fetch from own processor%u\n", p); # endif /* GRAN_CHECK */ CurrentTime[CurrentProc] += RTSflags.GranFlags.gran_mpacktime; /* NB: Fetch is counted on arrival (FETCHREPLY) */ if (RTSflags.GranFlags.SimplifiedFetch) { FetchNode(node,CurrentProc); CurrentTime[CurrentProc] += RTSflags.GranFlags.gran_mtidytime+ RTSflags.GranFlags.gran_fetchtime+ RTSflags.GranFlags.gran_munpacktime; return(1); } fetchtime = STG_MAX(CurrentTime[CurrentProc],CurrentTime[p]) + RTSflags.GranFlags.gran_latency; new_event(p,CurrentProc,fetchtime,FETCHNODE,CurrentTSO,node,NULL); if (!RTSflags.GranFlags.DoReScheduleOnFetch) MAKE_FETCHING(CurrentProc); ++OutstandingFetches[CurrentProc]; if (fetchtime CurrentTime[CurrentProc]? CurrentTime[proc]: CurrentTime[CurrentProc]) + RTSflags.GranFlags.gran_latency; } else { exporttime = CurrentTime[CurrentProc]; } if ( RTSflags.GranFlags.Light ) /* Need CurrentTSO in event field to associate costs with creating spark even in a GrAnSim Light setup */ new_event(proc,CurrentProc,exporttime, MOVESPARK,CurrentTSO,Prelude_Z91Z93_closure,spark); else new_event(proc,CurrentProc,exporttime, MOVESPARK,Prelude_Z91Z93_closure,Prelude_Z91Z93_closure,spark); /* Bit of a hack to treat placed sparks the same as stolen sparks */ ++OutstandingFishes[proc]; /* Force the PE to take notice of the spark (FINDWORK is put after a MOVESPARK into the sparkq!) */ if(RTSflags.GranFlags.DoAlwaysCreateThreads) { new_event(CurrentProc,CurrentProc,exporttime+1, FINDWORK,Prelude_Z91Z93_closure,Prelude_Z91Z93_closure,NULL); } if (exporttime GR_EVENT_MAX) name = GR_EVENT_MAX; if(GRANSIMSTATS_BINARY) /* ToDo: fix code for writing binary GrAnSim statistics */ switch (name) { case GR_START: case GR_STARTQ: grputw(name); grputw(proc); abort(); /* die please: a single word */ /* doesn't represent long long times */ grputw(TIME_ON_PROC(proc)); grputw((W_)node); break; case GR_FETCH: case GR_REPLY: case GR_BLOCK: grputw(name); grputw(proc); abort(); /* die please: a single word */ /* doesn't represent long long times */ grputw(TIME_ON_PROC(proc)); /* this line is bound to */ grputw(id); /* do the wrong thing */ break; default: grputw(name); grputw(proc); abort(); /* die please: a single word */ /* doesn't represent long long times */ grputw(TIME_ON_PROC(proc)); grputw((W_)node); } else switch (name) { case GR_START: case GR_STARTQ: /* fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[sparks %u]\n", */ /* using spark queue length as optional argument ^^^^^^^^^ */ fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[SN %u]\n", /* using spark name as optional argument ^^^^^^ */ proc,time_string,gran_event_names[name], id,node_str,len); break; case GR_FETCH: case GR_REPLY: case GR_BLOCK: case GR_STOLEN: case GR_STOLENQ: fprintf(gr_file, "PE %2u [%s]: %-9s\t%lx \t%s\t(from %2u)\n", proc, time_string, gran_event_names[name], id,node_str,p); break; case GR_RESUME: case GR_RESUMEQ: case GR_SCHEDULE: case GR_DESCHEDULE: fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx \n", proc,time_string,gran_event_names[name],id); break; case GR_STEALING: fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t \t(by %2u)\n", proc,time_string,gran_event_names[name],id,p); break; case GR_ALLOC: fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t \tallocating %u words\n", proc,time_string,gran_event_names[name],id,len); break; default: fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[sparks %u]\n", proc,time_string,gran_event_names[name],id,node_str,len); } } #if defined(GRAN) /* Only needed for special dynamic spark labelling support */ void DumpStartEventAt(time, proc, p, name, tso, node, len) TIME time; PROC proc, p; /* proc ... where it happens; p ... where node lives */ enum gran_event_types name; P_ tso, node; I_ len; { W_ id; char time_string[500], node_str[16]; /*ToDo: kill magic constants */ ullong_format_string(time, time_string, rtsFalse/*no commas!*/); /* ^^^^ only important change to DumpRawGranEvent */ if (RTSflags.GranFlags.granSimStats_suppressed) return; id = tso == NULL ? -1 : TSO_ID(tso); if (node==Prelude_Z91Z93_closure) strcpy(node_str,"________"); /* "Z91Z93_closure"); */ else sprintf(node_str,"0x%-6lx",node); if (name > GR_EVENT_MAX) name = GR_EVENT_MAX; if(GRANSIMSTATS_BINARY) /* ToDo: fix code for writing binary GrAnSim statistics */ switch (name) { case GR_START: case GR_STARTQ: grputw(name); grputw(proc); abort(); /* die please: a single word */ /* doesn't represent long long times */ grputw(TIME_ON_PROC(proc)); grputw((W_)node); break; default: fprintf(stderr,"Error in DumpStartEventAt: event %s is not a START event\n", gran_event_names[name]); } else switch (name) { case GR_START: case GR_STARTQ: /* fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[sparks %u]\n", */ /* using spark queue length as optional argument ^^^^^^^^^ */ fprintf(gr_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[SN %u]\n", /* using spark name as optional argument ^^^^^^ */ proc,time_string,gran_event_names[name], id,node_str,len); break; default: fprintf(stderr,"Error in DumpStartEventAt: event %s is not a START event\n", gran_event_names[name]); } } #endif /* GRAN */ void DumpGranInfo(proc, tso, mandatory_thread) PROC proc; P_ tso; rtsBool mandatory_thread; { char time_string[500]; /* ToDo: kill magic constant */ ullong_format_string(CURRENT_TIME, time_string, rtsFalse/*no commas!*/); #if defined(GRAN) if (RTSflags.GranFlags.granSimStats_suppressed) return; #endif if (GRANSIMSTATS_BINARY) { grputw(GR_END); grputw(proc); abort(); /* die please: a single word doesn't represent long long times */ grputw(CURRENT_TIME); /* this line is bound to fail */ grputw(TSO_ID(tso)); #ifdef PAR grputw(0); grputw(0); grputw(0); grputw(0); grputw(0); grputw(0); grputw(0); grputw(0); grputw(0); grputw(0); grputw(0); grputw(0); #else grputw(TSO_SPARKNAME(tso)); grputw(TSO_STARTEDAT(tso)); grputw(TSO_EXPORTED(tso)); grputw(TSO_BASICBLOCKS(tso)); grputw(TSO_ALLOCS(tso)); grputw(TSO_EXECTIME(tso)); grputw(TSO_BLOCKTIME(tso)); grputw(TSO_BLOCKCOUNT(tso)); grputw(TSO_FETCHTIME(tso)); grputw(TSO_FETCHCOUNT(tso)); grputw(TSO_LOCALSPARKS(tso)); grputw(TSO_GLOBALSPARKS(tso)); #endif grputw(mandatory_thread); } else { /* * NB: DumpGranEvent cannot be used because PE may be wrong * (as well as the extra info) */ fprintf(gr_file, "PE %2u [%s]: END %lx, SN %lu, ST %lu, EXP %c, BB %lu, HA %lu, RT %lu, BT %lu (%lu), FT %lu (%lu), LS %lu, GS %lu, MY %c\n" ,proc ,time_string ,TSO_ID(tso) ,TSO_SPARKNAME(tso) ,TSO_STARTEDAT(tso) ,TSO_EXPORTED(tso) ? 'T' : 'F' ,TSO_BASICBLOCKS(tso) ,TSO_ALLOCS(tso) ,TSO_EXECTIME(tso) ,TSO_BLOCKTIME(tso) ,TSO_BLOCKCOUNT(tso) ,TSO_FETCHTIME(tso) ,TSO_FETCHCOUNT(tso) ,TSO_LOCALSPARKS(tso) ,TSO_GLOBALSPARKS(tso) ,mandatory_thread ? 'T' : 'F' ); } } void DumpTSO(tso) P_ tso; { fprintf(stderr,"TSO 0x%lx, NAME 0x%lx, ID %lu, LINK 0x%lx, TYPE %s\n" ,tso ,TSO_NAME(tso) ,TSO_ID(tso) ,TSO_LINK(tso) ,TSO_TYPE(tso)==T_MAIN?"MAIN": TSO_TYPE(tso)==T_FAIL?"FAIL": TSO_TYPE(tso)==T_REQUIRED?"REQUIRED": TSO_TYPE(tso)==T_ADVISORY?"ADVISORY": "???" ); fprintf(stderr,"PC (0x%lx,0x%lx), ARG (0x%lx), SWITCH %lx0x\n" ,TSO_PC1(tso) ,TSO_PC2(tso) ,TSO_ARG1(tso) /* ,TSO_ARG2(tso) */ ,TSO_SWITCH(tso) ); fprintf(gr_file,"TSO %lx: SN %lu, ST %lu, GBL %c, BB %lu, HA %lu, RT %lu, BT %lu (%lu), FT %lu (%lu) LS %lu, GS %lu\n" ,TSO_ID(tso) ,TSO_SPARKNAME(tso) ,TSO_STARTEDAT(tso) ,TSO_EXPORTED(tso)?'T':'F' ,TSO_BASICBLOCKS(tso) ,TSO_ALLOCS(tso) ,TSO_EXECTIME(tso) ,TSO_BLOCKTIME(tso) ,TSO_BLOCKCOUNT(tso) ,TSO_FETCHTIME(tso) ,TSO_FETCHCOUNT(tso) ,TSO_LOCALSPARKS(tso) ,TSO_GLOBALSPARKS(tso) ); } /* Output a terminate event and an 8-byte time. */ void grterminate(v) TIME v; { #if defined(GRAN) if (RTSflags.GranFlags.granSimStats_suppressed) return; #endif DumpGranEvent(GR_TERMINATE, Prelude_Z91Z93_closure); if (sizeof(TIME) == 4) { putc('\0', gr_file); putc('\0', gr_file); putc('\0', gr_file); putc('\0', gr_file); } else { putc(v >> 56l, gr_file); putc((v >> 48l) & 0xffl, gr_file); putc((v >> 40l) & 0xffl, gr_file); putc((v >> 32l) & 0xffl, gr_file); } putc((v >> 24l) & 0xffl, gr_file); putc((v >> 16l) & 0xffl, gr_file); putc((v >> 8l) & 0xffl, gr_file); putc(v & 0xffl, gr_file); } /* Length-coded output: first 3 bits contain length coding 00x 1 byte 01x 2 bytes 10x 4 bytes 110 8 bytes 111 5 or 9 bytes */ void grputw(v) TIME v; { #if defined(GRAN) if (RTSflags.GranFlags.granSimStats_suppressed) return; #endif if (v <= 0x3fl) { /* length v = 1 byte */ fputc(v & 0x3f, gr_file); } else if (v <= 0x3fffl) { /* length v = 2 byte */ fputc((v >> 8l) | 0x40l, gr_file); fputc(v & 0xffl, gr_file); } else if (v <= 0x3fffffffl) { /* length v = 4 byte */ fputc((v >> 24l) | 0x80l, gr_file); fputc((v >> 16l) & 0xffl, gr_file); fputc((v >> 8l) & 0xffl, gr_file); fputc(v & 0xffl, gr_file); } else if (sizeof(TIME) == 4) { fputc(0x70, gr_file); fputc((v >> 24l) & 0xffl, gr_file); fputc((v >> 16l) & 0xffl, gr_file); fputc((v >> 8l) & 0xffl, gr_file); fputc(v & 0xffl, gr_file); } else { if (v <= 0x3fffffffffffffl) putc((v >> 56l) | 0x60l, gr_file); else { putc(0x70, gr_file); putc((v >> 56l) & 0xffl, gr_file); } putc((v >> 48l) & 0xffl, gr_file); putc((v >> 40l) & 0xffl, gr_file); putc((v >> 32l) & 0xffl, gr_file); putc((v >> 24l) & 0xffl, gr_file); putc((v >> 16l) & 0xffl, gr_file); putc((v >> 8l) & 0xffl, gr_file); putc(v & 0xffl, gr_file); } } #endif /* GRAN || PAR */ \end{code} %**************************************************************************** % \subsection[gr-simulation]{Granularity Simulation} % %**************************************************************************** General routines for GranSim. Mainly, startup and shutdown routines, called from @main.lc@. \begin{code} #if defined(GRAN) FILE *gr_file = NULL; char gr_filename[STATS_FILENAME_MAXLEN]; /* I_ do_gr_sim = 0; */ /* In GrAnSim setup always do simulation */ int init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv) char *prog_argv[], *rts_argv[]; int prog_argc, rts_argc; { I_ i; char *extension = RTSflags.GranFlags.granSimStats_Binary ? "gb" : "gr"; if (RTSflags.GranFlags.granSimStats_suppressed) return; sprintf(gr_filename, GR_FILENAME_FMT, prog_argv[0], extension); if ((gr_file = fopen(gr_filename, "w")) == NULL) { fprintf(stderr, "Can't open granularity simulation report file %s\n", gr_filename); EXIT(EXIT_FAILURE); } # if 0 /* that's obsolete now, I think -- HWL */ if (RTSflags.GranFlags.DoReScheduleOnFetch) setbuf(gr_file, NULL); # endif fputs("Granularity Simulation for ", gr_file); for (i = 0; i < prog_argc; ++i) { fputs(prog_argv[i], gr_file); fputc(' ', gr_file); } if (rts_argc > 0) { fputs("+RTS ", gr_file); for (i = 0; i < rts_argc; ++i) { fputs(rts_argv[i], gr_file); fputc(' ', gr_file); } } fputs("\nStart time: ", gr_file); fputs(time_str(), gr_file); /* defined in main.lc */ fputc('\n', gr_file); fputs("\n\n--------------------\n\n", gr_file); fputs("General Parameters:\n\n", gr_file); if (RTSflags.GranFlags.Light) fprintf(gr_file, "GrAnSim-Light\nPEs infinite, %s Scheduler, %sMigrate Threads %s, %s\n", RTSflags.GranFlags.DoFairSchedule?"Fair":"Unfair", RTSflags.GranFlags.DoThreadMigration?"":"Don't ", RTSflags.GranFlags.DoThreadMigration && RTSflags.GranFlags.DoStealThreadsFirst?" Before Sparks":"", RTSflags.GranFlags.SimplifiedFetch ? "Simplified Fetch" : RTSflags.GranFlags.DoReScheduleOnFetch ? "Reschedule on Fetch" : "Block on Fetch"); else fprintf(gr_file, "PEs %u, %s Scheduler, %sMigrate Threads %s, %s\n", RTSflags.GranFlags.proc,RTSflags.GranFlags.DoFairSchedule?"Fair":"Unfair", RTSflags.GranFlags.DoThreadMigration?"":"Don't ", RTSflags.GranFlags.DoThreadMigration && RTSflags.GranFlags.DoStealThreadsFirst?" Before Sparks":"", RTSflags.GranFlags.SimplifiedFetch ? "Simplified Fetch" : RTSflags.GranFlags.DoReScheduleOnFetch ? "Reschedule on Fetch" : "Block on Fetch"); if (RTSflags.GranFlags.DoGUMMFetching) if (RTSflags.GranFlags.ThunksToPack) fprintf(gr_file, "Bulk Fetching: Fetch %d Thunks in Each Packet (Packet Size = %d closures)\n", RTSflags.GranFlags.ThunksToPack, RTSflags.GranFlags.packBufferSize); else fprintf(gr_file, "Bulk Fetching: Fetch as many closures as possible (Packet Size = %d closures)\n", RTSflags.GranFlags.packBufferSize); else fprintf(gr_file, "Incremental Fetching: Fetch Exactly One Closure in Each Packet\n"); fprintf(gr_file, "Fetch Strategy(%u):If outstanding fetches %s\n", RTSflags.GranFlags.FetchStrategy, RTSflags.GranFlags.FetchStrategy==0 ? " block (block-on-fetch)": RTSflags.GranFlags.FetchStrategy==1 ? "only run runnable threads": RTSflags.GranFlags.FetchStrategy==2 ? "create threads only from local sparks": RTSflags.GranFlags.FetchStrategy==3 ? "create threads from local or global sparks": RTSflags.GranFlags.FetchStrategy==4 ? "create sparks and steal threads if necessary": "unknown"); if (RTSflags.GranFlags.DoPrioritySparking) fprintf(gr_file, "Priority Sparking (i.e. keep sparks ordered by priority)\n"); if (RTSflags.GranFlags.DoPriorityScheduling) fprintf(gr_file, "Priority Scheduling (i.e. keep threads ordered by priority)\n"); fprintf(gr_file, "Thread Creation Time %lu, Thread Queue Time %lu\n", RTSflags.GranFlags.gran_threadcreatetime, RTSflags.GranFlags.gran_threadqueuetime); fprintf(gr_file, "Thread DeSchedule Time %lu, Thread Schedule Time %lu\n", RTSflags.GranFlags.gran_threaddescheduletime, RTSflags.GranFlags.gran_threadscheduletime); fprintf(gr_file, "Thread Context-Switch Time %lu\n", RTSflags.GranFlags.gran_threadcontextswitchtime); fputs("\n\n--------------------\n\n", gr_file); fputs("Communication Metrics:\n\n", gr_file); fprintf(gr_file, "Latency %lu (1st) %lu (rest), Fetch %lu, Notify %lu (Global) %lu (Local)\n", RTSflags.GranFlags.gran_latency, RTSflags.GranFlags.gran_additional_latency, RTSflags.GranFlags.gran_fetchtime, RTSflags.GranFlags.gran_gunblocktime, RTSflags.GranFlags.gran_lunblocktime); fprintf(gr_file, "Message Creation %lu (+ %lu after send), Message Read %lu\n", RTSflags.GranFlags.gran_mpacktime, RTSflags.GranFlags.gran_mtidytime, RTSflags.GranFlags.gran_munpacktime); fputs("\n\n--------------------\n\n", gr_file); fputs("Instruction Metrics:\n\n", gr_file); fprintf(gr_file, "Arith %lu, Branch %lu, Load %lu, Store %lu, Float %lu, Alloc %lu\n", RTSflags.GranFlags.gran_arith_cost, RTSflags.GranFlags.gran_branch_cost, RTSflags.GranFlags.gran_load_cost, RTSflags.GranFlags.gran_store_cost, RTSflags.GranFlags.gran_float_cost, RTSflags.GranFlags.gran_heapalloc_cost); fputs("\n\n++++++++++++++++++++\n\n", gr_file); if (RTSflags.GranFlags.granSimStats_Binary) grputw(sizeof(TIME)); return (0); } void end_gr_simulation(STG_NO_ARGS) { char time_string[500]; /* ToDo: kill magic constant */ ullong_format_string(CURRENT_TIME, time_string, rtsFalse/*no commas!*/); if (RTSflags.GranFlags.granSimStats_suppressed) return; #if defined(GRAN_CHECK) && defined(GRAN) /* Print event stats */ if (RTSflags.GranFlags.debug & 0x20) { int i; fprintf(stderr,"Event statistics (number of events: %d):\n", noOfEvents); for (i=0; i<=MAX_EVENT; i++) { fprintf(stderr," %s (%d): \t%ld \t%f%%\t%f%%\n", event_names[i],i,event_counts[i], (float)(100*event_counts[i])/(float)(noOfEvents), (i==CONTINUETHREAD ? 0.0 : (float)(100*(event_counts[i])/(float)(noOfEvents-event_counts[CONTINUETHREAD])) )); } fprintf(stderr,"Randomized steals: %u sparks, %u threads \n \t(Sparks: #%u (avg ntimes=%f; avg fl=%f) \n", rs_sp_count, rs_t_count, no_of_steals, (float)ntimes_total/(float)STG_MAX(no_of_steals,1), (float)fl_total/(float)STG_MAX(no_of_steals,1)); fprintf(stderr,"Moved sparks: %d Withered sparks: %d (%.2f %%)\n", tot_sparks,withered_sparks, ( tot_sparks == 0 ? 0 : (float)(100*withered_sparks)/(float)(tot_sparks)) ); /* Print statistics about priority sparking */ if (RTSflags.GranFlags.DoPrioritySparking) { fprintf(stderr,"About Priority Sparking:\n"); fprintf(stderr," Total no. NewThreads: %d Avg. spark queue len: %.2f \n", tot_sq_probes, (float)tot_sq_len/(float)tot_sq_probes); } /* Print statistics about priority sparking */ if (RTSflags.GranFlags.DoPriorityScheduling) { fprintf(stderr,"About Priority Scheduling:\n"); fprintf(stderr," Total no. of StartThreads: %d (non-end: %d) Avg. thread queue len: %.2f\n", tot_add_threads, non_end_add_threads, (float)tot_tq_len/(float)tot_add_threads); } /* Print packet statistics if GUMM fetching is turned on */ if (RTSflags.GranFlags.DoGUMMFetching) { fprintf(stderr,"Packet statistcs:\n"); fprintf(stderr," Total no. of packets: %d Avg. packet size: %.2f \n", tot_packets, (float)tot_packet_size/(float)tot_packets); fprintf(stderr," Total no. of thunks: %d Avg. thunks/packet: %.2f \n", tot_thunks, (float)tot_thunks/(float)tot_packets); fprintf(stderr," Total no. of cuts: %d Avg. cuts/packet: %.2f\n", tot_cuts, (float)tot_cuts/(float)tot_packets); /* if (closure_queue_overflows>0) fprintf(stderr," Number of closure queue overflows: %u\n", closure_queue_overflows); */ } } if (RTSflags.GranFlags.PrintFetchMisses) fprintf(stderr,"Number of fetch misses: %d\n",fetch_misses); # if defined(GRAN_COUNT) fprintf(stderr,"Update count statistics:\n"); fprintf(stderr," Total number of updates: %u\n",nUPDs); fprintf(stderr," Needed to awaken BQ: %u with avg BQ len of: %f\n", nUPDs_BQ,(float)BQ_lens/(float)nUPDs_BQ); fprintf(stderr," Number of PAPs: %u\n",nPAPs); # endif #endif /* GRAN_CHECK */ fprintf(stderr, "Simulation finished after @ %s @ cycles. Look at %s for details.\n", time_string,gr_filename); if (RTSflags.GranFlags.granSimStats) fclose(gr_file); } #elif defined(PAR) FILE *gr_file = NULL; char gr_filename[STATS_FILENAME_MAXLEN]; /* I_ do_sp_profile = 0; */ void init_gr_profiling(rts_argc, rts_argv, prog_argc, prog_argv) char *prog_argv[], *rts_argv[]; int prog_argc, rts_argc; { int i; char *extension = RTSflags.ParFlags.granSimStats_Binary ? "gb" : "gr"; sprintf(gr_filename, GR_FILENAME_FMT_GUM, prog_argv[0], thisPE, extension); if ((gr_file = fopen(gr_filename, "w")) == NULL) { fprintf(stderr, "Can't open activity report file %s\n", gr_filename); EXIT(EXIT_FAILURE); } for (i = 0; i < prog_argc; ++i) { fputs(prog_argv[i], gr_file); fputc(' ', gr_file); } if (rts_argc > 0) { fputs("+RTS ", gr_file); for (i = 0; i < rts_argc; ++i) { fputs(rts_argv[i], gr_file); fputc(' ', gr_file); } } fputc('\n', gr_file); fputs("Start-Time: ", gr_file); fputs(time_str(), gr_file); /* defined in main.lc */ fputc('\n', gr_file); startTime = CURRENT_TIME; if (startTime > LL(1000000000)) { /* This shouldn't overflow twice */ fprintf(gr_file, "PE %2u [%lu%lu]: TIME\n", thisPE, (TIME) (startTime / LL(1000000000)), (TIME) (startTime % LL(1000000000))); } else { fprintf(gr_file, "PE %2u [%lu]: TIME\n", thisPE, (TIME) startTime); } if (RTSflags.ParFlags.granSimStats_Binary) grputw(sizeof(TIME)); } #endif /* PAR */ #endif /* GRAN || PAR */ \end{code}