Refactor PAPI support, and add profiling of multithreaded GC
authorSimon Marlow <simonmar@microsoft.com>
Wed, 31 Oct 2007 16:30:15 +0000 (16:30 +0000)
committerSimon Marlow <simonmar@microsoft.com>
Wed, 31 Oct 2007 16:30:15 +0000 (16:30 +0000)
rts/Papi.c
rts/Papi.h
rts/RtsStartup.c
rts/Stats.c
rts/sm/GC.c
rts/sm/GC.h

index 78cf9b1..8e8900f 100644 (file)
 
 #ifdef USE_PAPI /* ugly */
 
+#include <papi.h>
+
 #include "Papi.h"
 #include "Rts.h"
 #include "RtsUtils.h"
 #include "Stats.h"
 #include "RtsFlags.h"
+#include "OSThreads.h"
 
+// used to protect the aggregated counters
+#ifdef THREADED_RTS
+static Mutex papi_counter_mutex;
+#endif
 
 struct _papi_events {
   int event_code;
   char * event_name;
 };
 
-#define PAPI_ADD_EVENT(EVENT) \
-    {                        \
-       ASSERT(n_papi_events<MAX_PAPI_EVENTS);     \
-       papi_events[n_papi_events].event_code = EVENT;  \
-       papi_events[n_papi_events].event_name = #EVENT; \
-       n_papi_events++;                                \
-    }
-
-/* Report the value of a counter */
-#define PAPI_REPORT(EVENTSET,EVENT) \
-  { \
-    ullong_format_string(papi_counter(EVENTSET,EVENT),temp,rtsTrue/*commas*/); \
-    statsPrintf("  (" #EVENT ")  : %s\n",temp);                                \
-  }
-
-/* Report the value of a counter as a percentage of another counter */
-#define PAPI_REPORT_PCT(EVENTSET,EVENT,EVENTTOT) \
-  statsPrintf("  (" #EVENT ") %% of (" #EVENTTOT ") : %.1f%%\n", \
-             papi_counter(EVENTSET,EVENT)*100.0/papi_counter(EVENTSET,EVENTTOT))
-
 /* Beware, these counters are Opteron specific
  * I obtained the numbers using the papi_avail
  * and papi_native_avail utilities.
@@ -65,12 +52,16 @@ struct _papi_events {
 #define DC_L2_REFILL_MOES 0x40001e1b
 #define DC_SYS_REFILL_MOES 0x40001e1c
 
-/* Number of counted events, computed from size of papi_events */
-#define N_PAPI_EVENTS n_papi_events
-
 /* This is bad, it should be in a header */
 #define BIG_STRING_LEN 512
 
+
+#define PAPI_CHECK(CALL) \
+  if((papi_error=(CALL)) != PAPI_OK) { \
+   debugBelch("PAPI function failed in module %s at line %d with error code %d\n", \
+             __FILE__,__LINE__,papi_error);                            \
+  }
+
 /* While PAPI reporting is going on this flag is on */
 int papi_is_reporting;
 
@@ -100,6 +91,9 @@ long_long gc_cycles;
 
 
 
+static long_long papi_counter(long_long values[],int event);
+static void papi_add_events(int EventSet);
+
 /* If you want to add events to count, extend the
  * init_countable_events and the papi_report function.
  * Be aware that your processor can count a limited number
@@ -109,31 +103,43 @@ long_long gc_cycles;
 static void
 init_countable_events(void) 
 {
+#define PAPI_ADD_EVENT(EVENT)                           \
+    {                                                   \
+        if (n_papi_events >= MAX_PAPI_EVENTS) {         \
+           barf("too many PAPI events");                \
+        }                                               \
+       papi_events[n_papi_events].event_code = EVENT;  \
+       papi_events[n_papi_events].event_name = #EVENT; \
+       n_papi_events++;                                \
+    }
+
     PAPI_ADD_EVENT(PAPI_TOT_INS);
-    if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
+    if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
        PAPI_ADD_EVENT(FR_BR);
        PAPI_ADD_EVENT(FR_BR_MIS);
        /* Docs are wrong? Opteron does not count indirect branch misses exclusively */
        PAPI_ADD_EVENT(FR_BR_MISCOMPARE);
-    }
-    if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_STALLS) {
+    } else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_STALLS) {
        PAPI_ADD_EVENT(FR_DISPATCH_STALLS);
        PAPI_ADD_EVENT(FR_DISPATCH_STALLS_BR);
        PAPI_ADD_EVENT(FR_DISPATCH_STALLS_FULL_LS);
-    }
-    if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
+    } else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
        PAPI_ADD_EVENT(PAPI_L1_DCA);
        PAPI_ADD_EVENT(PAPI_L1_DCM);
-    }
-    if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
+    } else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
        PAPI_ADD_EVENT(PAPI_L2_DCA);
        PAPI_ADD_EVENT(PAPI_L2_DCM);
-    }
-    if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_CB_EVENTS) {
+    } else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CB_EVENTS) {
        PAPI_ADD_EVENT(DC_L2_REFILL_MOES);
        PAPI_ADD_EVENT(DC_SYS_REFILL_MOES);
        PAPI_ADD_EVENT(FR_BR_MIS);
+    } else {
+       PAPI_ADD_EVENT(PAPI_STL_ICY);
     }
+
+    // We might also consider:
+    //  PAPI_BR_MSP     Conditional branch instructions mispredicted
+    //  PAPI_RES_STL    Cycles stalled on any resource
 };
 
 
@@ -154,21 +160,33 @@ papi_gc_cycles()
 }
 
 /* This function reports counters for GC and mutator */
-void
+static void
 papi_report(long_long PapiCounters[])
 {
 
-    /* I need to improve formatting aesthetics */
+/* Report the value of a counter */
+#define PAPI_REPORT(EVENTSET,EVENT) \
+  { \
+    ullong_format_string(papi_counter(EVENTSET,EVENT),temp,rtsTrue/*commas*/); \
+    statsPrintf("  (" #EVENT ")  : %s\n",temp);                                \
+  }
+
+/* Report the value of a counter as a percentage of another counter */
+#define PAPI_REPORT_PCT(EVENTSET,EVENT,EVENTTOT) \
+  statsPrintf("  (" #EVENT ") %% of (" #EVENTTOT ") : %.1f%%\n", \
+             papi_counter(EVENTSET,EVENT)*100.0/papi_counter(EVENTSET,EVENTTOT))
+
+  /* I need to improve formatting aesthetics */
     PAPI_REPORT(PapiCounters,PAPI_TOT_INS);
 
-    if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
+    if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
        PAPI_REPORT(PapiCounters,FR_BR);
        PAPI_REPORT(PapiCounters,FR_BR_MIS);
        PAPI_REPORT_PCT(PapiCounters,FR_BR_MIS,FR_BR);
        PAPI_REPORT_PCT(PapiCounters,FR_BR_MISCOMPARE,FR_BR);
     }
 
-    if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_STALLS) {
+    else if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_STALLS) {
        PAPI_REPORT(PapiCounters,FR_DISPATCH_STALLS);
        PAPI_REPORT(PapiCounters,FR_DISPATCH_STALLS_BR);
        //PAPI_REPORT_PCT(PapiCounters,FR_DISPATCH_STALLS_BR,PAPI_TOT_CYC);
@@ -176,50 +194,87 @@ papi_report(long_long PapiCounters[])
        //PAPI_REPORT_PCT(PapiCounters,FR_DISPATCH_STALLS_FULL_LS,PAPI_TOT_CYC);
     }
 
-    if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
+    else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
        PAPI_REPORT(PapiCounters,PAPI_L1_DCA);
        PAPI_REPORT(PapiCounters,PAPI_L1_DCM);
        PAPI_REPORT_PCT(PapiCounters,PAPI_L1_DCM,PAPI_L1_DCA);
     }
 
-    if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
+    else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
        PAPI_REPORT(PapiCounters,PAPI_L2_DCA);
        PAPI_REPORT(PapiCounters,PAPI_L2_DCM);
        PAPI_REPORT_PCT(PapiCounters,PAPI_L2_DCM,PAPI_L2_DCA);
     }
 
-    if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_CB_EVENTS) {
+    else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CB_EVENTS) {
        PAPI_REPORT(PapiCounters,DC_L2_REFILL_MOES);
        PAPI_REPORT(PapiCounters,DC_SYS_REFILL_MOES);
        PAPI_REPORT(PapiCounters,FR_BR_MIS);
     }
 
+    else {
+       PAPI_REPORT(PapiCounters,PAPI_STL_ICY);
+    }        
 }
 
-
+void
+papi_stats_report (void)
+{
+    statsPrintf("  -- CPU Mutator counters --\n");
+    papi_mut_cycles();
+    papi_report(MutatorCounters);
+    
+    statsPrintf("\n  -- CPU GC counters --\n");
+    papi_gc_cycles();
+    papi_report(GCCounters);
+}
+    
+void
+papi_init_eventset (int *event_set)
+{
+    PAPI_register_thread();
+    PAPI_CHECK( PAPI_create_eventset(event_set));
+    papi_add_events(*event_set);
+}
 
 void
-papi_init_eventsets(void)
+papi_init (void)
 {
+    /* Initialise the performance tracking library */
+    int ver;
+    if ((ver = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT) {
+        if (ver > 0) {
+            errorBelch("PAPI_library_init: wrong version: %x", ver);
+            stg_exit(EXIT_FAILURE);
+        } else {
+            sysErrorBelch("PAPI_library_init");
+            stg_exit(EXIT_FAILURE);
+        }
+    }
 
-    init_countable_events();
+#ifdef THREADED_RTS
+    {
+        int err;
+        if ((err = PAPI_thread_init(osThreadId)) < 0) {
+            barf("PAPI_thread_init: %d",err);
+        }
 
-    /* One event set for the mutator and another for the GC */
-    PAPI_CHECK( PAPI_create_eventset(&MutatorEvents));
-    PAPI_CHECK( PAPI_create_eventset(&GCEvents));
+        initMutex(&papi_counter_mutex);
+    }
+#endif
 
-    /* Both sets contain the same events */
-    papi_add_events(MutatorEvents);
-    papi_add_events(GCEvents);
+    init_countable_events();
 
+    papi_init_eventset(&MutatorEvents);
+    papi_init_eventset(&GCEvents);
 }
 
 /* Extract the value corresponding to an event */
-long_long
+static long_long
 papi_counter(long_long values[],int event)
 {
   int i;
-  for(i=0;i<N_PAPI_EVENTS;i++) {
+  for(i=0;i<n_papi_events;i++) {
     if(papi_events[i].event_code==event) {
       return values[i];
     }
@@ -230,11 +285,11 @@ papi_counter(long_long values[],int event)
 }
 
 /* Add the events of papi_events into an event set */
-void
+static void
 papi_add_events(int EventSet)
 {
   int i;
-  for(i=0;i<N_PAPI_EVENTS;i++) {
+  for(i=0;i<n_papi_events;i++) {
     if((papi_error=PAPI_add_event(EventSet,
                                  papi_events[i].event_code))
        != PAPI_OK)
@@ -253,32 +308,57 @@ papi_add_events(int EventSet)
 void
 papi_start_mutator_count(void)
 {
+    ACQUIRE_LOCK(&papi_counter_mutex);
     PAPI_CHECK( PAPI_start(MutatorEvents));
     start_mutator_cycles = PAPI_cycles();
+    RELEASE_LOCK(&papi_counter_mutex);
 }
 
 void
 papi_stop_mutator_count(void)
 {
+    ACQUIRE_LOCK(&papi_counter_mutex);
     mutator_cycles += PAPI_cycles() - start_mutator_cycles;
     PAPI_CHECK( PAPI_accum(MutatorEvents,MutatorCounters));
     PAPI_CHECK( PAPI_stop(MutatorEvents,NULL));
+    RELEASE_LOCK(&papi_counter_mutex);
 }
 
 void
 papi_start_gc_count(void)
 {
-      PAPI_CHECK( PAPI_start(GCEvents));
-      start_gc_cycles = PAPI_cycles();
+    ACQUIRE_LOCK(&papi_counter_mutex);
+    PAPI_CHECK( PAPI_start(GCEvents));
+    start_gc_cycles = PAPI_cycles();
+    RELEASE_LOCK(&papi_counter_mutex);
 }
 
 void
 papi_stop_gc_count(void)
 {
-      gc_cycles += PAPI_cycles() - start_gc_cycles;
-      PAPI_CHECK( PAPI_accum(GCEvents,GCCounters));
-      PAPI_CHECK( PAPI_stop(GCEvents,NULL));
+    ACQUIRE_LOCK(&papi_counter_mutex);
+    PAPI_CHECK( PAPI_accum(GCEvents,GCCounters));
+    PAPI_CHECK( PAPI_stop(GCEvents,NULL));
+    gc_cycles += PAPI_cycles() - start_gc_cycles;
+    RELEASE_LOCK(&papi_counter_mutex);
 }
 
 
+void
+papi_thread_start_gc_count(int event_set)
+{
+    ACQUIRE_LOCK(&papi_counter_mutex);
+    PAPI_CHECK( PAPI_start(event_set));
+    RELEASE_LOCK(&papi_counter_mutex);
+}
+
+void
+papi_thread_stop_gc_count(int event_set)
+{
+    ACQUIRE_LOCK(&papi_counter_mutex);
+    PAPI_CHECK( PAPI_accum(event_set,GCCounters));
+    PAPI_CHECK( PAPI_stop(event_set,NULL));
+    RELEASE_LOCK(&papi_counter_mutex);
+}
+
 #endif /* USE_PAPI */
index 835eea6..eaabdf5 100644 (file)
@@ -5,41 +5,21 @@
  *
  * ---------------------------------------------------------------------------*/
 
-
-#include <papi.h>
-
-
-
-#define PAPI_CHECK(CALL) \
-  if((papi_error=(CALL)) != PAPI_OK) { \
-   debugBelch("PAPI function failed in module %s at line %d with error code %d\n", \
-             __FILE__,__LINE__,papi_error);                            \
-  }
-
 /* Check the error value of a PAPI call, reporting an error, if needed */
 extern int papi_error;
 
 /* While PAPI reporting is going on this flag is on */
 extern int papi_is_reporting;
 
-/* Event sets and counter arrays for GC and mutator */
-
-extern int MutatorEvents;
-extern int GCEvents;
-
-extern long_long MutatorCounters[];
-extern long_long GCCounters[];
-
-long_long papi_counter(long_long values[],int event);
-void papi_report(long_long PapiCounters[]);
-void papi_mut_cycles(void);
-void papi_gc_cycles(void);
-void papi_add_events(int EventSet);
-
-void papi_init_eventsets(void);
+void papi_stats_report(void);
+void papi_init_eventset(int * event_set);
+void papi_init(void);
 void papi_start_mutator_count(void);
 void papi_stop_mutator_count(void);
 void papi_start_gc_count(void);
 void papi_stop_gc_count(void);
 
-
+// for multithreaded GC, each sub-thread uses these functions to count
+// events and aggregate them into the main GC counters.
+void papi_thread_start_gc_count(int event_set);
+void papi_thread_stop_gc_count(int event_set);
index 774de72..d1025a3 100644 (file)
@@ -167,28 +167,8 @@ hs_init(int *argc, char **argv[])
     argv++; argc--;
 #endif
 
-    /* Initialise the performance tracking library */
 #ifdef USE_PAPI
-    {
-       int ver;
-       if ((ver = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT) {
-           if (ver > 0) {
-               errorBelch("PAPI_library_init: wrong version: %x", ver);
-               stg_exit(EXIT_FAILURE);
-           } else {
-               sysErrorBelch("PAPI_library_init");
-               stg_exit(EXIT_FAILURE);
-           }
-       }
-    }
-#ifdef THREADED_RTS
-    {
-       int err;
-       if ((err = PAPI_thread_init(osThreadId)) < 0) {
-           barf("PAPI_thread_init: %d",err);
-       }
-    }
-#endif
+    papi_init();
 #endif
 
     /* Set the RTS flags to default values. */
index 1b0abaa..1127b81 100644 (file)
@@ -171,8 +171,6 @@ stat_endInit(void)
        InitElapsedTime = elapsed - ElapsedTimeStart;
     }
 #if USE_PAPI
-    papi_init_eventsets();
-
     /* We start counting events for the mutator
      * when garbage collection starts
      * we switch to the GC event set. */
@@ -606,16 +604,7 @@ stat_exit(int alloc)
             */
 
 #if USE_PAPI
-           /* PAPI reporting, should put somewhere else?
-            * Note that the cycles are counted _after_ the initialization of the RTS -- AR */
-
-           statsPrintf("  -- CPU Mutator counters --\n");
-           papi_mut_cycles();
-           papi_report(MutatorCounters);
-
-           statsPrintf("\n  -- CPU GC counters --\n");
-           papi_gc_cycles();
-           papi_report(GCCounters);
+            papi_stats_report();
 #endif
        }
 
index 2fc3f4d..f248a75 100644 (file)
@@ -40,6 +40,7 @@
 #include "RetainerProfile.h"
 #include "RaiseAsync.h"
 #include "Sparks.h"
+#include "Papi.h"
 
 #include "GC.h"
 #include "Compact.h"
@@ -873,6 +874,10 @@ alloc_gc_thread (gc_thread *t, int n)
 
     init_gc_thread(t);
     
+#ifdef USE_PAPI
+    t->papi_events = -1;
+#endif
+
     t->steps = stgMallocBytes(RtsFlags.GcFlags.generations * 
                                sizeof(step_workspace *), 
                                "initialise_gc_thread");
@@ -1011,7 +1016,20 @@ gc_thread_mainloop (void)
        gct->wakeup = rtsFalse;
        if (gct->exit) break;
 
+#ifdef USE_PAPI
+        // start performance counters in this thread...
+        if (gct->papi_events == -1) {
+            papi_init_eventset(&gct->papi_events);
+        }
+        papi_thread_start_gc_count(gct->papi_events);
+#endif
+
        gc_thread_work();
+
+#ifdef USE_PAPI
+        // count events in this thread towards the GC totals
+        papi_thread_stop_gc_count(gct->papi_events);
+#endif
     }
 }      
 #endif
index 488a2db..5d7924e 100644 (file)
@@ -142,6 +142,10 @@ typedef struct gc_thread_ {
 
     lnat thunk_selector_depth;     // ummm.... not used as of now
 
+#ifdef USE_PAPI
+    int papi_events;
+#endif
+    
 } gc_thread;
 
 extern nat N;