Refactor PAPI support, and add profiling of multithreaded GC
[ghc-hetmet.git] / rts / Papi.c
1 /* -----------------------------------------------------------------------------
2  * (c) The GHC Team 2006
3  * 
4  * Initialization and use of the PAPI performance monitoring library
5  *
6  *
7  * For adding events or add your processor counters modify
8  *
9  *   init_countable_events
10  *   papi_report
11  *
12  * ---------------------------------------------------------------------------*/
13
14
15 #ifdef USE_PAPI /* ugly */
16
17 #include <papi.h>
18
19 #include "Papi.h"
20 #include "Rts.h"
21 #include "RtsUtils.h"
22 #include "Stats.h"
23 #include "RtsFlags.h"
24 #include "OSThreads.h"
25
26 // used to protect the aggregated counters
27 #ifdef THREADED_RTS
28 static Mutex papi_counter_mutex;
29 #endif
30
31 struct _papi_events {
32   int event_code;
33   char * event_name;
34 };
35
36 /* Beware, these counters are Opteron specific
37  * I obtained the numbers using the papi_avail
38  * and papi_native_avail utilities.
39  * This is certainly not the official PAPI way
40  * of doing things.
41  */
42 #define FR_BR 0x40000040
43 #define FR_BR_MIS 0x40000041
44 #define FR_BR_MISCOMPARE 0x40000048
45 #define DC_ACCESS 0x40000019
46 #define DC_MISS 0x4000001a
47 #define FR_DISPATCH_STALLS 0x40000054
48 #define FR_DISPATCH_STALLS_BR 0x40000055
49 #define FR_DISPATCH_STALLS_FULL_REORDER 0x40000058
50 #define FR_DISPATCH_STALLS_FULL_RESERVATION 0x40000059
51 #define FR_DISPATCH_STALLS_FULL_LS 0x4000005b
52 #define DC_L2_REFILL_MOES 0x40001e1b
53 #define DC_SYS_REFILL_MOES 0x40001e1c
54
55 /* This is bad, it should be in a header */
56 #define BIG_STRING_LEN 512
57
58
59 #define PAPI_CHECK(CALL) \
60   if((papi_error=(CALL)) != PAPI_OK) { \
61    debugBelch("PAPI function failed in module %s at line %d with error code %d\n", \
62               __FILE__,__LINE__,papi_error);                            \
63   }
64
65 /* While PAPI reporting is going on this flag is on */
66 int papi_is_reporting;
67
68 /* Event sets and counter arrays for GC and mutator */
69
70 int MutatorEvents = PAPI_NULL;
71 int GCEvents = PAPI_NULL;
72
73 int papi_error;
74
75 /* Arbitrary, to avoid using malloc */
76 #define MAX_PAPI_EVENTS 10
77
78 int n_papi_events = 0;
79
80
81 /* Events counted during GC and Mutator execution */
82 /* There's a trailing comma, do all C compilers accept that? */
83 static struct _papi_events papi_events[MAX_PAPI_EVENTS];
84 long_long MutatorCounters[MAX_PAPI_EVENTS];
85 long_long GCCounters[MAX_PAPI_EVENTS];
86
87 long_long start_mutator_cycles;
88 long_long start_gc_cycles;
89 long_long mutator_cycles;
90 long_long gc_cycles;
91
92
93
94 static long_long papi_counter(long_long values[],int event);
95 static void papi_add_events(int EventSet);
96
97 /* If you want to add events to count, extend the
98  * init_countable_events and the papi_report function.
99  * Be aware that your processor can count a limited number
100  * of events simultaneously, you can turn on multiplexing
101  * to increase that number, though.
102  */
103 static void
104 init_countable_events(void) 
105 {
106 #define PAPI_ADD_EVENT(EVENT)                           \
107     {                                                   \
108         if (n_papi_events >= MAX_PAPI_EVENTS) {         \
109            barf("too many PAPI events");                \
110         }                                               \
111         papi_events[n_papi_events].event_code = EVENT;  \
112         papi_events[n_papi_events].event_name = #EVENT; \
113         n_papi_events++;                                \
114     }
115
116     PAPI_ADD_EVENT(PAPI_TOT_INS);
117     if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
118         PAPI_ADD_EVENT(FR_BR);
119         PAPI_ADD_EVENT(FR_BR_MIS);
120         /* Docs are wrong? Opteron does not count indirect branch misses exclusively */
121         PAPI_ADD_EVENT(FR_BR_MISCOMPARE);
122     } else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_STALLS) {
123         PAPI_ADD_EVENT(FR_DISPATCH_STALLS);
124         PAPI_ADD_EVENT(FR_DISPATCH_STALLS_BR);
125         PAPI_ADD_EVENT(FR_DISPATCH_STALLS_FULL_LS);
126     } else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
127         PAPI_ADD_EVENT(PAPI_L1_DCA);
128         PAPI_ADD_EVENT(PAPI_L1_DCM);
129     } else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
130         PAPI_ADD_EVENT(PAPI_L2_DCA);
131         PAPI_ADD_EVENT(PAPI_L2_DCM);
132     } else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CB_EVENTS) {
133         PAPI_ADD_EVENT(DC_L2_REFILL_MOES);
134         PAPI_ADD_EVENT(DC_SYS_REFILL_MOES);
135         PAPI_ADD_EVENT(FR_BR_MIS);
136     } else {
137         PAPI_ADD_EVENT(PAPI_STL_ICY);
138     }
139
140     // We might also consider:
141     //  PAPI_BR_MSP     Conditional branch instructions mispredicted
142     //  PAPI_RES_STL    Cycles stalled on any resource
143 };
144
145
146 static char temp[BIG_STRING_LEN];
147
148 void
149 papi_mut_cycles()
150 {
151     ullong_format_string(mutator_cycles,temp,rtsTrue/*commas*/); 
152     statsPrintf("  (MUT_CYCLES)  : %s\n",temp);
153 }
154
155 void
156 papi_gc_cycles()
157 {
158     ullong_format_string(gc_cycles,temp,rtsTrue/*commas*/); 
159     statsPrintf("  (GC_CYCLES)  : %s\n",temp);
160 }
161
162 /* This function reports counters for GC and mutator */
163 static void
164 papi_report(long_long PapiCounters[])
165 {
166
167 /* Report the value of a counter */
168 #define PAPI_REPORT(EVENTSET,EVENT) \
169   { \
170     ullong_format_string(papi_counter(EVENTSET,EVENT),temp,rtsTrue/*commas*/); \
171     statsPrintf("  (" #EVENT ")  : %s\n",temp);                         \
172   }
173
174 /* Report the value of a counter as a percentage of another counter */
175 #define PAPI_REPORT_PCT(EVENTSET,EVENT,EVENTTOT) \
176   statsPrintf("  (" #EVENT ") %% of (" #EVENTTOT ") : %.1f%%\n", \
177               papi_counter(EVENTSET,EVENT)*100.0/papi_counter(EVENTSET,EVENTTOT))
178
179   /* I need to improve formatting aesthetics */
180     PAPI_REPORT(PapiCounters,PAPI_TOT_INS);
181
182     if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
183         PAPI_REPORT(PapiCounters,FR_BR);
184         PAPI_REPORT(PapiCounters,FR_BR_MIS);
185         PAPI_REPORT_PCT(PapiCounters,FR_BR_MIS,FR_BR);
186         PAPI_REPORT_PCT(PapiCounters,FR_BR_MISCOMPARE,FR_BR);
187     }
188
189     else if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_STALLS) {
190         PAPI_REPORT(PapiCounters,FR_DISPATCH_STALLS);
191         PAPI_REPORT(PapiCounters,FR_DISPATCH_STALLS_BR);
192         //PAPI_REPORT_PCT(PapiCounters,FR_DISPATCH_STALLS_BR,PAPI_TOT_CYC);
193         PAPI_REPORT(PapiCounters,FR_DISPATCH_STALLS_FULL_LS);
194         //PAPI_REPORT_PCT(PapiCounters,FR_DISPATCH_STALLS_FULL_LS,PAPI_TOT_CYC);
195     }
196
197     else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
198         PAPI_REPORT(PapiCounters,PAPI_L1_DCA);
199         PAPI_REPORT(PapiCounters,PAPI_L1_DCM);
200         PAPI_REPORT_PCT(PapiCounters,PAPI_L1_DCM,PAPI_L1_DCA);
201     }
202
203     else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
204         PAPI_REPORT(PapiCounters,PAPI_L2_DCA);
205         PAPI_REPORT(PapiCounters,PAPI_L2_DCM);
206         PAPI_REPORT_PCT(PapiCounters,PAPI_L2_DCM,PAPI_L2_DCA);
207     }
208
209     else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CB_EVENTS) {
210         PAPI_REPORT(PapiCounters,DC_L2_REFILL_MOES);
211         PAPI_REPORT(PapiCounters,DC_SYS_REFILL_MOES);
212         PAPI_REPORT(PapiCounters,FR_BR_MIS);
213     }
214
215     else {
216         PAPI_REPORT(PapiCounters,PAPI_STL_ICY);
217     }        
218 }
219
220 void
221 papi_stats_report (void)
222 {
223     statsPrintf("  -- CPU Mutator counters --\n");
224     papi_mut_cycles();
225     papi_report(MutatorCounters);
226     
227     statsPrintf("\n  -- CPU GC counters --\n");
228     papi_gc_cycles();
229     papi_report(GCCounters);
230 }
231     
232 void
233 papi_init_eventset (int *event_set)
234 {
235     PAPI_register_thread();
236     PAPI_CHECK( PAPI_create_eventset(event_set));
237     papi_add_events(*event_set);
238 }
239
240 void
241 papi_init (void)
242 {
243     /* Initialise the performance tracking library */
244     int ver;
245     if ((ver = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT) {
246         if (ver > 0) {
247             errorBelch("PAPI_library_init: wrong version: %x", ver);
248             stg_exit(EXIT_FAILURE);
249         } else {
250             sysErrorBelch("PAPI_library_init");
251             stg_exit(EXIT_FAILURE);
252         }
253     }
254
255 #ifdef THREADED_RTS
256     {
257         int err;
258         if ((err = PAPI_thread_init(osThreadId)) < 0) {
259             barf("PAPI_thread_init: %d",err);
260         }
261
262         initMutex(&papi_counter_mutex);
263     }
264 #endif
265
266     init_countable_events();
267
268     papi_init_eventset(&MutatorEvents);
269     papi_init_eventset(&GCEvents);
270 }
271
272 /* Extract the value corresponding to an event */
273 static long_long
274 papi_counter(long_long values[],int event)
275 {
276   int i;
277   for(i=0;i<n_papi_events;i++) {
278     if(papi_events[i].event_code==event) {
279       return values[i];
280     }
281   }
282   /* Passed a wrong event? */
283   debugBelch("Event %d is not part of event set\n",event);
284   return 0;
285 }
286
287 /* Add the events of papi_events into an event set */
288 static void
289 papi_add_events(int EventSet)
290 {
291   int i;
292   for(i=0;i<n_papi_events;i++) {
293     if((papi_error=PAPI_add_event(EventSet,
294                                   papi_events[i].event_code))
295        != PAPI_OK)
296       debugBelch("Failed adding %s to event set with error code %d\n",
297                  papi_events[i].event_name,papi_error);
298   }
299 }
300
301 /* We should be using elapsed cycles
302  * to be consistent with time metric chosen in Stats.c (Elapsed time).
303  * This is an approximation to the cycles that the program spends.
304  * Note that the counters, in contrast, are virtual and user space.
305  */
306 #define PAPI_cycles PAPI_get_virt_cyc
307
308 void
309 papi_start_mutator_count(void)
310 {
311     ACQUIRE_LOCK(&papi_counter_mutex);
312     PAPI_CHECK( PAPI_start(MutatorEvents));
313     start_mutator_cycles = PAPI_cycles();
314     RELEASE_LOCK(&papi_counter_mutex);
315 }
316
317 void
318 papi_stop_mutator_count(void)
319 {
320     ACQUIRE_LOCK(&papi_counter_mutex);
321     mutator_cycles += PAPI_cycles() - start_mutator_cycles;
322     PAPI_CHECK( PAPI_accum(MutatorEvents,MutatorCounters));
323     PAPI_CHECK( PAPI_stop(MutatorEvents,NULL));
324     RELEASE_LOCK(&papi_counter_mutex);
325 }
326
327 void
328 papi_start_gc_count(void)
329 {
330     ACQUIRE_LOCK(&papi_counter_mutex);
331     PAPI_CHECK( PAPI_start(GCEvents));
332     start_gc_cycles = PAPI_cycles();
333     RELEASE_LOCK(&papi_counter_mutex);
334 }
335
336 void
337 papi_stop_gc_count(void)
338 {
339     ACQUIRE_LOCK(&papi_counter_mutex);
340     PAPI_CHECK( PAPI_accum(GCEvents,GCCounters));
341     PAPI_CHECK( PAPI_stop(GCEvents,NULL));
342     gc_cycles += PAPI_cycles() - start_gc_cycles;
343     RELEASE_LOCK(&papi_counter_mutex);
344 }
345
346
347 void
348 papi_thread_start_gc_count(int event_set)
349 {
350     ACQUIRE_LOCK(&papi_counter_mutex);
351     PAPI_CHECK( PAPI_start(event_set));
352     RELEASE_LOCK(&papi_counter_mutex);
353 }
354
355 void
356 papi_thread_stop_gc_count(int event_set)
357 {
358     ACQUIRE_LOCK(&papi_counter_mutex);
359     PAPI_CHECK( PAPI_accum(event_set,GCCounters));
360     PAPI_CHECK( PAPI_stop(event_set,NULL));
361     RELEASE_LOCK(&papi_counter_mutex);
362 }
363
364 #endif /* USE_PAPI */