1 /* -------------------------------------------------------------------------
2 * $Id: ParTicky.c,v 1.1 2001/03/22 03:51:11 hwloidl Exp $
4 * (c) Hans-Wolfgang Loidl, 2000-
6 * Parallel ticky profiling, monitoring basic RTS operations in GUM.
7 * Similar in structure to TICKY_TICKY profiling, but doesn't need a
8 * separate way of building GHC.
9 *-------------------------------------------------------------------------- */
11 #if defined(PAR) && defined(PAR_TICKY)
16 //#include "StoragePriv.h"
18 //#include "Schedule.h"
21 #include "ParTicky.h" // ToDo: move into Rts.h
22 #include "ParallelRts.h"
24 #if defined(PAR) && defined(HAVE_GETRUSAGE)
25 #include <sys/resource.h>
29 extern double ElapsedTimeStart;
31 extern ullong GC_tot_alloc;
32 extern ullong GC_tot_copied;
34 extern lnat MaxResidency; /* in words; for stats only */
35 extern lnat ResidencySamples; /* for stats only */
37 /* ngIplu' {Stats.c}vo' */
38 #define BIG_STRING_LEN 512
40 /* ngIplu' {Ticky.c}vo' */
41 #define INTAVG(a,b) ((b == 0) ? 0.0 : ((double) (a) / (double) (b)))
42 #define PC(a) (100.0 * a)
45 StgDouble avg##thing = INTAVG(tot##thing,ctr##thing)
50 set_foo_time(double *x) {
55 get_foo_time(double x) {
56 fprintf(stderr, "get_foo_time: %7.2f (%7.5f,%7.5f) \n",
57 usertime()-x,usertime(),x);
58 return (usertime()-x);
62 static double start_time_GA = 0.0;
63 static double start_mark = 0.0;
64 static double start_pack = 0.0;
65 static double start_unpack = 0.0;
68 par_ticky_Par_start (void) {
69 # if !defined(HAVE_GETRUSAGE) || irix_TARGET_OS || defined(_WIN32)
70 fprintf(stderr, "|| sorry don't have RUSAGE\n");
73 FILE *sf = RtsFlags.GcFlags.statsFile;
77 if (RtsFlags.GcFlags.giveStats>1 && sf != NULL) {
78 getrusage(RUSAGE_SELF, &t);
80 utime = t.ru_utime.tv_sec + 1e-6*t.ru_utime.tv_usec;
81 stime = t.ru_stime.tv_sec + 1e-6*t.ru_stime.tv_usec;
83 fprintf(stderr, "|| user time: %5.2f; system time: %5.2f\n",
85 fprintf(stderr, "|| max RSS: %ld; int SM size: %ld; int USM data size: %ld; int USS size: %ld\n",
86 t.ru_maxrss, t.ru_ixrss, t.ru_idrss, t.ru_isrss);
95 struct timeval ru_utime; /* user time used */
96 struct timeval ru_stime; /* system time used */
97 long ru_maxrss; /* maximum resident set size */
98 long ru_ixrss; /* integral shared memory size */
99 long ru_idrss; /* integral unshared data size */
100 long ru_isrss; /* integral unshared stack size */
101 long ru_minflt; /* page reclaims */
102 long ru_majflt; /* page faults */
103 long ru_nswap; /* swaps */
104 long ru_inblock; /* block input operations */
105 long ru_oublock; /* block output operations */
106 long ru_msgsnd; /* messages sent */
107 long ru_msgrcv; /* messages received */
108 long ru_nsignals; /* signals received */
109 long ru_nvcsw; /* voluntary context switches */
110 long ru_nivcsw; /* involuntary context switches */
116 par_ticky_rebuildGAtables_start(void) {
117 // collect parallel global statistics (currently done together with GC stats)
118 if (RtsFlags.ParFlags.ParStats.Global &&
119 RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
120 //set_foo_time(&start_time_GA);
121 start_time_GA = usertime();
126 par_ticky_rebuildGAtables_end(nat n, nat size_GA) {
127 // collect parallel global statistics (currently done together with GC stats)
128 if (RtsFlags.ParFlags.ParStats.Global &&
129 RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
130 static double foo = 0.0;
131 foo = usertime() - start_time_GA; // get_foo_time(start_time_GA);
132 globalParStats.cnt_rebuild_GA++;
133 globalParStats.tot_rebuild_GA += n;
134 if ( n > globalParStats.res_rebuild_GA )
135 globalParStats.res_rebuild_GA = n;
136 // fprintf(stderr, "rebuildGAtables: footime=%7.2f (%11.5f, %11.5f)\n",
137 // foo, usertime(), start_time_GA);
138 globalParStats.time_rebuild_GA += foo;
139 globalParStats.tot_size_GA += size_GA;
140 if ( size_GA > globalParStats.res_size_GA )
141 globalParStats.res_size_GA = size_GA;
143 // fprintf(stderr, ">> n: %d; size: %d;; tot: %d; res: %d\n",
144 // n, size_GA, globalParStats.tot_size_GA, globalParStats.res_size_GA);
148 par_ticky_markLocalGAs_start(void) {
149 // collect parallel global statistics (currently done together with GC stats)
150 if (RtsFlags.ParFlags.ParStats.Global &&
151 RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
152 start_time_GA = usertime();
157 par_ticky_markLocalGAs_end(nat n) {
158 // collect parallel global statistics (currently done together with GC stats)
159 if (RtsFlags.ParFlags.ParStats.Global &&
160 RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
161 globalParStats.cnt_mark_GA++;
162 globalParStats.tot_mark_GA += n;
163 if ( n > globalParStats.res_mark_GA )
164 globalParStats.res_mark_GA = n;
165 globalParStats.time_mark_GA += usertime() - start_time_GA;
170 par_ticky_markSparkQueue_start(void) {
171 // collect parallel global statistics (currently done together with GC stats)
172 if (RtsFlags.ParFlags.ParStats.Global &&
173 RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
174 start_mark=usertime();
179 par_ticky_markSparkQueue_end(nat n) {
180 // collect parallel global statistics (currently done together with GC stats)
181 if (RtsFlags.ParFlags.ParStats.Global &&
182 RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
183 globalParStats.time_sparks += usertime() - start_mark;
185 globalParStats.tot_sparks_marked += n;
186 if ( n > globalParStats.res_sparks_marked )
187 globalParStats.res_sparks_marked = n;
192 par_ticky_PackNearbyGraph_start (void) {
193 if (RtsFlags.ParFlags.ParStats.Global &&
194 RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
195 start_pack=usertime();
200 par_ticky_PackNearbyGraph_end(nat n, nat thunks) {
201 // collect parallel global statistics (currently done together with GC stats)
202 if (RtsFlags.ParFlags.ParStats.Global &&
203 RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
204 globalParStats.time_pack += usertime() - start_pack;
206 globalParStats.tot_packets++;
207 globalParStats.tot_packet_size += n;
208 if ( n > globalParStats.res_packet_size )
209 globalParStats.res_packet_size = n;
210 globalParStats.tot_thunks += thunks;
211 if ( thunks > globalParStats.res_thunks )
212 globalParStats.res_thunks = thunks;
217 par_ticky_UnpackGraph_start (void) {
218 if (RtsFlags.ParFlags.ParStats.Global &&
219 RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
220 start_unpack=usertime();
225 par_ticky_UnpackGraph_end(nat n, nat thunks) {
226 // collect parallel global statistics (currently done together with GC stats)
227 if (RtsFlags.ParFlags.ParStats.Global &&
228 RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
229 globalParStats.time_unpack += usertime() - start_unpack;
231 globalParStats.rec_packets++;
232 globalParStats.rec_packet_size += n;
234 if ( n > globalParStats.res_packet_size )
235 globalParStats.res_packet_size = n;
237 globalParStats.rec_thunks += thunks;
239 if ( thunks > globalParStats.res_thunks )
240 globalParStats.res_thunks = thunks;
246 par_ticky_TP (void) {
248 nat tp_size, sp_size; // stats only
250 // Global stats gathering
251 /* the spark pool for the current PE */
252 pool = &(MainRegTable.rSparks); // generalise to cap = &MainRegTable
254 // Global statistics: residency of thread and spark pool
255 if (RtsFlags.ParFlags.ParStats.Global &&
256 RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
257 tp_size = run_queue_len() + 1; // add the TSO just poped
258 // No: there may be many blocked threads being awoken at the same time
259 // ASSERT(tp_size <= RtsFlags.ParFlags.maxThreads);
260 globalParStats.tot_tp += tp_size;
261 globalParStats.emp_tp += (tp_size==0) ? 1 : 0;
262 globalParStats.cnt_tp++;
263 if ( tp_size > globalParStats.res_tp)
264 globalParStats.res_tp = tp_size;
265 // fprintf(stderr, "run_queue_len() = %d (max %d)\n", run_queue_len(), globalParStats.res_tp);
266 sp_size = spark_queue_len(pool);
267 //ASSERT(sp_size <= RtsFlags.ParFlags.maxLocalSparks);
268 globalParStats.tot_sp += sp_size;
269 globalParStats.emp_sp += (sp_size==0) ? 1 : 0;
270 globalParStats.cnt_sp++;
271 if ( sp_size > globalParStats.res_sp)
272 globalParStats.res_sp = sp_size;
273 // fprintf(stderr, "spark_queue_len(pool) = %d (max %d)\n", spark_queue_len(pool), globalParStats.res_sp);
278 globalParStat_exit(void)
280 FILE *sf = RtsFlags.GcFlags.statsFile;
283 /* print only if GC stats is enabled, too; i.e. -sstderr */
284 if (!(RtsFlags.ParFlags.ParStats.Global &&
285 RtsFlags.GcFlags.giveStats > NO_GC_STATS))
289 etime = elapsedtime() - ElapsedTimeStart;
290 // fprintf(stderr, "foo=%7.2f\n", time);
293 char temp[BIG_STRING_LEN];
295 // GC_tot_alloc += alloc;
298 fprintf(sf, "%11d threads created\n",
299 globalParStats.tot_threads_created);
301 Would need to add a ++ to the par macro to use this
303 fprintf(sf, "%11d sparks created\n",
304 globalParStats.tot_sparks_created);
305 fprintf(sf, "%11d sparks ignored\n",
306 globalParStats.tot_sparks_ignored);
308 ullong_format_string(globalParStats.res_tp, temp, rtsTrue/*commas*/);
309 fprintf(sf, "%11s thread pool residency", temp);
310 fprintf(sf, " (avg: %3.2f; %d times (%2.2f%%) of %d empty)\n",
311 (double)globalParStats.tot_tp/(double)globalParStats.cnt_tp,
312 globalParStats.emp_tp,
313 globalParStats.emp_tp*100.0/(double)globalParStats.cnt_tp,
314 globalParStats.cnt_tp);
315 ullong_format_string(globalParStats.res_sp, temp, rtsTrue/*commas*/);
316 fprintf(sf, "%11s spark pool residency", temp);
318 fprintf(sf, " (avg: %3.2f; %d times (%2.2f%%) of %d empty)\n",
319 (double)globalParStats.tot_sp/(double)globalParStats.cnt_sp,
320 globalParStats.emp_sp,
321 globalParStats.emp_sp*100.0/(double)globalParStats.cnt_sp,
322 globalParStats.cnt_sp);
323 //ullong_format_string(globalParStats.tot_fishes, temp, rtsTrue/*commas*/);
324 fprintf(sf, "%11d messages sent (%d fish, %d fetch, %d resume, %d schedule",
325 globalParStats.tot_fish_mess+globalParStats.tot_fetch_mess+
326 globalParStats.tot_resume_mess+globalParStats.tot_schedule_mess,
327 globalParStats.tot_fish_mess, globalParStats.tot_fetch_mess,
328 globalParStats.tot_resume_mess, globalParStats.tot_schedule_mess);
330 fprintf(sf, "%d revals", globalParStats.tot_reval_mess);
333 fprintf(sf, "%11d messages received (%d fish, %d fetch, %d resume, %d schedule",
334 globalParStats.rec_fish_mess+globalParStats.rec_fetch_mess+
335 globalParStats.rec_resume_mess+globalParStats.rec_schedule_mess,
336 globalParStats.rec_fish_mess, globalParStats.rec_fetch_mess,
337 globalParStats.rec_resume_mess, globalParStats.rec_schedule_mess);
339 fprintf(sf, "%d revals", globalParStats.rec_reval_mess);
343 ullong_format_string(globalParStats.tot_size_GA*sizeof(W_), temp, rtsTrue/*commas*/);
344 fprintf(sf, "%11s bytes of global heap in total ", temp);
345 fprintf(sf, "(%5.2f%% of total allocated heap)\n",
346 globalParStats.tot_size_GA*sizeof(W_)*100.0/(double)GC_tot_alloc*sizeof(W_));
347 ullong_format_string(globalParStats.res_size_GA*sizeof(W_), temp, rtsTrue/*commas*/);
348 fprintf(sf, "%11s bytes global heap residency ", temp);
349 fprintf(sf, "(%5.2f%% of max heap residency)\n",
350 globalParStats.res_size_GA*sizeof(W_)*100.0/(double)MaxResidency*sizeof(W_));
352 //ullong_format_string(globalParStats.res_mark_GA, temp, rtsTrue/*commas*/);
353 //fprintf(sf, "%11s GAs residency in GALA table ", temp);
354 // ullong_format_string(globalParStats.tot_mark_GA, temp, rtsTrue/*commas*/);
355 //fprintf(sf, "(avg %5.2f; %d samples)\n",
356 // (double)globalParStats.tot_mark_GA/(double)globalParStats.cnt_mark_GA,
357 // globalParStats.cnt_mark_GA);
359 ullong_format_string(globalParStats.local_alloc_GA, temp, rtsTrue/*commas*/);
360 fprintf(sf, "%11s GAs locally allocated (calls to makeGlobal)\n", temp);
362 ullong_format_string(globalParStats.tot_rebuild_GA, temp, rtsTrue/*commas*/);
363 fprintf(sf, "%11s live GAs in total (after rebuilding tables)\n", temp);
364 ullong_format_string(globalParStats.res_rebuild_GA, temp, rtsTrue/*commas*/);
365 fprintf(sf, "%11s GAs residency (after rebuilding tables) ", temp);
366 fprintf(sf, "(avg %5.2f; %d samples)\n",
367 (double)globalParStats.tot_rebuild_GA/(double)globalParStats.cnt_rebuild_GA,
368 globalParStats.cnt_rebuild_GA);
369 ullong_format_string(globalParStats.res_free_GA, temp, rtsTrue/*commas*/);
370 fprintf(sf, "%11s residency of freeing GAs", temp);
371 fprintf(sf, " (avg %5.2f; %d samples)\n",
372 (double)globalParStats.tot_free_GA/(double)globalParStats.cnt_free_GA,
373 globalParStats.cnt_free_GA);
375 fprintf(sf, "%11.2fs spent marking GAs (%7.2f%% of %7.2fs)\n",
376 globalParStats.time_mark_GA,
377 globalParStats.time_mark_GA*100./time, time);
378 fprintf(sf, "%11.2fs spent rebuilding GALA tables (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs)\n",
379 globalParStats.time_rebuild_GA,
380 globalParStats.time_rebuild_GA*100./time, time,
381 globalParStats.time_rebuild_GA*100./etime, etime);
383 ullong_format_string(globalParStats.tot_sparks_marked, temp, rtsTrue/*commas*/);
384 fprintf(sf, "%11s sparks marked\t", temp);
385 ullong_format_string(globalParStats.res_sparks_marked, temp, rtsTrue/*commas*/);
386 fprintf(sf, "%6s spark mark residency\n", temp);
387 fprintf(sf, "%11.2fs spent marking sparks (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n",
388 globalParStats.time_sparks,
389 globalParStats.time_sparks*100./time, time,
390 globalParStats.time_sparks*100./etime, etime);
394 ullong_format_string(globalParStats.tot_packets, temp, rtsTrue/*commas*/);
395 fprintf(sf, "%11s packets sent\n", temp);
396 ullong_format_string(globalParStats.tot_packet_size, temp, rtsTrue/*commas*/);
397 fprintf(sf, "%11s bytes of graph sent in total (max %d; avg %.2f)\n",
398 temp, globalParStats.res_packet_size,
399 (double)globalParStats.tot_packet_size/(double)globalParStats.tot_packets);
400 ullong_format_string(globalParStats.tot_thunks, temp, rtsTrue/*commas*/);
401 fprintf(sf, "%11s thunks sent in total (max %d; avg %.2f)\n",
402 temp, globalParStats.res_thunks,
403 (double)globalParStats.tot_thunks/(double)globalParStats.tot_packets);
404 fprintf(sf, "%11.2fs spent packing graph structures (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n",
405 globalParStats.time_pack,
406 globalParStats.time_pack*100./time, time,
407 globalParStats.time_pack*100./etime, etime);
409 ullong_format_string(globalParStats.rec_packets, temp, rtsTrue/*commas*/);
410 fprintf(sf, "%11s packets received\n", temp);
411 ullong_format_string(globalParStats.rec_packet_size, temp, rtsTrue/*commas*/);
412 fprintf(sf, "%11s bytes of graph received in total (max %d; avg %.2f)\n",
413 temp, globalParStats.rec_res_packet_size,
414 (double)globalParStats.rec_packet_size/(double)globalParStats.rec_packets);
415 ullong_format_string(globalParStats.rec_thunks, temp, rtsTrue/*commas*/);
416 fprintf(sf, "%11s thunks received in total (max %d; avg %.2f)\n",
417 temp, globalParStats.rec_res_thunks,
418 (double)globalParStats.rec_thunks/(double)globalParStats.rec_packets);
419 fprintf(sf, "%11.2fs spent unpacking graph structures (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n",
420 globalParStats.time_unpack,
421 globalParStats.time_unpack*100./time, time,
422 globalParStats.time_unpack*100./etime, etime);
426 ullong_format_string(globalParStats.tot_arrs, temp, rtsTrue/*commas*/);
427 fprintf(sf, "%11s bytearrays sent; ", temp);
428 ullong_format_string(globalParStats.tot_arr_size, temp, rtsTrue/*commas*/);
429 fprintf(sf, " %s bytes in total (avg %.2f)\n",
431 (double)globalParStats.tot_arr_size/(double)globalParStats.tot_arrs);
435 fprintf(sf, "%11d yields, %d stack overflows, %d heap overflows\n",
436 globalParStats.tot_yields, globalParStats.tot_stackover,
437 globalParStats.tot_heapover);
441 //fprintf(stderr, "Printing this pathetic statistics took %7.2fs (start @ %7.2f)\n",
442 // usertime()-time, time);
445 // Open filehandle needed by other stats printing fcts