• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1Demonstrations of offcputime, the Linux eBPF/bcc version.
2
3
4This program shows stack traces that were blocked, and the total duration they
5were blocked. It works by tracing when threads block and when they return to
6CPU, measuring both the time they were blocked (aka the "off-CPU time") and the
7blocked stack trace and the task name. This data is summarized in kernel by
8summing the blocked time by unique stack trace and task name.
9
10Here is some example output. The -K option was used to only match kernel stacks.
11To explain what we are seeing: the very first stack trace looks like a page
12fault (do_page_fault() etc) from the "chmod" command, and in total was off-CPU
13for 13 microseconds.
14
15# ./offcputime -K
16Tracing off-CPU time (us) of all threads by kernel stack... Hit Ctrl-C to end.
17^C
18    schedule
19    schedule_timeout
20    io_schedule_timeout
21    bit_wait_io
22    __wait_on_bit
23    wait_on_page_bit_killable
24    __lock_page_or_retry
25    filemap_fault
26    __do_fault
27    handle_mm_fault
28    __do_page_fault
29    do_page_fault
30    page_fault
31    chmod
32        13
33
34    schedule
35    rcu_nocb_kthread
36    kthread
37    ret_from_fork
38    ddebug_tables
39    rcuos/0
40        22
41
42    schedule
43    schedule_timeout
44    io_schedule_timeout
45    bit_wait_io
46    __wait_on_bit_lock
47    __lock_page
48    lock_page
49    __do_fault
50    handle_mm_fault
51    __do_page_fault
52    do_page_fault
53    page_fault
54    run
55        27
56
57    schedule
58    schedule_timeout
59    io_schedule_timeout
60    bit_wait_io
61    __wait_on_bit
62    wait_on_page_bit_killable
63    __lock_page_or_retry
64    filemap_fault
65    __do_fault
66    handle_mm_fault
67    __do_page_fault
68    do_page_fault
69    page_fault
70    clear_user
71    padzero
72    load_elf_binary
73    search_binary_handler
74    load_script
75    search_binary_handler
76    do_execveat_common.isra.27
77    run
78        28
79
80    schedule
81    schedule_timeout
82    io_schedule_timeout
83    bit_wait_io
84    __wait_on_bit
85    wait_on_page_bit_killable
86    __lock_page_or_retry
87    filemap_fault
88    __do_fault
89    handle_mm_fault
90    __do_page_fault
91    do_page_fault
92    page_fault
93    run
94        82
95
96    schedule
97    pipe_wait
98    pipe_read
99    __vfs_read
100    vfs_read
101    sys_read
102    entry_SYSCALL_64_fastpath
103    bash
104        94
105
106    schedule
107    rcu_gp_kthread
108    kthread
109    ret_from_fork
110    ddebug_tables
111    rcu_sched
112        104
113
114    schedule
115    schedule_timeout
116    io_schedule_timeout
117    bit_wait_io
118    __wait_on_bit
119    out_of_line_wait_on_bit
120    __wait_on_buffer
121    jbd2_journal_commit_transaction
122    kjournald2
123    kthread
124    ret_from_fork
125    mb_cache_list
126    jbd2/xvda1-8
127        986
128
129    schedule
130    schedule_timeout
131    io_schedule_timeout
132    bit_wait_io
133    __wait_on_bit
134    out_of_line_wait_on_bit
135    __wait_on_buffer
136    jbd2_journal_commit_transaction
137    kjournald2
138    kthread
139    ret_from_fork
140    mb_cache_list
141    jbd2/xvda1-8
142        6630
143
144    schedule
145    schedule_timeout
146    io_schedule_timeout
147    bit_wait_io
148    __wait_on_bit
149    out_of_line_wait_on_bit
150    do_get_write_access
151    jbd2_journal_get_write_access
152    __ext4_journal_get_write_access
153    ext4_mb_mark_diskspace_used
154    ext4_mb_new_blocks
155    ext4_ext_map_blocks
156    ext4_map_blocks
157    ext4_writepages
158    do_writepages
159    __filemap_fdatawrite_range
160    filemap_flush
161    ext4_alloc_da_blocks
162    ext4_rename
163    ext4_rename2
164    supervise
165        6645
166
167    schedule
168    schedule_timeout
169    io_schedule_timeout
170    bit_wait_io
171    __wait_on_bit
172    out_of_line_wait_on_bit
173    do_get_write_access
174    jbd2_journal_get_write_access
175    __ext4_journal_get_write_access
176    __ext4_new_inode
177    ext4_create
178    vfs_create
179    path_openat
180    do_filp_open
181    do_sys_open
182    sys_open
183    entry_SYSCALL_64_fastpath
184    supervise
185        12702
186
187    schedule
188    rcu_nocb_kthread
189    kthread
190    ret_from_fork
191    rcuos/2
192        16036
193
194    schedule
195    rcu_nocb_kthread
196    kthread
197    ret_from_fork
198    rcuos/4
199        24085
200
201    schedule
202    do_wait
203    sys_wait4
204    entry_SYSCALL_64_fastpath
205    run
206        233055
207
208    schedule
209    schedule_timeout
210    io_schedule_timeout
211    bit_wait_io
212    __wait_on_bit
213    wait_on_page_bit
214    truncate_inode_pages_range
215    truncate_inode_pages_final
216    ext4_evict_inode
217    evict
218    iput
219    __dentry_kill
220    dput
221    sys_rename
222    entry_SYSCALL_64_fastpath
223    supervise
224        297113
225
226    schedule
227    schedule_timeout
228    wait_woken
229    n_tty_read
230    tty_read
231    __vfs_read
232    vfs_read
233    sys_read
234    entry_SYSCALL_64_fastpath
235    bash
236        1789866
237
238    schedule
239    schedule_timeout
240    io_schedule_timeout
241    do_blockdev_direct_IO
242    __blockdev_direct_IO
243    blkdev_direct_IO
244    generic_file_read_iter
245    blkdev_read_iter
246    __vfs_read
247    vfs_read
248    sys_read
249    entry_SYSCALL_64_fastpath
250    dd
251        3310763
252
253    schedule
254    smpboot_thread_fn
255    kthread
256    ret_from_fork
257    watchdog/1
258        3999989
259
260    schedule
261    smpboot_thread_fn
262    kthread
263    ret_from_fork
264    watchdog/5
265        3999995
266
267    schedule
268    smpboot_thread_fn
269    kthread
270    ret_from_fork
271    watchdog/4
272        3999996
273
274    schedule
275    smpboot_thread_fn
276    kthread
277    ret_from_fork
278    watchdog/0
279        3999996
280
281    schedule
282    smpboot_thread_fn
283    kthread
284    ret_from_fork
285    watchdog/3
286        3999998
287
288    schedule
289    smpboot_thread_fn
290    kthread
291    ret_from_fork
292    watchdog/7
293        3999999
294
295    schedule
296    smpboot_thread_fn
297    kthread
298    ret_from_fork
299    watchdog/2
300        4000001
301
302    schedule
303    smpboot_thread_fn
304    kthread
305    ret_from_fork
306    watchdog/6
307        4000001
308
309    schedule
310    do_wait
311    sys_wait4
312    entry_SYSCALL_64_fastpath
313    bash
314        4039675
315
316    schedule
317    do_nanosleep
318    hrtimer_nanosleep
319    sys_nanosleep
320    entry_SYSCALL_64_fastpath
321    svscan
322        5000112
323
324    schedule
325    schedule_hrtimeout_range_clock
326    schedule_hrtimeout_range
327    poll_schedule_timeout
328    do_select
329    core_sys_select
330    sys_select
331    entry_SYSCALL_64_fastpath
332    snmpd
333        5998761
334
335    schedule
336    smpboot_thread_fn
337    kthread
338    ret_from_fork
339    migration/3
340        6149779
341
342    schedule
343    schedule_hrtimeout_range_clock
344    schedule_hrtimeout_range
345    poll_schedule_timeout
346    do_select
347    core_sys_select
348    sys_select
349    entry_SYSCALL_64_fastpath
350    ntpd
351        6999832
352
353    schedule
354    worker_thread
355    kthread
356    ret_from_fork
357    kworker/u16:2
358        7131941
359
360    schedule
361    worker_thread
362    kthread
363    ret_from_fork
364    kworker/3:0
365        7999844
366
367    schedule
368    worker_thread
369    kthread
370    ret_from_fork
371    kworker/1:1
372        7999872
373
374    schedule
375    worker_thread
376    kthread
377    ret_from_fork
378    kworker/2:1
379        7999889
380
381    schedule
382    worker_thread
383    kthread
384    ret_from_fork
385    kworker/5:1
386        7999936
387
388    schedule
389    worker_thread
390    kthread
391    ret_from_fork
392    kworker/7:1
393        7999938
394
395    schedule
396    worker_thread
397    kthread
398    ret_from_fork
399    kworker/6:1
400        7999940
401
402    schedule
403    do_nanosleep
404    hrtimer_nanosleep
405    sys_nanosleep
406    entry_SYSCALL_64_fastpath
407    tail
408        8000905
409
410    schedule
411    smpboot_thread_fn
412    kthread
413    ret_from_fork
414    migration/7
415        8197046
416
417    schedule
418    pipe_wait
419    pipe_read
420    __vfs_read
421    vfs_read
422    sys_read
423    entry_SYSCALL_64_fastpath
424    readproctitle
425        8197835
426
427    schedule
428    smpboot_thread_fn
429    kthread
430    ret_from_fork
431    migration/4
432        8201851
433
434    schedule
435    smpboot_thread_fn
436    kthread
437    ret_from_fork
438    migration/2
439        8203375
440
441    schedule
442    smpboot_thread_fn
443    kthread
444    ret_from_fork
445    migration/6
446        8208664
447
448    schedule
449    smpboot_thread_fn
450    kthread
451    ret_from_fork
452    migration/5
453        8209819
454
455    schedule
456    smpboot_thread_fn
457    kthread
458    ret_from_fork
459    ddebug_tables
460    migration/0
461        8211292
462
463    schedule
464    smpboot_thread_fn
465    kthread
466    ret_from_fork
467    migration/1
468        8212100
469
470    schedule
471    worker_thread
472    kthread
473    ret_from_fork
474    kworker/0:2
475        8270305
476
477    schedule
478    rcu_nocb_kthread
479    kthread
480    ret_from_fork
481    rcuos/3
482        8349697
483
484    schedule
485    rcu_nocb_kthread
486    kthread
487    ret_from_fork
488    rcuos/2
489        8363357
490
491    schedule
492    rcu_nocb_kthread
493    kthread
494    ret_from_fork
495    rcuos/1
496        8365338
497
498    schedule
499    schedule_timeout
500    xfs_buf_terminate
501    kthread
502    ret_from_fork
503    xfsaild/md0
504        8371514
505
506    schedule
507    rcu_nocb_kthread
508    kthread
509    ret_from_fork
510    rcuos/4
511        8384013
512
513    schedule
514    rcu_nocb_kthread
515    kthread
516    ret_from_fork
517    rcuos/5
518        8390016
519
520    schedule
521    rcu_nocb_kthread
522    kthread
523    ret_from_fork
524    ddebug_tables
525    rcuos/0
526        8405428
527
528    schedule
529    schedule_timeout
530    rcu_gp_kthread
531    kthread
532    ret_from_fork
533    ddebug_tables
534    rcu_sched
535        8406930
536
537    schedule
538    rcu_nocb_kthread
539    kthread
540    ret_from_fork
541    rcuos/7
542        8409575
543
544    schedule
545    rcu_nocb_kthread
546    kthread
547    ret_from_fork
548    rcuos/6
549        8415062
550
551    schedule
552    schedule_hrtimeout_range_clock
553    schedule_hrtimeout_range
554    poll_schedule_timeout
555    do_select
556    core_sys_select
557    sys_select
558    entry_SYSCALL_64_fastpath
559    offcputime
560        8421478
561
562    schedule
563    worker_thread
564    kthread
565    ret_from_fork
566    kworker/4:0
567        8421492
568
569    schedule
570    schedule_hrtimeout_range_clock
571    schedule_hrtimeout_range
572    poll_schedule_timeout
573    do_select
574    core_sys_select
575    sys_select
576    entry_SYSCALL_64_fastpath
577    sshd
578        14249005
579
580    schedule
581    schedule_hrtimeout_range_clock
582    schedule_hrtimeout_range
583    poll_schedule_timeout
584    do_sys_poll
585    sys_poll
586    entry_SYSCALL_64_fastpath
587    supervise
588        81670888
589
590The last few stack traces aren't very interesting, since they are threads that
591are often blocked off-CPU waiting for work.
592
593Do be somewhat careful with overhead: this is tracing scheduler functions, which
594can be called very frequently. While this uses in-kernel summaries for
595efficiency, the rate of scheduler functions can be very high (> 1,000,000/sec),
596and this is performing stack walks when threads return to CPU. At some point
597the overhead will be measurable.
598
599
600A -p option can be used to filter (in-kernel) on a single process ID. For
601example, only matching PID 26651, which is a running "dd" command:
602
603# ./offcputime -K -p 26651
604Tracing off-CPU time (us) of all threads by kernel stack... Hit Ctrl-C to end.
605^C
606    schedule
607    schedule_timeout
608    io_schedule_timeout
609    do_blockdev_direct_IO
610    __blockdev_direct_IO
611    blkdev_direct_IO
612    generic_file_read_iter
613    blkdev_read_iter
614    __vfs_read
615    vfs_read
616    sys_read
617    entry_SYSCALL_64_fastpath
618    dd
619        2405710
620
621The stack trace shows "dd" is blocked waiting on disk I/O, as expected, for a
622total of 2.4 seconds during tracing.
623
624
625A duration can be added, for example, tracing for 5 seconds only:
626
627# ./offcputime -K -p 26651 5
628Tracing off-CPU time (us) of all threads by kernel stack for 5 secs.
629
630    schedule
631    schedule_timeout
632    io_schedule_timeout
633    do_blockdev_direct_IO
634    __blockdev_direct_IO
635    blkdev_direct_IO
636    generic_file_read_iter
637    blkdev_read_iter
638    __vfs_read
639    vfs_read
640    sys_read
641    entry_SYSCALL_64_fastpath
642    dd
643        4413909
644
645Here, dd was blocked for 4.4 seconds out of 5. Or put differently, likely
646on-CPU for about 12% of the time. Which matches the ratio seen by time(1):
647
648# time dd if=/dev/md0 iflag=direct of=/dev/null bs=1k
649^C108115+0 records in
650108114+0 records out
651110708736 bytes (111 MB) copied, 13.7565 s, 8.0 MB/s
652
653real	0m13.760s
654user	0m0.000s
655sys	0m1.739s
656
657
658A -f option will emit output using the "folded stacks" format, which can be
659read directly by flamegraph.pl from the FlameGraph open source software
660(https://github.com/brendangregg/FlameGraph). Eg:
661
662# ./offcputime -K -f 5
663bash;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;tty_read;n_tty_read;call_rwsem_down_read_failed;rwsem_down_read_failed;schedule 8
664yes;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;tty_write;n_tty_write;call_rwsem_down_read_failed;rwsem_down_read_failed;schedule 14
665run;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;__do_fault;filemap_fault;__lock_page_or_retry;wait_on_page_bit_killable;__wait_on_bit;bit_wait_io;io_schedule_timeout;schedule_timeout;schedule 33
666rcuos/4;ret_from_fork;kthread;rcu_nocb_kthread;schedule 45
667bash;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;pipe_read;pipe_wait;schedule 88
668run;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;__do_fault;filemap_fault;__lock_page_or_retry;wait_on_page_bit_killable;__wait_on_bit;bit_wait_io;io_schedule_timeout;schedule_timeout;schedule 108
669jbd2/xvda1-8;mb_cache_list;ret_from_fork;kthread;kjournald2;jbd2_journal_commit_transaction;__wait_on_buffer;out_of_line_wait_on_bit;__wait_on_bit;bit_wait_io;io_schedule_timeout;schedule_timeout;schedule 828
670jbd2/xvda1-8;mb_cache_list;ret_from_fork;kthread;kjournald2;jbd2_journal_commit_transaction;__wait_on_buffer;out_of_line_wait_on_bit;__wait_on_bit;bit_wait_io;io_schedule_timeout;schedule_timeout;schedule 6201
671supervise;entry_SYSCALL_64_fastpath;sys_rename;dput;__dentry_kill;iput;evict;ext4_evict_inode;truncate_inode_pages_final;truncate_inode_pages_range;wait_on_page_bit;__wait_on_bit;bit_wait_io;io_schedule_timeout;schedule_timeout;schedule 41049
672run;entry_SYSCALL_64_fastpath;sys_wait4;do_wait;schedule 120709
673bash;entry_SYSCALL_64_fastpath;sys_wait4;do_wait;schedule 699320
674ksoftirqd/0;ret_from_fork;kthread;smpboot_thread_fn;schedule 1077529
675bash;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;tty_read;n_tty_read;wait_woken;schedule_timeout;schedule 1362045
676sshd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule 1377627
677migration/0;ddebug_tables;ret_from_fork;kthread;smpboot_thread_fn;schedule 2040753
678snmpd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule 2197568
679migration/5;ret_from_fork;kthread;smpboot_thread_fn;schedule 3079426
680migration/7;ret_from_fork;kthread;smpboot_thread_fn;schedule 3084746
681kworker/6:2;ret_from_fork;kthread;worker_thread;schedule 3940583
682kworker/5:1;ret_from_fork;kthread;worker_thread;schedule 3944892
683kworker/1:2;ret_from_fork;kthread;worker_thread;schedule 3999646
684ntpd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule 3999904
685kworker/u16:0;ret_from_fork;kthread;worker_thread;schedule 3999967
686kworker/7:0;ret_from_fork;kthread;worker_thread;schedule 3999987
687tail;entry_SYSCALL_64_fastpath;sys_nanosleep;hrtimer_nanosleep;do_nanosleep;schedule 4000473
688migration/1;ret_from_fork;kthread;smpboot_thread_fn;schedule 4091150
689migration/4;ret_from_fork;kthread;smpboot_thread_fn;schedule 4095217
690readproctitle;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;pipe_read;pipe_wait;schedule 4108470
691migration/3;ret_from_fork;kthread;smpboot_thread_fn;schedule 4109264
692migration/2;ret_from_fork;kthread;smpboot_thread_fn;schedule 4109280
693migration/6;ret_from_fork;kthread;smpboot_thread_fn;schedule 4111143
694kworker/4:0;ret_from_fork;kthread;worker_thread;schedule 4402350
695kworker/3:0;ret_from_fork;kthread;worker_thread;schedule 4433988
696kworker/2:1;ret_from_fork;kthread;worker_thread;schedule 4636142
697kworker/0:2;ret_from_fork;kthread;worker_thread;schedule 4832023
698rcuos/1;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4974186
699rcuos/5;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4977137
700rcuos/6;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4987769
701rcuos/3;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4992282
702rcuos/4;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4992364
703rcuos/2;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4992714
704rcuos/0;ddebug_tables;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4996504
705rcuos/7;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4998497
706rcu_sched;ddebug_tables;ret_from_fork;kthread;rcu_gp_kthread;schedule_timeout;schedule 5000686
707offcputime;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule 5005063
708dd;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;blkdev_read_iter;generic_file_read_iter;blkdev_direct_IO;__blockdev_direct_IO;do_blockdev_direct_IO;io_schedule_timeout;schedule_timeout;schedule 8025599
709supervise;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule 40835611
710
711The stack traces are shown as single lines, with functions separated by
712semicolons. The first entry is the task name. The 2nd column is the total
713off-CPU time.
714
715I'd save this output to a file, then move it to the system where you'll be
716creating your "off-CPU time flame graphs".
717
718
719USAGE message:
720
721# ./offcputime.py -h
722usage: offcputime.py [-h] [-p PID | -t TID | -u | -k] [-U | -K] [-d] [-f]
723                     [--stack-storage-size STACK_STORAGE_SIZE]
724                     [-m MIN_BLOCK_TIME] [-M MAX_BLOCK_TIME] [--state STATE]
725                     [duration]
726
727Summarize off-CPU time by stack trace
728
729positional arguments:
730  duration              duration of trace, in seconds
731
732optional arguments:
733  -h, --help            show this help message and exit
734  -p PID, --pid PID     trace this PID only
735  -t TID, --tid TID     trace this TID only
736  -u, --user-threads-only
737                        user threads only (no kernel threads)
738  -k, --kernel-threads-only
739                        kernel threads only (no user threads)
740  -U, --user-stacks-only
741                        show stacks from user space only (no kernel space
742                        stacks)
743  -K, --kernel-stacks-only
744                        show stacks from kernel space only (no user space
745                        stacks)
746  -d, --delimited       insert delimiter between kernel/user stacks
747  -f, --folded          output folded format
748  --stack-storage-size STACK_STORAGE_SIZE
749                        the number of unique stack traces that can be stored
750                        and displayed (default 1024)
751  -m MIN_BLOCK_TIME, --min-block-time MIN_BLOCK_TIME
752                        the amount of time in microseconds over which we store
753                        traces (default 1)
754  -M MAX_BLOCK_TIME, --max-block-time MAX_BLOCK_TIME
755                        the amount of time in microseconds under which we
756                        store traces (default U64_MAX)
757  --state STATE         filter on this thread state bitmask (eg, 2 ==
758                        TASK_UNINTERRUPTIBLE) see include/linux/sched.h
759
760examples:
761    ./offcputime             # trace off-CPU stack time until Ctrl-C
762    ./offcputime 5           # trace for 5 seconds only
763    ./offcputime -f 5        # 5 seconds, and output in folded format
764    ./offcputime -m 1000     # trace only events that last more than 1000 usec
765    ./offcputime -M 10000    # trace only events that last less than 10000 usec
766    ./offcputime -p 185      # only trace threads for PID 185
767    ./offcputime -t 188      # only trace thread 188
768    ./offcputime -u          # only trace user threads (no kernel)
769    ./offcputime -k          # only trace kernel threads (no user)
770    ./offcputime -U          # only show user space stacks (no kernel)
771    ./offcputime -K          # only show kernel space stacks (no user)
772