• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2#
3# klockstat traces lock events and display locks statistics.
4#
5# USAGE: klockstat
6#
7
8from __future__ import print_function
9from bcc import BPF, USDT
10import argparse
11import subprocess
12import ctypes as ct
13from time import sleep, strftime
14from datetime import datetime, timedelta
15import errno
16from sys import stderr
17
18examples = """
19    klockstat                           # trace system wide
20    klockstat -d 5                      # trace for 5 seconds only
21    klockstat -i 5                      # display stats every 5 seconds
22    klockstat -p 123                    # trace locks for PID 123
23    klockstat -t 321                    # trace locks for PID 321
24    klockstat -c pipe_                  # display stats only for lock callers with 'pipe_' substring
25    klockstat -S acq_count              # sort lock acquired results on acquired count
26    klockstat -S hld_total              # sort lock held results on total held time
27    klockstat -S acq_count,hld_total    # combination of above
28    klockstat -n 3                      # display 3 locks
29    klockstat -s 3                      # display 3 levels of stack
30"""
31
32# arg validation
33def positive_int(val):
34    try:
35        ival = int(val)
36    except ValueError:
37        raise argparse.ArgumentTypeError("must be an integer")
38
39    if ival < 0:
40        raise argparse.ArgumentTypeError("must be positive")
41    return ival
42
43def positive_nonzero_int(val):
44    ival = positive_int(val)
45    if ival == 0:
46        raise argparse.ArgumentTypeError("must be nonzero")
47    return ival
48
49def stack_id_err(stack_id):
50    # -EFAULT in get_stackid normally means the stack-trace is not available,
51    # Such as getting kernel stack trace in userspace code
52    return (stack_id < 0) and (stack_id != -errno.EFAULT)
53
54parser = argparse.ArgumentParser(
55    description="",
56    formatter_class=argparse.RawDescriptionHelpFormatter,
57    epilog=examples)
58
59time_group = parser.add_mutually_exclusive_group()
60time_group.add_argument("-d", "--duration", type=int,
61    help="total duration of trace in seconds")
62time_group.add_argument("-i", "--interval", type=int,
63    help="print summary at this interval (seconds)")
64parser.add_argument("-n", "--locks", type=int, default=99999999,
65    help="print given number of locks")
66parser.add_argument("-s", "--stacks", type=int, default=1,
67    help="print given number of stack entries")
68parser.add_argument("-c", "--caller",
69    help="print locks taken by given caller")
70parser.add_argument("-S", "--sort",
71    help="sort data on <aq_field,hd_field>, fields: acq_[max|total|count] hld_[max|total|count]")
72parser.add_argument("-p", "--pid",
73    help="trace this PID only")
74parser.add_argument("-t", "--tid",
75    help="trace this TID only")
76parser.add_argument("--stack-storage-size", default=16384,
77    type=positive_nonzero_int,
78    help="the number of unique stack traces that can be stored and "
79         "displayed (default 16384)")
80
81args = parser.parse_args()
82
83program = """
84#include <uapi/linux/ptrace.h>
85
86struct depth_id {
87  u64 id;
88  u64 depth;
89};
90
91BPF_ARRAY(enabled,   u64, 1);
92BPF_HASH(track,      u64, u64);
93BPF_HASH(time_aq,    u64, u64);
94BPF_HASH(lock_depth, u64, u64);
95BPF_HASH(time_held,  struct depth_id, u64);
96BPF_HASH(stack,      struct depth_id, int);
97
98BPF_HASH(aq_report_count, int, u64);
99BPF_HASH(aq_report_max,   int, u64);
100BPF_HASH(aq_report_total, int, u64);
101
102BPF_HASH(hl_report_count, int, u64);
103BPF_HASH(hl_report_max,   int, u64);
104BPF_HASH(hl_report_total, int, u64);
105
106BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
107
108static bool is_enabled(void)
109{
110    int key = 0;
111    u64 *ret;
112
113    ret = enabled.lookup(&key);
114    return ret && *ret == 1;
115}
116
117static bool allow_pid(u64 id)
118{
119    u32 pid = id >> 32; // PID is higher part
120    u32 tid = id;       // Cast and get the lower part
121
122    FILTER
123
124    return 1;
125}
126
127static int do_mutex_lock_enter(void *ctx, int skip)
128{
129    if (!is_enabled())
130        return 0;
131
132    u64 id = bpf_get_current_pid_tgid();
133
134    if (!allow_pid(id))
135        return 0;
136
137    u64 one = 1, zero = 0;
138
139    track.update(&id, &one);
140
141    u64 *depth = lock_depth.lookup(&id);
142
143    if (!depth) {
144        lock_depth.update(&id, &zero);
145
146        depth = lock_depth.lookup(&id);
147        /* something is wrong.. */
148        if (!depth)
149            return 0;
150    }
151
152    int stackid = stack_traces.get_stackid(ctx, skip);
153    struct depth_id did = {
154      .id    = id,
155      .depth = *depth,
156    };
157    stack.update(&did, &stackid);
158
159    u64 ts = bpf_ktime_get_ns();
160    time_aq.update(&id, &ts);
161
162    *depth += 1;
163    return 0;
164}
165
166static void update_aq_report_count(int *stackid)
167{
168    u64 *count, one = 1;
169
170    count = aq_report_count.lookup(stackid);
171    if (!count) {
172        aq_report_count.update(stackid, &one);
173    } else {
174        *count += 1;
175    }
176}
177
178static void update_hl_report_count(int *stackid)
179{
180    u64 *count, one = 1;
181
182    count = hl_report_count.lookup(stackid);
183    if (!count) {
184        hl_report_count.update(stackid, &one);
185    } else {
186        *count += 1;
187    }
188}
189
190static void update_aq_report_max(int *stackid, u64 time)
191{
192    u64 *max;
193
194    max = aq_report_max.lookup(stackid);
195    if (!max || *max < time)
196        aq_report_max.update(stackid, &time);
197}
198
199static void update_hl_report_max(int *stackid, u64 time)
200{
201    u64 *max;
202
203    max = hl_report_max.lookup(stackid);
204    if (!max || *max < time)
205        hl_report_max.update(stackid, &time);
206}
207
208static void update_aq_report_total(int *stackid, u64 delta)
209{
210    u64 *count, *time;
211
212    count = aq_report_count.lookup(stackid);
213    if (!count)
214        return;
215
216    time = aq_report_total.lookup(stackid);
217    if (!time) {
218        aq_report_total.update(stackid, &delta);
219    } else {
220        *time = *time + delta;
221    }
222}
223
224static void update_hl_report_total(int *stackid, u64 delta)
225{
226    u64 *count, *time;
227
228    count = hl_report_count.lookup(stackid);
229    if (!count)
230        return;
231
232    time = hl_report_total.lookup(stackid);
233    if (!time) {
234        hl_report_total.update(stackid, &delta);
235    } else {
236        *time = *time + delta;
237    }
238}
239
240static int do_mutex_lock_return(void)
241{
242    if (!is_enabled())
243        return 0;
244
245    u64 id = bpf_get_current_pid_tgid();
246
247    if (!allow_pid(id))
248        return 0;
249
250    u64 *one = track.lookup(&id);
251
252    if (!one)
253        return 0;
254
255    track.delete(&id);
256
257    u64 *depth = lock_depth.lookup(&id);
258    if (!depth)
259        return 0;
260
261    struct depth_id did = {
262      .id    = id,
263      .depth = *depth - 1,
264    };
265
266    u64 *aq = time_aq.lookup(&id);
267    if (!aq)
268        return 0;
269
270    int *stackid = stack.lookup(&did);
271    if (!stackid)
272        return 0;
273
274    int stackid_ = *stackid;
275    u64 cur = bpf_ktime_get_ns();
276
277    if (cur > *aq) {
278        int val = cur - *aq;
279        update_aq_report_count(&stackid_);
280        update_aq_report_max(&stackid_, val);
281        update_aq_report_total(&stackid_, val);
282    }
283
284    time_held.update(&did, &cur);
285    return 0;
286}
287
288static int do_mutex_unlock_enter(void)
289{
290    if (!is_enabled())
291        return 0;
292
293    u64 id = bpf_get_current_pid_tgid();
294
295    if (!allow_pid(id))
296        return 0;
297
298    u64 *depth = lock_depth.lookup(&id);
299
300    if (!depth || *depth == 0)
301        return 0;
302
303    *depth -= 1;
304
305    struct depth_id did = {
306      .id    = id,
307      .depth = *depth,
308    };
309
310    u64 *held = time_held.lookup(&did);
311    if (!held)
312        return 0;
313
314    int *stackid = stack.lookup(&did);
315    if (!stackid)
316        return 0;
317
318
319    int stackid_ = *stackid;
320    u64 cur = bpf_ktime_get_ns();
321
322    if (cur > *held) {
323        u64 val = cur - *held;
324        update_hl_report_count(&stackid_);
325        update_hl_report_max(&stackid_, val);
326        update_hl_report_total(&stackid_, val);
327    }
328
329    stack.delete(&did);
330    time_held.delete(&did);
331    return 0;
332}
333"""
334
335program_kprobe = """
336int mutex_unlock_enter(struct pt_regs *ctx)
337{
338    return do_mutex_unlock_enter();
339}
340
341int mutex_lock_return(struct pt_regs *ctx)
342{
343    return do_mutex_lock_return();
344}
345
346int mutex_lock_enter(struct pt_regs *ctx)
347{
348    return do_mutex_lock_enter(ctx, 0);
349}
350"""
351
352program_kfunc = """
353KFUNC_PROBE(mutex_unlock, void *lock)
354{
355    return do_mutex_unlock_enter();
356}
357
358KRETFUNC_PROBE(mutex_lock, void *lock, int ret)
359{
360    return do_mutex_lock_return();
361}
362
363KFUNC_PROBE(mutex_lock, void *lock)
364{
365    return do_mutex_lock_enter(ctx, 3);
366}
367
368"""
369
370program_kfunc_nested = """
371KFUNC_PROBE(mutex_unlock, void *lock)
372{
373    return do_mutex_unlock_enter();
374}
375
376KRETFUNC_PROBE(mutex_lock_nested, void *lock, int ret)
377{
378    return do_mutex_lock_return();
379}
380
381KFUNC_PROBE(mutex_lock_nested, void *lock)
382{
383    return do_mutex_lock_enter(ctx, 3);
384}
385
386"""
387
388is_support_kfunc = BPF.support_kfunc()
389if is_support_kfunc:
390    if BPF.get_kprobe_functions(b"mutex_lock_nested"):
391        program += program_kfunc_nested
392    else:
393        program += program_kfunc
394else:
395    program += program_kprobe
396
397def sort_list(maxs, totals, counts):
398    if (not args.sort):
399        return maxs;
400
401    for field in args.sort.split(','):
402        if (field == "acq_max" or field == "hld_max"):
403            return maxs
404        if (field == "acq_total" or field == "hld_total"):
405            return totals
406        if (field == "acq_count" or field == "hld_count"):
407            return counts
408
409    print("Wrong sort argument: %s", args.sort)
410    exit(-1)
411
412def display(sort, maxs, totals, counts):
413    global missing_stacks
414    global has_enomem
415
416    for k, v in sorted(sort.items(), key=lambda sort: sort[1].value, reverse=True)[:args.locks]:
417        missing_stacks += int(stack_id_err(k.value))
418        has_enomem      = has_enomem or (k.value == -errno.ENOMEM)
419
420        caller = "[Missed Kernel Stack]"
421        stack  = []
422
423        if (k.value >= 0):
424            stack  = list(stack_traces.walk(k.value))
425            caller = b.ksym(stack[1], show_offset=True)
426
427            if (args.caller and caller.find(args.caller.encode())):
428                continue
429
430        avg = totals[k].value / counts[k].value
431
432        print("%40s %10lu %6lu %10lu %10lu" % (caller, avg, counts[k].value, maxs[k].value, totals[k].value))
433
434        for addr in stack[2:args.stacks]:
435            print("%40s" %  b.ksym(addr, show_offset=True))
436
437
438if args.tid:  # TID trumps PID
439    program = program.replace('FILTER',
440        'if (tid != %s) { return 0; }' % args.tid)
441elif args.pid:
442    program = program.replace('FILTER',
443        'if (pid != %s) { return 0; }' % args.pid)
444else:
445    program = program.replace('FILTER', '')
446
447program = program.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
448
449b = BPF(text=program)
450
451if not is_support_kfunc:
452    b.attach_kprobe(event="mutex_unlock", fn_name="mutex_unlock_enter")
453    # Depending on whether DEBUG_LOCK_ALLOC is set, the proper kprobe may be either mutex_lock or mutex_lock_nested
454    if BPF.get_kprobe_functions(b"mutex_lock_nested"):
455        b.attach_kretprobe(event="mutex_lock_nested", fn_name="mutex_lock_return")
456        b.attach_kprobe(event="mutex_lock_nested", fn_name="mutex_lock_enter")
457    else:
458        b.attach_kretprobe(event="mutex_lock", fn_name="mutex_lock_return")
459        b.attach_kprobe(event="mutex_lock", fn_name="mutex_lock_enter")
460
461enabled = b.get_table("enabled");
462
463stack_traces = b.get_table("stack_traces")
464aq_counts = b.get_table("aq_report_count")
465aq_maxs   = b.get_table("aq_report_max")
466aq_totals = b.get_table("aq_report_total")
467
468hl_counts = b.get_table("hl_report_count")
469hl_maxs   = b.get_table("hl_report_max")
470hl_totals = b.get_table("hl_report_total")
471
472aq_sort = sort_list(aq_maxs, aq_totals, aq_counts)
473hl_sort = sort_list(hl_maxs, hl_totals, hl_counts)
474
475print("Tracing lock events... Hit Ctrl-C to end.")
476
477# duration and interval are mutualy exclusive
478exiting = 0 if args.interval else 1
479exiting = 1 if args.duration else 0
480
481seconds = 99999999
482if args.interval:
483    seconds = args.interval
484if args.duration:
485    seconds = args.duration
486
487missing_stacks = 0
488has_enomem     = False
489
490while (1):
491    enabled[ct.c_int(0)] = ct.c_int(1)
492
493    try:
494        sleep(seconds)
495    except KeyboardInterrupt:
496        exiting = 1
497
498    enabled[ct.c_int(0)] = ct.c_int(0)
499
500    print("\n%40s %10s %6s %10s %10s" % ("Caller", "Avg Spin", "Count", "Max spin", "Total spin"))
501    display(aq_sort, aq_maxs, aq_totals, aq_counts)
502
503
504    print("\n%40s %10s %6s %10s %10s" % ("Caller", "Avg Hold", "Count", "Max hold", "Total hold"))
505    display(hl_sort, hl_maxs, hl_totals, hl_counts)
506
507    if exiting:
508        break;
509
510    stack_traces.clear()
511    aq_counts.clear()
512    aq_maxs.clear()
513    aq_totals.clear()
514    hl_counts.clear()
515    hl_maxs.clear()
516    hl_totals.clear()
517
518if missing_stacks > 0:
519    enomem_str = " Consider increasing --stack-storage-size."
520    print("WARNING: %d stack traces lost and could not be displayed.%s" %
521        (missing_stacks, (enomem_str if has_enomem else "")),
522        file=stderr)
523