• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# compactsnoop  Trace compact zone and print details including issuing PID.
5#       For Linux, uses BCC, eBPF.
6#
7# This uses in-kernel eBPF maps to cache process details (PID and comm) by
8# compact zone begin, as well as a starting timestamp for calculating
9# latency.
10#
11# Copyright (c) 2019 Wenbo Zhang
12# Licensed under the Apache License, Version 2.0 (the "License")
13#
14# 11-NOV-2019   Wenbo Zhang   Created this.
15
16from __future__ import print_function
17from bcc import BPF
18import argparse
19import platform
20from datetime import datetime, timedelta
21
22# arguments
23examples = """examples:
24    ./compactsnoop          # trace all compact stall
25    ./compactsnoop -T       # include timestamps
26    ./compactsnoop -d 10    # trace for 10 seconds only
27    ./compactsnoop -K       # output kernel stack trace
28    ./compactsnoop -e       # show extended fields
29"""
30
31parser = argparse.ArgumentParser(
32    description="Trace compact zone",
33    formatter_class=argparse.RawDescriptionHelpFormatter,
34    epilog=examples,
35)
36parser.add_argument("-T", "--timestamp", action="store_true",
37                    help="include timestamp on output")
38parser.add_argument("-p", "--pid", help="trace this PID only")
39parser.add_argument("-d", "--duration",
40                    help="total duration of trace in seconds")
41parser.add_argument("-K", "--kernel-stack", action="store_true",
42                    help="output kernel stack trace")
43parser.add_argument("-e", "--extended_fields", action="store_true",
44                    help="show system memory state")
45parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS)
46args = parser.parse_args()
47debug = 0
48if args.duration:
49    args.duration = timedelta(seconds=int(args.duration))
50
51NO_EXTENDED = """
52#ifdef EXTNEDED_FIELDS
53#undef EXTNEDED_FIELDS
54#endif
55"""
56
57EXTENDED = """
58#define EXTNEDED_FIELDS    1
59"""
60
61bpf_text = """
62#include <uapi/linux/ptrace.h>
63#include <linux/sched.h>
64#include <linux/mmzone.h>
65struct node;
66#include <linux/compaction.h>
67
68struct compact_control {
69    struct list_head freepages;     /* List of free pages to migrate to */
70    struct list_head migratepages;  /* List of pages being migrated */
71    unsigned long nr_freepages;     /* Number of isolated free pages */
72    unsigned long nr_migratepages;  /* Number of pages to migrate */
73    unsigned long free_pfn;         /* isolate_freepages search base */
74    unsigned long migrate_pfn;      /* isolate_migratepages search base */
75    bool sync;                      /* Synchronous migration */
76};
77
78struct val_t {
79    int nid;
80    int idx;
81    int order;
82    int sync;
83#ifdef EXTNEDED_FIELDS
84    int fragindex;
85    int low;
86    int min;
87    int high;
88    int free;
89#endif
90    u64 ts;    // compaction begin time
91};
92
93struct data_t {
94    u32 pid;
95    u32 tid;
96    int nid;
97    int idx;
98    int order;
99    u64 delta;
100    u64 ts;    // compaction end time
101    int sync;
102#ifdef EXTNEDED_FIELDS
103    int fragindex;
104    int low;
105    int min;
106    int high;
107    int free;
108#endif
109    int status;
110    int stack_id;
111    char comm[TASK_COMM_LEN];
112};
113
114BPF_HASH(start, u64, struct val_t);
115BPF_PERF_OUTPUT(events);
116BPF_STACK_TRACE(stack_traces, 2048);
117
118#ifdef CONFIG_NUMA
119static inline int zone_to_nid_(struct zone *zone)
120{
121    int node;
122    bpf_probe_read(&node, sizeof(node), &zone->node);
123    return node;
124}
125#else
126static inline int zone_to_nid_(struct zone *zone)
127{
128    return 0;
129}
130#endif
131
132// #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
133static inline int zone_idx_(struct zone *zone)
134{
135    struct pglist_data *zone_pgdat = NULL;
136    bpf_probe_read(&zone_pgdat, sizeof(zone_pgdat), &zone->zone_pgdat);
137    return zone - zone_pgdat->node_zones;
138}
139
140#ifdef EXTNEDED_FIELDS
141static inline void get_all_wmark_pages(struct zone *zone, struct val_t *valp)
142{
143    u64 watermark[NR_WMARK] = {};
144    u64 watermark_boost = 0;
145
146    bpf_probe_read(&watermark, sizeof(watermark), &zone->watermark);
147    valp->min = watermark[WMARK_MIN];
148    valp->low = watermark[WMARK_LOW];
149    valp->high = watermark[WMARK_HIGH];
150    bpf_probe_read(&valp->free, sizeof(valp->free),
151                   &zone->vm_stat[NR_FREE_PAGES]);
152}
153#endif
154
155int trace_compact_zone_entry(struct pt_regs *ctx, struct zone *zone,
156                             struct compact_control *cc)
157{
158#ifdef EXTNEDED_FIELDS
159    struct val_t val = { .fragindex=-1000 };
160#else
161    struct val_t val = { };
162#endif
163    u64 id = bpf_get_current_pid_tgid();
164    PID_FILTER
165    val.sync = cc->sync;
166    start.update(&id, &val);
167    return 0;
168}
169
170int trace_compaction_suitable_entry(struct pt_regs *ctx, struct zone *zone,
171                                    int order)
172{
173    u64 id = bpf_get_current_pid_tgid();
174    struct val_t *valp = start.lookup(&id);
175    if (valp == NULL) {
176        // missed entry
177        return 0;
178    }
179    valp->nid = zone_to_nid_(zone);
180    valp->idx = zone_idx_(zone);
181    valp->order = order;
182
183#ifdef EXTNEDED_FIELDS
184     get_all_wmark_pages(zone, valp);
185#endif
186
187    return 0;
188}
189
190int trace_fragmentation_index_return(struct pt_regs *ctx)
191{
192    int ret = PT_REGS_RC(ctx);
193    u64 id = bpf_get_current_pid_tgid();
194    struct val_t *valp = start.lookup(&id);
195    if (valp == NULL) {
196        // missed entry
197        return 0;
198    }
199#ifdef EXTNEDED_FIELDS
200    valp->fragindex = ret;
201#endif
202    return 0;
203}
204
205int trace_compaction_suitable_return(struct pt_regs *ctx)
206{
207    int ret = PT_REGS_RC(ctx);
208    u64 id = bpf_get_current_pid_tgid();
209    struct val_t *valp = start.lookup(&id);
210    if (valp == NULL) {
211        // missed entry
212        return 0;
213    }
214    if (ret != COMPACT_CONTINUE)
215        start.delete(&id);
216    else
217        valp->ts = bpf_ktime_get_ns();
218    return 0;
219}
220
221int trace_compact_zone_return(struct pt_regs *ctx)
222{
223    int ret = PT_REGS_RC(ctx);
224    struct data_t data = {};
225    u64 ts = bpf_ktime_get_ns();
226    u64 id = bpf_get_current_pid_tgid();
227    struct val_t *valp = start.lookup(&id);
228    if (valp == NULL) {
229        // missed entry or unsuitable
230        return 0;
231    }
232
233    data.delta = ts - valp->ts;
234    data.ts = ts / 1000;
235    data.pid = id >> 32;
236    data.tid = id;
237    bpf_get_current_comm(&data.comm, sizeof(data.comm));
238    data.nid = valp->nid;
239    data.idx = valp->idx;
240    data.order = valp->order;
241    data.sync = valp->sync;
242
243#ifdef EXTNEDED_FIELDS
244    data.fragindex = valp->fragindex;
245    data.min = valp->min;
246    data.low = valp->low;
247    data.high = valp->high;
248    data.free = valp->free;
249#endif
250
251    data.status = ret;
252    data.stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID);
253
254    events.perf_submit(ctx, &data, sizeof(data));
255
256    start.delete(&id);
257    return 0;
258}
259"""
260
261if platform.machine() != 'x86_64':
262    print("""
263          Currently only support x86_64 servers, if you want to use it on
264          other platforms, please refer include/linux/mmzone.h to modify
265          zone_idex_to_str to get the right zone type
266    """)
267    exit()
268
269if args.extended_fields:
270    bpf_text = EXTENDED + bpf_text
271else:
272    bpf_text = NO_EXTENDED + bpf_text
273
274if args.pid:
275    bpf_text = bpf_text.replace(
276        "PID_FILTER", "if (id >> 32 != %s) { return 0; }" % args.pid)
277else:
278    bpf_text = bpf_text.replace("PID_FILTER", "")
279if debug or args.ebpf:
280    print(bpf_text)
281    if args.ebpf:
282        exit()
283
284# load BPF program
285b = BPF(text=bpf_text)
286b.attach_kprobe(event="compact_zone", fn_name="trace_compact_zone_entry")
287b.attach_kretprobe(event="compact_zone", fn_name="trace_compact_zone_return")
288b.attach_kprobe(
289    event="compaction_suitable", fn_name="trace_compaction_suitable_entry"
290)
291b.attach_kretprobe(
292    event="fragmentation_index", fn_name="trace_fragmentation_index_return"
293)
294b.attach_kretprobe(
295    event="compaction_suitable", fn_name="trace_compaction_suitable_return"
296)
297
298stack_traces = b.get_table("stack_traces")
299initial_ts = 0
300
301def zone_idx_to_str(idx):
302    # from include/linux/mmzone.h
303    # NOTICE: consider only x86_64 servers
304    zonetype = {
305        0: "ZONE_DMA",
306        1: "ZONE_DMA32",
307        2: "ZONE_NORMAL",
308    }
309
310    if idx in zonetype:
311        return zonetype[idx]
312    else:
313        return str(idx)
314
315def compact_result_to_str(status):
316    # from include/linux/compaction.h
317    compact_status = {
318        # COMPACT_SKIPPED: compaction didn't start as it was not possible
319        # or direct reclaim was more suitable
320        0: "skipped",
321        # COMPACT_CONTINUE: compaction should continue to another pageblock
322        1: "continue",
323        # COMPACT_PARTIAL: direct compaction partially compacted a zone and
324        # there are suitable pages
325        2: "partial",
326        # COMPACT_COMPLETE: The full zone was compacted
327        3: "complete",
328    }
329
330    if status in compact_status:
331        return compact_status[status]
332    else:
333        return str(status)
334
335# header
336if args.timestamp:
337    print("%-14s" % ("TIME(s)"), end=" ")
338print(
339    "%-14s %-6s %-4s %-12s %-5s %-7s"
340    % ("COMM", "PID", "NODE", "ZONE", "ORDER", "MODE"),
341    end=" ",
342)
343if args.extended_fields:
344    print("%-8s %-8s %-8s %-8s %-8s" %
345            ("FRAGIDX", "MIN", "LOW", "HIGH", "FREE"), end=" ")
346print("%9s %16s" % ("LAT(ms)", "STATUS"))
347
348# process event
349def print_event(cpu, data, size):
350    event = b["events"].event(data)
351
352    global initial_ts
353
354    if not initial_ts:
355        initial_ts = event.ts
356
357    if args.timestamp:
358        delta = event.ts - initial_ts
359        print("%-14.9f" % (float(delta) / 1000000), end=" ")
360
361    print("%-14.14s %-6s %-4s %-12s %-5s %-7s" % (
362            event.comm.decode("utf-8", "replace"),
363            event.pid,
364            event.nid,
365            zone_idx_to_str(event.idx),
366            event.order,
367            "SYNC" if event.sync else "ASYNC"), end=" ")
368    if args.extended_fields:
369        print("%-8.3f %-8s %-8s %-8s %-8s" % (
370                float(event.fragindex) / 1000,
371                event.min,
372                event.low,
373                event.high,
374                event.free), end=" ")
375    print("%9.3f %16s" % (
376        float(event.delta) / 1000000, compact_result_to_str(event.status)))
377    if args.kernel_stack:
378        for addr in stack_traces.walk(event.stack_id):
379            sym = b.ksym(addr, show_offset=True)
380            print("\t%s" % sym)
381            print("")
382
383# loop with callback to print_event
384b["events"].open_perf_buffer(print_event, page_cnt=64)
385start_time = datetime.now()
386while not args.duration or datetime.now() - start_time < args.duration:
387    try:
388        b.perf_buffer_poll()
389    except KeyboardInterrupt:
390        exit()
391