• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# compactsnoop  Trace compact zone and print details including issuing PID.
5#       For Linux, uses BCC, eBPF.
6#
7# This uses in-kernel eBPF maps to cache process details (PID and comm) by
8# compact zone begin, as well as a starting timestamp for calculating
9# latency.
10#
11# Copyright (c) 2019 Wenbo Zhang
12# Licensed under the Apache License, Version 2.0 (the "License")
13#
14# 11-NOV-2019   Wenbo Zhang   Created this.
15
16from __future__ import print_function
17from bcc import BPF
18import argparse
19import platform
20from datetime import datetime, timedelta
21import sys
22
23# arguments
24examples = """examples:
25    ./compactsnoop          # trace all compact stall
26    ./compactsnoop -T       # include timestamps
27    ./compactsnoop -d 10    # trace for 10 seconds only
28    ./compactsnoop -K       # output kernel stack trace
29    ./compactsnoop -e       # show extended fields
30"""
31
32parser = argparse.ArgumentParser(
33    description="Trace compact zone",
34    formatter_class=argparse.RawDescriptionHelpFormatter,
35    epilog=examples,
36)
37parser.add_argument("-T", "--timestamp", action="store_true",
38        help="include timestamp on output")
39parser.add_argument("-p", "--pid", help="trace this PID only")
40parser.add_argument("-d", "--duration",
41        help="total duration of trace in seconds")
42parser.add_argument("-K", "--kernel-stack", action="store_true",
43        help="output kernel stack trace")
44parser.add_argument("-e", "--extended_fields", action="store_true",
45        help="show system memory state")
46parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS)
47args = parser.parse_args()
48debug = 0
49if args.duration:
50    args.duration = timedelta(seconds=int(args.duration))
51
52NO_EXTENDED = """
53#ifdef EXTNEDED_FIELDS
54#undef EXTNEDED_FIELDS
55#endif
56"""
57
58EXTENDED = """
59#define EXTNEDED_FIELDS    1
60"""
61
62bpf_text = """
63#include <uapi/linux/ptrace.h>
64#include <linux/sched.h>
65#include <linux/mmzone.h>
66#include <linux/compaction.h>
67
68struct val_t {
69    int nid;
70    int idx;
71    int order;
72    int sync;
73#ifdef EXTNEDED_FIELDS
74    int fragindex;
75    int low;
76    int min;
77    int high;
78    int free;
79#endif
80    u64 ts;    // compaction begin time
81};
82
83struct data_t {
84    u32 pid;
85    u32 tid;
86    int nid;
87    int idx;
88    int order;
89    u64 delta;
90    u64 ts;    // compaction end time
91    int sync;
92#ifdef EXTNEDED_FIELDS
93    int fragindex;
94    int low;
95    int min;
96    int high;
97    int free;
98#endif
99    int status;
100    int stack_id;
101    char comm[TASK_COMM_LEN];
102};
103
104BPF_HASH(start, u64, struct val_t);
105BPF_PERF_OUTPUT(events);
106BPF_STACK_TRACE(stack_traces, 2048);
107
108#ifdef CONFIG_NUMA
109static inline int zone_to_nid_(struct zone *zone)
110{
111    int node;
112    bpf_probe_read_kernel(&node, sizeof(node), &zone->node);
113    return node;
114}
115#else
116static inline int zone_to_nid_(struct zone *zone)
117{
118    return 0;
119}
120#endif
121
122// #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
123static inline int zone_idx_(struct zone *zone)
124{
125    struct pglist_data *zone_pgdat = NULL;
126    bpf_probe_read_kernel(&zone_pgdat, sizeof(zone_pgdat), &zone->zone_pgdat);
127    return zone - zone_pgdat->node_zones;
128}
129
130#ifdef EXTNEDED_FIELDS
131static inline void get_all_wmark_pages(struct zone *zone, struct val_t *valp)
132{
133    u64 _watermark[NR_WMARK] = {};
134    u64 watermark_boost = 0;
135
136    bpf_probe_read_kernel(&_watermark, sizeof(_watermark), &zone->_watermark);
137    bpf_probe_read_kernel(&watermark_boost, sizeof(watermark_boost),
138                    &zone->watermark_boost);
139    valp->min = _watermark[WMARK_MIN] + watermark_boost;
140    valp->low = _watermark[WMARK_LOW] + watermark_boost;
141    valp->high = _watermark[WMARK_HIGH] + watermark_boost;
142    bpf_probe_read_kernel(&valp->free, sizeof(valp->free),
143                    &zone->vm_stat[NR_FREE_PAGES]);
144}
145#endif
146
147static inline void submit_event(void *ctx, int status)
148{
149    struct data_t data = {};
150    u64 ts = bpf_ktime_get_ns();
151    u64 id = bpf_get_current_pid_tgid();
152    struct val_t *valp = start.lookup(&id);
153    if (valp == NULL) {
154        // missed entry
155        return;
156    }
157
158    data.delta = ts - valp->ts;
159    data.ts = ts / 1000;
160    data.pid = id >> 32;
161    data.tid = id;
162    bpf_get_current_comm(&data.comm, sizeof(data.comm));
163    data.nid = valp->nid;
164    data.idx = valp->idx;
165    data.order = valp->order;
166    data.sync = valp->sync;
167
168#ifdef EXTNEDED_FIELDS
169    data.fragindex = valp->fragindex;
170    data.min = valp->min;
171    data.low = valp->low;
172    data.high = valp->high;
173    data.free = valp->free;
174#endif
175
176    data.status = status;
177    data.stack_id = stack_traces.get_stackid(ctx, 0);
178
179    events.perf_submit(ctx, &data, sizeof(data));
180
181    start.delete(&id);
182}
183
184#ifdef EXTNEDED_FIELDS
185int trace_fragmentation_index_return(struct pt_regs *ctx)
186{
187    struct val_t val = { };
188    int ret = PT_REGS_RC(ctx);
189    u64 id = bpf_get_current_pid_tgid();
190    PID_FILTER
191    val.fragindex = ret;
192    start.update(&id, &val);
193    return 0;
194}
195#endif
196
197static inline void fill_compact_info(struct val_t *valp,
198                                     struct zone *zone,
199                                     int order)
200{
201    valp->nid = zone_to_nid_(zone);
202    valp->idx = zone_idx_(zone);
203    valp->order = order;
204}
205
206RAW_TRACEPOINT_PROBE(mm_compaction_suitable)
207{
208    // TP_PROTO(struct zone *zone, int order, int ret)
209    struct zone *zone = (struct zone *)ctx->args[0];
210    int order = (int)ctx->args[1];
211    int ret = (int)ctx->args[2];
212    u64 id;
213
214    if(ret != COMPACT_CONTINUE)
215        return 0;
216
217    id = bpf_get_current_pid_tgid();
218    PID_FILTER
219
220#ifdef EXTNEDED_FIELDS
221    struct val_t *valp = start.lookup(&id);
222    if (valp == NULL) {
223        // missed entry or order <= PAGE_ALLOC_COSTLY_ORDER, eg:
224        // manual trigger echo 1 > /proc/sys/vm/compact_memory
225        struct val_t val = { .fragindex = -1000 };
226        valp = &val;
227        start.update(&id, valp);
228    }
229    fill_compact_info(valp, zone, order);
230    get_all_wmark_pages(zone, valp);
231#else
232    struct val_t val = { };
233    fill_compact_info(&val, zone, order);
234    start.update(&id, &val);
235#endif
236
237    return 0;
238}
239
240RAW_TRACEPOINT_PROBE(mm_compaction_begin)
241{
242    // TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
243    //          unsigned long free_pfn, unsigned long zone_end, bool sync)
244    bool sync = (bool)ctx->args[4];
245
246    u64 id = bpf_get_current_pid_tgid();
247    struct val_t *valp = start.lookup(&id);
248    if (valp == NULL) {
249        // missed entry
250        return 0;
251    }
252
253    valp->ts = bpf_ktime_get_ns();
254    valp->sync = sync;
255    return 0;
256}
257
258RAW_TRACEPOINT_PROBE(mm_compaction_end)
259{
260    // TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
261    //          unsigned long free_pfn, unsigned long zone_end, bool sync,
262    //          int status)
263    submit_event(ctx, ctx->args[5]);
264    return 0;
265}
266"""
267
268if platform.machine() != 'x86_64':
269    print("""
270          Currently only support x86_64 servers, if you want to use it on
271          other platforms, please refer include/linux/mmzone.h to modify
272          zone_idex_to_str to get the right zone type
273    """)
274    exit()
275
276if args.extended_fields:
277    bpf_text = EXTENDED + bpf_text
278else:
279    bpf_text = NO_EXTENDED + bpf_text
280
281if args.pid:
282    bpf_text = bpf_text.replace("PID_FILTER",
283                                "if (id >> 32 != %s) { return 0; }" % args.pid)
284else:
285    bpf_text = bpf_text.replace("PID_FILTER", "")
286if debug or args.ebpf:
287    print(bpf_text)
288    if args.ebpf:
289        exit()
290
291# load BPF program
292b = BPF(text=bpf_text)
293if args.extended_fields:
294    b.attach_kretprobe(event="fragmentation_index",
295                       fn_name="trace_fragmentation_index_return")
296
297stack_traces = b.get_table("stack_traces")
298initial_ts = 0
299
300def zone_idx_to_str(idx):
301    # from include/linux/mmzone.h
302    # NOTICE: consider only x86_64 servers
303    zone_type = {
304        0: "ZONE_DMA",
305        1: "ZONE_DMA32",
306        2: "ZONE_NORMAL",
307    }
308
309    if idx in zone_type:
310        return zone_type[idx]
311    else:
312        return str(idx)
313
314def compact_result_to_str(status):
315    # from include/trace/evnets/mmflags.h
316    # from include/linux/compaction.h
317    compact_status = {
318        # COMPACT_NOT_SUITABLE_ZONE: For more detailed tracepoint
319        # output - internal to compaction
320        0: "not_suitable_zone",
321        # COMPACT_SKIPPED: compaction didn't start as it was not
322        # possible or direct reclaim was more suitable
323        1: "skipped",
324        # COMPACT_DEFERRED: compaction didn't start as it was
325        # deferred due to past failures
326        2: "deferred",
327        # COMPACT_NOT_SUITABLE_PAGE: For more detailed tracepoint
328        # output - internal to compaction
329        3: "no_suitable_page",
330        # COMPACT_CONTINUE: compaction should continue to another pageblock
331        4: "continue",
332        # COMPACT_COMPLETE: The full zone was compacted scanned but wasn't
333        # successful to compact suitable pages.
334        5: "complete",
335        # COMPACT_PARTIAL_SKIPPED: direct compaction has scanned part of the
336        # zone but wasn't successful to compact suitable pages.
337        6: "partial_skipped",
338        # COMPACT_CONTENDED: compaction terminated prematurely due to lock
339        # contentions
340        7: "contended",
341        # COMPACT_SUCCESS: direct compaction terminated after concluding
342        # that the allocation should now succeed
343        8: "success",
344    }
345
346    if status in compact_status:
347        return compact_status[status]
348    else:
349        return str(status)
350
351# header
352if args.timestamp:
353    print("%-14s" % ("TIME(s)"), end=" ")
354print("%-14s %-6s %-4s %-12s %-5s %-7s" %
355      ("COMM", "PID", "NODE", "ZONE", "ORDER", "MODE"), end=" ")
356if args.extended_fields:
357    print("%-8s %-8s %-8s %-8s %-8s" %
358          ("FRAGIDX", "MIN", "LOW", "HIGH", "FREE"), end=" ")
359print("%9s %16s" % ("LAT(ms)", "STATUS"))
360
361# process event
362def print_event(cpu, data, size):
363    event = b["events"].event(data)
364
365    global initial_ts
366
367    if not initial_ts:
368        initial_ts = event.ts
369
370    if args.timestamp:
371        delta = event.ts - initial_ts
372        print("%-14.9f" % (float(delta) / 1000000), end=" ")
373
374    print("%-14.14s %-6s %-4s %-12s %-5s %-7s" % (
375            event.comm.decode("utf-8", "replace"),
376            event.pid,
377            event.nid,
378            zone_idx_to_str(event.idx),
379            event.order,
380            "SYNC" if event.sync else "ASYNC"), end=" ")
381    if args.extended_fields:
382        print("%-8.3f %-8s %-8s %-8s %-8s" % (
383            (float(event.fragindex) / 1000),
384            event.min, event.low, event.high, event.free
385            ), end=" ")
386    print("%9.3f %16s" % (
387        float(event.delta) / 1000000, compact_result_to_str(event.status)))
388    if args.kernel_stack:
389        for addr in stack_traces.walk(event.stack_id):
390            sym = b.ksym(addr, show_offset=True)
391            print("\t%s" % sym)
392        print("")
393
394    sys.stdout.flush()
395
396# loop with callback to print_event
397b["events"].open_perf_buffer(print_event, page_cnt=64)
398start_time = datetime.now()
399while not args.duration or datetime.now() - start_time < args.duration:
400    try:
401        b.perf_buffer_poll()
402    except KeyboardInterrupt:
403        exit()
404