1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# compactsnoop Trace compact zone and print details including issuing PID. 5# For Linux, uses BCC, eBPF. 6# 7# This uses in-kernel eBPF maps to cache process details (PID and comm) by 8# compact zone begin, as well as a starting timestamp for calculating 9# latency. 10# 11# Copyright (c) 2019 Wenbo Zhang 12# Licensed under the Apache License, Version 2.0 (the "License") 13# 14# 11-NOV-2019 Wenbo Zhang Created this. 15 16from __future__ import print_function 17from bcc import BPF 18import argparse 19import platform 20from datetime import datetime, timedelta 21 22# arguments 23examples = """examples: 24 ./compactsnoop # trace all compact stall 25 ./compactsnoop -T # include timestamps 26 ./compactsnoop -d 10 # trace for 10 seconds only 27 ./compactsnoop -K # output kernel stack trace 28 ./compactsnoop -e # show extended fields 29""" 30 31parser = argparse.ArgumentParser( 32 description="Trace compact zone", 33 formatter_class=argparse.RawDescriptionHelpFormatter, 34 epilog=examples, 35) 36parser.add_argument("-T", "--timestamp", action="store_true", 37 help="include timestamp on output") 38parser.add_argument("-p", "--pid", help="trace this PID only") 39parser.add_argument("-d", "--duration", 40 help="total duration of trace in seconds") 41parser.add_argument("-K", "--kernel-stack", action="store_true", 42 help="output kernel stack trace") 43parser.add_argument("-e", "--extended_fields", action="store_true", 44 help="show system memory state") 45parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) 46args = parser.parse_args() 47debug = 0 48if args.duration: 49 args.duration = timedelta(seconds=int(args.duration)) 50 51NO_EXTENDED = """ 52#ifdef EXTNEDED_FIELDS 53#undef EXTNEDED_FIELDS 54#endif 55""" 56 57EXTENDED = """ 58#define EXTNEDED_FIELDS 1 59""" 60 61bpf_text = """ 62#include <uapi/linux/ptrace.h> 63#include <linux/sched.h> 64#include <linux/mmzone.h> 65struct node; 66#include <linux/compaction.h> 67 68struct compact_control { 69 struct list_head freepages; /* List of free pages to migrate to */ 70 struct list_head migratepages; /* List of pages being migrated */ 71 unsigned long nr_freepages; /* Number of isolated free pages */ 72 unsigned long nr_migratepages; /* Number of pages to migrate */ 73 unsigned long free_pfn; /* isolate_freepages search base */ 74 unsigned long migrate_pfn; /* isolate_migratepages search base */ 75 bool sync; /* Synchronous migration */ 76}; 77 78struct val_t { 79 int nid; 80 int idx; 81 int order; 82 int sync; 83#ifdef EXTNEDED_FIELDS 84 int fragindex; 85 int low; 86 int min; 87 int high; 88 int free; 89#endif 90 u64 ts; // compaction begin time 91}; 92 93struct data_t { 94 u32 pid; 95 u32 tid; 96 int nid; 97 int idx; 98 int order; 99 u64 delta; 100 u64 ts; // compaction end time 101 int sync; 102#ifdef EXTNEDED_FIELDS 103 int fragindex; 104 int low; 105 int min; 106 int high; 107 int free; 108#endif 109 int status; 110 int stack_id; 111 char comm[TASK_COMM_LEN]; 112}; 113 114BPF_HASH(start, u64, struct val_t); 115BPF_PERF_OUTPUT(events); 116BPF_STACK_TRACE(stack_traces, 2048); 117 118#ifdef CONFIG_NUMA 119static inline int zone_to_nid_(struct zone *zone) 120{ 121 int node; 122 bpf_probe_read(&node, sizeof(node), &zone->node); 123 return node; 124} 125#else 126static inline int zone_to_nid_(struct zone *zone) 127{ 128 return 0; 129} 130#endif 131 132// #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) 133static inline int zone_idx_(struct zone *zone) 134{ 135 struct pglist_data *zone_pgdat = NULL; 136 bpf_probe_read(&zone_pgdat, sizeof(zone_pgdat), &zone->zone_pgdat); 137 return zone - zone_pgdat->node_zones; 138} 139 140#ifdef EXTNEDED_FIELDS 141static inline void get_all_wmark_pages(struct zone *zone, struct val_t *valp) 142{ 143 u64 watermark[NR_WMARK] = {}; 144 u64 watermark_boost = 0; 145 146 bpf_probe_read(&watermark, sizeof(watermark), &zone->watermark); 147 valp->min = watermark[WMARK_MIN]; 148 valp->low = watermark[WMARK_LOW]; 149 valp->high = watermark[WMARK_HIGH]; 150 bpf_probe_read(&valp->free, sizeof(valp->free), 151 &zone->vm_stat[NR_FREE_PAGES]); 152} 153#endif 154 155int trace_compact_zone_entry(struct pt_regs *ctx, struct zone *zone, 156 struct compact_control *cc) 157{ 158#ifdef EXTNEDED_FIELDS 159 struct val_t val = { .fragindex=-1000 }; 160#else 161 struct val_t val = { }; 162#endif 163 u64 id = bpf_get_current_pid_tgid(); 164 PID_FILTER 165 val.sync = cc->sync; 166 start.update(&id, &val); 167 return 0; 168} 169 170int trace_compaction_suitable_entry(struct pt_regs *ctx, struct zone *zone, 171 int order) 172{ 173 u64 id = bpf_get_current_pid_tgid(); 174 struct val_t *valp = start.lookup(&id); 175 if (valp == NULL) { 176 // missed entry 177 return 0; 178 } 179 valp->nid = zone_to_nid_(zone); 180 valp->idx = zone_idx_(zone); 181 valp->order = order; 182 183#ifdef EXTNEDED_FIELDS 184 get_all_wmark_pages(zone, valp); 185#endif 186 187 return 0; 188} 189 190int trace_fragmentation_index_return(struct pt_regs *ctx) 191{ 192 int ret = PT_REGS_RC(ctx); 193 u64 id = bpf_get_current_pid_tgid(); 194 struct val_t *valp = start.lookup(&id); 195 if (valp == NULL) { 196 // missed entry 197 return 0; 198 } 199#ifdef EXTNEDED_FIELDS 200 valp->fragindex = ret; 201#endif 202 return 0; 203} 204 205int trace_compaction_suitable_return(struct pt_regs *ctx) 206{ 207 int ret = PT_REGS_RC(ctx); 208 u64 id = bpf_get_current_pid_tgid(); 209 struct val_t *valp = start.lookup(&id); 210 if (valp == NULL) { 211 // missed entry 212 return 0; 213 } 214 if (ret != COMPACT_CONTINUE) 215 start.delete(&id); 216 else 217 valp->ts = bpf_ktime_get_ns(); 218 return 0; 219} 220 221int trace_compact_zone_return(struct pt_regs *ctx) 222{ 223 int ret = PT_REGS_RC(ctx); 224 struct data_t data = {}; 225 u64 ts = bpf_ktime_get_ns(); 226 u64 id = bpf_get_current_pid_tgid(); 227 struct val_t *valp = start.lookup(&id); 228 if (valp == NULL) { 229 // missed entry or unsuitable 230 return 0; 231 } 232 233 data.delta = ts - valp->ts; 234 data.ts = ts / 1000; 235 data.pid = id >> 32; 236 data.tid = id; 237 bpf_get_current_comm(&data.comm, sizeof(data.comm)); 238 data.nid = valp->nid; 239 data.idx = valp->idx; 240 data.order = valp->order; 241 data.sync = valp->sync; 242 243#ifdef EXTNEDED_FIELDS 244 data.fragindex = valp->fragindex; 245 data.min = valp->min; 246 data.low = valp->low; 247 data.high = valp->high; 248 data.free = valp->free; 249#endif 250 251 data.status = ret; 252 data.stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID); 253 254 events.perf_submit(ctx, &data, sizeof(data)); 255 256 start.delete(&id); 257 return 0; 258} 259""" 260 261if platform.machine() != 'x86_64': 262 print(""" 263 Currently only support x86_64 servers, if you want to use it on 264 other platforms, please refer include/linux/mmzone.h to modify 265 zone_idex_to_str to get the right zone type 266 """) 267 exit() 268 269if args.extended_fields: 270 bpf_text = EXTENDED + bpf_text 271else: 272 bpf_text = NO_EXTENDED + bpf_text 273 274if args.pid: 275 bpf_text = bpf_text.replace( 276 "PID_FILTER", "if (id >> 32 != %s) { return 0; }" % args.pid) 277else: 278 bpf_text = bpf_text.replace("PID_FILTER", "") 279if debug or args.ebpf: 280 print(bpf_text) 281 if args.ebpf: 282 exit() 283 284# load BPF program 285b = BPF(text=bpf_text) 286b.attach_kprobe(event="compact_zone", fn_name="trace_compact_zone_entry") 287b.attach_kretprobe(event="compact_zone", fn_name="trace_compact_zone_return") 288b.attach_kprobe( 289 event="compaction_suitable", fn_name="trace_compaction_suitable_entry" 290) 291b.attach_kretprobe( 292 event="fragmentation_index", fn_name="trace_fragmentation_index_return" 293) 294b.attach_kretprobe( 295 event="compaction_suitable", fn_name="trace_compaction_suitable_return" 296) 297 298stack_traces = b.get_table("stack_traces") 299initial_ts = 0 300 301def zone_idx_to_str(idx): 302 # from include/linux/mmzone.h 303 # NOTICE: consider only x86_64 servers 304 zonetype = { 305 0: "ZONE_DMA", 306 1: "ZONE_DMA32", 307 2: "ZONE_NORMAL", 308 } 309 310 if idx in zonetype: 311 return zonetype[idx] 312 else: 313 return str(idx) 314 315def compact_result_to_str(status): 316 # from include/linux/compaction.h 317 compact_status = { 318 # COMPACT_SKIPPED: compaction didn't start as it was not possible 319 # or direct reclaim was more suitable 320 0: "skipped", 321 # COMPACT_CONTINUE: compaction should continue to another pageblock 322 1: "continue", 323 # COMPACT_PARTIAL: direct compaction partially compacted a zone and 324 # there are suitable pages 325 2: "partial", 326 # COMPACT_COMPLETE: The full zone was compacted 327 3: "complete", 328 } 329 330 if status in compact_status: 331 return compact_status[status] 332 else: 333 return str(status) 334 335# header 336if args.timestamp: 337 print("%-14s" % ("TIME(s)"), end=" ") 338print( 339 "%-14s %-6s %-4s %-12s %-5s %-7s" 340 % ("COMM", "PID", "NODE", "ZONE", "ORDER", "MODE"), 341 end=" ", 342) 343if args.extended_fields: 344 print("%-8s %-8s %-8s %-8s %-8s" % 345 ("FRAGIDX", "MIN", "LOW", "HIGH", "FREE"), end=" ") 346print("%9s %16s" % ("LAT(ms)", "STATUS")) 347 348# process event 349def print_event(cpu, data, size): 350 event = b["events"].event(data) 351 352 global initial_ts 353 354 if not initial_ts: 355 initial_ts = event.ts 356 357 if args.timestamp: 358 delta = event.ts - initial_ts 359 print("%-14.9f" % (float(delta) / 1000000), end=" ") 360 361 print("%-14.14s %-6s %-4s %-12s %-5s %-7s" % ( 362 event.comm.decode("utf-8", "replace"), 363 event.pid, 364 event.nid, 365 zone_idx_to_str(event.idx), 366 event.order, 367 "SYNC" if event.sync else "ASYNC"), end=" ") 368 if args.extended_fields: 369 print("%-8.3f %-8s %-8s %-8s %-8s" % ( 370 float(event.fragindex) / 1000, 371 event.min, 372 event.low, 373 event.high, 374 event.free), end=" ") 375 print("%9.3f %16s" % ( 376 float(event.delta) / 1000000, compact_result_to_str(event.status))) 377 if args.kernel_stack: 378 for addr in stack_traces.walk(event.stack_id): 379 sym = b.ksym(addr, show_offset=True) 380 print("\t%s" % sym) 381 print("") 382 383# loop with callback to print_event 384b["events"].open_perf_buffer(print_event, page_cnt=64) 385start_time = datetime.now() 386while not args.duration or datetime.now() - start_time < args.duration: 387 try: 388 b.perf_buffer_poll() 389 except KeyboardInterrupt: 390 exit() 391