1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# compactsnoop Trace compact zone and print details including issuing PID. 5# For Linux, uses BCC, eBPF. 6# 7# This uses in-kernel eBPF maps to cache process details (PID and comm) by 8# compact zone begin, as well as a starting timestamp for calculating 9# latency. 10# 11# Copyright (c) 2019 Wenbo Zhang 12# Licensed under the Apache License, Version 2.0 (the "License") 13# 14# 11-NOV-2019 Wenbo Zhang Created this. 15 16from __future__ import print_function 17from bcc import BPF 18import argparse 19import platform 20from datetime import datetime, timedelta 21import sys 22 23# arguments 24examples = """examples: 25 ./compactsnoop # trace all compact stall 26 ./compactsnoop -T # include timestamps 27 ./compactsnoop -d 10 # trace for 10 seconds only 28 ./compactsnoop -K # output kernel stack trace 29 ./compactsnoop -e # show extended fields 30""" 31 32parser = argparse.ArgumentParser( 33 description="Trace compact zone", 34 formatter_class=argparse.RawDescriptionHelpFormatter, 35 epilog=examples, 36) 37parser.add_argument("-T", "--timestamp", action="store_true", 38 help="include timestamp on output") 39parser.add_argument("-p", "--pid", help="trace this PID only") 40parser.add_argument("-d", "--duration", 41 help="total duration of trace in seconds") 42parser.add_argument("-K", "--kernel-stack", action="store_true", 43 help="output kernel stack trace") 44parser.add_argument("-e", "--extended_fields", action="store_true", 45 help="show system memory state") 46parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) 47args = parser.parse_args() 48debug = 0 49if args.duration: 50 args.duration = timedelta(seconds=int(args.duration)) 51 52NO_EXTENDED = """ 53#ifdef EXTNEDED_FIELDS 54#undef EXTNEDED_FIELDS 55#endif 56""" 57 58EXTENDED = """ 59#define EXTNEDED_FIELDS 1 60""" 61 62bpf_text = """ 63#include <uapi/linux/ptrace.h> 64#include <linux/sched.h> 65#include <linux/mmzone.h> 66#include <linux/compaction.h> 67 68struct val_t { 69 int nid; 70 int idx; 71 int order; 72 int sync; 73#ifdef EXTNEDED_FIELDS 74 int fragindex; 75 int low; 76 int min; 77 int high; 78 int free; 79#endif 80 u64 ts; // compaction begin time 81}; 82 83struct data_t { 84 u32 pid; 85 u32 tid; 86 int nid; 87 int idx; 88 int order; 89 u64 delta; 90 u64 ts; // compaction end time 91 int sync; 92#ifdef EXTNEDED_FIELDS 93 int fragindex; 94 int low; 95 int min; 96 int high; 97 int free; 98#endif 99 int status; 100 int stack_id; 101 char comm[TASK_COMM_LEN]; 102}; 103 104BPF_HASH(start, u64, struct val_t); 105BPF_PERF_OUTPUT(events); 106BPF_STACK_TRACE(stack_traces, 2048); 107 108#ifdef CONFIG_NUMA 109static inline int zone_to_nid_(struct zone *zone) 110{ 111 int node; 112 bpf_probe_read_kernel(&node, sizeof(node), &zone->node); 113 return node; 114} 115#else 116static inline int zone_to_nid_(struct zone *zone) 117{ 118 return 0; 119} 120#endif 121 122// #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) 123static inline int zone_idx_(struct zone *zone) 124{ 125 struct pglist_data *zone_pgdat = NULL; 126 bpf_probe_read_kernel(&zone_pgdat, sizeof(zone_pgdat), &zone->zone_pgdat); 127 return zone - zone_pgdat->node_zones; 128} 129 130#ifdef EXTNEDED_FIELDS 131static inline void get_all_wmark_pages(struct zone *zone, struct val_t *valp) 132{ 133 u64 _watermark[NR_WMARK] = {}; 134 u64 watermark_boost = 0; 135 136 bpf_probe_read_kernel(&_watermark, sizeof(_watermark), &zone->_watermark); 137 bpf_probe_read_kernel(&watermark_boost, sizeof(watermark_boost), 138 &zone->watermark_boost); 139 valp->min = _watermark[WMARK_MIN] + watermark_boost; 140 valp->low = _watermark[WMARK_LOW] + watermark_boost; 141 valp->high = _watermark[WMARK_HIGH] + watermark_boost; 142 bpf_probe_read_kernel(&valp->free, sizeof(valp->free), 143 &zone->vm_stat[NR_FREE_PAGES]); 144} 145#endif 146 147static inline void submit_event(void *ctx, int status) 148{ 149 struct data_t data = {}; 150 u64 ts = bpf_ktime_get_ns(); 151 u64 id = bpf_get_current_pid_tgid(); 152 struct val_t *valp = start.lookup(&id); 153 if (valp == NULL) { 154 // missed entry 155 return; 156 } 157 158 data.delta = ts - valp->ts; 159 data.ts = ts / 1000; 160 data.pid = id >> 32; 161 data.tid = id; 162 bpf_get_current_comm(&data.comm, sizeof(data.comm)); 163 data.nid = valp->nid; 164 data.idx = valp->idx; 165 data.order = valp->order; 166 data.sync = valp->sync; 167 168#ifdef EXTNEDED_FIELDS 169 data.fragindex = valp->fragindex; 170 data.min = valp->min; 171 data.low = valp->low; 172 data.high = valp->high; 173 data.free = valp->free; 174#endif 175 176 data.status = status; 177 data.stack_id = stack_traces.get_stackid(ctx, 0); 178 179 events.perf_submit(ctx, &data, sizeof(data)); 180 181 start.delete(&id); 182} 183 184#ifdef EXTNEDED_FIELDS 185int trace_fragmentation_index_return(struct pt_regs *ctx) 186{ 187 struct val_t val = { }; 188 int ret = PT_REGS_RC(ctx); 189 u64 id = bpf_get_current_pid_tgid(); 190 PID_FILTER 191 val.fragindex = ret; 192 start.update(&id, &val); 193 return 0; 194} 195#endif 196 197static inline void fill_compact_info(struct val_t *valp, 198 struct zone *zone, 199 int order) 200{ 201 valp->nid = zone_to_nid_(zone); 202 valp->idx = zone_idx_(zone); 203 valp->order = order; 204} 205 206RAW_TRACEPOINT_PROBE(mm_compaction_suitable) 207{ 208 // TP_PROTO(struct zone *zone, int order, int ret) 209 struct zone *zone = (struct zone *)ctx->args[0]; 210 int order = (int)ctx->args[1]; 211 int ret = (int)ctx->args[2]; 212 u64 id; 213 214 if(ret != COMPACT_CONTINUE) 215 return 0; 216 217 id = bpf_get_current_pid_tgid(); 218 PID_FILTER 219 220#ifdef EXTNEDED_FIELDS 221 struct val_t *valp = start.lookup(&id); 222 if (valp == NULL) { 223 // missed entry or order <= PAGE_ALLOC_COSTLY_ORDER, eg: 224 // manual trigger echo 1 > /proc/sys/vm/compact_memory 225 struct val_t val = { .fragindex = -1000 }; 226 valp = &val; 227 start.update(&id, valp); 228 } 229 fill_compact_info(valp, zone, order); 230 get_all_wmark_pages(zone, valp); 231#else 232 struct val_t val = { }; 233 fill_compact_info(&val, zone, order); 234 start.update(&id, &val); 235#endif 236 237 return 0; 238} 239 240RAW_TRACEPOINT_PROBE(mm_compaction_begin) 241{ 242 // TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, 243 // unsigned long free_pfn, unsigned long zone_end, bool sync) 244 bool sync = (bool)ctx->args[4]; 245 246 u64 id = bpf_get_current_pid_tgid(); 247 struct val_t *valp = start.lookup(&id); 248 if (valp == NULL) { 249 // missed entry 250 return 0; 251 } 252 253 valp->ts = bpf_ktime_get_ns(); 254 valp->sync = sync; 255 return 0; 256} 257 258RAW_TRACEPOINT_PROBE(mm_compaction_end) 259{ 260 // TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, 261 // unsigned long free_pfn, unsigned long zone_end, bool sync, 262 // int status) 263 submit_event(ctx, ctx->args[5]); 264 return 0; 265} 266""" 267 268if platform.machine() != 'x86_64': 269 print(""" 270 Currently only support x86_64 servers, if you want to use it on 271 other platforms, please refer include/linux/mmzone.h to modify 272 zone_idex_to_str to get the right zone type 273 """) 274 exit() 275 276if args.extended_fields: 277 bpf_text = EXTENDED + bpf_text 278else: 279 bpf_text = NO_EXTENDED + bpf_text 280 281if args.pid: 282 bpf_text = bpf_text.replace("PID_FILTER", 283 "if (id >> 32 != %s) { return 0; }" % args.pid) 284else: 285 bpf_text = bpf_text.replace("PID_FILTER", "") 286if debug or args.ebpf: 287 print(bpf_text) 288 if args.ebpf: 289 exit() 290 291# load BPF program 292b = BPF(text=bpf_text) 293if args.extended_fields: 294 b.attach_kretprobe(event="fragmentation_index", 295 fn_name="trace_fragmentation_index_return") 296 297stack_traces = b.get_table("stack_traces") 298initial_ts = 0 299 300def zone_idx_to_str(idx): 301 # from include/linux/mmzone.h 302 # NOTICE: consider only x86_64 servers 303 zone_type = { 304 0: "ZONE_DMA", 305 1: "ZONE_DMA32", 306 2: "ZONE_NORMAL", 307 } 308 309 if idx in zone_type: 310 return zone_type[idx] 311 else: 312 return str(idx) 313 314def compact_result_to_str(status): 315 # from include/trace/evnets/mmflags.h 316 # from include/linux/compaction.h 317 compact_status = { 318 # COMPACT_NOT_SUITABLE_ZONE: For more detailed tracepoint 319 # output - internal to compaction 320 0: "not_suitable_zone", 321 # COMPACT_SKIPPED: compaction didn't start as it was not 322 # possible or direct reclaim was more suitable 323 1: "skipped", 324 # COMPACT_DEFERRED: compaction didn't start as it was 325 # deferred due to past failures 326 2: "deferred", 327 # COMPACT_NOT_SUITABLE_PAGE: For more detailed tracepoint 328 # output - internal to compaction 329 3: "no_suitable_page", 330 # COMPACT_CONTINUE: compaction should continue to another pageblock 331 4: "continue", 332 # COMPACT_COMPLETE: The full zone was compacted scanned but wasn't 333 # successful to compact suitable pages. 334 5: "complete", 335 # COMPACT_PARTIAL_SKIPPED: direct compaction has scanned part of the 336 # zone but wasn't successful to compact suitable pages. 337 6: "partial_skipped", 338 # COMPACT_CONTENDED: compaction terminated prematurely due to lock 339 # contentions 340 7: "contended", 341 # COMPACT_SUCCESS: direct compaction terminated after concluding 342 # that the allocation should now succeed 343 8: "success", 344 } 345 346 if status in compact_status: 347 return compact_status[status] 348 else: 349 return str(status) 350 351# header 352if args.timestamp: 353 print("%-14s" % ("TIME(s)"), end=" ") 354print("%-14s %-6s %-4s %-12s %-5s %-7s" % 355 ("COMM", "PID", "NODE", "ZONE", "ORDER", "MODE"), end=" ") 356if args.extended_fields: 357 print("%-8s %-8s %-8s %-8s %-8s" % 358 ("FRAGIDX", "MIN", "LOW", "HIGH", "FREE"), end=" ") 359print("%9s %16s" % ("LAT(ms)", "STATUS")) 360 361# process event 362def print_event(cpu, data, size): 363 event = b["events"].event(data) 364 365 global initial_ts 366 367 if not initial_ts: 368 initial_ts = event.ts 369 370 if args.timestamp: 371 delta = event.ts - initial_ts 372 print("%-14.9f" % (float(delta) / 1000000), end=" ") 373 374 print("%-14.14s %-6s %-4s %-12s %-5s %-7s" % ( 375 event.comm.decode("utf-8", "replace"), 376 event.pid, 377 event.nid, 378 zone_idx_to_str(event.idx), 379 event.order, 380 "SYNC" if event.sync else "ASYNC"), end=" ") 381 if args.extended_fields: 382 print("%-8.3f %-8s %-8s %-8s %-8s" % ( 383 (float(event.fragindex) / 1000), 384 event.min, event.low, event.high, event.free 385 ), end=" ") 386 print("%9.3f %16s" % ( 387 float(event.delta) / 1000000, compact_result_to_str(event.status))) 388 if args.kernel_stack: 389 for addr in stack_traces.walk(event.stack_id): 390 sym = b.ksym(addr, show_offset=True) 391 print("\t%s" % sym) 392 print("") 393 394 sys.stdout.flush() 395 396# loop with callback to print_event 397b["events"].open_perf_buffer(print_event, page_cnt=64) 398start_time = datetime.now() 399while not args.duration or datetime.now() - start_time < args.duration: 400 try: 401 b.perf_buffer_poll() 402 except KeyboardInterrupt: 403 exit() 404