1#!/usr/bin/env python 2# 3# kvmexit.py 4# 5# Display the exit_reason and its statistics of each vm exit 6# for all vcpus of all virtual machines. For example: 7# $./kvmexit.py 8# PID TID KVM_EXIT_REASON COUNT 9# 1273551 1273568 EXIT_REASON_MSR_WRITE 6 10# 1274253 1274261 EXIT_REASON_EXTERNAL_INTERRUPT 1 11# 1274253 1274261 EXIT_REASON_HLT 12 12# ... 13# 14# Besides, we also allow users to specify one pid, tid(s), or one 15# pid and its vcpu. See kvmexit_example.txt for more examples. 16# 17# @PID: each vitual machine's pid in the user space. 18# @TID: the user space's thread of each vcpu of that virtual machine. 19# @KVM_EXIT_REASON: the reason why the vm exits. 20# @COUNT: the counts of the @KVM_EXIT_REASONS. 21# 22# REQUIRES: Linux 4.7+ (BPF_PROG_TYPE_TRACEPOINT support) 23# 24# Copyright (c) 2021 ByteDance Inc. All rights reserved. 25# 26# Author(s): 27# Fei Li <lifei.shirley@bytedance.com> 28 29 30from __future__ import print_function 31from time import sleep 32from bcc import BPF 33import argparse 34import multiprocessing 35import os 36import subprocess 37 38# 39# Process Arguments 40# 41def valid_args_list(args): 42 args_list = args.split(",") 43 for arg in args_list: 44 try: 45 int(arg) 46 except: 47 raise argparse.ArgumentTypeError("must be valid integer") 48 return args_list 49 50# arguments 51examples = """examples: 52 ./kvmexit # Display kvm_exit_reason and its statistics in real-time until Ctrl-C 53 ./kvmexit 5 # Display in real-time after sleeping 5s 54 ./kvmexit -p 3195281 # Collpase all tids for pid 3195281 with exit reasons sorted in descending order 55 ./kvmexit -p 3195281 20 # Collpase all tids for pid 3195281 with exit reasons sorted in descending order, and display after sleeping 20s 56 ./kvmexit -p 3195281 -v 0 # Display only vcpu0 for pid 3195281, descending sort by default 57 ./kvmexit -p 3195281 -a # Display all tids for pid 3195281 58 ./kvmexit -t 395490 # Display only for tid 395490 with exit reasons sorted in descending order 59 ./kvmexit -t 395490 20 # Display only for tid 395490 with exit reasons sorted in descending order after sleeping 20s 60 ./kvmexit -T '395490,395491' # Display for a union like {395490, 395491} 61""" 62parser = argparse.ArgumentParser( 63 description="Display kvm_exit_reason and its statistics at a timed interval", 64 formatter_class=argparse.RawDescriptionHelpFormatter, 65 epilog=examples) 66parser.add_argument("duration", nargs="?", default=99999999, type=int, help="show delta for next several seconds") 67parser.add_argument("-p", "--pid", type=int, help="trace this PID only") 68exgroup = parser.add_mutually_exclusive_group() 69exgroup.add_argument("-t", "--tid", type=int, help="trace this TID only") 70exgroup.add_argument("-T", "--tids", type=valid_args_list, help="trace a comma separated series of tids with no space in between") 71exgroup.add_argument("-v", "--vcpu", type=int, help="trace this vcpu only") 72exgroup.add_argument("-a", "--alltids", action="store_true", help="trace all tids for this pid") 73args = parser.parse_args() 74duration = int(args.duration) 75 76# 77# Setup BPF 78# 79 80# load BPF program 81bpf_text = """ 82#include <linux/delay.h> 83 84#define REASON_NUM 69 85#define TGID_NUM 1024 86 87struct exit_count { 88 u64 exit_ct[REASON_NUM]; 89}; 90BPF_PERCPU_ARRAY(init_value, struct exit_count, 1); 91BPF_TABLE("percpu_hash", u64, struct exit_count, pcpu_kvm_stat, TGID_NUM); 92 93struct cache_info { 94 u64 cache_pid_tgid; 95 struct exit_count cache_exit_ct; 96}; 97BPF_PERCPU_ARRAY(pcpu_cache, struct cache_info, 1); 98 99FUNC_ENTRY { 100 int cache_miss = 0; 101 int zero = 0; 102 u32 er = GET_ER; 103 if (er >= REASON_NUM) { 104 return 0; 105 } 106 107 u64 cur_pid_tgid = bpf_get_current_pid_tgid(); 108 u32 tgid = cur_pid_tgid >> 32; 109 u32 pid = cur_pid_tgid; 110 111 if (THREAD_FILTER) 112 return 0; 113 114 struct exit_count *tmp_info = NULL, *initial = NULL; 115 struct cache_info *cache_p; 116 cache_p = pcpu_cache.lookup(&zero); 117 if (cache_p == NULL) { 118 return 0; 119 } 120 121 if (cache_p->cache_pid_tgid == cur_pid_tgid) { 122 //a. If the cur_pid_tgid hit this physical cpu consecutively, save it to pcpu_cache 123 tmp_info = &cache_p->cache_exit_ct; 124 } else { 125 //b. If another pid_tgid matches this pcpu for the last hit, OR it is the first time to hit this physical cpu. 126 cache_miss = 1; 127 128 // b.a Try to load the last cache struct if exists. 129 tmp_info = pcpu_kvm_stat.lookup(&cur_pid_tgid); 130 131 // b.b If it is the first time for the cur_pid_tgid to hit this pcpu, employ a 132 // per_cpu array to initialize pcpu_kvm_stat's exit_count with each exit reason's count is zero 133 if (tmp_info == NULL) { 134 initial = init_value.lookup(&zero); 135 if (initial == NULL) { 136 return 0; 137 } 138 139 pcpu_kvm_stat.update(&cur_pid_tgid, initial); 140 tmp_info = pcpu_kvm_stat.lookup(&cur_pid_tgid); 141 // To pass the verifier 142 if (tmp_info == NULL) { 143 return 0; 144 } 145 } 146 } 147 148 if (er < REASON_NUM) { 149 tmp_info->exit_ct[er]++; 150 if (cache_miss == 1) { 151 if (cache_p->cache_pid_tgid != 0) { 152 // b.*.a Let's save the last hit cache_info into kvm_stat. 153 pcpu_kvm_stat.update(&cache_p->cache_pid_tgid, &cache_p->cache_exit_ct); 154 } 155 // b.* As the cur_pid_tgid meets current pcpu_cache_array for the first time, save it. 156 cache_p->cache_pid_tgid = cur_pid_tgid; 157 bpf_probe_read(&cache_p->cache_exit_ct, sizeof(*tmp_info), tmp_info); 158 } 159 return 0; 160 } 161 162 return 0; 163} 164""" 165 166# format output 167exit_reasons = ( 168 "EXCEPTION_NMI", 169 "EXTERNAL_INTERRUPT", 170 "TRIPLE_FAULT", 171 "INIT_SIGNAL", 172 "N/A", 173 "N/A", 174 "N/A", 175 "INTERRUPT_WINDOW", 176 "NMI_WINDOW", 177 "TASK_SWITCH", 178 "CPUID", 179 "N/A", 180 "HLT", 181 "INVD", 182 "INVLPG", 183 "RDPMC", 184 "RDTSC", 185 "N/A", 186 "VMCALL", 187 "VMCLEAR", 188 "VMLAUNCH", 189 "VMPTRLD", 190 "VMPTRST", 191 "VMREAD", 192 "VMRESUME", 193 "VMWRITE", 194 "VMOFF", 195 "VMON", 196 "CR_ACCESS", 197 "DR_ACCESS", 198 "IO_INSTRUCTION", 199 "MSR_READ", 200 "MSR_WRITE", 201 "INVALID_STATE", 202 "MSR_LOAD_FAIL", 203 "N/A", 204 "MWAIT_INSTRUCTION", 205 "MONITOR_TRAP_FLAG", 206 "N/A", 207 "MONITOR_INSTRUCTION", 208 "PAUSE_INSTRUCTION", 209 "MCE_DURING_VMENTRY", 210 "N/A", 211 "TPR_BELOW_THRESHOLD", 212 "APIC_ACCESS", 213 "EOI_INDUCED", 214 "GDTR_IDTR", 215 "LDTR_TR", 216 "EPT_VIOLATION", 217 "EPT_MISCONFIG", 218 "INVEPT", 219 "RDTSCP", 220 "PREEMPTION_TIMER", 221 "INVVPID", 222 "WBINVD", 223 "XSETBV", 224 "APIC_WRITE", 225 "RDRAND", 226 "INVPCID", 227 "VMFUNC", 228 "ENCLS", 229 "RDSEED", 230 "PML_FULL", 231 "XSAVES", 232 "XRSTORS", 233 "N/A", 234 "N/A", 235 "UMWAIT", 236 "TPAUSE" 237) 238 239# 240# Do some checks 241# 242try: 243 # Currently, only adapte on intel architecture 244 cmd = "cat /proc/cpuinfo | grep vendor_id | head -n 1" 245 arch_info = subprocess.check_output(cmd, shell=True).strip() 246 if b"Intel" in arch_info: 247 pass 248 else: 249 raise Exception("Currently we only support Intel architecture, please do expansion if needs more.") 250 251 # Check if kvm module is loaded 252 if os.access("/dev/kvm", os.R_OK | os.W_OK): 253 pass 254 else: 255 raise Exception("Please insmod kvm module to use kvmexit tool.") 256except Exception as e: 257 raise Exception("Failed to do precondition check, due to: %s." % e) 258 259try: 260 if BPF.support_raw_tracepoint_in_module(): 261 # Let's firstly try raw_tracepoint_in_module 262 func_entry = "RAW_TRACEPOINT_PROBE(kvm_exit)" 263 get_er = "ctx->args[0]" 264 else: 265 # If raw_tp_in_module is not supported, fall back to regular tp 266 func_entry = "TRACEPOINT_PROBE(kvm, kvm_exit)" 267 get_er = "args->exit_reason" 268except Exception as e: 269 raise Exception("Failed to catch kvm exit reasons due to: %s" % e) 270 271 272def find_tid(tgt_dir, tgt_vcpu): 273 for tid in os.listdir(tgt_dir): 274 path = tgt_dir + "/" + tid + "/comm" 275 fp = open(path, "r") 276 comm = fp.read() 277 if (comm.find(tgt_vcpu) != -1): 278 return tid 279 return -1 280 281# set process/thread filter 282thread_context = "" 283header_format = "" 284need_collapse = not args.alltids 285if args.tid is not None: 286 thread_context = "TID %s" % args.tid 287 thread_filter = 'pid != %s' % args.tid 288elif args.tids is not None: 289 thread_context = "TIDS %s" % args.tids 290 thread_filter = "pid != " + " && pid != ".join(args.tids) 291 header_format = "TIDS " 292elif args.pid is not None: 293 thread_context = "PID %s" % args.pid 294 thread_filter = 'tgid != %s' % args.pid 295 if args.vcpu is not None: 296 thread_context = "PID %s VCPU %s" % (args.pid, args.vcpu) 297 # transfer vcpu to tid 298 tgt_dir = '/proc/' + str(args.pid) + '/task' 299 tgt_vcpu = "CPU " + str(args.vcpu) 300 args.tid = find_tid(tgt_dir, tgt_vcpu) 301 if args.tid == -1: 302 raise Exception("There's no v%s for PID %d." % (tgt_vcpu, args.pid)) 303 thread_filter = 'pid != %s' % args.tid 304 elif args.alltids: 305 thread_context = "PID %s and its all threads" % args.pid 306 header_format = "TID " 307else: 308 thread_context = "all threads" 309 thread_filter = '0' 310 header_format = "PID TID " 311bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) 312 313# For kernel >= 5.0, use RAW_TRACEPOINT_MODULE for performance consideration 314bpf_text = bpf_text.replace('FUNC_ENTRY', func_entry) 315bpf_text = bpf_text.replace('GET_ER', get_er) 316b = BPF(text=bpf_text) 317 318 319# header 320print("Display kvm exit reasons and statistics for %s" % thread_context, end="") 321if duration < 99999999: 322 print(" after sleeping %d secs." % duration) 323else: 324 print("... Hit Ctrl-C to end.") 325 326try: 327 sleep(duration) 328except KeyboardInterrupt: 329 print() 330 331 332# Currently, sort multiple tids in descending order is not supported. 333if (args.pid or args.tid): 334 ct_reason = [] 335 if args.pid: 336 tgid_exit = [0 for i in range(len(exit_reasons))] 337 338# output 339print("%s%-35s %s" % (header_format, "KVM_EXIT_REASON", "COUNT")) 340 341pcpu_kvm_stat = b["pcpu_kvm_stat"] 342pcpu_cache = b["pcpu_cache"] 343for k, v in pcpu_kvm_stat.items(): 344 tgid = k.value >> 32 345 pid = k.value & 0xffffffff 346 for i in range(0, len(exit_reasons)): 347 sum1 = 0 348 for inner_cpu in range(0, multiprocessing.cpu_count()): 349 cachePIDTGID = pcpu_cache[0][inner_cpu].cache_pid_tgid 350 # Take priority to check if it is in cache 351 if cachePIDTGID == k.value: 352 sum1 += pcpu_cache[0][inner_cpu].cache_exit_ct.exit_ct[i] 353 # If not in cache, find from kvm_stat 354 else: 355 sum1 += v[inner_cpu].exit_ct[i] 356 if sum1 == 0: 357 continue 358 359 if (args.pid and args.pid == tgid and need_collapse): 360 tgid_exit[i] += sum1 361 elif (args.tid and args.tid == pid): 362 ct_reason.append((sum1, i)) 363 elif not need_collapse or args.tids: 364 print("%-8u %-35s %-8u" % (pid, exit_reasons[i], sum1)) 365 else: 366 print("%-8u %-8u %-35s %-8u" % (tgid, pid, exit_reasons[i], sum1)) 367 368 # Display only for the target tid in descending sort 369 if (args.tid and args.tid == pid): 370 ct_reason.sort(reverse=True) 371 for i in range(0, len(ct_reason)): 372 if ct_reason[i][0] == 0: 373 continue 374 print("%-35s %-8u" % (exit_reasons[ct_reason[i][1]], ct_reason[i][0])) 375 break 376 377 378# Aggregate all tids' counts for this args.pid in descending sort 379if args.pid and need_collapse: 380 for i in range(0, len(exit_reasons)): 381 ct_reason.append((tgid_exit[i], i)) 382 ct_reason.sort(reverse=True) 383 for i in range(0, len(ct_reason)): 384 if ct_reason[i][0] == 0: 385 continue 386 print("%-35s %-8u" % (exit_reasons[ct_reason[i][1]], ct_reason[i][0])) 387