1#!/usr/bin/env python 2# 3# deadlock_detector Detects potential deadlocks (lock order inversions) 4# on a running process. For Linux, uses BCC, eBPF. 5# 6# USAGE: deadlock_detector.py [-h] [--binary BINARY] [--dump-graph DUMP_GRAPH] 7# [--verbose] [--lock-symbols LOCK_SYMBOLS] 8# [--unlock-symbols UNLOCK_SYMBOLS] 9# pid 10# 11# This traces pthread mutex lock and unlock calls to build a directed graph 12# representing the mutex wait graph: 13# 14# - Nodes in the graph represent mutexes. 15# - Edge (A, B) exists if there exists some thread T where lock(A) was called 16# and lock(B) was called before unlock(A) was called. 17# 18# If the program finds a potential lock order inversion, the program will dump 19# the cycle of mutexes and the stack traces where each mutex was acquired, and 20# then exit. 21# 22# This program can only find potential deadlocks that occur while the program 23# is tracing the process. It cannot find deadlocks that may have occurred 24# before the program was attached to the process. 25# 26# Since this traces all mutex lock and unlock events and all thread creation 27# events on the traced process, the overhead of this bpf program can be very 28# high if the process has many threads and mutexes. You should only run this on 29# a process where the slowdown is acceptable. 30# 31# Note: This tool does not work for shared mutexes or recursive mutexes. 32# 33# For shared (read-write) mutexes, a deadlock requires a cycle in the wait 34# graph where at least one of the mutexes in the cycle is acquiring exclusive 35# (write) ownership. 36# 37# For recursive mutexes, lock() is called multiple times on the same mutex. 38# However, there is no way to determine if a mutex is a recursive mutex 39# after the mutex has been created. As a result, this tool will not find 40# potential deadlocks that involve only one mutex. 41# 42# Copyright 2017 Facebook, Inc. 43# Licensed under the Apache License, Version 2.0 (the "License") 44# 45# 01-Feb-2017 Kenny Yu Created this. 46 47from __future__ import ( 48 absolute_import, division, unicode_literals, print_function 49) 50from bcc import BPF 51from collections import defaultdict 52import argparse 53import json 54import os 55import subprocess 56import sys 57import time 58 59 60class DiGraph(object): 61 ''' 62 Adapted from networkx: http://networkx.github.io/ 63 Represents a directed graph. Edges can store (key, value) attributes. 64 ''' 65 66 def __init__(self): 67 # Map of node -> set of nodes 68 self.adjacency_map = {} 69 # Map of (node1, node2) -> map string -> arbitrary attribute 70 # This will not be copied in subgraph() 71 self.attributes_map = {} 72 73 def neighbors(self, node): 74 return self.adjacency_map.get(node, set()) 75 76 def edges(self): 77 edges = [] 78 for node, neighbors in self.adjacency_map.items(): 79 for neighbor in neighbors: 80 edges.append((node, neighbor)) 81 return edges 82 83 def nodes(self): 84 return self.adjacency_map.keys() 85 86 def attributes(self, node1, node2): 87 return self.attributes_map[(node1, node2)] 88 89 def add_edge(self, node1, node2, **kwargs): 90 if node1 not in self.adjacency_map: 91 self.adjacency_map[node1] = set() 92 if node2 not in self.adjacency_map: 93 self.adjacency_map[node2] = set() 94 self.adjacency_map[node1].add(node2) 95 self.attributes_map[(node1, node2)] = kwargs 96 97 def remove_node(self, node): 98 self.adjacency_map.pop(node, None) 99 for _, neighbors in self.adjacency_map.items(): 100 neighbors.discard(node) 101 102 def subgraph(self, nodes): 103 graph = DiGraph() 104 for node in nodes: 105 for neighbor in self.neighbors(node): 106 if neighbor in nodes: 107 graph.add_edge(node, neighbor) 108 return graph 109 110 def node_link_data(self): 111 ''' 112 Returns the graph as a dictionary in a format that can be 113 serialized. 114 ''' 115 data = { 116 'directed': True, 117 'multigraph': False, 118 'graph': {}, 119 'links': [], 120 'nodes': [], 121 } 122 123 # Do one pass to build a map of node -> position in nodes 124 node_to_number = {} 125 for node in self.adjacency_map.keys(): 126 node_to_number[node] = len(data['nodes']) 127 data['nodes'].append({'id': node}) 128 129 # Do another pass to build the link information 130 for node, neighbors in self.adjacency_map.items(): 131 for neighbor in neighbors: 132 link = self.attributes_map[(node, neighbor)].copy() 133 link['source'] = node_to_number[node] 134 link['target'] = node_to_number[neighbor] 135 data['links'].append(link) 136 return data 137 138 139def strongly_connected_components(G): 140 ''' 141 Adapted from networkx: http://networkx.github.io/ 142 Parameters 143 ---------- 144 G : DiGraph 145 Returns 146 ------- 147 comp : generator of sets 148 A generator of sets of nodes, one for each strongly connected 149 component of G. 150 ''' 151 preorder = {} 152 lowlink = {} 153 scc_found = {} 154 scc_queue = [] 155 i = 0 # Preorder counter 156 for source in G.nodes(): 157 if source not in scc_found: 158 queue = [source] 159 while queue: 160 v = queue[-1] 161 if v not in preorder: 162 i = i + 1 163 preorder[v] = i 164 done = 1 165 v_nbrs = G.neighbors(v) 166 for w in v_nbrs: 167 if w not in preorder: 168 queue.append(w) 169 done = 0 170 break 171 if done == 1: 172 lowlink[v] = preorder[v] 173 for w in v_nbrs: 174 if w not in scc_found: 175 if preorder[w] > preorder[v]: 176 lowlink[v] = min([lowlink[v], lowlink[w]]) 177 else: 178 lowlink[v] = min([lowlink[v], preorder[w]]) 179 queue.pop() 180 if lowlink[v] == preorder[v]: 181 scc_found[v] = True 182 scc = {v} 183 while ( 184 scc_queue and preorder[scc_queue[-1]] > preorder[v] 185 ): 186 k = scc_queue.pop() 187 scc_found[k] = True 188 scc.add(k) 189 yield scc 190 else: 191 scc_queue.append(v) 192 193 194def simple_cycles(G): 195 ''' 196 Adapted from networkx: http://networkx.github.io/ 197 Parameters 198 ---------- 199 G : DiGraph 200 Returns 201 ------- 202 cycle_generator: generator 203 A generator that produces elementary cycles of the graph. 204 Each cycle is represented by a list of nodes along the cycle. 205 ''' 206 207 def _unblock(thisnode, blocked, B): 208 stack = set([thisnode]) 209 while stack: 210 node = stack.pop() 211 if node in blocked: 212 blocked.remove(node) 213 stack.update(B[node]) 214 B[node].clear() 215 216 # Johnson's algorithm requires some ordering of the nodes. 217 # We assign the arbitrary ordering given by the strongly connected comps 218 # There is no need to track the ordering as each node removed as processed. 219 # save the actual graph so we can mutate it here 220 # We only take the edges because we do not want to 221 # copy edge and node attributes here. 222 subG = G.subgraph(G.nodes()) 223 sccs = list(strongly_connected_components(subG)) 224 while sccs: 225 scc = sccs.pop() 226 # order of scc determines ordering of nodes 227 startnode = scc.pop() 228 # Processing node runs 'circuit' routine from recursive version 229 path = [startnode] 230 blocked = set() # vertex: blocked from search? 231 closed = set() # nodes involved in a cycle 232 blocked.add(startnode) 233 B = defaultdict(set) # graph portions that yield no elementary circuit 234 stack = [(startnode, list(subG.neighbors(startnode)))] 235 while stack: 236 thisnode, nbrs = stack[-1] 237 if nbrs: 238 nextnode = nbrs.pop() 239 if nextnode == startnode: 240 yield path[:] 241 closed.update(path) 242 elif nextnode not in blocked: 243 path.append(nextnode) 244 stack.append((nextnode, list(subG.neighbors(nextnode)))) 245 closed.discard(nextnode) 246 blocked.add(nextnode) 247 continue 248 # done with nextnode... look for more neighbors 249 if not nbrs: # no more nbrs 250 if thisnode in closed: 251 _unblock(thisnode, blocked, B) 252 else: 253 for nbr in subG.neighbors(thisnode): 254 if thisnode not in B[nbr]: 255 B[nbr].add(thisnode) 256 stack.pop() 257 path.pop() 258 # done processing this node 259 subG.remove_node(startnode) 260 H = subG.subgraph(scc) # make smaller to avoid work in SCC routine 261 sccs.extend(list(strongly_connected_components(H))) 262 263 264def find_cycle(graph): 265 ''' 266 Looks for a cycle in the graph. If found, returns the first cycle. 267 If nodes a1, a2, ..., an are in a cycle, then this returns: 268 [(a1,a2), (a2,a3), ... (an-1,an), (an, a1)] 269 Otherwise returns an empty list. 270 ''' 271 cycles = list(simple_cycles(graph)) 272 if cycles: 273 nodes = cycles[0] 274 nodes.append(nodes[0]) 275 edges = [] 276 prev = nodes[0] 277 for node in nodes[1:]: 278 edges.append((prev, node)) 279 prev = node 280 return edges 281 else: 282 return [] 283 284 285def print_cycle(binary, graph, edges, thread_info, print_stack_trace_fn): 286 ''' 287 Prints the cycle in the mutex graph in the following format: 288 289 Potential Deadlock Detected! 290 291 Cycle in lock order graph: M0 => M1 => M2 => M0 292 293 for (m, n) in cycle: 294 Mutex n acquired here while holding Mutex m in thread T: 295 [ stack trace ] 296 297 Mutex m previously acquired by thread T here: 298 [ stack trace ] 299 300 for T in all threads: 301 Thread T was created here: 302 [ stack trace ] 303 ''' 304 305 # List of mutexes in the cycle, first and last repeated 306 nodes_in_order = [] 307 # Map mutex address -> readable alias 308 node_addr_to_name = {} 309 for counter, (m, n) in enumerate(edges): 310 nodes_in_order.append(m) 311 # For global or static variables, try to symbolize the mutex address. 312 symbol = symbolize_with_objdump(binary, m) 313 if symbol: 314 symbol += ' ' 315 node_addr_to_name[m] = 'Mutex M%d (%s0x%016x)' % (counter, symbol, m) 316 nodes_in_order.append(nodes_in_order[0]) 317 318 print('----------------\nPotential Deadlock Detected!\n') 319 print( 320 'Cycle in lock order graph: %s\n' % 321 (' => '.join([node_addr_to_name[n] for n in nodes_in_order])) 322 ) 323 324 # Set of threads involved in the lock inversion 325 thread_pids = set() 326 327 # For each edge in the cycle, print where the two mutexes were held 328 for (m, n) in edges: 329 thread_pid = graph.attributes(m, n)['thread_pid'] 330 thread_comm = graph.attributes(m, n)['thread_comm'] 331 first_mutex_stack_id = graph.attributes(m, n)['first_mutex_stack_id'] 332 second_mutex_stack_id = graph.attributes(m, n)['second_mutex_stack_id'] 333 thread_pids.add(thread_pid) 334 print( 335 '%s acquired here while holding %s in Thread %d (%s):' % ( 336 node_addr_to_name[n], node_addr_to_name[m], thread_pid, 337 thread_comm 338 ) 339 ) 340 print_stack_trace_fn(second_mutex_stack_id) 341 print('') 342 print( 343 '%s previously acquired by the same Thread %d (%s) here:' % 344 (node_addr_to_name[m], thread_pid, thread_comm) 345 ) 346 print_stack_trace_fn(first_mutex_stack_id) 347 print('') 348 349 # Print where the threads were created, if available 350 for thread_pid in thread_pids: 351 parent_pid, stack_id, parent_comm = thread_info.get( 352 thread_pid, (None, None, None) 353 ) 354 if parent_pid: 355 print( 356 'Thread %d created by Thread %d (%s) here: ' % 357 (thread_pid, parent_pid, parent_comm) 358 ) 359 print_stack_trace_fn(stack_id) 360 else: 361 print( 362 'Could not find stack trace where Thread %d was created' % 363 thread_pid 364 ) 365 print('') 366 367 368def symbolize_with_objdump(binary, addr): 369 ''' 370 Searches the binary for the address using objdump. Returns the symbol if 371 it is found, otherwise returns empty string. 372 ''' 373 try: 374 command = ( 375 'objdump -tT %s | grep %x | awk {\'print $NF\'} | c++filt' % 376 (binary, addr) 377 ) 378 output = subprocess.check_output(command, shell=True) 379 return output.decode('utf-8').strip() 380 except subprocess.CalledProcessError: 381 return '' 382 383 384def strlist(s): 385 '''Given a comma-separated string, returns a list of substrings''' 386 return s.strip().split(',') 387 388 389def main(): 390 examples = '''Examples: 391 deadlock_detector 181 # Analyze PID 181 392 393 deadlock_detector 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0 394 # Analyze PID 181 and locks from this binary. 395 # If tracing a process that is running from 396 # a dynamically-linked binary, this argument 397 # is required and should be the path to the 398 # pthread library. 399 400 deadlock_detector 181 --verbose 401 # Analyze PID 181 and print statistics about 402 # the mutex wait graph. 403 404 deadlock_detector 181 --lock-symbols my_mutex_lock1,my_mutex_lock2 \\ 405 --unlock-symbols my_mutex_unlock1,my_mutex_unlock2 406 # Analyze PID 181 and trace custom mutex 407 # symbols instead of pthread mutexes. 408 409 deadlock_detector 181 --dump-graph graph.json 410 # Analyze PID 181 and dump the mutex wait 411 # graph to graph.json. 412 ''' 413 parser = argparse.ArgumentParser( 414 description=( 415 'Detect potential deadlocks (lock inversions) in a running binary.' 416 '\nMust be run as root.' 417 ), 418 formatter_class=argparse.RawDescriptionHelpFormatter, 419 epilog=examples, 420 ) 421 parser.add_argument('pid', type=int, help='Pid to trace') 422 # Binaries with `:` in the path will fail to attach uprobes on kernels 423 # running without this patch: https://lkml.org/lkml/2017/1/13/585. 424 # Symlinks to the binary without `:` in the path can get around this issue. 425 parser.add_argument( 426 '--binary', 427 type=str, 428 default='', 429 help='If set, trace the mutexes from the binary at this path. ' 430 'For statically-linked binaries, this argument is not required. ' 431 'For dynamically-linked binaries, this argument is required and ' 432 'should be the path of the pthread library the binary is using. ' 433 'Example: /lib/x86_64-linux-gnu/libpthread.so.0', 434 ) 435 parser.add_argument( 436 '--dump-graph', 437 type=str, 438 default='', 439 help='If set, this will dump the mutex graph to the specified file.', 440 ) 441 parser.add_argument( 442 '--verbose', 443 action='store_true', 444 help='Print statistics about the mutex wait graph.', 445 ) 446 parser.add_argument( 447 '--lock-symbols', 448 type=strlist, 449 default=['pthread_mutex_lock'], 450 help='Comma-separated list of lock symbols to trace. Default is ' 451 'pthread_mutex_lock. These symbols cannot be inlined in the binary.', 452 ) 453 parser.add_argument( 454 '--unlock-symbols', 455 type=strlist, 456 default=['pthread_mutex_unlock'], 457 help='Comma-separated list of unlock symbols to trace. Default is ' 458 'pthread_mutex_unlock. These symbols cannot be inlined in the binary.', 459 ) 460 args = parser.parse_args() 461 if not args.binary: 462 try: 463 args.binary = os.readlink('/proc/%d/exe' % args.pid) 464 except OSError as e: 465 print('%s. Is the process (pid=%d) running?' % (str(e), args.pid)) 466 sys.exit(1) 467 468 bpf = BPF(src_file=b'deadlock_detector.c') 469 470 # Trace where threads are created 471 bpf.attach_kretprobe(event=bpf.get_syscall_fnname('clone'), fn_name='trace_clone') 472 473 # We must trace unlock first, otherwise in the time we attached the probe 474 # on lock() and have not yet attached the probe on unlock(), a thread can 475 # acquire mutexes and release them, but the release events will not be 476 # traced, resulting in noisy reports. 477 for symbol in args.unlock_symbols: 478 try: 479 bpf.attach_uprobe( 480 name=args.binary, 481 sym=symbol, 482 fn_name='trace_mutex_release', 483 pid=args.pid, 484 ) 485 except Exception as e: 486 print('%s. Failed to attach to symbol: %s' % (str(e), symbol)) 487 sys.exit(1) 488 for symbol in args.lock_symbols: 489 try: 490 bpf.attach_uprobe( 491 name=args.binary, 492 sym=symbol, 493 fn_name='trace_mutex_acquire', 494 pid=args.pid, 495 ) 496 except Exception as e: 497 print('%s. Failed to attach to symbol: %s' % (str(e), symbol)) 498 sys.exit(1) 499 500 def print_stack_trace(stack_id): 501 '''Closure that prints the symbolized stack trace.''' 502 for addr in bpf.get_table('stack_traces').walk(stack_id): 503 line = bpf.sym(addr, args.pid) 504 # Try to symbolize with objdump if we cannot with bpf. 505 if line == '[unknown]': 506 symbol = symbolize_with_objdump(args.binary, addr) 507 if symbol: 508 line = symbol 509 print('@ %016x %s' % (addr, line)) 510 511 print('Tracing... Hit Ctrl-C to end.') 512 while True: 513 try: 514 # Map of child thread pid -> parent info 515 thread_info = { 516 child.value: (parent.parent_pid, parent.stack_id, parent.comm) 517 for child, parent in bpf.get_table('thread_to_parent').items() 518 } 519 520 # Mutex wait directed graph. Nodes are mutexes. Edge (A,B) exists 521 # if there exists some thread T where lock(A) was called and 522 # lock(B) was called before unlock(A) was called. 523 graph = DiGraph() 524 for key, leaf in bpf.get_table('edges').items(): 525 graph.add_edge( 526 key.mutex1, 527 key.mutex2, 528 thread_pid=leaf.thread_pid, 529 thread_comm=leaf.comm.decode('utf-8'), 530 first_mutex_stack_id=leaf.mutex1_stack_id, 531 second_mutex_stack_id=leaf.mutex2_stack_id, 532 ) 533 if args.verbose: 534 print( 535 'Mutexes: %d, Edges: %d' % 536 (len(graph.nodes()), len(graph.edges())) 537 ) 538 if args.dump_graph: 539 with open(args.dump_graph, 'w') as f: 540 data = graph.node_link_data() 541 f.write(json.dumps(data, indent=2)) 542 543 cycle = find_cycle(graph) 544 if cycle: 545 print_cycle( 546 args.binary, graph, cycle, thread_info, print_stack_trace 547 ) 548 sys.exit(1) 549 550 time.sleep(1) 551 except KeyboardInterrupt: 552 break 553 554 555if __name__ == '__main__': 556 main() 557