1#!/usr/bin/env python3 2# 3# Copyright (C) 2022 The Android Open Source Project 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17""" 18Checks dwarf CFI (unwinding) information by comparing it to disassembly. 19It is only a simple heuristic check of stack pointer adjustments. 20Fully inferring CFI from disassembly is not possible in general. 21""" 22 23import os, re, subprocess, collections, pathlib, bisect, collections 24from typing import List, Optional, Set, Tuple 25 26Source = collections.namedtuple("Source", ["addr", "file", "line", "flag"]) 27 28def get_source(lib: pathlib.Path) -> List[Source]: 29 """ Get source-file and line-number for all hand-written assembly code. """ 30 31 proc = subprocess.run(["llvm-dwarfdump", "--debug-line", lib], 32 encoding='utf-8', 33 capture_output=True, 34 check=True) 35 36 section_re = re.compile("^debug_line\[0x[0-9a-f]+\]$", re.MULTILINE) 37 filename_re = re.compile('file_names\[ *(\d)+\]:\n\s*name: "(.*)"', re.MULTILINE) 38 line_re = re.compile('0x([0-9a-f]{16}) +(\d+) +\d+ +(\d+)' # addr, line, column, file 39 ' +\d+ +\d +(.*)') # isa, discriminator, flag 40 41 results = [] 42 for section in section_re.split(proc.stdout): 43 files = {m[1]: m[2] for m in filename_re.finditer(section)} 44 if not any(f.endswith(".S") for f in files.values()): 45 continue 46 lines = line_re.findall(section) 47 results.extend([Source(int(a, 16), files[fn], l, fg) for a, l, fn, fg in lines]) 48 return sorted(filter(lambda line: "end_sequence" not in line.flag, results)) 49 50Fde = collections.namedtuple("Fde", ["addr", "end", "data"]) 51 52def get_fde(lib: pathlib.Path) -> List[Fde]: 53 """ Get all unwinding FDE blocks (in dumped text-based format) """ 54 55 proc = subprocess.run(["llvm-dwarfdump", "--debug-frame", lib], 56 encoding='utf-8', 57 capture_output=True, 58 check=True) 59 60 section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent. 61 fda_re = re.compile(".* FDE .* pc=([0-9a-f]+)...([0-9a-f]+)") 62 63 results = [] 64 for section in section_re.split(proc.stdout): 65 m = fda_re.match(section) 66 if m: 67 fde = Fde(int(m[1], 16), int(m[2], 16), section) 68 if fde.addr != 0: 69 results.append(fde) 70 return sorted(results) 71 72Asm = collections.namedtuple("Asm", ["addr", "name", "data"]) 73 74def get_asm(lib: pathlib.Path) -> List[Asm]: 75 """ Get disassembly for all methods (in dumped text-based format) """ 76 77 proc = subprocess.run(["llvm-objdump", "--disassemble", lib], 78 encoding='utf-8', 79 capture_output=True, 80 check=True) 81 82 section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent. 83 sym_re = re.compile("([0-9a-f]+) <(.+)>:") 84 85 results = [] 86 for section in section_re.split(proc.stdout): 87 sym = sym_re.match(section) 88 if sym: 89 results.append(Asm(int(sym[1], 16), sym[2], section)) 90 return sorted(results) 91 92Cfa = collections.namedtuple("Cfa", ["addr", "cfa"]) 93 94def get_cfa(fde: Fde) -> List[Cfa]: 95 """ Extract individual CFA (SP+offset) entries from the FDE block """ 96 97 cfa_re = re.compile("0x([0-9a-f]+): CFA=([^\s:]+)") 98 return [Cfa(int(addr, 16), cfa) for addr, cfa in cfa_re.findall(fde.data)] 99 100Inst = collections.namedtuple("Inst", ["addr", "inst", "symbol"]) 101 102def get_instructions(asm: Asm) -> List[Inst]: 103 """ Extract individual instructions from disassembled code block """ 104 105 data = re.sub(r"[ \t]+", " ", asm.data) 106 inst_re = re.compile(r"([0-9a-f]+): +(?:[0-9a-f]{2} +)*(.*)") 107 return [Inst(int(addr, 16), inst, asm.name) for addr, inst in inst_re.findall(data)] 108 109CfaOffset = collections.namedtuple("CfaOffset", ["addr", "offset"]) 110 111def get_dwarf_cfa_offsets(cfas: List[Cfa]) -> List[CfaOffset]: 112 """ Parse textual CFA entries into integer stack offsets """ 113 114 result = [] 115 for addr, cfa in cfas: 116 if cfa == "WSP" or cfa == "SP": 117 result.append(CfaOffset(addr, 0)) 118 elif cfa.startswith("WSP+") or cfa.startswith("SP+"): 119 result.append(CfaOffset(addr, int(cfa.split("+")[1]))) 120 else: 121 result.append(CfaOffset(addr, None)) 122 return result 123 124def get_infered_cfa_offsets(insts: List[Inst]) -> List[CfaOffset]: 125 """ Heuristic to convert disassembly into stack offsets """ 126 127 # Regular expressions which find instructions that adjust stack pointer. 128 rexprs = [] 129 def add(rexpr, adjust_offset): 130 rexprs.append((re.compile(rexpr), adjust_offset)) 131 add(r"sub sp,(?: sp,)? #(\d+)", lambda m: int(m[1])) 132 add(r"add sp,(?: sp,)? #(\d+)", lambda m: -int(m[1])) 133 add(r"str \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1])) 134 add(r"ldr \w+, \[sp\], #(\d+)", lambda m: -int(m[1])) 135 add(r"stp \w+, \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1])) 136 add(r"ldp \w+, \w+, \[sp\], #(\d+)", lambda m: -int(m[1])) 137 add(r"vpush \{([d0-9, ]*)\}", lambda m: 8 * len(m[1].split(","))) 138 add(r"vpop \{([d0-9, ]*)\}", lambda m: -8 * len(m[1].split(","))) 139 add(r"v?push(?:\.w)? \{([\w+, ]*)\}", lambda m: 4 * len(m[1].split(","))) 140 add(r"v?pop(?:\.w)? \{([\w+, ]*)\}", lambda m: -4 * len(m[1].split(","))) 141 142 # Regular expression which identifies branches. 143 jmp_re = re.compile(r"cb\w* \w+, 0x(\w+)|(?:b|bl|b\w\w) 0x(\w+)") 144 145 offset, future_offset = 0, {} 146 result = [CfaOffset(insts[0].addr, offset)] 147 for addr, inst, symbol in insts: 148 # Previous code branched here, so us that offset instead. 149 # This likely identifies slow-path which is after return. 150 if addr in future_offset: 151 offset = future_offset[addr] 152 153 # Add entry to output (only if the offset changed). 154 if result[-1].offset != offset: 155 result.append(CfaOffset(addr, offset)) 156 157 # Adjust offset if the instruction modifies stack pointer. 158 for rexpr, adjust_offset in rexprs: 159 m = rexpr.match(inst) 160 if m: 161 offset += adjust_offset(m) 162 break # First matched pattern wins. 163 164 # Record branches. We only support forward edges for now. 165 m = jmp_re.match(inst) 166 if m: 167 future_offset[int(m[m.lastindex], 16)] = offset 168 return result 169 170def check_fde(fde: Fde, insts: List[Inst], srcs, verbose: bool = False) -> Tuple[str, Set[int]]: 171 """ Compare DWARF offsets to assembly-inferred offsets. Report differences. """ 172 173 error, seen_addrs = None, set() 174 cfas = get_cfa(fde) 175 i, dwarf_cfa = 0, get_dwarf_cfa_offsets(cfas) 176 j, infered_cfa = 0, get_infered_cfa_offsets(insts) 177 for inst in insts: 178 seen_addrs.add(inst.addr) 179 while i+1 < len(dwarf_cfa) and dwarf_cfa[i+1].addr <= inst.addr: 180 i += 1 181 while j+1 < len(infered_cfa) and infered_cfa[j+1].addr <= inst.addr: 182 j += 1 183 if verbose: 184 print("{:08x}: dwarf={:4} infered={:4} {:40} // {}".format( 185 inst.addr, str(dwarf_cfa[i].offset), str(infered_cfa[j].offset), 186 inst.inst.strip(), srcs.get(inst.addr, ""))) 187 if dwarf_cfa[i].offset is not None and dwarf_cfa[i].offset != infered_cfa[j].offset: 188 if inst.addr in srcs: # Only report if it maps to source code (not padding or literals). 189 error = error or "{:08x} {}".format(inst.addr, srcs.get(inst.addr, "")) 190 return error, seen_addrs 191 192def check_lib(lib: pathlib.Path): 193 assert lib.exists() 194 IGNORE = [ 195 "art_quick_throw_null_pointer_exception_from_signal", # Starts with non-zero offset. 196 "art_quick_generic_jni_trampoline", # Saves/restores SP in other register. 197 "nterp_op_", # Uses calculated CFA due to dynamic stack size. 198 "$d.", # Data (literals) interleaved within code. 199 ] 200 fdes = get_fde(lib) 201 asms = collections.deque(get_asm(lib)) 202 srcs = {src.addr: src.file + ":" + src.line for src in get_source(lib)} 203 seen = set() # Used to verify the we have covered all assembly source lines. 204 205 for fde in fdes: 206 if fde.addr not in srcs: 207 continue # Ignore if it is not hand-written assembly. 208 209 # Assembly instructions (one FDE can cover several assembly chunks). 210 all_insts, name = [], None 211 while asms and asms[0].addr < fde.end: 212 asm = asms.popleft() 213 if asm.addr < fde.addr: 214 continue 215 insts = get_instructions(asm) 216 if any(asm.name.startswith(i) for i in IGNORE): 217 seen.update([inst.addr for inst in insts]) 218 continue 219 all_insts.extend(insts) 220 name = name or asm.name 221 if not all_insts: 222 continue # No assembly 223 224 # Compare DWARF data to assembly instructions 225 error, seen_addrs = check_fde(fde, all_insts, srcs) 226 if error: 227 print("ERROR at " + name + " " + error) 228 check_fde(fde, all_insts, srcs, True) 229 print("") 230 seen.update(seen_addrs) 231 for addr in sorted(set(srcs.keys()) - seen): 232 print("Missing CFI for {:08x}: {}".format(addr, srcs[addr])) 233 234 235def main(argv): 236 """ Check libraries provided on the command line, or use the default build output """ 237 238 libs = argv[1:] 239 if not libs: 240 out = os.environ["OUT"] 241 libs.append(out + "/symbols/apex/com.android.art/lib/libart.so") 242 libs.append(out + "/symbols/apex/com.android.art/lib64/libart.so") 243 for lib in libs: 244 check_lib(pathlib.Path(lib)) 245 246if __name__ == "__main__": 247 main(os.sys.argv) 248