1# Copyright (c) 2011 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import logging 6import os 7import re 8import shutil 9from autotest_lib.client.common_lib import utils as client_utils 10from autotest_lib.client.common_lib.cros import dev_server 11from autotest_lib.client.common_lib.cros import retry 12from autotest_lib.client.cros import constants 13from autotest_lib.server.cros.dynamic_suite.constants import JOB_BUILD_KEY 14from autotest_lib.server.crashcollect import collect_log_file 15from autotest_lib.server import utils 16 17try: 18 from chromite.lib import metrics 19except ImportError: 20 metrics = client_utils.metrics_mock 21 22 23def generate_minidump_stacktrace(minidump_path): 24 """ 25 Generates a stacktrace for the specified minidump. 26 27 This function expects the debug symbols to reside under: 28 /build/<board>/usr/lib/debug 29 30 @param minidump_path: absolute path to minidump to by symbolicated. 31 @raise client_utils.error.CmdError if minidump_stackwalk return code != 0. 32 """ 33 symbol_dir = '%s/../../../lib/debug' % utils.get_server_dir() 34 logging.info('symbol_dir: %s', symbol_dir) 35 client_utils.run('minidump_stackwalk "%s" "%s" > "%s.txt"' % 36 (minidump_path, symbol_dir, minidump_path)) 37 38 39def _resolve_crashserver(): 40 """ 41 Attempts to find a devserver / crashserver that has capacity to 42 symbolicate a crashdump. 43 44 @raises DevServerException if no server with capacity could be found. 45 @returns Hostname of resolved server, if found. 46 """ 47 crashserver_name = dev_server.get_least_loaded_devserver( 48 devserver_type=dev_server.CrashServer) 49 if not crashserver_name: 50 metrics.Counter('chromeos/autotest/crashcollect/could_not_resolve' 51 ).increment() 52 raise dev_server.DevServerException( 53 'No crash server has the capacity to symbolicate the dump.') 54 else: 55 metrics.Counter('chromeos/autotest/crashcollect/resolved' 56 ).increment(fields={'crash_server': crashserver_name}) 57 return crashserver_name 58 59 60def _symbolicate_minidump_with_devserver(minidump_path, resultdir, 61 crashserver_name): 62 """ 63 Generates a stack trace for the specified minidump by consulting devserver. 64 65 This function assumes the debug symbols have been staged on the devserver. 66 67 @param minidump_path: absolute path to minidump to by symbolicated. 68 @param resultdir: server job's result directory. 69 @param crashserver_name: Name of crashserver to attempt to symbolicate with. 70 @raise DevServerException upon failure, HTTP or otherwise. 71 """ 72 # First, look up what build we tested. If we can't find this, we can't 73 # get the right debug symbols, so we might as well give up right now. 74 keyvals = client_utils.read_keyval(resultdir) 75 if JOB_BUILD_KEY not in keyvals: 76 raise dev_server.DevServerException( 77 'Cannot determine build being tested.') 78 79 devserver = dev_server.CrashServer(crashserver_name) 80 81 with metrics.SecondsTimer( 82 'chromeos/autotest/crashcollect/symbolicate_duration', 83 fields={'crash_server': crashserver_name}): 84 trace_text = devserver.symbolicate_dump(minidump_path, 85 keyvals[JOB_BUILD_KEY]) 86 87 if not trace_text: 88 raise dev_server.DevServerException('Unknown error!!') 89 with open(minidump_path + '.txt', 'w') as trace_file: 90 trace_file.write(trace_text) 91 92def generate_stacktrace_for_file(minidump, host_resultdir): 93 """ 94 Tries to generate a stack trace for the file located at |minidump|. 95 @param minidump: path to minidump file to generate the stacktrace for. 96 @param host_resultdir: server job's result directory. 97 """ 98 # First, try to symbolicate locally. 99 try: 100 logging.info('Trying to generate stack trace locally for %s', minidump) 101 generate_minidump_stacktrace(minidump) 102 logging.info('Generated stack trace for dump %s', minidump) 103 return 104 except client_utils.error.CmdError as err: 105 logging.info('Failed to generate stack trace locally for ' 106 'dump %s (rc=%d):\n%r', 107 minidump, err.result_obj.exit_status, err) 108 109 # If that did not succeed, try to symbolicate using the dev server. 110 try: 111 logging.info('Generating stack trace using devserver for %s', minidump) 112 crashserver_name = _resolve_crashserver() 113 args = (minidump, host_resultdir, crashserver_name) 114 is_timeout, _ = retry.timeout(_symbolicate_minidump_with_devserver, 115 args=args, 116 timeout_sec=600) 117 if is_timeout: 118 logging.info('Generating stack trace timed out for dump %s', 119 minidump) 120 metrics.Counter( 121 'chromeos/autotest/crashcollect/symbolicate_timed_out' 122 ).increment(fields={'crash_server': crashserver_name}) 123 else: 124 logging.info('Generated stack trace for dump %s', minidump) 125 return 126 except dev_server.DevServerException as e: 127 logging.info('Failed to generate stack trace on devserver for dump ' 128 '%s:\n%r', minidump, e) 129 130 # Symbolicating failed. 131 logging.warning('Failed to generate stack trace for %s (see info logs)', 132 minidump) 133 134def find_and_generate_minidump_stacktraces(host_resultdir): 135 """ 136 Finds all minidump files and generates a stack trace for each. 137 138 Enumerates all files under the test results directory (recursively) 139 and generates a stack trace file for the minidumps. Minidump files are 140 identified as files with .dmp extension. The stack trace filename is 141 composed by appending the .txt extension to the minidump filename. 142 143 @param host_resultdir: Directory to walk looking for dmp files. 144 145 @returns The list of all found minidump files. Each dump may or may not have 146 been symbolized. 147 """ 148 minidumps = [] 149 for file in _find_crashdumps(host_resultdir): 150 generate_stacktrace_for_file(file, host_resultdir) 151 minidumps.append(file) 152 return minidumps 153 154 155def _find_crashdumps(host_resultdir): 156 """Find crashdumps. 157 158 @param host_resultdir The result directory for this host for this test run. 159 """ 160 for dir, subdirs, files in os.walk(host_resultdir): 161 for file in files: 162 if file.endswith('.dmp'): 163 yield os.path.join(dir, file) 164 165 166def _find_orphaned_crashdumps(host): 167 """Return file paths of crashdumps on host. 168 169 @param host A host object of the device. 170 """ 171 return host.list_files_glob(os.path.join(constants.CRASH_DIR, '*')) 172 173 174def report_crashdumps(host): 175 """Report on crashdumps for host. 176 177 This is run when no tests failed. We don't process crashdumps in this 178 case because of devserver load, but they should still be reported. 179 180 @param host A host object of the device we're to pull crashes from. 181 """ 182 for crashfile in _find_orphaned_crashdumps(host): 183 logging.warning('Host crashdump exists: %s', crashfile) 184 host.job.record('INFO', None, None, 185 'Host crashdump exists: %s' % (crashfile,)) 186 187 host_resultdir = _get_host_resultdir(host) 188 for crashfile in _find_crashdumps(host_resultdir): 189 logging.warning('Local crashdump exists: %s', crashfile) 190 host.job.record('INFO', None, None, 191 'Local crashdump exists: %s' % (crashfile,)) 192 193 194def fetch_orphaned_crashdumps(host, infodir): 195 """ 196 Copy all of the crashes in the crash directory over to the results folder. 197 198 @param host A host object of the device we're to pull crashes from. 199 @param infodir The directory to fetch crashdumps into. 200 @return The list of minidumps that we pulled back from the host. 201 """ 202 if not os.path.exists(infodir): 203 os.mkdir(infodir) 204 orphans = [] 205 try: 206 for file in _find_orphaned_crashdumps(host): 207 logging.info('Collecting %s...', file) 208 collect_log_file(host, file, infodir, clean=True) 209 orphans.append(file) 210 except Exception as e: 211 logging.warning('Collection of orphaned crash dumps failed %s', e) 212 finally: 213 # Delete infodir if we have no orphans 214 if not orphans: 215 logging.info('There are no orphaned crashes; deleting %s', infodir) 216 os.rmdir(infodir) 217 return orphans 218 219 220def _copy_to_debug_dir(host_resultdir, filename): 221 """ 222 Copies a file to the debug dir under host_resultdir. 223 224 @param host_resultdir The result directory for this host for this test run. 225 @param filename The full path of the file to copy to the debug folder. 226 """ 227 debugdir = os.path.join(host_resultdir, 'debug') 228 src = filename 229 dst = os.path.join(debugdir, os.path.basename(filename)) 230 231 try: 232 shutil.copyfile(src, dst) 233 logging.info('Copied %s to %s', src, dst) 234 except IOError: 235 logging.warning('Failed to copy %s to %s', src, dst) 236 237 238def _get_host_resultdir(host): 239 """Get resultdir for host. 240 241 @param host A host object of the device we're to pull crashes from. 242 """ 243 return getattr(getattr(host, 'job', None), 'resultdir', None) 244 245 246def get_host_infodir(host): 247 """Get infodir for host. 248 249 @param host A host object of the device we're to pull crashes from. 250 """ 251 host_resultdir = _get_host_resultdir(host) 252 return os.path.join(host_resultdir, 'crashinfo.%s' % host.hostname) 253 254 255def get_site_crashdumps(host, test_start_time): 256 """ 257 Copy all of the crashdumps from a host to the results directory. 258 259 @param host The host object from which to pull crashes 260 @param test_start_time When the test we just ran started. 261 @return A list of all the minidumps 262 """ 263 host_resultdir = _get_host_resultdir(host) 264 infodir = get_host_infodir(host) 265 266 orphans = fetch_orphaned_crashdumps(host, infodir) 267 minidumps = find_and_generate_minidump_stacktraces(host_resultdir) 268 269 # Record all crashdumps in status.log of the job: 270 # - If one server job runs several client jobs we will only record 271 # crashdumps in the status.log of the high level server job. 272 # - We will record these crashdumps whether or not we successfully 273 # symbolicate them. 274 if host.job and minidumps or orphans: 275 host.job.record('INFO', None, None, 'Start crashcollection record') 276 for minidump in minidumps: 277 host.job.record('INFO', None, 'New Crash Dump', minidump) 278 for orphan in orphans: 279 host.job.record('INFO', None, 'Orphaned Crash Dump', orphan) 280 host.job.record('INFO', None, None, 'End crashcollection record') 281 282 orphans.extend(minidumps) 283 284 for minidump in orphans: 285 report_bug_from_crash(host, minidump) 286 287 # We copy Chrome crash information to the debug dir to assist debugging. 288 # Since orphans occurred on a previous run, they are most likely not 289 # relevant to the current failure, so we don't copy them. 290 for minidump in minidumps: 291 minidump_no_ext = os.path.splitext(minidump)[0] 292 _copy_to_debug_dir(host_resultdir, minidump_no_ext + '.dmp.txt') 293 _copy_to_debug_dir(host_resultdir, minidump_no_ext + '.log') 294 295 return orphans 296 297 298def find_package_of(host, exec_name): 299 """ 300 Find the package that an executable came from. 301 302 @param host A host object that has the executable. 303 @param exec_name Name of or path to executable. 304 @return The name of the package that installed the executable. 305 """ 306 # Run "portageq owners" on "host" to determine which package owns 307 # "exec_name." Portageq queue output consists of package names followed 308 # tab-prefixed path names. For example, owners of "python:" 309 # 310 # sys-devel/gdb-7.7.1-r2 311 # /usr/share/gdb/python 312 # chromeos-base/dev-install-0.0.1-r711 313 # /usr/bin/python 314 # dev-lang/python-2.7.3-r7 315 # /etc/env.d/python 316 # 317 # This gets piped into "xargs stat" to annotate each line with 318 # information about the path, so we later can consider only packages 319 # with executable files. After annotation the above looks like: 320 # 321 # stat: cannot stat '@@@ sys-devel/gdb-7.7.1-r2 @@@': ... 322 # stat: cannot stat '/usr/share/gdb/python': ... 323 # stat: cannot stat '@@@ chromeos-base/dev-install-0.0.1-r711 @@@': ... 324 # 755 -rwxr-xr-x /usr/bin/python 325 # stat: cannot stat '@@@ dev-lang/python-2.7.3-r7 @@@': ... 326 # 755 drwxr-xr-x /etc/env.d/python 327 # 328 # Package names are surrounded by "@@@" to facilitate parsing. Lines 329 # starting with an octal number were successfully annotated, because 330 # the path existed on "host." 331 # The above is then parsed to find packages which contain executable files 332 # (not directories), in this case "chromeos-base/dev-install-0.0.1-r711." 333 # 334 # TODO(milleral): portageq can show scary looking error messages 335 # in the debug logs via stderr. We only look at stdout, so those 336 # get filtered, but it would be good to silence them. 337 cmd = ('portageq owners / ' + exec_name + 338 r'| sed -e "s/^[^\t].*/@@@ & @@@/" -e "s/^\t//"' 339 r'| tr \\n \\0' 340 ' | xargs -0 -r stat -L -c "%a %A %n" 2>&1') 341 portageq = host.run(cmd, ignore_status=True) 342 343 # Parse into a set of names of packages containing an executable file. 344 packages = set() 345 pkg = '' 346 pkg_re = re.compile('@@@ (.*) @@@') 347 path_re = re.compile('^([0-7]{3,}) (.)') 348 for line in portageq.stdout.splitlines(): 349 match = pkg_re.search(line) 350 if match: 351 pkg = match.group(1) 352 continue 353 match = path_re.match(line) 354 if match: 355 isexec = int(match.group(1), 8) & 0o111 356 isfile = match.group(2) == '-' 357 if pkg and isexec and isfile: 358 packages.add(pkg) 359 360 # If exactly one package found it must be the one we want, return it. 361 if len(packages) == 1: 362 return packages.pop() 363 364 # TODO(milleral): Decide if it really is an error if not exactly one 365 # package is found. 366 # It is highly questionable as to if this should be left in the 367 # production version of this code or not. 368 if len(packages) == 0: 369 logging.warning('find_package_of() found no packages for "%s"', 370 exec_name) 371 else: 372 logging.warning('find_package_of() found multiple packages for "%s": ' 373 '%s', exec_name, ', '.join(packages)) 374 return '' 375 376 377def report_bug_from_crash(host, minidump_path): 378 """ 379 Given a host to query and a minidump, file a bug about the crash. 380 381 @param host A host object that is where the dump came from 382 @param minidump_path The path to the dump file that should be reported. 383 """ 384 # TODO(milleral): Once this has actually been tested, remove the 385 # try/except. In the meantime, let's make sure nothing dies because of 386 # the fact that this code isn't very heavily tested. 387 try: 388 meta_path = os.path.splitext(minidump_path)[0] + '.meta' 389 with open(meta_path, 'r') as f: 390 for line in f.readlines(): 391 parts = line.split('=') 392 if parts[0] == 'exec_name': 393 package = find_package_of(host, parts[1].strip()) 394 if not package: 395 package = '<unknown package>' 396 logging.info('Would report crash on %s.', package) 397 break 398 except Exception as e: 399 logging.warning('Crash detection failed with: %s', e) 400