• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import logging
6import os
7import re
8import shutil
9from autotest_lib.client.common_lib import utils as client_utils
10from autotest_lib.client.common_lib.cros import dev_server
11from autotest_lib.client.common_lib.cros import retry
12from autotest_lib.client.cros import constants
13from autotest_lib.server.cros.dynamic_suite.constants import JOB_BUILD_KEY
14from autotest_lib.server.crashcollect import collect_log_file
15from autotest_lib.server import utils
16
17try:
18    from chromite.lib import metrics
19except ImportError:
20    metrics = client_utils.metrics_mock
21
22
23def generate_minidump_stacktrace(minidump_path):
24    """
25    Generates a stacktrace for the specified minidump.
26
27    This function expects the debug symbols to reside under:
28        /build/<board>/usr/lib/debug
29
30    @param minidump_path: absolute path to minidump to by symbolicated.
31    @raise client_utils.error.CmdError if minidump_stackwalk return code != 0.
32    """
33    symbol_dir = '%s/../../../lib/debug' % utils.get_server_dir()
34    logging.info('symbol_dir: %s', symbol_dir)
35    client_utils.run('minidump_stackwalk "%s" "%s" > "%s.txt"' %
36                     (minidump_path, symbol_dir, minidump_path))
37
38
39def _resolve_crashserver():
40    """
41    Attempts to find a devserver / crashserver that has capacity to
42    symbolicate a crashdump.
43
44    @raises DevServerException if no server with capacity could be found.
45    @returns Hostname of resolved server, if found.
46    """
47    crashserver_name = dev_server.get_least_loaded_devserver(
48            devserver_type=dev_server.CrashServer)
49    if not crashserver_name:
50        metrics.Counter('chromeos/autotest/crashcollect/could_not_resolve'
51                        ).increment()
52        raise dev_server.DevServerException(
53                'No crash server has the capacity to symbolicate the dump.')
54    else:
55        metrics.Counter('chromeos/autotest/crashcollect/resolved'
56                        ).increment(fields={'crash_server': crashserver_name})
57    return crashserver_name
58
59
60def _symbolicate_minidump_with_devserver(minidump_path, resultdir,
61                                        crashserver_name):
62    """
63    Generates a stack trace for the specified minidump by consulting devserver.
64
65    This function assumes the debug symbols have been staged on the devserver.
66
67    @param minidump_path: absolute path to minidump to by symbolicated.
68    @param resultdir: server job's result directory.
69    @param crashserver_name: Name of crashserver to attempt to symbolicate with.
70    @raise DevServerException upon failure, HTTP or otherwise.
71    """
72    # First, look up what build we tested.  If we can't find this, we can't
73    # get the right debug symbols, so we might as well give up right now.
74    keyvals = client_utils.read_keyval(resultdir)
75    if JOB_BUILD_KEY not in keyvals:
76        raise dev_server.DevServerException(
77            'Cannot determine build being tested.')
78
79    devserver = dev_server.CrashServer(crashserver_name)
80
81    with metrics.SecondsTimer(
82            'chromeos/autotest/crashcollect/symbolicate_duration',
83            fields={'crash_server': crashserver_name}):
84        trace_text = devserver.symbolicate_dump(minidump_path,
85                                                keyvals[JOB_BUILD_KEY])
86
87    if not trace_text:
88        raise dev_server.DevServerException('Unknown error!!')
89    with open(minidump_path + '.txt', 'w') as trace_file:
90        trace_file.write(trace_text)
91
92def generate_stacktrace_for_file(minidump, host_resultdir):
93    """
94    Tries to generate a stack trace for the file located at |minidump|.
95    @param minidump: path to minidump file to generate the stacktrace for.
96    @param host_resultdir: server job's result directory.
97    """
98    # First, try to symbolicate locally.
99    try:
100        logging.info('Trying to generate stack trace locally for %s', minidump)
101        generate_minidump_stacktrace(minidump)
102        logging.info('Generated stack trace for dump %s', minidump)
103        return
104    except client_utils.error.CmdError as err:
105        logging.info('Failed to generate stack trace locally for '
106                     'dump %s (rc=%d):\n%r',
107                     minidump, err.result_obj.exit_status, err)
108
109    # If that did not succeed, try to symbolicate using the dev server.
110    try:
111        logging.info('Generating stack trace using devserver for %s', minidump)
112        crashserver_name = _resolve_crashserver()
113        args = (minidump, host_resultdir, crashserver_name)
114        is_timeout, _ = retry.timeout(_symbolicate_minidump_with_devserver,
115                                      args=args,
116                                      timeout_sec=600)
117        if is_timeout:
118            logging.info('Generating stack trace timed out for dump %s',
119                         minidump)
120            metrics.Counter(
121                    'chromeos/autotest/crashcollect/symbolicate_timed_out'
122            ).increment(fields={'crash_server': crashserver_name})
123        else:
124            logging.info('Generated stack trace for dump %s', minidump)
125            return
126    except dev_server.DevServerException as e:
127        logging.info('Failed to generate stack trace on devserver for dump '
128                     '%s:\n%r', minidump, e)
129
130    # Symbolicating failed.
131    logging.warning('Failed to generate stack trace for %s (see info logs)',
132                    minidump)
133
134def find_and_generate_minidump_stacktraces(host_resultdir):
135    """
136    Finds all minidump files and generates a stack trace for each.
137
138    Enumerates all files under the test results directory (recursively)
139    and generates a stack trace file for the minidumps.  Minidump files are
140    identified as files with .dmp extension.  The stack trace filename is
141    composed by appending the .txt extension to the minidump filename.
142
143    @param host_resultdir: Directory to walk looking for dmp files.
144
145    @returns The list of all found minidump files. Each dump may or may not have
146             been symbolized.
147    """
148    minidumps = []
149    for file in _find_crashdumps(host_resultdir):
150        generate_stacktrace_for_file(file, host_resultdir)
151        minidumps.append(file)
152    return minidumps
153
154
155def _find_crashdumps(host_resultdir):
156    """Find crashdumps.
157
158    @param host_resultdir The result directory for this host for this test run.
159    """
160    for dir, subdirs, files in os.walk(host_resultdir):
161        for file in files:
162            if file.endswith('.dmp'):
163                yield os.path.join(dir, file)
164
165
166def _find_orphaned_crashdumps(host):
167    """Return file paths of crashdumps on host.
168
169    @param host A host object of the device.
170    """
171    return host.list_files_glob(os.path.join(constants.CRASH_DIR, '*'))
172
173
174def report_crashdumps(host):
175    """Report on crashdumps for host.
176
177    This is run when no tests failed.  We don't process crashdumps in this
178    case because of devserver load, but they should still be reported.
179
180    @param host A host object of the device we're to pull crashes from.
181    """
182    for crashfile in _find_orphaned_crashdumps(host):
183        logging.warning('Host crashdump exists: %s', crashfile)
184        host.job.record('INFO', None, None,
185                        'Host crashdump exists: %s' % (crashfile,))
186
187    host_resultdir = _get_host_resultdir(host)
188    for crashfile in _find_crashdumps(host_resultdir):
189        logging.warning('Local crashdump exists: %s', crashfile)
190        host.job.record('INFO', None, None,
191                        'Local crashdump exists: %s' % (crashfile,))
192
193
194def fetch_orphaned_crashdumps(host, infodir):
195    """
196    Copy all of the crashes in the crash directory over to the results folder.
197
198    @param host A host object of the device we're to pull crashes from.
199    @param infodir The directory to fetch crashdumps into.
200    @return The list of minidumps that we pulled back from the host.
201    """
202    if not os.path.exists(infodir):
203        os.mkdir(infodir)
204    orphans = []
205
206    if not host.check_cached_up_status():
207        logging.warning('Host %s did not answer to ping, skip fetching '
208                        'orphaned crashdumps.', host.hostname)
209        return orphans
210
211    try:
212        for file in _find_orphaned_crashdumps(host):
213            logging.info('Collecting %s...', file)
214            collect_log_file(host, file, infodir, clean=True)
215            orphans.append(file)
216    except Exception as e:
217        logging.warning('Collection of orphaned crash dumps failed %s', e)
218    finally:
219        # Delete infodir if we have no orphans
220        if not orphans:
221            logging.info('There are no orphaned crashes; deleting %s', infodir)
222            os.rmdir(infodir)
223    return orphans
224
225
226def _copy_to_debug_dir(host_resultdir, filename):
227    """
228    Copies a file to the debug dir under host_resultdir.
229
230    @param host_resultdir The result directory for this host for this test run.
231    @param filename The full path of the file to copy to the debug folder.
232    """
233    debugdir = os.path.join(host_resultdir, 'debug')
234    src = filename
235    dst = os.path.join(debugdir, os.path.basename(filename))
236
237    try:
238        shutil.copyfile(src, dst)
239        logging.info('Copied %s to %s', src, dst)
240    except IOError:
241        logging.warning('Failed to copy %s to %s', src, dst)
242
243
244def _get_host_resultdir(host):
245    """Get resultdir for host.
246
247    @param host A host object of the device we're to pull crashes from.
248    """
249    return getattr(getattr(host, 'job', None), 'resultdir', None)
250
251
252def get_host_infodir(host):
253    """Get infodir for host.
254
255    @param host A host object of the device we're to pull crashes from.
256    """
257    host_resultdir = _get_host_resultdir(host)
258    return os.path.join(host_resultdir, 'crashinfo.%s' % host.hostname)
259
260
261def get_site_crashdumps(host, test_start_time):
262    """
263    Copy all of the crashdumps from a host to the results directory.
264
265    @param host The host object from which to pull crashes
266    @param test_start_time When the test we just ran started.
267    @return A list of all the minidumps
268    """
269    host_resultdir = _get_host_resultdir(host)
270    infodir = get_host_infodir(host)
271
272    orphans = fetch_orphaned_crashdumps(host, infodir)
273    minidumps = find_and_generate_minidump_stacktraces(host_resultdir)
274
275    # Record all crashdumps in status.log of the job:
276    # - If one server job runs several client jobs we will only record
277    # crashdumps in the status.log of the high level server job.
278    # - We will record these crashdumps whether or not we successfully
279    # symbolicate them.
280    if host.job and minidumps or orphans:
281        host.job.record('INFO', None, None, 'Start crashcollection record')
282        for minidump in minidumps:
283            host.job.record('INFO', None, 'New Crash Dump', minidump)
284        for orphan in orphans:
285            host.job.record('INFO', None, 'Orphaned Crash Dump', orphan)
286        host.job.record('INFO', None, None, 'End crashcollection record')
287
288    orphans.extend(minidumps)
289
290    for minidump in orphans:
291        report_bug_from_crash(host, minidump)
292
293    # We copy Chrome crash information to the debug dir to assist debugging.
294    # Since orphans occurred on a previous run, they are most likely not
295    # relevant to the current failure, so we don't copy them.
296    for minidump in minidumps:
297        minidump_no_ext = os.path.splitext(minidump)[0]
298        _copy_to_debug_dir(host_resultdir, minidump_no_ext + '.dmp.txt')
299        _copy_to_debug_dir(host_resultdir, minidump_no_ext + '.log')
300
301    return orphans
302
303
304def find_package_of(host, exec_name):
305    """
306    Find the package that an executable came from.
307
308    @param host A host object that has the executable.
309    @param exec_name Name of or path to executable.
310    @return The name of the package that installed the executable.
311    """
312    # Run "portageq owners" on "host" to determine which package owns
313    # "exec_name."  Portageq queue output consists of package names followed
314    # tab-prefixed path names.  For example, owners of "python:"
315    #
316    # sys-devel/gdb-7.7.1-r2
317    #         /usr/share/gdb/python
318    # chromeos-base/dev-install-0.0.1-r711
319    #         /usr/bin/python
320    # dev-lang/python-2.7.3-r7
321    #         /etc/env.d/python
322    #
323    # This gets piped into "xargs stat" to annotate each line with
324    # information about the path, so we later can consider only packages
325    # with executable files.  After annotation the above looks like:
326    #
327    # stat: cannot stat '@@@ sys-devel/gdb-7.7.1-r2 @@@': ...
328    # stat: cannot stat '/usr/share/gdb/python': ...
329    # stat: cannot stat '@@@ chromeos-base/dev-install-0.0.1-r711 @@@': ...
330    # 755 -rwxr-xr-x /usr/bin/python
331    # stat: cannot stat '@@@ dev-lang/python-2.7.3-r7 @@@': ...
332    # 755 drwxr-xr-x /etc/env.d/python
333    #
334    # Package names are surrounded by "@@@" to facilitate parsing.  Lines
335    # starting with an octal number were successfully annotated, because
336    # the path existed on "host."
337    # The above is then parsed to find packages which contain executable files
338    # (not directories), in this case "chromeos-base/dev-install-0.0.1-r711."
339    #
340    # TODO(milleral): portageq can show scary looking error messages
341    # in the debug logs via stderr. We only look at stdout, so those
342    # get filtered, but it would be good to silence them.
343    cmd = ('portageq owners / ' + exec_name +
344            r'| sed -e "s/^[^\t].*/@@@ & @@@/" -e "s/^\t//"'
345            r'| tr \\n \\0'
346            ' | xargs -0 -r stat -L -c "%a %A %n" 2>&1')
347    portageq = host.run(cmd, ignore_status=True)
348
349    # Parse into a set of names of packages containing an executable file.
350    packages = set()
351    pkg = ''
352    pkg_re = re.compile('@@@ (.*) @@@')
353    path_re = re.compile('^([0-7]{3,}) (.)')
354    for line in portageq.stdout.splitlines():
355        match = pkg_re.search(line)
356        if match:
357            pkg = match.group(1)
358            continue
359        match = path_re.match(line)
360        if match:
361            isexec = int(match.group(1), 8) & 0o111
362            isfile = match.group(2) == '-'
363            if pkg and isexec and isfile:
364                packages.add(pkg)
365
366    # If exactly one package found it must be the one we want, return it.
367    if len(packages) == 1:
368        return packages.pop()
369
370    # TODO(milleral): Decide if it really is an error if not exactly one
371    # package is found.
372    # It is highly questionable as to if this should be left in the
373    # production version of this code or not.
374    if len(packages) == 0:
375        logging.warning('find_package_of() found no packages for "%s"',
376                        exec_name)
377    else:
378        logging.warning('find_package_of() found multiple packages for "%s": '
379                        '%s', exec_name, ', '.join(packages))
380    return ''
381
382
383def report_bug_from_crash(host, minidump_path):
384    """
385    Given a host to query and a minidump, file a bug about the crash.
386
387    @param host A host object that is where the dump came from
388    @param minidump_path The path to the dump file that should be reported.
389    """
390    # TODO(milleral): Once this has actually been tested, remove the
391    # try/except. In the meantime, let's make sure nothing dies because of
392    # the fact that this code isn't very heavily tested.
393    try:
394        meta_path = os.path.splitext(minidump_path)[0] + '.meta'
395        with open(meta_path, 'r') as f:
396            for line in f.readlines():
397                parts = line.split('=')
398                if parts[0] == 'exec_name':
399                    package = find_package_of(host, parts[1].strip())
400                    if not package:
401                        package = '<unknown package>'
402                    logging.info('Would report crash on %s.', package)
403                    break
404    except Exception as e:
405        logging.warning('Crash detection failed with: %s', e)
406