• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1from __future__ import print_function
2
3import collections
4import logging
5import os
6import pipes
7import random
8import shutil
9import time
10
11import common
12from autotest_lib.client.bin.result_tools import runner as result_tools_runner
13from autotest_lib.client.common_lib import error
14from autotest_lib.client.common_lib import global_config
15from autotest_lib.client.cros import constants
16from autotest_lib.server import utils
17
18try:
19    from chromite.lib import metrics
20except ImportError:
21    metrics = utils.metrics_mock
22
23
24# The amortized max filesize to collect.  For example, if _MAX_FILESIZE is 10
25# then we would collect a file with size 20 half the time, and a file with size
26# 40 a quarter of the time, so that in the long run we are collecting files
27# with this max size.
28_MAX_FILESIZE = 64 * (2 ** 20)  # 64 MiB
29
30class _RemoteTempDir(object):
31
32    """Context manager for temporary directory on remote host."""
33
34    def __init__(self, host):
35        self.host = host
36        self.tmpdir = None
37
38    def __repr__(self):
39        return '<{cls} host={this.host!r}, tmpdir={this.tmpdir!r}>'.format(
40            cls=type(self).__name__, this=self)
41
42    def __enter__(self):
43        self.tmpdir = (self.host
44                       .run('mktemp -d', stdout_tee=None)
45                       .stdout.strip())
46        return self.tmpdir
47
48    def __exit__(self, exc_type, exc_value, exc_tb):
49        self.host.run('rm -rf %s' % (pipes.quote(self.tmpdir),))
50
51
52def _collect_log_file_with_summary(host, source_path, dest_path):
53    """Collects a log file from the remote machine with directory summary.
54
55    @param host: The RemoteHost to collect logs from.
56    @param source_path: The remote path to collect the log file from.
57    @param dest_path: A path (file or directory) to write the copies logs into.
58    """
59    # Build test result directory summary
60    summary_created = result_tools_runner.run_on_client(host, source_path)
61
62    skip_summary_collection = True
63    try:
64        host.get_file(source_path, dest_path, preserve_perm=False)
65        skip_summary_collection = False
66    finally:
67        if summary_created:
68            # If dest_path is a file, use its parent folder to store the
69            # directory summary file.
70            if os.path.isfile(dest_path):
71                dest_path = os.path.dirname(dest_path)
72            # If dest_path doesn't exist, that means get_file failed, there is
73            # no need to collect directory summary file.
74            skip_summary_collection |= not os.path.exists(dest_path)
75            result_tools_runner.collect_last_summary(
76                    host, source_path, dest_path,
77                    skip_summary_collection=skip_summary_collection)
78
79
80def collect_log_file(host, log_path, dest_path, use_tmp=False, clean=False,
81                     clean_content=False):
82    """Collects a log file from the remote machine.
83
84    Log files are collected from the remote machine and written into the
85    destination path. If dest_path is a directory, the log file will be named
86    using the basename of the remote log path.
87
88    Very large files will randomly not be collected, to alleviate network
89    traffic in the case of widespread crashes dumping large core files. Note
90    that this check only applies to the exact file passed as log_path. For
91    example, if this is a directory, the size of the contents will not be
92    checked.
93
94    @param host: The RemoteHost to collect logs from
95    @param log_path: The remote path to collect the log file from
96    @param dest_path: A path (file or directory) to write the copies logs into
97    @param use_tmp: If True, will first copy the logs to a temporary directory
98                    on the host and download logs from there.
99    @param clean: If True, remove dest_path after upload attempt even if it
100                  failed.
101    @param clean_content: If True, remove files and directories in dest_path
102            after upload attempt even if it failed.
103
104    """
105    logging.info('Collecting %s...', log_path)
106    if not host.check_cached_up_status():
107        logging.warning('Host %s did not answer to ping, skip collecting log '
108                        'file %s.', host.hostname, log_path)
109        return
110    try:
111        file_stats = _get_file_stats(host, log_path)
112        if not file_stats:
113            # Failed to get file stat, the file may not exist.
114            return
115
116        if (not result_tools_runner.ENABLE_RESULT_THROTTLING and
117            random.random() > file_stats.collection_probability):
118            logging.warning('Collection of %s skipped:'
119                            'size=%s, collection_probability=%s',
120                            log_path, file_stats.size,
121                            file_stats.collection_probability)
122        elif use_tmp:
123            _collect_log_file_with_tmpdir(host, log_path, dest_path)
124        else:
125            _collect_log_file_with_summary(host, log_path, dest_path)
126    except Exception as e:
127        logging.exception('Non-critical failure: collection of %s failed: %s',
128                          log_path, e)
129    finally:
130        if clean_content:
131            path_to_delete = os.path.join(pipes.quote(log_path), '*')
132        elif clean:
133            path_to_delete = pipes.quote(log_path)
134        if clean or clean_content:
135            host.run('rm -rf %s' % path_to_delete, ignore_status=True)
136
137
138_FileStats = collections.namedtuple('_FileStats',
139                                    'size collection_probability')
140
141
142def _collect_log_file_with_tmpdir(host, log_path, dest_path):
143    """Collect log file from host through a temp directory on the host.
144
145    @param host: The RemoteHost to collect logs from.
146    @param log_path: The remote path to collect the log file from.
147    @param dest_path: A path (file or directory) to write the copies logs into.
148
149    """
150    with _RemoteTempDir(host) as tmpdir:
151        host.run('cp -rp %s %s' % (pipes.quote(log_path), pipes.quote(tmpdir)))
152        source_path = os.path.join(tmpdir, os.path.basename(log_path))
153
154        _collect_log_file_with_summary(host, source_path, dest_path)
155
156
157def _get_file_stats(host, path):
158    """Get the stats of a file from host.
159
160    @param host: Instance of Host subclass with run().
161    @param path: Path of file to check.
162    @returns: _FileStats namedtuple with file size and collection probability.
163    """
164    cmd = 'ls -ld %s | cut -d" " -f5' % (pipes.quote(path),)
165    output = None
166    file_size = 0
167    try:
168        output = host.run(cmd).stdout
169    except error.CmdError as e:
170        logging.warning('Getting size of file %r on host %r failed: %s. '
171                        'Default its size to 0', path, host, e)
172    try:
173        if output is not None:
174            file_size = int(output)
175    except ValueError:
176        logging.warning('Failed to convert size string "%s" for %s on host %r. '
177                        'File may not exist.', output, path, host)
178        return
179
180    if file_size == 0:
181        return _FileStats(0, 1.0)
182    else:
183        collection_probability = _MAX_FILESIZE / float(file_size)
184        return _FileStats(file_size, collection_probability)
185
186
187# import any site hooks for the crashdump and crashinfo collection
188get_site_crashdumps = utils.import_site_function(
189    __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps",
190    lambda host, test_start_time: None)
191get_site_crashinfo = utils.import_site_function(
192    __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo",
193    lambda host, test_start_time: None)
194report_crashdumps = utils.import_site_function(
195    __file__, "autotest_lib.server.site_crashcollect", "report_crashdumps",
196    lambda host: None)
197fetch_orphaned_crashdumps = utils.import_site_function(
198    __file__, "autotest_lib.server.site_crashcollect", "fetch_orphaned_crashdumps",
199    lambda host, host_resultdir: None)
200get_host_infodir = utils.import_site_function(
201    __file__, "autotest_lib.server.site_crashcollect", "get_host_infodir",
202    lambda host: None)
203
204
205@metrics.SecondsTimerDecorator(
206        'chromeos/autotest/autoserv/get_crashdumps_duration')
207def get_crashdumps(host, test_start_time):
208    get_site_crashdumps(host, test_start_time)
209
210
211@metrics.SecondsTimerDecorator(
212        'chromeos/autotest/autoserv/get_crashinfo_duration')
213def get_crashinfo(host, test_start_time):
214    logging.info("Collecting crash information...")
215
216    # get_crashdumps collects orphaned crashdumps and symbolicates all
217    # collected crashdumps. Symbolicating could happen
218    # during a postjob task as well, at which time some crashdumps could have
219    # already been pulled back from machine. So it doesn't necessarily need
220    # to wait for the machine to come up.
221    get_crashdumps(host, test_start_time)
222
223    if wait_for_machine_to_recover(host):
224        # run any site-specific collection
225        get_site_crashinfo(host, test_start_time)
226
227        crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
228        collect_messages(host)
229        collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
230        collect_uncollected_logs(host)
231
232        # Collect everything in /var/log.
233        log_path = os.path.join(crashinfo_dir, 'var')
234        os.makedirs(log_path)
235        collect_log_file(host, constants.LOG_DIR, log_path)
236
237        # Collect console-ramoops.  The filename has changed in linux-3.19,
238        # so collect all the files in the pstore dirs.
239        log_path = os.path.join(crashinfo_dir, 'pstore')
240        for pstore_dir in constants.LOG_PSTORE_DIRS:
241            collect_log_file(host, pstore_dir, log_path, use_tmp=True,
242                             clean_content=True)
243        # Collect i915_error_state, only available on intel systems.
244        # i915 contains the Intel graphics state. It might contain useful data
245        # when a DUT hangs, times out or crashes.
246        log_path = os.path.join(
247                crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE))
248        collect_log_file(host, constants.LOG_I915_ERROR_STATE,
249                         log_path, use_tmp=True)
250
251
252# Load default for number of hours to wait before giving up on crash collection.
253HOURS_TO_WAIT = global_config.global_config.get_config_value(
254    'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
255
256
257def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
258    """Wait for a machine (possibly down) to become accessible again.
259
260    @param host: A RemoteHost instance to wait on
261    @param hours_to_wait: Number of hours to wait before giving up
262
263    @returns: True if the machine comes back up, False otherwise
264    """
265    current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
266    if host.is_up():
267        logging.info("%s already up, collecting crash info", host.hostname)
268        return True
269
270    logging.info("Waiting %s hours for %s to come up (%s)",
271                 hours_to_wait, host.hostname, current_time)
272    if not host.wait_up(timeout=hours_to_wait * 3600):
273        (metrics.Counter('chromeos/autotest/errors/collect_crashinfo_timeout')
274         .increment())
275        logging.warning("%s down, unable to collect crash info",
276                        host.hostname)
277        return False
278    else:
279        logging.info("%s is back up, collecting crash info", host.hostname)
280        return True
281
282
283def get_crashinfo_dir(host, dir_prefix):
284    """Find and if necessary create a directory to store crashinfo in.
285
286    @param host: The RemoteHost object that crashinfo will be collected from
287    @param dir_prefix: Prefix of directory name.
288
289    @returns: The path to an existing directory for writing crashinfo into
290    """
291    host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
292    if host_resultdir:
293        infodir = host_resultdir
294    else:
295        infodir = os.path.abspath(os.getcwd())
296    infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname))
297    if not os.path.exists(infodir):
298        os.mkdir(infodir)
299    return infodir
300
301
302def collect_command(host, command, dest_path):
303    """Collects the result of a command on the remote machine.
304
305    The standard output of the command will be collected and written into the
306    desitionation path. The destination path is assumed to be filename and
307    not a directory.
308
309    @param host: The RemoteHost to collect from
310    @param command: A shell command to run on the remote machine and capture
311        the output from.
312    @param dest_path: A file path to write the results of the log into
313    """
314    logging.info("Collecting '%s' ...", command)
315    try:
316        result = host.run(command, stdout_tee=None).stdout
317        utils.open_write_close(dest_path, result)
318    except Exception as e:
319        logging.warning("Collection of '%s' failed:\n%s", command, e)
320
321
322def collect_uncollected_logs(host):
323    """Collects any leftover uncollected logs from the client.
324
325    @param host: The RemoteHost to collect from
326    """
327    if host.job:
328        try:
329            logs = host.job.get_client_logs()
330            for hostname, remote_path, local_path in logs:
331                if hostname == host.hostname:
332                    logging.info('Retrieving logs from %s:%s into %s',
333                                 hostname, remote_path, local_path)
334                    collect_log_file(host, remote_path + '/', local_path + '/')
335        except Exception as e:
336            logging.warning('Error while trying to collect stranded '
337                            'Autotest client logs: %s', e)
338
339
340def collect_messages(host):
341    """Collects the 'new' contents of /var/log/messages.
342
343    If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects
344    the contents of /var/log/messages excluding whatever initial contents
345    are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not
346    present, simply collects the entire contents of /var/log/messages.
347
348    @param host: The RemoteHost to collect from
349    """
350    crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
351
352    try:
353        # paths to the messages files
354        messages = os.path.join(crashinfo_dir, "messages")
355        messages_raw = os.path.join(crashinfo_dir, "messages.raw")
356        messages_at_start = os.path.join(crashinfo_dir, "messages.at_start")
357
358        # grab the files from the remote host
359        collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH,
360                         messages_at_start)
361        collect_log_file(host, "/var/log/messages", messages_raw)
362
363        # figure out how much of messages.raw to skip
364        if os.path.exists(messages_at_start):
365            # if the first lines of the messages at start should match the
366            # first lines of the current messages; if they don't then messages
367            # has been erase or rotated and we just grab all of it
368            first_line_at_start = utils.read_one_line(messages_at_start)
369            first_line_now = utils.read_one_line(messages_raw)
370            if first_line_at_start != first_line_now:
371                size_at_start = 0
372            else:
373                size_at_start = os.path.getsize(messages_at_start)
374        else:
375            size_at_start = 0
376        raw_messages_file = open(messages_raw)
377        messages_file = open(messages, "w")
378        raw_messages_file.seek(size_at_start)
379        shutil.copyfileobj(raw_messages_file, messages_file)
380        raw_messages_file.close()
381        messages_file.close()
382
383        # get rid of the "raw" versions of messages
384        os.remove(messages_raw)
385        if os.path.exists(messages_at_start):
386            os.remove(messages_at_start)
387    except Exception as e:
388        logging.warning("Error while collecting /var/log/messages: %s", e)
389