• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import collections
2import logging
3import os
4import pipes
5import random
6import shutil
7import time
8
9import common
10from autotest_lib.client.bin.result_tools import runner as result_tools_runner
11from autotest_lib.client.common_lib import error
12from autotest_lib.client.common_lib import global_config
13from autotest_lib.client.cros import constants
14from autotest_lib.server import utils
15
16try:
17    from chromite.lib import metrics
18except ImportError:
19    metrics = utils.metrics_mock
20
21
22# The amortized max filesize to collect.  For example, if _MAX_FILESIZE is 10
23# then we would collect a file with size 20 half the time, and a file with size
24# 40 a quarter of the time, so that in the long run we are collecting files
25# with this max size.
26_MAX_FILESIZE = 64 * (2 ** 20)  # 64 MiB
27
28class _RemoteTempDir(object):
29
30    """Context manager for temporary directory on remote host."""
31
32    def __init__(self, host):
33        self.host = host
34        self.tmpdir = None
35
36    def __repr__(self):
37        return '<{cls} host={this.host!r}, tmpdir={this.tmpdir!r}>'.format(
38            cls=type(self).__name__, this=self)
39
40    def __enter__(self):
41        self.tmpdir = (self.host
42                       .run('mktemp -d', stdout_tee=None)
43                       .stdout.strip())
44        return self.tmpdir
45
46    def __exit__(self, exc_type, exc_value, exc_tb):
47        self.host.run('rm -rf %s' % (pipes.quote(self.tmpdir),))
48
49
50def _collect_log_file_with_summary(host, source_path, dest_path):
51    """Collects a log file from the remote machine with directory summary.
52
53    @param host: The RemoteHost to collect logs from.
54    @param source_path: The remote path to collect the log file from.
55    @param dest_path: A path (file or directory) to write the copies logs into.
56    """
57    # Build test result directory summary
58    summary_created = result_tools_runner.run_on_client(host, source_path)
59
60    skip_summary_collection = True
61    try:
62        host.get_file(source_path, dest_path, preserve_perm=False)
63        skip_summary_collection = False
64    finally:
65        if summary_created:
66            # If dest_path is a file, use its parent folder to store the
67            # directory summary file.
68            if os.path.isfile(dest_path):
69                dest_path = os.path.dirname(dest_path)
70            # If dest_path doesn't exist, that means get_file failed, there is
71            # no need to collect directory summary file.
72            skip_summary_collection |= not os.path.exists(dest_path)
73            result_tools_runner.collect_last_summary(
74                    host, source_path, dest_path,
75                    skip_summary_collection=skip_summary_collection)
76
77
78def collect_log_file(host, log_path, dest_path, use_tmp=False, clean=False,
79                     clean_content=False):
80    """Collects a log file from the remote machine.
81
82    Log files are collected from the remote machine and written into the
83    destination path. If dest_path is a directory, the log file will be named
84    using the basename of the remote log path.
85
86    Very large files will randomly not be collected, to alleviate network
87    traffic in the case of widespread crashes dumping large core files. Note
88    that this check only applies to the exact file passed as log_path. For
89    example, if this is a directory, the size of the contents will not be
90    checked.
91
92    @param host: The RemoteHost to collect logs from
93    @param log_path: The remote path to collect the log file from
94    @param dest_path: A path (file or directory) to write the copies logs into
95    @param use_tmp: If True, will first copy the logs to a temporary directory
96                    on the host and download logs from there.
97    @param clean: If True, remove dest_path after upload attempt even if it
98                  failed.
99    @param clean_content: If True, remove files and directories in dest_path
100            after upload attempt even if it failed.
101
102    """
103    logging.info('Collecting %s...', log_path)
104    if not host.check_cached_up_status():
105        logging.warning('Host %s did not answer to ping, skip collecting log '
106                        'file %s.', host.hostname, log_path)
107        return
108    try:
109        file_stats = _get_file_stats(host, log_path)
110        if not file_stats:
111            # Failed to get file stat, the file may not exist.
112            return
113
114        if (not result_tools_runner.ENABLE_RESULT_THROTTLING and
115            random.random() > file_stats.collection_probability):
116            logging.warning('Collection of %s skipped:'
117                            'size=%s, collection_probability=%s',
118                            log_path, file_stats.size,
119                            file_stats.collection_probability)
120        elif use_tmp:
121            _collect_log_file_with_tmpdir(host, log_path, dest_path)
122        else:
123            _collect_log_file_with_summary(host, log_path, dest_path)
124    except Exception as e:
125        logging.exception('Non-critical failure: collection of %s failed: %s',
126                          log_path, e)
127    finally:
128        if clean_content:
129            path_to_delete = os.path.join(pipes.quote(log_path), '*')
130        elif clean:
131            path_to_delete = pipes.quote(log_path)
132        if clean or clean_content:
133            host.run('rm -rf %s' % path_to_delete, ignore_status=True)
134
135
136_FileStats = collections.namedtuple('_FileStats',
137                                    'size collection_probability')
138
139
140def _collect_log_file_with_tmpdir(host, log_path, dest_path):
141    """Collect log file from host through a temp directory on the host.
142
143    @param host: The RemoteHost to collect logs from.
144    @param log_path: The remote path to collect the log file from.
145    @param dest_path: A path (file or directory) to write the copies logs into.
146
147    """
148    with _RemoteTempDir(host) as tmpdir:
149        host.run('cp -rp %s %s' % (pipes.quote(log_path), pipes.quote(tmpdir)))
150        source_path = os.path.join(tmpdir, os.path.basename(log_path))
151
152        _collect_log_file_with_summary(host, source_path, dest_path)
153
154
155def _get_file_stats(host, path):
156    """Get the stats of a file from host.
157
158    @param host: Instance of Host subclass with run().
159    @param path: Path of file to check.
160    @returns: _FileStats namedtuple with file size and collection probability.
161    """
162    cmd = 'ls -ld %s | cut -d" " -f5' % (pipes.quote(path),)
163    output = None
164    file_size = 0
165    try:
166        output = host.run(cmd).stdout
167    except error.CmdError as e:
168        logging.warning('Getting size of file %r on host %r failed: %s. '
169                        'Default its size to 0', path, host, e)
170    try:
171        if output is not None:
172            file_size = int(output)
173    except ValueError:
174        logging.warning('Failed to convert size string "%s" for %s on host %r. '
175                        'File may not exist.', output, path, host)
176        return
177
178    if file_size == 0:
179        return _FileStats(0, 1.0)
180    else:
181        collection_probability = _MAX_FILESIZE / float(file_size)
182        return _FileStats(file_size, collection_probability)
183
184
185# import any site hooks for the crashdump and crashinfo collection
186get_site_crashdumps = utils.import_site_function(
187    __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps",
188    lambda host, test_start_time: None)
189get_site_crashinfo = utils.import_site_function(
190    __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo",
191    lambda host, test_start_time: None)
192report_crashdumps = utils.import_site_function(
193    __file__, "autotest_lib.server.site_crashcollect", "report_crashdumps",
194    lambda host: None)
195fetch_orphaned_crashdumps = utils.import_site_function(
196    __file__, "autotest_lib.server.site_crashcollect", "fetch_orphaned_crashdumps",
197    lambda host, host_resultdir: None)
198get_host_infodir = utils.import_site_function(
199    __file__, "autotest_lib.server.site_crashcollect", "get_host_infodir",
200    lambda host: None)
201
202
203@metrics.SecondsTimerDecorator(
204        'chromeos/autotest/autoserv/get_crashdumps_duration')
205def get_crashdumps(host, test_start_time):
206    get_site_crashdumps(host, test_start_time)
207
208
209@metrics.SecondsTimerDecorator(
210        'chromeos/autotest/autoserv/get_crashinfo_duration')
211def get_crashinfo(host, test_start_time):
212    logging.info("Collecting crash information...")
213
214    # get_crashdumps collects orphaned crashdumps and symbolicates all
215    # collected crashdumps. Symbolicating could happen
216    # during a postjob task as well, at which time some crashdumps could have
217    # already been pulled back from machine. So it doesn't necessarily need
218    # to wait for the machine to come up.
219    get_crashdumps(host, test_start_time)
220
221    if wait_for_machine_to_recover(host):
222        # run any site-specific collection
223        get_site_crashinfo(host, test_start_time)
224
225        crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
226        collect_messages(host)
227        collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
228        collect_uncollected_logs(host)
229
230        # Collect everything in /var/log.
231        log_path = os.path.join(crashinfo_dir, 'var')
232        os.makedirs(log_path)
233        collect_log_file(host, constants.LOG_DIR, log_path)
234
235        # Collect console-ramoops.  The filename has changed in linux-3.19,
236        # so collect all the files in the pstore dirs.
237        log_path = os.path.join(crashinfo_dir, 'pstore')
238        for pstore_dir in constants.LOG_PSTORE_DIRS:
239            collect_log_file(host, pstore_dir, log_path, use_tmp=True,
240                             clean_content=True)
241        # Collect i915_error_state, only available on intel systems.
242        # i915 contains the Intel graphics state. It might contain useful data
243        # when a DUT hangs, times out or crashes.
244        log_path = os.path.join(
245                crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE))
246        collect_log_file(host, constants.LOG_I915_ERROR_STATE,
247                         log_path, use_tmp=True)
248
249
250# Load default for number of hours to wait before giving up on crash collection.
251HOURS_TO_WAIT = global_config.global_config.get_config_value(
252    'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
253
254
255def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
256    """Wait for a machine (possibly down) to become accessible again.
257
258    @param host: A RemoteHost instance to wait on
259    @param hours_to_wait: Number of hours to wait before giving up
260
261    @returns: True if the machine comes back up, False otherwise
262    """
263    current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
264    if host.is_up():
265        logging.info("%s already up, collecting crash info", host.hostname)
266        return True
267
268    logging.info("Waiting %s hours for %s to come up (%s)",
269                 hours_to_wait, host.hostname, current_time)
270    if not host.wait_up(timeout=hours_to_wait * 3600):
271        (metrics.Counter('chromeos/autotest/errors/collect_crashinfo_timeout')
272         .increment())
273        logging.warning("%s down, unable to collect crash info",
274                        host.hostname)
275        return False
276    else:
277        logging.info("%s is back up, collecting crash info", host.hostname)
278        return True
279
280
281def get_crashinfo_dir(host, dir_prefix):
282    """Find and if necessary create a directory to store crashinfo in.
283
284    @param host: The RemoteHost object that crashinfo will be collected from
285    @param dir_prefix: Prefix of directory name.
286
287    @returns: The path to an existing directory for writing crashinfo into
288    """
289    host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
290    if host_resultdir:
291        infodir = host_resultdir
292    else:
293        infodir = os.path.abspath(os.getcwd())
294    infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname))
295    if not os.path.exists(infodir):
296        os.mkdir(infodir)
297    return infodir
298
299
300def collect_command(host, command, dest_path):
301    """Collects the result of a command on the remote machine.
302
303    The standard output of the command will be collected and written into the
304    desitionation path. The destination path is assumed to be filename and
305    not a directory.
306
307    @param host: The RemoteHost to collect from
308    @param command: A shell command to run on the remote machine and capture
309        the output from.
310    @param dest_path: A file path to write the results of the log into
311    """
312    logging.info("Collecting '%s' ...", command)
313    try:
314        result = host.run(command, stdout_tee=None).stdout
315        utils.open_write_close(dest_path, result)
316    except Exception, e:
317        logging.warning("Collection of '%s' failed:\n%s", command, e)
318
319
320def collect_uncollected_logs(host):
321    """Collects any leftover uncollected logs from the client.
322
323    @param host: The RemoteHost to collect from
324    """
325    if host.job:
326        try:
327            logs = host.job.get_client_logs()
328            for hostname, remote_path, local_path in logs:
329                if hostname == host.hostname:
330                    logging.info('Retrieving logs from %s:%s into %s',
331                                 hostname, remote_path, local_path)
332                    collect_log_file(host, remote_path + '/', local_path + '/')
333        except Exception, e:
334            logging.warning('Error while trying to collect stranded '
335                            'Autotest client logs: %s', e)
336
337
338def collect_messages(host):
339    """Collects the 'new' contents of /var/log/messages.
340
341    If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects
342    the contents of /var/log/messages excluding whatever initial contents
343    are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not
344    present, simply collects the entire contents of /var/log/messages.
345
346    @param host: The RemoteHost to collect from
347    """
348    crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
349
350    try:
351        # paths to the messages files
352        messages = os.path.join(crashinfo_dir, "messages")
353        messages_raw = os.path.join(crashinfo_dir, "messages.raw")
354        messages_at_start = os.path.join(crashinfo_dir, "messages.at_start")
355
356        # grab the files from the remote host
357        collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH,
358                         messages_at_start)
359        collect_log_file(host, "/var/log/messages", messages_raw)
360
361        # figure out how much of messages.raw to skip
362        if os.path.exists(messages_at_start):
363            # if the first lines of the messages at start should match the
364            # first lines of the current messages; if they don't then messages
365            # has been erase or rotated and we just grab all of it
366            first_line_at_start = utils.read_one_line(messages_at_start)
367            first_line_now = utils.read_one_line(messages_raw)
368            if first_line_at_start != first_line_now:
369                size_at_start = 0
370            else:
371                size_at_start = os.path.getsize(messages_at_start)
372        else:
373            size_at_start = 0
374        raw_messages_file = open(messages_raw)
375        messages_file = open(messages, "w")
376        raw_messages_file.seek(size_at_start)
377        shutil.copyfileobj(raw_messages_file, messages_file)
378        raw_messages_file.close()
379        messages_file.close()
380
381        # get rid of the "raw" versions of messages
382        os.remove(messages_raw)
383        if os.path.exists(messages_at_start):
384            os.remove(messages_at_start)
385    except Exception, e:
386        logging.warning("Error while collecting /var/log/messages: %s", e)
387