• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import collections
2import logging
3import os
4import pipes
5import random
6import shutil
7import time
8
9from autotest_lib.client.common_lib import error
10from autotest_lib.client.common_lib import global_config
11from autotest_lib.client.cros import constants
12from autotest_lib.server import utils
13
14try:
15    from chromite.lib import metrics
16except ImportError:
17    metrics = utils.metrics_mock
18
19
20# The amortized max filesize to collect.  For example, if _MAX_FILESIZE is 10
21# then we would collect a file with size 20 half the time, and a file with size
22# 40 a quarter of the time, so that in the long run we are collecting files
23# with this max size.
24_MAX_FILESIZE = 64 * (2 ** 20)  # 64 MiB
25
26
27class _RemoteTempDir(object):
28
29    """Context manager for temporary directory on remote host."""
30
31    def __init__(self, host):
32        self.host = host
33        self.tmpdir = None
34
35    def __repr__(self):
36        return '<{cls} host={this.host!r}, tmpdir={this.tmpdir!r}>'.format(
37            cls=type(self).__name__, this=self)
38
39    def __enter__(self):
40        self.tmpdir = (self.host
41                       .run('mktemp -d', stdout_tee=None)
42                       .stdout.strip())
43        return self.tmpdir
44
45    def __exit__(self, exc_type, exc_value, exc_tb):
46        self.host.run('rm -rf %s' % (pipes.quote(self.tmpdir),))
47
48
49def collect_log_file(host, log_path, dest_path, use_tmp=False, clean=False):
50    """Collects a log file from the remote machine.
51
52    Log files are collected from the remote machine and written into the
53    destination path. If dest_path is a directory, the log file will be named
54    using the basename of the remote log path.
55
56    Very large files will randomly not be collected, to alleviate network
57    traffic in the case of widespread crashes dumping large core files. Note
58    that this check only applies to the exact file passed as log_path. For
59    example, if this is a directory, the size of the contents will not be
60    checked.
61
62    @param host: The RemoteHost to collect logs from
63    @param log_path: The remote path to collect the log file from
64    @param dest_path: A path (file or directory) to write the copies logs into
65    @param use_tmp: If True, will first copy the logs to a temporary directory
66                    on the host and download logs from there.
67    @param clean: If True, remove dest_path after upload attempt even if it
68                  failed.
69
70    """
71    logging.info('Collecting %s...', log_path)
72    try:
73        file_stats = _get_file_stats(host, log_path)
74        if random.random() > file_stats.collection_probability:
75            logging.warning('Collection of %s skipped:'
76                            'size=%s, collection_probability=%s',
77                            log_path, file_stats.size,
78                            file_stats.collection_probability)
79        elif use_tmp:
80            _collect_log_file_with_tmpdir(host, log_path, dest_path)
81        else:
82            source_path = log_path
83            host.get_file(source_path, dest_path, preserve_perm=False)
84    except Exception, e:
85        logging.warning('Collection of %s failed: %s', log_path, e)
86    finally:
87        if clean:
88            host.run('rm -rf %s' % (pipes.quote(log_path),))
89
90
91_FileStats = collections.namedtuple('_FileStats',
92                                    'size collection_probability')
93
94
95def _collect_log_file_with_tmpdir(host, log_path, dest_path):
96    """Collect log file from host through a temp directory on the host.
97
98    @param host: The RemoteHost to collect logs from.
99    @param log_path: The remote path to collect the log file from.
100    @param dest_path: A path (file or directory) to write the copies logs into.
101
102    """
103    with _RemoteTempDir(host) as tmpdir:
104        host.run('cp -rp %s %s' % (pipes.quote(log_path), pipes.quote(tmpdir)))
105        source_path = os.path.join(tmpdir, os.path.basename(log_path))
106        host.get_file(source_path, dest_path, preserve_perm=False)
107
108
109def _get_file_stats(host, path):
110    """Get the stats of a file from host.
111
112    @param host: Instance of Host subclass with run().
113    @param path: Path of file to check.
114    @returns: _FileStats namedtuple with file size and collection probability.
115    """
116    cmd = 'ls -ld %s | cut -d" " -f5' % (pipes.quote(path),)
117    try:
118        file_size = int(host.run(cmd).stdout)
119    except error.CmdError as e:
120        logging.warning('Getting size of file %r on host %r failed: %s',
121                        path, host, e)
122        file_size = 0
123    if file_size == 0:
124        return _FileStats(0, 1.0)
125    else:
126        collection_probability = _MAX_FILESIZE / float(file_size)
127        return _FileStats(file_size, collection_probability)
128
129
130# import any site hooks for the crashdump and crashinfo collection
131get_site_crashdumps = utils.import_site_function(
132    __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps",
133    lambda host, test_start_time: None)
134get_site_crashinfo = utils.import_site_function(
135    __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo",
136    lambda host, test_start_time: None)
137report_crashdumps = utils.import_site_function(
138    __file__, "autotest_lib.server.site_crashcollect", "report_crashdumps",
139    lambda host: None)
140fetch_orphaned_crashdumps = utils.import_site_function(
141    __file__, "autotest_lib.server.site_crashcollect", "fetch_orphaned_crashdumps",
142    lambda host, host_resultdir: None)
143get_host_infodir = utils.import_site_function(
144    __file__, "autotest_lib.server.site_crashcollect", "get_host_infodir",
145    lambda host: None)
146
147
148@metrics.SecondsTimerDecorator(
149        'chromeos/autotest/autoserv/get_crashdumps_duration')
150def get_crashdumps(host, test_start_time):
151    get_site_crashdumps(host, test_start_time)
152
153
154@metrics.SecondsTimerDecorator(
155        'chromeos/autotest/autoserv/get_crashinfo_duration')
156def get_crashinfo(host, test_start_time):
157    logging.info("Collecting crash information...")
158
159    # get_crashdumps collects orphaned crashdumps and symbolicates all
160    # collected crashdumps. Symbolicating could happen
161    # during a postjob task as well, at which time some crashdumps could have
162    # already been pulled back from machine. So it doesn't necessarily need
163    # to wait for the machine to come up.
164    get_crashdumps(host, test_start_time)
165
166    if wait_for_machine_to_recover(host):
167        # run any site-specific collection
168        get_site_crashinfo(host, test_start_time)
169
170        crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
171        collect_messages(host)
172        collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
173        collect_uncollected_logs(host)
174
175        # Collect everything in /var/log.
176        log_path = os.path.join(crashinfo_dir, 'var')
177        os.makedirs(log_path)
178        collect_log_file(host, constants.LOG_DIR, log_path)
179
180        # Collect console-ramoops
181        log_path = os.path.join(
182                crashinfo_dir, os.path.basename(constants.LOG_CONSOLE_RAMOOPS))
183        collect_log_file(host, constants.LOG_CONSOLE_RAMOOPS, log_path,
184                         clean=True)
185        # Collect i915_error_state, only available on intel systems.
186        # i915 contains the Intel graphics state. It might contain useful data
187        # when a DUT hangs, times out or crashes.
188        log_path = os.path.join(
189                crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE))
190        collect_log_file(host, constants.LOG_I915_ERROR_STATE,
191                         log_path, use_tmp=True)
192
193
194# Load default for number of hours to wait before giving up on crash collection.
195HOURS_TO_WAIT = global_config.global_config.get_config_value(
196    'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
197
198
199def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
200    """Wait for a machine (possibly down) to become accessible again.
201
202    @param host: A RemoteHost instance to wait on
203    @param hours_to_wait: Number of hours to wait before giving up
204
205    @returns: True if the machine comes back up, False otherwise
206    """
207    current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
208    if host.is_up():
209        logging.info("%s already up, collecting crash info", host.hostname)
210        return True
211
212    logging.info("Waiting %s hours for %s to come up (%s)",
213                 hours_to_wait, host.hostname, current_time)
214    if not host.wait_up(timeout=hours_to_wait * 3600):
215        (metrics.Counter('chromeos/autotest/errors/collect_crashinfo_timeout')
216         .increment())
217        logging.warning("%s down, unable to collect crash info",
218                        host.hostname)
219        return False
220    else:
221        logging.info("%s is back up, collecting crash info", host.hostname)
222        return True
223
224
225def get_crashinfo_dir(host, dir_prefix):
226    """Find and if necessary create a directory to store crashinfo in.
227
228    @param host: The RemoteHost object that crashinfo will be collected from
229    @param dir_prefix: Prefix of directory name.
230
231    @returns: The path to an existing directory for writing crashinfo into
232    """
233    host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
234    if host_resultdir:
235        infodir = host_resultdir
236    else:
237        infodir = os.path.abspath(os.getcwd())
238    infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname))
239    if not os.path.exists(infodir):
240        os.mkdir(infodir)
241    return infodir
242
243
244def collect_command(host, command, dest_path):
245    """Collects the result of a command on the remote machine.
246
247    The standard output of the command will be collected and written into the
248    desitionation path. The destination path is assumed to be filename and
249    not a directory.
250
251    @param host: The RemoteHost to collect from
252    @param command: A shell command to run on the remote machine and capture
253        the output from.
254    @param dest_path: A file path to write the results of the log into
255    """
256    logging.info("Collecting '%s' ...", command)
257    try:
258        result = host.run(command, stdout_tee=None).stdout
259        utils.open_write_close(dest_path, result)
260    except Exception, e:
261        logging.warning("Collection of '%s' failed:\n%s", command, e)
262
263
264def collect_uncollected_logs(host):
265    """Collects any leftover uncollected logs from the client.
266
267    @param host: The RemoteHost to collect from
268    """
269    if host.job:
270        try:
271            logs = host.job.get_client_logs()
272            for hostname, remote_path, local_path in logs:
273                if hostname == host.hostname:
274                    logging.info('Retrieving logs from %s:%s into %s',
275                                 hostname, remote_path, local_path)
276                    collect_log_file(host, remote_path + '/', local_path + '/')
277        except Exception, e:
278            logging.warning('Error while trying to collect stranded '
279                            'Autotest client logs: %s', e)
280
281
282def collect_messages(host):
283    """Collects the 'new' contents of /var/log/messages.
284
285    If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects
286    the contents of /var/log/messages excluding whatever initial contents
287    are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not
288    present, simply collects the entire contents of /var/log/messages.
289
290    @param host: The RemoteHost to collect from
291    """
292    crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
293
294    try:
295        # paths to the messages files
296        messages = os.path.join(crashinfo_dir, "messages")
297        messages_raw = os.path.join(crashinfo_dir, "messages.raw")
298        messages_at_start = os.path.join(crashinfo_dir, "messages.at_start")
299
300        # grab the files from the remote host
301        collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH,
302                         messages_at_start)
303        collect_log_file(host, "/var/log/messages", messages_raw)
304
305        # figure out how much of messages.raw to skip
306        if os.path.exists(messages_at_start):
307            # if the first lines of the messages at start should match the
308            # first lines of the current messages; if they don't then messages
309            # has been erase or rotated and we just grab all of it
310            first_line_at_start = utils.read_one_line(messages_at_start)
311            first_line_now = utils.read_one_line(messages_raw)
312            if first_line_at_start != first_line_now:
313                size_at_start = 0
314            else:
315                size_at_start = os.path.getsize(messages_at_start)
316        else:
317            size_at_start = 0
318        raw_messages_file = open(messages_raw)
319        messages_file = open(messages, "w")
320        raw_messages_file.seek(size_at_start)
321        shutil.copyfileobj(raw_messages_file, messages_file)
322        raw_messages_file.close()
323        messages_file.close()
324
325        # get rid of the "raw" versions of messages
326        os.remove(messages_raw)
327        if os.path.exists(messages_at_start):
328            os.remove(messages_at_start)
329    except Exception, e:
330        logging.warning("Error while collecting /var/log/messages: %s", e)
331