1import collections 2import logging 3import os 4import pipes 5import random 6import shutil 7import time 8 9from autotest_lib.client.common_lib import error 10from autotest_lib.client.common_lib import global_config 11from autotest_lib.client.cros import constants 12from autotest_lib.server import utils 13 14try: 15 from chromite.lib import metrics 16except ImportError: 17 metrics = utils.metrics_mock 18 19 20# The amortized max filesize to collect. For example, if _MAX_FILESIZE is 10 21# then we would collect a file with size 20 half the time, and a file with size 22# 40 a quarter of the time, so that in the long run we are collecting files 23# with this max size. 24_MAX_FILESIZE = 64 * (2 ** 20) # 64 MiB 25 26 27class _RemoteTempDir(object): 28 29 """Context manager for temporary directory on remote host.""" 30 31 def __init__(self, host): 32 self.host = host 33 self.tmpdir = None 34 35 def __repr__(self): 36 return '<{cls} host={this.host!r}, tmpdir={this.tmpdir!r}>'.format( 37 cls=type(self).__name__, this=self) 38 39 def __enter__(self): 40 self.tmpdir = (self.host 41 .run('mktemp -d', stdout_tee=None) 42 .stdout.strip()) 43 return self.tmpdir 44 45 def __exit__(self, exc_type, exc_value, exc_tb): 46 self.host.run('rm -rf %s' % (pipes.quote(self.tmpdir),)) 47 48 49def collect_log_file(host, log_path, dest_path, use_tmp=False, clean=False): 50 """Collects a log file from the remote machine. 51 52 Log files are collected from the remote machine and written into the 53 destination path. If dest_path is a directory, the log file will be named 54 using the basename of the remote log path. 55 56 Very large files will randomly not be collected, to alleviate network 57 traffic in the case of widespread crashes dumping large core files. Note 58 that this check only applies to the exact file passed as log_path. For 59 example, if this is a directory, the size of the contents will not be 60 checked. 61 62 @param host: The RemoteHost to collect logs from 63 @param log_path: The remote path to collect the log file from 64 @param dest_path: A path (file or directory) to write the copies logs into 65 @param use_tmp: If True, will first copy the logs to a temporary directory 66 on the host and download logs from there. 67 @param clean: If True, remove dest_path after upload attempt even if it 68 failed. 69 70 """ 71 logging.info('Collecting %s...', log_path) 72 try: 73 file_stats = _get_file_stats(host, log_path) 74 if random.random() > file_stats.collection_probability: 75 logging.warning('Collection of %s skipped:' 76 'size=%s, collection_probability=%s', 77 log_path, file_stats.size, 78 file_stats.collection_probability) 79 elif use_tmp: 80 _collect_log_file_with_tmpdir(host, log_path, dest_path) 81 else: 82 source_path = log_path 83 host.get_file(source_path, dest_path, preserve_perm=False) 84 except Exception, e: 85 logging.warning('Collection of %s failed: %s', log_path, e) 86 finally: 87 if clean: 88 host.run('rm -rf %s' % (pipes.quote(log_path),)) 89 90 91_FileStats = collections.namedtuple('_FileStats', 92 'size collection_probability') 93 94 95def _collect_log_file_with_tmpdir(host, log_path, dest_path): 96 """Collect log file from host through a temp directory on the host. 97 98 @param host: The RemoteHost to collect logs from. 99 @param log_path: The remote path to collect the log file from. 100 @param dest_path: A path (file or directory) to write the copies logs into. 101 102 """ 103 with _RemoteTempDir(host) as tmpdir: 104 host.run('cp -rp %s %s' % (pipes.quote(log_path), pipes.quote(tmpdir))) 105 source_path = os.path.join(tmpdir, os.path.basename(log_path)) 106 host.get_file(source_path, dest_path, preserve_perm=False) 107 108 109def _get_file_stats(host, path): 110 """Get the stats of a file from host. 111 112 @param host: Instance of Host subclass with run(). 113 @param path: Path of file to check. 114 @returns: _FileStats namedtuple with file size and collection probability. 115 """ 116 cmd = 'ls -ld %s | cut -d" " -f5' % (pipes.quote(path),) 117 try: 118 file_size = int(host.run(cmd).stdout) 119 except error.CmdError as e: 120 logging.warning('Getting size of file %r on host %r failed: %s', 121 path, host, e) 122 file_size = 0 123 if file_size == 0: 124 return _FileStats(0, 1.0) 125 else: 126 collection_probability = _MAX_FILESIZE / float(file_size) 127 return _FileStats(file_size, collection_probability) 128 129 130# import any site hooks for the crashdump and crashinfo collection 131get_site_crashdumps = utils.import_site_function( 132 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps", 133 lambda host, test_start_time: None) 134get_site_crashinfo = utils.import_site_function( 135 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo", 136 lambda host, test_start_time: None) 137report_crashdumps = utils.import_site_function( 138 __file__, "autotest_lib.server.site_crashcollect", "report_crashdumps", 139 lambda host: None) 140fetch_orphaned_crashdumps = utils.import_site_function( 141 __file__, "autotest_lib.server.site_crashcollect", "fetch_orphaned_crashdumps", 142 lambda host, host_resultdir: None) 143get_host_infodir = utils.import_site_function( 144 __file__, "autotest_lib.server.site_crashcollect", "get_host_infodir", 145 lambda host: None) 146 147 148@metrics.SecondsTimerDecorator( 149 'chromeos/autotest/autoserv/get_crashdumps_duration') 150def get_crashdumps(host, test_start_time): 151 get_site_crashdumps(host, test_start_time) 152 153 154@metrics.SecondsTimerDecorator( 155 'chromeos/autotest/autoserv/get_crashinfo_duration') 156def get_crashinfo(host, test_start_time): 157 logging.info("Collecting crash information...") 158 159 # get_crashdumps collects orphaned crashdumps and symbolicates all 160 # collected crashdumps. Symbolicating could happen 161 # during a postjob task as well, at which time some crashdumps could have 162 # already been pulled back from machine. So it doesn't necessarily need 163 # to wait for the machine to come up. 164 get_crashdumps(host, test_start_time) 165 166 if wait_for_machine_to_recover(host): 167 # run any site-specific collection 168 get_site_crashinfo(host, test_start_time) 169 170 crashinfo_dir = get_crashinfo_dir(host, 'crashinfo') 171 collect_messages(host) 172 collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg")) 173 collect_uncollected_logs(host) 174 175 # Collect everything in /var/log. 176 log_path = os.path.join(crashinfo_dir, 'var') 177 os.makedirs(log_path) 178 collect_log_file(host, constants.LOG_DIR, log_path) 179 180 # Collect console-ramoops 181 log_path = os.path.join( 182 crashinfo_dir, os.path.basename(constants.LOG_CONSOLE_RAMOOPS)) 183 collect_log_file(host, constants.LOG_CONSOLE_RAMOOPS, log_path, 184 clean=True) 185 # Collect i915_error_state, only available on intel systems. 186 # i915 contains the Intel graphics state. It might contain useful data 187 # when a DUT hangs, times out or crashes. 188 log_path = os.path.join( 189 crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE)) 190 collect_log_file(host, constants.LOG_I915_ERROR_STATE, 191 log_path, use_tmp=True) 192 193 194# Load default for number of hours to wait before giving up on crash collection. 195HOURS_TO_WAIT = global_config.global_config.get_config_value( 196 'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0) 197 198 199def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT): 200 """Wait for a machine (possibly down) to become accessible again. 201 202 @param host: A RemoteHost instance to wait on 203 @param hours_to_wait: Number of hours to wait before giving up 204 205 @returns: True if the machine comes back up, False otherwise 206 """ 207 current_time = time.strftime("%b %d %H:%M:%S", time.localtime()) 208 if host.is_up(): 209 logging.info("%s already up, collecting crash info", host.hostname) 210 return True 211 212 logging.info("Waiting %s hours for %s to come up (%s)", 213 hours_to_wait, host.hostname, current_time) 214 if not host.wait_up(timeout=hours_to_wait * 3600): 215 (metrics.Counter('chromeos/autotest/errors/collect_crashinfo_timeout') 216 .increment()) 217 logging.warning("%s down, unable to collect crash info", 218 host.hostname) 219 return False 220 else: 221 logging.info("%s is back up, collecting crash info", host.hostname) 222 return True 223 224 225def get_crashinfo_dir(host, dir_prefix): 226 """Find and if necessary create a directory to store crashinfo in. 227 228 @param host: The RemoteHost object that crashinfo will be collected from 229 @param dir_prefix: Prefix of directory name. 230 231 @returns: The path to an existing directory for writing crashinfo into 232 """ 233 host_resultdir = getattr(getattr(host, "job", None), "resultdir", None) 234 if host_resultdir: 235 infodir = host_resultdir 236 else: 237 infodir = os.path.abspath(os.getcwd()) 238 infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname)) 239 if not os.path.exists(infodir): 240 os.mkdir(infodir) 241 return infodir 242 243 244def collect_command(host, command, dest_path): 245 """Collects the result of a command on the remote machine. 246 247 The standard output of the command will be collected and written into the 248 desitionation path. The destination path is assumed to be filename and 249 not a directory. 250 251 @param host: The RemoteHost to collect from 252 @param command: A shell command to run on the remote machine and capture 253 the output from. 254 @param dest_path: A file path to write the results of the log into 255 """ 256 logging.info("Collecting '%s' ...", command) 257 try: 258 result = host.run(command, stdout_tee=None).stdout 259 utils.open_write_close(dest_path, result) 260 except Exception, e: 261 logging.warning("Collection of '%s' failed:\n%s", command, e) 262 263 264def collect_uncollected_logs(host): 265 """Collects any leftover uncollected logs from the client. 266 267 @param host: The RemoteHost to collect from 268 """ 269 if host.job: 270 try: 271 logs = host.job.get_client_logs() 272 for hostname, remote_path, local_path in logs: 273 if hostname == host.hostname: 274 logging.info('Retrieving logs from %s:%s into %s', 275 hostname, remote_path, local_path) 276 collect_log_file(host, remote_path + '/', local_path + '/') 277 except Exception, e: 278 logging.warning('Error while trying to collect stranded ' 279 'Autotest client logs: %s', e) 280 281 282def collect_messages(host): 283 """Collects the 'new' contents of /var/log/messages. 284 285 If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects 286 the contents of /var/log/messages excluding whatever initial contents 287 are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not 288 present, simply collects the entire contents of /var/log/messages. 289 290 @param host: The RemoteHost to collect from 291 """ 292 crashinfo_dir = get_crashinfo_dir(host, 'crashinfo') 293 294 try: 295 # paths to the messages files 296 messages = os.path.join(crashinfo_dir, "messages") 297 messages_raw = os.path.join(crashinfo_dir, "messages.raw") 298 messages_at_start = os.path.join(crashinfo_dir, "messages.at_start") 299 300 # grab the files from the remote host 301 collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH, 302 messages_at_start) 303 collect_log_file(host, "/var/log/messages", messages_raw) 304 305 # figure out how much of messages.raw to skip 306 if os.path.exists(messages_at_start): 307 # if the first lines of the messages at start should match the 308 # first lines of the current messages; if they don't then messages 309 # has been erase or rotated and we just grab all of it 310 first_line_at_start = utils.read_one_line(messages_at_start) 311 first_line_now = utils.read_one_line(messages_raw) 312 if first_line_at_start != first_line_now: 313 size_at_start = 0 314 else: 315 size_at_start = os.path.getsize(messages_at_start) 316 else: 317 size_at_start = 0 318 raw_messages_file = open(messages_raw) 319 messages_file = open(messages, "w") 320 raw_messages_file.seek(size_at_start) 321 shutil.copyfileobj(raw_messages_file, messages_file) 322 raw_messages_file.close() 323 messages_file.close() 324 325 # get rid of the "raw" versions of messages 326 os.remove(messages_raw) 327 if os.path.exists(messages_at_start): 328 os.remove(messages_at_start) 329 except Exception, e: 330 logging.warning("Error while collecting /var/log/messages: %s", e) 331