1import collections 2import logging 3import os 4import pipes 5import random 6import shutil 7import time 8 9import common 10from autotest_lib.client.bin.result_tools import runner as result_tools_runner 11from autotest_lib.client.common_lib import error 12from autotest_lib.client.common_lib import global_config 13from autotest_lib.client.cros import constants 14from autotest_lib.server import utils 15 16try: 17 from chromite.lib import metrics 18except ImportError: 19 metrics = utils.metrics_mock 20 21 22# The amortized max filesize to collect. For example, if _MAX_FILESIZE is 10 23# then we would collect a file with size 20 half the time, and a file with size 24# 40 a quarter of the time, so that in the long run we are collecting files 25# with this max size. 26_MAX_FILESIZE = 64 * (2 ** 20) # 64 MiB 27 28class _RemoteTempDir(object): 29 30 """Context manager for temporary directory on remote host.""" 31 32 def __init__(self, host): 33 self.host = host 34 self.tmpdir = None 35 36 def __repr__(self): 37 return '<{cls} host={this.host!r}, tmpdir={this.tmpdir!r}>'.format( 38 cls=type(self).__name__, this=self) 39 40 def __enter__(self): 41 self.tmpdir = (self.host 42 .run('mktemp -d', stdout_tee=None) 43 .stdout.strip()) 44 return self.tmpdir 45 46 def __exit__(self, exc_type, exc_value, exc_tb): 47 self.host.run('rm -rf %s' % (pipes.quote(self.tmpdir),)) 48 49 50def _collect_log_file_with_summary(host, source_path, dest_path): 51 """Collects a log file from the remote machine with directory summary. 52 53 @param host: The RemoteHost to collect logs from. 54 @param source_path: The remote path to collect the log file from. 55 @param dest_path: A path (file or directory) to write the copies logs into. 56 """ 57 # Build test result directory summary 58 summary_created = result_tools_runner.run_on_client(host, source_path) 59 60 skip_summary_collection = True 61 try: 62 host.get_file(source_path, dest_path, preserve_perm=False) 63 skip_summary_collection = False 64 finally: 65 if summary_created: 66 # If dest_path is a file, use its parent folder to store the 67 # directory summary file. 68 if os.path.isfile(dest_path): 69 dest_path = os.path.dirname(dest_path) 70 # If dest_path doesn't exist, that means get_file failed, there is 71 # no need to collect directory summary file. 72 skip_summary_collection |= not os.path.exists(dest_path) 73 result_tools_runner.collect_last_summary( 74 host, source_path, dest_path, 75 skip_summary_collection=skip_summary_collection) 76 77 78def collect_log_file(host, log_path, dest_path, use_tmp=False, clean=False, 79 clean_content=False): 80 """Collects a log file from the remote machine. 81 82 Log files are collected from the remote machine and written into the 83 destination path. If dest_path is a directory, the log file will be named 84 using the basename of the remote log path. 85 86 Very large files will randomly not be collected, to alleviate network 87 traffic in the case of widespread crashes dumping large core files. Note 88 that this check only applies to the exact file passed as log_path. For 89 example, if this is a directory, the size of the contents will not be 90 checked. 91 92 @param host: The RemoteHost to collect logs from 93 @param log_path: The remote path to collect the log file from 94 @param dest_path: A path (file or directory) to write the copies logs into 95 @param use_tmp: If True, will first copy the logs to a temporary directory 96 on the host and download logs from there. 97 @param clean: If True, remove dest_path after upload attempt even if it 98 failed. 99 @param clean_content: If True, remove files and directories in dest_path 100 after upload attempt even if it failed. 101 102 """ 103 logging.info('Collecting %s...', log_path) 104 if not host.check_cached_up_status(): 105 logging.warning('Host %s did not answer to ping, skip collecting log ' 106 'file %s.', host.hostname, log_path) 107 return 108 try: 109 file_stats = _get_file_stats(host, log_path) 110 if not file_stats: 111 # Failed to get file stat, the file may not exist. 112 return 113 114 if (not result_tools_runner.ENABLE_RESULT_THROTTLING and 115 random.random() > file_stats.collection_probability): 116 logging.warning('Collection of %s skipped:' 117 'size=%s, collection_probability=%s', 118 log_path, file_stats.size, 119 file_stats.collection_probability) 120 elif use_tmp: 121 _collect_log_file_with_tmpdir(host, log_path, dest_path) 122 else: 123 _collect_log_file_with_summary(host, log_path, dest_path) 124 except Exception as e: 125 logging.exception('Non-critical failure: collection of %s failed: %s', 126 log_path, e) 127 finally: 128 if clean_content: 129 path_to_delete = os.path.join(pipes.quote(log_path), '*') 130 elif clean: 131 path_to_delete = pipes.quote(log_path) 132 if clean or clean_content: 133 host.run('rm -rf %s' % path_to_delete, ignore_status=True) 134 135 136_FileStats = collections.namedtuple('_FileStats', 137 'size collection_probability') 138 139 140def _collect_log_file_with_tmpdir(host, log_path, dest_path): 141 """Collect log file from host through a temp directory on the host. 142 143 @param host: The RemoteHost to collect logs from. 144 @param log_path: The remote path to collect the log file from. 145 @param dest_path: A path (file or directory) to write the copies logs into. 146 147 """ 148 with _RemoteTempDir(host) as tmpdir: 149 host.run('cp -rp %s %s' % (pipes.quote(log_path), pipes.quote(tmpdir))) 150 source_path = os.path.join(tmpdir, os.path.basename(log_path)) 151 152 _collect_log_file_with_summary(host, source_path, dest_path) 153 154 155def _get_file_stats(host, path): 156 """Get the stats of a file from host. 157 158 @param host: Instance of Host subclass with run(). 159 @param path: Path of file to check. 160 @returns: _FileStats namedtuple with file size and collection probability. 161 """ 162 cmd = 'ls -ld %s | cut -d" " -f5' % (pipes.quote(path),) 163 output = None 164 file_size = 0 165 try: 166 output = host.run(cmd).stdout 167 except error.CmdError as e: 168 logging.warning('Getting size of file %r on host %r failed: %s. ' 169 'Default its size to 0', path, host, e) 170 try: 171 if output is not None: 172 file_size = int(output) 173 except ValueError: 174 logging.warning('Failed to convert size string "%s" for %s on host %r. ' 175 'File may not exist.', output, path, host) 176 return 177 178 if file_size == 0: 179 return _FileStats(0, 1.0) 180 else: 181 collection_probability = _MAX_FILESIZE / float(file_size) 182 return _FileStats(file_size, collection_probability) 183 184 185# import any site hooks for the crashdump and crashinfo collection 186get_site_crashdumps = utils.import_site_function( 187 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps", 188 lambda host, test_start_time: None) 189get_site_crashinfo = utils.import_site_function( 190 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo", 191 lambda host, test_start_time: None) 192report_crashdumps = utils.import_site_function( 193 __file__, "autotest_lib.server.site_crashcollect", "report_crashdumps", 194 lambda host: None) 195fetch_orphaned_crashdumps = utils.import_site_function( 196 __file__, "autotest_lib.server.site_crashcollect", "fetch_orphaned_crashdumps", 197 lambda host, host_resultdir: None) 198get_host_infodir = utils.import_site_function( 199 __file__, "autotest_lib.server.site_crashcollect", "get_host_infodir", 200 lambda host: None) 201 202 203@metrics.SecondsTimerDecorator( 204 'chromeos/autotest/autoserv/get_crashdumps_duration') 205def get_crashdumps(host, test_start_time): 206 get_site_crashdumps(host, test_start_time) 207 208 209@metrics.SecondsTimerDecorator( 210 'chromeos/autotest/autoserv/get_crashinfo_duration') 211def get_crashinfo(host, test_start_time): 212 logging.info("Collecting crash information...") 213 214 # get_crashdumps collects orphaned crashdumps and symbolicates all 215 # collected crashdumps. Symbolicating could happen 216 # during a postjob task as well, at which time some crashdumps could have 217 # already been pulled back from machine. So it doesn't necessarily need 218 # to wait for the machine to come up. 219 get_crashdumps(host, test_start_time) 220 221 if wait_for_machine_to_recover(host): 222 # run any site-specific collection 223 get_site_crashinfo(host, test_start_time) 224 225 crashinfo_dir = get_crashinfo_dir(host, 'crashinfo') 226 collect_messages(host) 227 collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg")) 228 collect_uncollected_logs(host) 229 230 # Collect everything in /var/log. 231 log_path = os.path.join(crashinfo_dir, 'var') 232 os.makedirs(log_path) 233 collect_log_file(host, constants.LOG_DIR, log_path) 234 235 # Collect console-ramoops. The filename has changed in linux-3.19, 236 # so collect all the files in the pstore dirs. 237 log_path = os.path.join(crashinfo_dir, 'pstore') 238 for pstore_dir in constants.LOG_PSTORE_DIRS: 239 collect_log_file(host, pstore_dir, log_path, use_tmp=True, 240 clean_content=True) 241 # Collect i915_error_state, only available on intel systems. 242 # i915 contains the Intel graphics state. It might contain useful data 243 # when a DUT hangs, times out or crashes. 244 log_path = os.path.join( 245 crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE)) 246 collect_log_file(host, constants.LOG_I915_ERROR_STATE, 247 log_path, use_tmp=True) 248 249 250# Load default for number of hours to wait before giving up on crash collection. 251HOURS_TO_WAIT = global_config.global_config.get_config_value( 252 'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0) 253 254 255def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT): 256 """Wait for a machine (possibly down) to become accessible again. 257 258 @param host: A RemoteHost instance to wait on 259 @param hours_to_wait: Number of hours to wait before giving up 260 261 @returns: True if the machine comes back up, False otherwise 262 """ 263 current_time = time.strftime("%b %d %H:%M:%S", time.localtime()) 264 if host.is_up(): 265 logging.info("%s already up, collecting crash info", host.hostname) 266 return True 267 268 logging.info("Waiting %s hours for %s to come up (%s)", 269 hours_to_wait, host.hostname, current_time) 270 if not host.wait_up(timeout=hours_to_wait * 3600): 271 (metrics.Counter('chromeos/autotest/errors/collect_crashinfo_timeout') 272 .increment()) 273 logging.warning("%s down, unable to collect crash info", 274 host.hostname) 275 return False 276 else: 277 logging.info("%s is back up, collecting crash info", host.hostname) 278 return True 279 280 281def get_crashinfo_dir(host, dir_prefix): 282 """Find and if necessary create a directory to store crashinfo in. 283 284 @param host: The RemoteHost object that crashinfo will be collected from 285 @param dir_prefix: Prefix of directory name. 286 287 @returns: The path to an existing directory for writing crashinfo into 288 """ 289 host_resultdir = getattr(getattr(host, "job", None), "resultdir", None) 290 if host_resultdir: 291 infodir = host_resultdir 292 else: 293 infodir = os.path.abspath(os.getcwd()) 294 infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname)) 295 if not os.path.exists(infodir): 296 os.mkdir(infodir) 297 return infodir 298 299 300def collect_command(host, command, dest_path): 301 """Collects the result of a command on the remote machine. 302 303 The standard output of the command will be collected and written into the 304 desitionation path. The destination path is assumed to be filename and 305 not a directory. 306 307 @param host: The RemoteHost to collect from 308 @param command: A shell command to run on the remote machine and capture 309 the output from. 310 @param dest_path: A file path to write the results of the log into 311 """ 312 logging.info("Collecting '%s' ...", command) 313 try: 314 result = host.run(command, stdout_tee=None).stdout 315 utils.open_write_close(dest_path, result) 316 except Exception, e: 317 logging.warning("Collection of '%s' failed:\n%s", command, e) 318 319 320def collect_uncollected_logs(host): 321 """Collects any leftover uncollected logs from the client. 322 323 @param host: The RemoteHost to collect from 324 """ 325 if host.job: 326 try: 327 logs = host.job.get_client_logs() 328 for hostname, remote_path, local_path in logs: 329 if hostname == host.hostname: 330 logging.info('Retrieving logs from %s:%s into %s', 331 hostname, remote_path, local_path) 332 collect_log_file(host, remote_path + '/', local_path + '/') 333 except Exception, e: 334 logging.warning('Error while trying to collect stranded ' 335 'Autotest client logs: %s', e) 336 337 338def collect_messages(host): 339 """Collects the 'new' contents of /var/log/messages. 340 341 If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects 342 the contents of /var/log/messages excluding whatever initial contents 343 are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not 344 present, simply collects the entire contents of /var/log/messages. 345 346 @param host: The RemoteHost to collect from 347 """ 348 crashinfo_dir = get_crashinfo_dir(host, 'crashinfo') 349 350 try: 351 # paths to the messages files 352 messages = os.path.join(crashinfo_dir, "messages") 353 messages_raw = os.path.join(crashinfo_dir, "messages.raw") 354 messages_at_start = os.path.join(crashinfo_dir, "messages.at_start") 355 356 # grab the files from the remote host 357 collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH, 358 messages_at_start) 359 collect_log_file(host, "/var/log/messages", messages_raw) 360 361 # figure out how much of messages.raw to skip 362 if os.path.exists(messages_at_start): 363 # if the first lines of the messages at start should match the 364 # first lines of the current messages; if they don't then messages 365 # has been erase or rotated and we just grab all of it 366 first_line_at_start = utils.read_one_line(messages_at_start) 367 first_line_now = utils.read_one_line(messages_raw) 368 if first_line_at_start != first_line_now: 369 size_at_start = 0 370 else: 371 size_at_start = os.path.getsize(messages_at_start) 372 else: 373 size_at_start = 0 374 raw_messages_file = open(messages_raw) 375 messages_file = open(messages, "w") 376 raw_messages_file.seek(size_at_start) 377 shutil.copyfileobj(raw_messages_file, messages_file) 378 raw_messages_file.close() 379 messages_file.close() 380 381 # get rid of the "raw" versions of messages 382 os.remove(messages_raw) 383 if os.path.exists(messages_at_start): 384 os.remove(messages_at_start) 385 except Exception, e: 386 logging.warning("Error while collecting /var/log/messages: %s", e) 387