1from __future__ import print_function 2 3import collections 4import logging 5import os 6import pipes 7import random 8import shutil 9import time 10 11import common 12from autotest_lib.client.bin.result_tools import runner as result_tools_runner 13from autotest_lib.client.common_lib import error 14from autotest_lib.client.common_lib import global_config 15from autotest_lib.client.cros import constants 16from autotest_lib.server import utils 17 18try: 19 from chromite.lib import metrics 20except ImportError: 21 metrics = utils.metrics_mock 22 23 24# The amortized max filesize to collect. For example, if _MAX_FILESIZE is 10 25# then we would collect a file with size 20 half the time, and a file with size 26# 40 a quarter of the time, so that in the long run we are collecting files 27# with this max size. 28_MAX_FILESIZE = 64 * (2 ** 20) # 64 MiB 29 30class _RemoteTempDir(object): 31 32 """Context manager for temporary directory on remote host.""" 33 34 def __init__(self, host): 35 self.host = host 36 self.tmpdir = None 37 38 def __repr__(self): 39 return '<{cls} host={this.host!r}, tmpdir={this.tmpdir!r}>'.format( 40 cls=type(self).__name__, this=self) 41 42 def __enter__(self): 43 self.tmpdir = (self.host 44 .run('mktemp -d', stdout_tee=None) 45 .stdout.strip()) 46 return self.tmpdir 47 48 def __exit__(self, exc_type, exc_value, exc_tb): 49 self.host.run('rm -rf %s' % (pipes.quote(self.tmpdir),)) 50 51 52def _collect_log_file_with_summary(host, source_path, dest_path): 53 """Collects a log file from the remote machine with directory summary. 54 55 @param host: The RemoteHost to collect logs from. 56 @param source_path: The remote path to collect the log file from. 57 @param dest_path: A path (file or directory) to write the copies logs into. 58 """ 59 # Build test result directory summary 60 summary_created = result_tools_runner.run_on_client(host, source_path) 61 62 skip_summary_collection = True 63 try: 64 host.get_file(source_path, dest_path, preserve_perm=False) 65 skip_summary_collection = False 66 finally: 67 if summary_created: 68 # If dest_path is a file, use its parent folder to store the 69 # directory summary file. 70 if os.path.isfile(dest_path): 71 dest_path = os.path.dirname(dest_path) 72 # If dest_path doesn't exist, that means get_file failed, there is 73 # no need to collect directory summary file. 74 skip_summary_collection |= not os.path.exists(dest_path) 75 result_tools_runner.collect_last_summary( 76 host, source_path, dest_path, 77 skip_summary_collection=skip_summary_collection) 78 79 80def collect_log_file(host, log_path, dest_path, use_tmp=False, clean=False, 81 clean_content=False): 82 """Collects a log file from the remote machine. 83 84 Log files are collected from the remote machine and written into the 85 destination path. If dest_path is a directory, the log file will be named 86 using the basename of the remote log path. 87 88 Very large files will randomly not be collected, to alleviate network 89 traffic in the case of widespread crashes dumping large core files. Note 90 that this check only applies to the exact file passed as log_path. For 91 example, if this is a directory, the size of the contents will not be 92 checked. 93 94 @param host: The RemoteHost to collect logs from 95 @param log_path: The remote path to collect the log file from 96 @param dest_path: A path (file or directory) to write the copies logs into 97 @param use_tmp: If True, will first copy the logs to a temporary directory 98 on the host and download logs from there. 99 @param clean: If True, remove dest_path after upload attempt even if it 100 failed. 101 @param clean_content: If True, remove files and directories in dest_path 102 after upload attempt even if it failed. 103 104 """ 105 logging.info('Collecting %s...', log_path) 106 if not host.check_cached_up_status(): 107 logging.warning('Host %s did not answer to ping, skip collecting log ' 108 'file %s.', host.hostname, log_path) 109 return 110 try: 111 file_stats = _get_file_stats(host, log_path) 112 if not file_stats: 113 # Failed to get file stat, the file may not exist. 114 return 115 116 if (not result_tools_runner.ENABLE_RESULT_THROTTLING and 117 random.random() > file_stats.collection_probability): 118 logging.warning('Collection of %s skipped:' 119 'size=%s, collection_probability=%s', 120 log_path, file_stats.size, 121 file_stats.collection_probability) 122 elif use_tmp: 123 _collect_log_file_with_tmpdir(host, log_path, dest_path) 124 else: 125 _collect_log_file_with_summary(host, log_path, dest_path) 126 except Exception as e: 127 logging.exception('Non-critical failure: collection of %s failed: %s', 128 log_path, e) 129 finally: 130 if clean_content: 131 path_to_delete = os.path.join(pipes.quote(log_path), '*') 132 elif clean: 133 path_to_delete = pipes.quote(log_path) 134 if clean or clean_content: 135 host.run('rm -rf %s' % path_to_delete, ignore_status=True) 136 137 138_FileStats = collections.namedtuple('_FileStats', 139 'size collection_probability') 140 141 142def _collect_log_file_with_tmpdir(host, log_path, dest_path): 143 """Collect log file from host through a temp directory on the host. 144 145 @param host: The RemoteHost to collect logs from. 146 @param log_path: The remote path to collect the log file from. 147 @param dest_path: A path (file or directory) to write the copies logs into. 148 149 """ 150 with _RemoteTempDir(host) as tmpdir: 151 host.run('cp -rp %s %s' % (pipes.quote(log_path), pipes.quote(tmpdir))) 152 source_path = os.path.join(tmpdir, os.path.basename(log_path)) 153 154 _collect_log_file_with_summary(host, source_path, dest_path) 155 156 157def _get_file_stats(host, path): 158 """Get the stats of a file from host. 159 160 @param host: Instance of Host subclass with run(). 161 @param path: Path of file to check. 162 @returns: _FileStats namedtuple with file size and collection probability. 163 """ 164 cmd = 'ls -ld %s | cut -d" " -f5' % (pipes.quote(path),) 165 output = None 166 file_size = 0 167 try: 168 output = host.run(cmd).stdout 169 except error.CmdError as e: 170 logging.warning('Getting size of file %r on host %r failed: %s. ' 171 'Default its size to 0', path, host, e) 172 try: 173 if output is not None: 174 file_size = int(output) 175 except ValueError: 176 logging.warning('Failed to convert size string "%s" for %s on host %r. ' 177 'File may not exist.', output, path, host) 178 return 179 180 if file_size == 0: 181 return _FileStats(0, 1.0) 182 else: 183 collection_probability = _MAX_FILESIZE / float(file_size) 184 return _FileStats(file_size, collection_probability) 185 186 187# import any site hooks for the crashdump and crashinfo collection 188get_site_crashdumps = utils.import_site_function( 189 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps", 190 lambda host, test_start_time: None) 191get_site_crashinfo = utils.import_site_function( 192 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo", 193 lambda host, test_start_time: None) 194report_crashdumps = utils.import_site_function( 195 __file__, "autotest_lib.server.site_crashcollect", "report_crashdumps", 196 lambda host: None) 197fetch_orphaned_crashdumps = utils.import_site_function( 198 __file__, "autotest_lib.server.site_crashcollect", "fetch_orphaned_crashdumps", 199 lambda host, host_resultdir: None) 200get_host_infodir = utils.import_site_function( 201 __file__, "autotest_lib.server.site_crashcollect", "get_host_infodir", 202 lambda host: None) 203 204 205@metrics.SecondsTimerDecorator( 206 'chromeos/autotest/autoserv/get_crashdumps_duration') 207def get_crashdumps(host, test_start_time): 208 get_site_crashdumps(host, test_start_time) 209 210 211@metrics.SecondsTimerDecorator( 212 'chromeos/autotest/autoserv/get_crashinfo_duration') 213def get_crashinfo(host, test_start_time): 214 logging.info("Collecting crash information...") 215 216 # get_crashdumps collects orphaned crashdumps and symbolicates all 217 # collected crashdumps. Symbolicating could happen 218 # during a postjob task as well, at which time some crashdumps could have 219 # already been pulled back from machine. So it doesn't necessarily need 220 # to wait for the machine to come up. 221 get_crashdumps(host, test_start_time) 222 223 if wait_for_machine_to_recover(host): 224 # run any site-specific collection 225 get_site_crashinfo(host, test_start_time) 226 227 crashinfo_dir = get_crashinfo_dir(host, 'crashinfo') 228 collect_messages(host) 229 collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg")) 230 collect_uncollected_logs(host) 231 232 # Collect everything in /var/log. 233 log_path = os.path.join(crashinfo_dir, 'var') 234 os.makedirs(log_path) 235 collect_log_file(host, constants.LOG_DIR, log_path) 236 237 # Collect console-ramoops. The filename has changed in linux-3.19, 238 # so collect all the files in the pstore dirs. 239 log_path = os.path.join(crashinfo_dir, 'pstore') 240 for pstore_dir in constants.LOG_PSTORE_DIRS: 241 collect_log_file(host, pstore_dir, log_path, use_tmp=True, 242 clean_content=True) 243 # Collect i915_error_state, only available on intel systems. 244 # i915 contains the Intel graphics state. It might contain useful data 245 # when a DUT hangs, times out or crashes. 246 log_path = os.path.join( 247 crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE)) 248 collect_log_file(host, constants.LOG_I915_ERROR_STATE, 249 log_path, use_tmp=True) 250 251 252# Load default for number of hours to wait before giving up on crash collection. 253HOURS_TO_WAIT = global_config.global_config.get_config_value( 254 'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0) 255 256 257def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT): 258 """Wait for a machine (possibly down) to become accessible again. 259 260 @param host: A RemoteHost instance to wait on 261 @param hours_to_wait: Number of hours to wait before giving up 262 263 @returns: True if the machine comes back up, False otherwise 264 """ 265 current_time = time.strftime("%b %d %H:%M:%S", time.localtime()) 266 if host.is_up(): 267 logging.info("%s already up, collecting crash info", host.hostname) 268 return True 269 270 logging.info("Waiting %s hours for %s to come up (%s)", 271 hours_to_wait, host.hostname, current_time) 272 if not host.wait_up(timeout=hours_to_wait * 3600): 273 (metrics.Counter('chromeos/autotest/errors/collect_crashinfo_timeout') 274 .increment()) 275 logging.warning("%s down, unable to collect crash info", 276 host.hostname) 277 return False 278 else: 279 logging.info("%s is back up, collecting crash info", host.hostname) 280 return True 281 282 283def get_crashinfo_dir(host, dir_prefix): 284 """Find and if necessary create a directory to store crashinfo in. 285 286 @param host: The RemoteHost object that crashinfo will be collected from 287 @param dir_prefix: Prefix of directory name. 288 289 @returns: The path to an existing directory for writing crashinfo into 290 """ 291 host_resultdir = getattr(getattr(host, "job", None), "resultdir", None) 292 if host_resultdir: 293 infodir = host_resultdir 294 else: 295 infodir = os.path.abspath(os.getcwd()) 296 infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname)) 297 if not os.path.exists(infodir): 298 os.mkdir(infodir) 299 return infodir 300 301 302def collect_command(host, command, dest_path): 303 """Collects the result of a command on the remote machine. 304 305 The standard output of the command will be collected and written into the 306 desitionation path. The destination path is assumed to be filename and 307 not a directory. 308 309 @param host: The RemoteHost to collect from 310 @param command: A shell command to run on the remote machine and capture 311 the output from. 312 @param dest_path: A file path to write the results of the log into 313 """ 314 logging.info("Collecting '%s' ...", command) 315 try: 316 result = host.run(command, stdout_tee=None).stdout 317 utils.open_write_close(dest_path, result) 318 except Exception as e: 319 logging.warning("Collection of '%s' failed:\n%s", command, e) 320 321 322def collect_uncollected_logs(host): 323 """Collects any leftover uncollected logs from the client. 324 325 @param host: The RemoteHost to collect from 326 """ 327 if host.job: 328 try: 329 logs = host.job.get_client_logs() 330 for hostname, remote_path, local_path in logs: 331 if hostname == host.hostname: 332 logging.info('Retrieving logs from %s:%s into %s', 333 hostname, remote_path, local_path) 334 collect_log_file(host, remote_path + '/', local_path + '/') 335 except Exception as e: 336 logging.warning('Error while trying to collect stranded ' 337 'Autotest client logs: %s', e) 338 339 340def collect_messages(host): 341 """Collects the 'new' contents of /var/log/messages. 342 343 If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects 344 the contents of /var/log/messages excluding whatever initial contents 345 are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not 346 present, simply collects the entire contents of /var/log/messages. 347 348 @param host: The RemoteHost to collect from 349 """ 350 crashinfo_dir = get_crashinfo_dir(host, 'crashinfo') 351 352 try: 353 # paths to the messages files 354 messages = os.path.join(crashinfo_dir, "messages") 355 messages_raw = os.path.join(crashinfo_dir, "messages.raw") 356 messages_at_start = os.path.join(crashinfo_dir, "messages.at_start") 357 358 # grab the files from the remote host 359 collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH, 360 messages_at_start) 361 collect_log_file(host, "/var/log/messages", messages_raw) 362 363 # figure out how much of messages.raw to skip 364 if os.path.exists(messages_at_start): 365 # if the first lines of the messages at start should match the 366 # first lines of the current messages; if they don't then messages 367 # has been erase or rotated and we just grab all of it 368 first_line_at_start = utils.read_one_line(messages_at_start) 369 first_line_now = utils.read_one_line(messages_raw) 370 if first_line_at_start != first_line_now: 371 size_at_start = 0 372 else: 373 size_at_start = os.path.getsize(messages_at_start) 374 else: 375 size_at_start = 0 376 raw_messages_file = open(messages_raw) 377 messages_file = open(messages, "w") 378 raw_messages_file.seek(size_at_start) 379 shutil.copyfileobj(raw_messages_file, messages_file) 380 raw_messages_file.close() 381 messages_file.close() 382 383 # get rid of the "raw" versions of messages 384 os.remove(messages_raw) 385 if os.path.exists(messages_at_start): 386 os.remove(messages_at_start) 387 except Exception as e: 388 logging.warning("Error while collecting /var/log/messages: %s", e) 389