1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import collections, logging, os, re, shutil, time 6 7from autotest_lib.client.bin import utils 8from autotest_lib.client.common_lib import error 9from autotest_lib.client.cros import cros_logging, sys_power 10from autotest_lib.client.cros import power_utils 11from autotest_lib.client.cros import power_status 12#pylint: disable=W0611 13from autotest_lib.client.cros import flimflam_test_path 14import flimflam 15 16class Suspender(object): 17 """Class for suspend/resume measurements. 18 19 Public attributes: 20 disconnect_3G_time: Amount of seconds it took to disable 3G. 21 successes[]: List of timing measurement dicts from successful suspends. 22 failures[]: List of SuspendFailure exceptions from failed suspends. 23 device_times[]: List of individual device suspend/resume time dicts. 24 25 Public methods: 26 suspend: Do a suspend/resume cycle. Return timing measurement dict. 27 28 Private attributes: 29 _logs: Array of /var/log/messages lines since start of suspend cycle. 30 _log_file: Open file descriptor at the end of /var/log/messages. 31 _logdir: Directory to store firmware logs in case of errors. 32 _suspend: Set to the sys_power suspend function to use. 33 _throw: Set to have SuspendFailure exceptions raised to the caller. 34 _reset_pm_print_times: Set to deactivate pm_print_times after the test. 35 _restart_tlsdated: Set to restart tlsdated after the test. 36 37 Private methods: 38 __init__: Shuts off tlsdated for duration of test, disables 3G 39 __del__: Restore tlsdated (must run eventually, but GC delay no problem) 40 _set_pm_print_times: Enable/disable kernel device suspend timing output. 41 _check_failure_log: Check /sys/.../suspend_stats for new failures. 42 _ts: Returns a timestamp from /var/run/power_manager/last_resume_timings 43 _hwclock_ts: Read RTC timestamp left on resume in hwclock-on-resume 44 _device_resume_time: Read seconds overall device resume took from logs. 45 _individual_device_times: Reads individual device suspend/resume times. 46 _identify_driver: Return the driver name of a device (or "unknown"). 47 """ 48 49 # board-specific "time to suspend" values determined empirically 50 # TODO: migrate to separate file with http://crosbug.com/38148 51 _DEFAULT_SUSPEND_DELAY = 5 52 _SUSPEND_DELAY = { 53 # TODO: Reevaluate this when http://crosbug.com/38460 is fixed 54 'daisy': 6, 55 'daisy_spring': 6, 56 'peach_pit': 6, 57 58 # TODO: Reevaluate these when http://crosbug.com/38225 is fixed 59 'x86-mario': 6, 60 'x86-alex': 5, 61 62 # Lumpy and Stumpy need high values, because it seems to mitigate their 63 # RTC interrupt problem. See http://crosbug.com/36004 64 'lumpy': 5, 65 'stumpy': 5, 66 67 # RTS5209 card reader has a really bad staging driver, can take ~1 sec 68 'butterfly': 4, 69 70 # Hard disk sync and overall just slow 71 'parrot': 8, 72 'kiev': 9, 73 } 74 75 # alarm/not_before value guaranteed to raise SpuriousWakeup in _hwclock_ts 76 _ALARM_FORCE_EARLY_WAKEUP = 2147483647 77 78 # File written by send_metrics_on_resume containing timing information about 79 # the last resume. 80 _TIMINGS_FILE = '/var/run/power_manager/root/last_resume_timings' 81 82 # Amount of lines to dump from the eventlog on a SpuriousWakeup. Should be 83 # enough to include ACPI Wake Reason... 10 should be far on the safe side. 84 _RELEVANT_EVENTLOG_LINES = 10 85 86 # Sanity check value to catch overlong resume times (from missed RTC wakes) 87 _MAX_RESUME_TIME = 10 88 89 # File written by powerd_suspend containing the hwclock time at resume. 90 HWCLOCK_FILE = '/var/run/power_manager/root/hwclock-on-resume' 91 92 # File read by powerd to decide on the state to suspend (mem or freeze). 93 _SUSPEND_STATE_PREF_FILE = 'suspend_to_idle' 94 95 def __init__(self, logdir, method=sys_power.do_suspend, 96 throw=False, device_times=False, suspend_state=''): 97 """ 98 Prepare environment for suspending. 99 @param suspend_state: Suspend state to enter into. It can be 100 'mem' or 'freeze' or an empty string. If 101 the suspend state is an empty string, 102 system suspends to the default pref. 103 """ 104 self.disconnect_3G_time = 0 105 self.successes = [] 106 self.failures = [] 107 self._logdir = logdir 108 self._suspend = method 109 self._throw = throw 110 self._reset_pm_print_times = False 111 self._restart_tlsdated = False 112 self._log_file = None 113 self._suspend_state = suspend_state 114 if device_times: 115 self.device_times = [] 116 117 # stop tlsdated, make sure we/hwclock have /dev/rtc for ourselves 118 if utils.system_output('initctl status tlsdated').find('start') != -1: 119 utils.system('initctl stop tlsdated') 120 self._restart_tlsdated = True 121 # give process's file descriptors time to asynchronously tear down 122 time.sleep(0.1) 123 124 # prime powerd_suspend RTC timestamp saving and make sure hwclock works 125 utils.open_write_close(self.HWCLOCK_FILE, '') 126 hwclock_output = utils.system_output('hwclock -r --debug --utc', 127 ignore_status=True) 128 if not re.search('Using.*/dev interface to.*clock', hwclock_output): 129 raise error.TestError('hwclock cannot find rtc: ' + hwclock_output) 130 131 # activate device suspend timing debug output 132 if hasattr(self, 'device_times'): 133 if not int(utils.read_one_line('/sys/power/pm_print_times')): 134 self._set_pm_print_times(True) 135 self._reset_pm_print_times = True 136 137 # Shut down 3G to remove its variability from suspend time measurements 138 flim = flimflam.FlimFlam() 139 service = flim.FindCellularService(0) 140 if service: 141 logging.info('Found 3G interface, disconnecting.') 142 start_time = time.time() 143 (success, status) = flim.DisconnectService( 144 service=service, wait_timeout=60) 145 if success: 146 logging.info('3G disconnected successfully.') 147 self.disconnect_3G_time = time.time() - start_time 148 else: 149 logging.error('Could not disconnect: %s.', status) 150 self.disconnect_3G_time = -1 151 152 self._configure_suspend_state() 153 154 def _configure_suspend_state(self): 155 """Configure the suspend state as requested.""" 156 if self._suspend_state: 157 available_suspend_states = utils.read_one_line('/sys/power/state') 158 if self._suspend_state not in available_suspend_states: 159 raise error.TestNAError('Invalid suspend state: ' + 160 self._suspend_state) 161 # Check the current state. If it is same as the one requested, 162 # we don't want to call PowerPrefChanger(restarts powerd). 163 if self._suspend_state == power_utils.get_sleep_state(): 164 return 165 should_freeze = '1' if self._suspend_state == 'freeze' else '0' 166 new_prefs = {self._SUSPEND_STATE_PREF_FILE: should_freeze} 167 self._power_pref_changer = power_utils.PowerPrefChanger(new_prefs) 168 169 def _set_pm_print_times(self, on): 170 """Enable/disable extra suspend timing output from powerd to syslog.""" 171 if utils.system('echo %s > /sys/power/pm_print_times' % int(bool(on)), 172 ignore_status=True): 173 logging.warning('Failed to set pm_print_times to %s', bool(on)) 174 del self.device_times 175 self._reset_pm_print_times = False 176 else: 177 logging.info('Device resume times set to %s', bool(on)) 178 179 180 def _get_board(self): 181 """Remove _freon from get_board if found.""" 182 return (utils.get_board().replace("_freon", "")) 183 184 185 def _reset_logs(self): 186 """Throw away cached log lines and reset log pointer to current end.""" 187 if self._log_file: 188 self._log_file.close() 189 self._log_file = open('/var/log/messages') 190 self._log_file.seek(0, os.SEEK_END) 191 self._logs = [] 192 193 194 def _update_logs(self, retries=11): 195 """ 196 Read all lines logged since last reset into log cache. Block until last 197 powerd_suspend resume message was read, raise if it takes too long. 198 """ 199 finished_regex = re.compile(r'powerd_suspend\[\d+\]: Resume finished') 200 for retry in xrange(retries + 1): 201 lines = self._log_file.readlines() 202 if lines: 203 if self._logs and self._logs[-1][-1] != '\n': 204 # Reassemble line that was cut in the middle 205 self._logs[-1] += lines.pop(0) 206 self._logs += lines 207 for line in reversed(self._logs): 208 if (finished_regex.search(line)): 209 return 210 time.sleep(0.005 * 2**retry) 211 212 raise error.TestError("Sanity check failed: did not try to suspend.") 213 214 215 def _ts(self, name, retries=11): 216 """Searches logs for last timestamp with a given suspend message.""" 217 # Occasionally need to retry due to races from process wakeup order 218 for retry in xrange(retries + 1): 219 try: 220 f = open(self._TIMINGS_FILE) 221 for line in f: 222 words = line.split('=') 223 if name == words[0]: 224 try: 225 timestamp = float(words[1]) 226 except ValueError: 227 logging.warning('Invalid timestamp: %s', line) 228 timestamp = 0 229 return timestamp 230 except IOError: 231 pass 232 time.sleep(0.005 * 2**retry) 233 234 raise error.TestError('Could not find %s entry.' % name) 235 236 237 def _hwclock_ts(self, not_before, retries=3): 238 """Read the RTC resume timestamp saved by powerd_suspend.""" 239 for retry in xrange(retries + 1): 240 early_wakeup = False 241 if os.path.exists(self.HWCLOCK_FILE): 242 # TODO(crbug.com/733773): Still fragile see bug. 243 match = re.search(r'(.+)(\.\d+)[+-]\d+:?\d+$', 244 utils.read_file(self.HWCLOCK_FILE), re.DOTALL) 245 if match: 246 timeval = time.strptime(match.group(1), 247 "%Y-%m-%d %H:%M:%S") 248 seconds = time.mktime(timeval) 249 seconds += float(match.group(2)) 250 logging.debug('RTC resume timestamp read: %f', seconds) 251 if seconds >= not_before: 252 return seconds 253 early_wakeup = True 254 time.sleep(0.05 * retry) 255 if early_wakeup: 256 logging.debug('Early wakeup, dumping eventlog if it exists:\n') 257 elog = utils.system_output('mosys eventlog list | tail -n %d' % 258 self._RELEVANT_EVENTLOG_LINES, ignore_status=True) 259 wake_elog = (['unknown'] + re.findall(r'Wake Source.*', elog))[-1] 260 for line in reversed(self._logs): 261 match = re.search(r'PM1_STS: WAK.*', line) 262 if match: 263 wake_syslog = match.group(0) 264 break 265 else: 266 wake_syslog = 'unknown' 267 for b, e, s in sys_power.SpuriousWakeupError.S3_WHITELIST: 268 if (re.search(b, utils.get_board()) and 269 re.search(e, wake_elog) and re.search(s, wake_syslog)): 270 logging.warning('Whitelisted spurious wake in S3: %s | %s', 271 wake_elog, wake_syslog) 272 return None 273 raise sys_power.SpuriousWakeupError('Spurious wake in S3: %s | %s' 274 % (wake_elog, wake_syslog)) 275 if self._get_board() in ['lumpy', 'stumpy', 'kiev']: 276 logging.debug('RTC read failure (crosbug/36004), dumping nvram:\n' + 277 utils.system_output('mosys nvram dump', ignore_status=True)) 278 return None 279 raise error.TestError('Broken RTC timestamp: ' + 280 utils.read_file(self.HWCLOCK_FILE)) 281 282 283 def _firmware_resume_time(self): 284 """Calculate seconds for firmware resume from logged TSC. (x86 only)""" 285 if utils.get_arch() not in ['i686', 'x86_64']: 286 # TODO: support this on ARM somehow 287 return 0 288 regex_freeze = re.compile(r'PM: resume from suspend-to-idle') 289 regex_tsc = re.compile(r'TSC at resume: (\d+)$') 290 freq = 1000 * int(utils.read_one_line( 291 '/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq')) 292 for line in reversed(self._logs): 293 match_freeze = regex_freeze.search(line) 294 if match_freeze: 295 logging.info('fw resume time zero due to suspend-to-idle\n') 296 return 0 297 match_tsc = regex_tsc.search(line) 298 if match_tsc: 299 return float(match_tsc.group(1)) / freq 300 301 raise error.TestError('Failed to find TSC resume value in syslog.') 302 303 304 def _device_resume_time(self): 305 """Read amount of seconds for overall device resume from syslog.""" 306 regex = re.compile(r'PM: resume of devices complete after ([0-9.]+)') 307 for line in reversed(self._logs): 308 match = regex.search(line) 309 if match: 310 return float(match.group(1)) / 1000 311 312 raise error.TestError('Failed to find device resume time in syslog.') 313 314 def _get_phase_times(self): 315 phase_times = [] 316 regex = re.compile(r'PM: (\w+ )?(resume|suspend) of devices complete') 317 for line in self._logs: 318 match = regex.search(line) 319 if match: 320 ts = cros_logging.extract_kernel_timestamp(line) 321 phase = match.group(1) 322 if not phase: 323 phase = 'REG' 324 phase_times.append((phase.upper(), ts)) 325 return sorted(phase_times, key = lambda entry: entry[1]) 326 327 def _get_phase(self, ts, phase_table, dev): 328 for entry in phase_table: 329 #checking if timestamp was before that phase's cutoff 330 if ts < entry[1]: 331 return entry[0] 332 raise error.TestError('Device %s has a timestamp after all devices %s', 333 dev, 'had already resumed') 334 335 def _individual_device_times(self, start_resume): 336 """Return dict of individual device suspend and resume times.""" 337 self.device_times.append(dict()) 338 dev_details = collections.defaultdict(dict) 339 regex = re.compile(r'call ([^ ]+)\+ returned 0 after ([0-9]+) usecs') 340 phase_table = self._get_phase_times() 341 for line in self._logs: 342 match = regex.search(line) 343 if match: 344 device = match.group(1).replace(':', '-') 345 key = 'seconds_dev_' + device 346 secs = float(match.group(2)) / 1e6 347 ts = cros_logging.extract_kernel_timestamp(line) 348 if ts > start_resume: 349 key += '_resume' 350 else: 351 key += '_suspend' 352 #looking if we're in a special phase 353 phase = self._get_phase(ts, phase_table, device) 354 dev = dev_details[key] 355 if phase in dev: 356 logging.warning('Duplicate %s entry for device %s, +%f', phase, 357 device, secs) 358 dev[phase] += secs 359 else: 360 dev[phase] = secs 361 362 for dev_key, dev in dev_details.iteritems(): 363 total_secs = sum(dev.values()) 364 self.device_times[-1][dev_key] = total_secs 365 report = '%s: %f TOT' % (dev_key, total_secs) 366 for phase in dev.keys(): 367 if phase is 'REG': 368 continue 369 report += ', %f %s' % (dev[phase], phase) 370 logging.debug(report) 371 372 def _identify_driver(self, device): 373 """Return the driver name of a device (or "unknown").""" 374 for path, subdirs, _ in os.walk('/sys/devices'): 375 if device in subdirs: 376 node = os.path.join(path, device, 'driver') 377 if not os.path.exists(node): 378 return "unknown" 379 return os.path.basename(os.path.realpath(node)) 380 else: 381 return "unknown" 382 383 384 def _check_for_errors(self, ignore_kernel_warns): 385 """Find and identify suspend errors. 386 387 @param ignore_kernel_warns: Ignore kernel errors. 388 389 @returns: True iff we should retry. 390 391 @raises: 392 sys_power.KernelError: for non-whitelisted kernel failures. 393 sys_power.SuspendTimeout: took too long to enter suspend. 394 sys_power.SpuriousWakeupError: woke too soon from suspend. 395 sys_power.SuspendFailure: unidentified failure. 396 """ 397 warning_regex = re.compile(r' kernel: \[.*WARNING:') 398 abort_regex = re.compile(r' kernel: \[.*Freezing of tasks abort' 399 r'| powerd_suspend\[.*Cancel suspend at kernel' 400 r'| kernel: \[.*PM: Wakeup pending, aborting suspend') 401 # rsyslogd can put this out of order with dmesg, so track in variable 402 fail_regex = re.compile(r'powerd_suspend\[\d+\]: Error') 403 failed = False 404 405 # TODO(scottz): warning_monitor crosbug.com/38092 406 log_len = len(self._logs) 407 for i in xrange(log_len): 408 line = self._logs[i] 409 if warning_regex.search(line): 410 # match the source file from the WARNING line, and the 411 # actual error text by peeking one or two lines below that 412 src = cros_logging.strip_timestamp(line) 413 text = '' 414 if i+1 < log_len: 415 text = cros_logging.strip_timestamp(self._logs[i + 1]) 416 if i+2 < log_len: 417 text += '\n' + cros_logging.strip_timestamp( 418 self._logs[i + 2]) 419 for p1, p2 in sys_power.KernelError.WHITELIST: 420 if re.search(p1, src) and re.search(p2, text): 421 logging.info('Whitelisted KernelError: %s', src) 422 break 423 else: 424 if ignore_kernel_warns: 425 logging.warn('Non-whitelisted KernelError: %s', src) 426 else: 427 raise sys_power.KernelError("%s\n%s" % (src, text)) 428 if abort_regex.search(line): 429 wake_source = 'unknown' 430 match = re.search(r'last active wakeup source: (.*)$', 431 '\n'.join(self._logs[i-5:i+3]), re.MULTILINE) 432 if match: 433 wake_source = match.group(1) 434 driver = self._identify_driver(wake_source) 435 for b, w in sys_power.SpuriousWakeupError.S0_WHITELIST: 436 if (re.search(b, utils.get_board()) and 437 re.search(w, wake_source)): 438 logging.warning('Whitelisted spurious wake before ' 439 'S3: %s | %s', wake_source, driver) 440 return True 441 if "rtc" in driver: 442 raise sys_power.SuspendTimeout('System took too ' 443 'long to suspend.') 444 raise sys_power.SpuriousWakeupError('Spurious wake ' 445 'before S3: %s | %s' % (wake_source, driver)) 446 if fail_regex.search(line): 447 failed = True 448 449 if failed: 450 raise sys_power.SuspendFailure('Unidentified problem.') 451 return False 452 453 def suspend(self, duration=10, ignore_kernel_warns=False): 454 """ 455 Do a single suspend for 'duration' seconds. Estimates the amount of time 456 it takes to suspend for a board (see _SUSPEND_DELAY), so the actual RTC 457 wakeup delay will be longer. Returns None on errors, or raises the 458 exception when _throw is set. Returns a dict of general measurements, 459 or a tuple (general_measurements, individual_device_times) when 460 _device_times is set. 461 462 @param duration: time in seconds to do a suspend prior to waking. 463 @param ignore_kernel_warns: Ignore kernel errors. Defaults to false. 464 """ 465 466 if power_utils.get_sleep_state() == 'freeze': 467 self._s0ix_residency_stats = power_status.S0ixResidencyStats() 468 469 try: 470 iteration = len(self.failures) + len(self.successes) + 1 471 # Retry suspend in case we hit a known (whitelisted) bug 472 for _ in xrange(10): 473 self._reset_logs() 474 utils.system('sync') 475 board_delay = self._SUSPEND_DELAY.get(self._get_board(), 476 self._DEFAULT_SUSPEND_DELAY) 477 try: 478 alarm = self._suspend(duration + board_delay) 479 except sys_power.SpuriousWakeupError: 480 # might be another error, we check for it ourselves below 481 alarm = self._ALARM_FORCE_EARLY_WAKEUP 482 483 if os.path.exists('/sys/firmware/log'): 484 for msg in re.findall(r'^.*ERROR.*$', 485 utils.read_file('/sys/firmware/log'), re.M): 486 for board, pattern in sys_power.FirmwareError.WHITELIST: 487 if (re.search(board, utils.get_board()) and 488 re.search(pattern, msg)): 489 logging.info('Whitelisted FW error: ' + msg) 490 break 491 else: 492 firmware_log = os.path.join(self._logdir, 493 'firmware.log.' + str(iteration)) 494 shutil.copy('/sys/firmware/log', firmware_log) 495 logging.info('Saved firmware log: ' + firmware_log) 496 raise sys_power.FirmwareError(msg.strip('\r\n ')) 497 498 self._update_logs() 499 if not self._check_for_errors(ignore_kernel_warns): 500 hwclock_ts = self._hwclock_ts(alarm) 501 if hwclock_ts: 502 break 503 504 else: 505 raise error.TestWarn('Ten tries failed due to whitelisted bug') 506 507 # calculate general measurements 508 start_resume = self._ts('start_resume_time') 509 kernel_down = (self._ts('end_suspend_time') - 510 self._ts('start_suspend_time')) 511 kernel_up = self._ts('end_resume_time') - start_resume 512 devices_up = self._device_resume_time() 513 total_up = hwclock_ts - alarm 514 firmware_up = self._firmware_resume_time() 515 board_up = total_up - kernel_up - firmware_up 516 try: 517 cpu_up = self._ts('cpu_ready_time', 0) - start_resume 518 except error.TestError: 519 # can be missing on non-SMP machines 520 cpu_up = None 521 if total_up > self._MAX_RESUME_TIME: 522 raise error.TestError('Sanity check failed: missed RTC wakeup.') 523 524 logging.info('Success(%d): %g down, %g up, %g board, %g firmware, ' 525 '%g kernel, %g cpu, %g devices', 526 iteration, kernel_down, total_up, board_up, 527 firmware_up, kernel_up, cpu_up, devices_up) 528 529 if hasattr(self, '_s0ix_residency_stats'): 530 s0ix_residency_secs = \ 531 self._s0ix_residency_stats.\ 532 get_accumulated_residency_secs() 533 if not s0ix_residency_secs: 534 raise sys_power.S0ixResidencyNotChanged( 535 'S0ix residency did not change.') 536 logging.info('S0ix residency : %d secs.', s0ix_residency_secs) 537 538 self.successes.append({ 539 'seconds_system_suspend': kernel_down, 540 'seconds_system_resume': total_up, 541 'seconds_system_resume_firmware': firmware_up + board_up, 542 'seconds_system_resume_firmware_cpu': firmware_up, 543 'seconds_system_resume_firmware_ec': board_up, 544 'seconds_system_resume_kernel': kernel_up, 545 'seconds_system_resume_kernel_cpu': cpu_up, 546 'seconds_system_resume_kernel_dev': devices_up, 547 }) 548 549 if hasattr(self, 'device_times'): 550 self._individual_device_times(start_resume) 551 return (self.successes[-1], self.device_times[-1]) 552 else: 553 return self.successes[-1] 554 555 except sys_power.SuspendFailure as ex: 556 message = '%s(%d): %s' % (type(ex).__name__, iteration, ex) 557 logging.error(message) 558 self.failures.append(ex) 559 if self._throw: 560 if type(ex).__name__ in ['KernelError', 'SuspendTimeout']: 561 raise error.TestWarn(message) 562 else: 563 raise error.TestFail(message) 564 return None 565 566 567 def finalize(self): 568 """Restore normal environment (not turning 3G back on for now...)""" 569 if os.path.exists(self.HWCLOCK_FILE): 570 os.remove(self.HWCLOCK_FILE) 571 if self._restart_tlsdated: 572 utils.system('initctl start tlsdated') 573 if self._reset_pm_print_times: 574 self._set_pm_print_times(False) 575 if hasattr(self, '_power_pref_changer'): 576 self._power_pref_changer.finalize() 577 578 579 def __del__(self): 580 self.finalize() 581