1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import logging, os, re, shutil, time 6 7from autotest_lib.client.bin import utils 8from autotest_lib.client.common_lib import error 9from autotest_lib.client.cros import cros_logging, sys_power 10#pylint: disable=W0611 11from autotest_lib.client.cros import flimflam_test_path 12import flimflam 13 14class Suspender(object): 15 """Class for suspend/resume measurements. 16 17 Public attributes: 18 disconnect_3G_time: Amount of seconds it took to disable 3G. 19 successes[]: List of timing measurement dicts from successful suspends. 20 failures[]: List of SuspendFailure exceptions from failed suspends. 21 device_times[]: List of individual device suspend/resume time dicts. 22 23 Public methods: 24 suspend: Do a suspend/resume cycle. Return timing measurement dict. 25 26 Private attributes: 27 _logs: Array of /var/log/messages lines since start of suspend cycle. 28 _log_file: Open file descriptor at the end of /var/log/messages. 29 _logdir: Directory to store firmware logs in case of errors. 30 _suspend: Set to the sys_power suspend function to use. 31 _throw: Set to have SuspendFailure exceptions raised to the caller. 32 _reset_pm_print_times: Set to deactivate pm_print_times after the test. 33 _restart_tlsdated: Set to restart tlsdated after the test. 34 35 Private methods: 36 __init__: Shuts off tlsdated for duration of test, disables 3G 37 __del__: Restore tlsdated (must run eventually, but GC delay no problem) 38 _set_pm_print_times: Enable/disable kernel device suspend timing output. 39 _check_failure_log: Check /sys/.../suspend_stats for new failures. 40 _ts: Returns a timestamp from /var/run/power_manager/last_resume_timings 41 _hwclock_ts: Read RTC timestamp left on resume in hwclock-on-resume 42 _device_resume_time: Read seconds overall device resume took from logs. 43 _individual_device_times: Reads individual device suspend/resume times. 44 _identify_driver: Return the driver name of a device (or "unknown"). 45 """ 46 47 # board-specific "time to suspend" values determined empirically 48 # TODO: migrate to separate file with http://crosbug.com/38148 49 _DEFAULT_SUSPEND_DELAY = 5 50 _SUSPEND_DELAY = { 51 # TODO: Reevaluate this when http://crosbug.com/38460 is fixed 52 'daisy': 6, 53 'daisy_spring': 6, 54 'peach_pit': 6, 55 56 # TODO: Reevaluate these when http://crosbug.com/38225 is fixed 57 'x86-mario': 6, 58 'x86-alex': 5, 59 60 # Lumpy and Stumpy need high values, because it seems to mitigate their 61 # RTC interrupt problem. See http://crosbug.com/36004 62 'lumpy': 5, 63 'stumpy': 5, 64 65 # RTS5209 card reader has a really bad staging driver, can take ~1 sec 66 'butterfly': 4, 67 68 # Hard disk sync and overall just slow 69 'parrot': 8, 70 'kiev': 9, 71 } 72 73 # alarm/not_before value guaranteed to raise SpuriousWakeup in _hwclock_ts 74 _ALARM_FORCE_EARLY_WAKEUP = 2147483647 75 76 # File written by send_metrics_on_resume containing timing information about 77 # the last resume. 78 _TIMINGS_FILE = '/var/run/power_manager/root/last_resume_timings' 79 80 # Amount of lines to dump from the eventlog on a SpuriousWakeup. Should be 81 # enough to include ACPI Wake Reason... 10 should be far on the safe side. 82 _RELEVANT_EVENTLOG_LINES = 10 83 84 # Sanity check value to catch overlong resume times (from missed RTC wakes) 85 _MAX_RESUME_TIME = 10 86 87 # File written by powerd_suspend containing the hwclock time at resume. 88 HWCLOCK_FILE = '/var/run/power_manager/root/hwclock-on-resume' 89 90 def __init__(self, logdir, method=sys_power.do_suspend, 91 throw=False, device_times=False): 92 """Prepare environment for suspending.""" 93 self.disconnect_3G_time = 0 94 self.successes = [] 95 self.failures = [] 96 self._logdir = logdir 97 self._suspend = method 98 self._throw = throw 99 self._reset_pm_print_times = False 100 self._restart_tlsdated = False 101 self._log_file = None 102 if device_times: 103 self.device_times = [] 104 105 # stop tlsdated, make sure we/hwclock have /dev/rtc for ourselves 106 if utils.system_output('initctl status tlsdated').find('start') != -1: 107 utils.system('initctl stop tlsdated') 108 self._restart_tlsdated = True 109 # give process's file descriptors time to asynchronously tear down 110 time.sleep(0.1) 111 112 # prime powerd_suspend RTC timestamp saving and make sure hwclock works 113 utils.open_write_close(self.HWCLOCK_FILE, '') 114 hwclock_output = utils.system_output('hwclock -r --debug --utc', 115 ignore_status=True) 116 if not re.search('Using.*/dev interface to.*clock', hwclock_output): 117 raise error.TestError('hwclock cannot find rtc: ' + hwclock_output) 118 119 # activate device suspend timing debug output 120 if hasattr(self, 'device_times'): 121 if not int(utils.read_one_line('/sys/power/pm_print_times')): 122 self._set_pm_print_times(True) 123 self._reset_pm_print_times = True 124 125 # Shut down 3G to remove its variability from suspend time measurements 126 flim = flimflam.FlimFlam() 127 service = flim.FindCellularService(0) 128 if service: 129 logging.info('Found 3G interface, disconnecting.') 130 start_time = time.time() 131 (success, status) = flim.DisconnectService( 132 service=service, wait_timeout=60) 133 if success: 134 logging.info('3G disconnected successfully.') 135 self.disconnect_3G_time = time.time() - start_time 136 else: 137 logging.error('Could not disconnect: %s.', status) 138 self.disconnect_3G_time = -1 139 140 141 def _set_pm_print_times(self, on): 142 """Enable/disable extra suspend timing output from powerd to syslog.""" 143 if utils.system('echo %s > /sys/power/pm_print_times' % int(bool(on)), 144 ignore_status=True): 145 logging.warning('Failed to set pm_print_times to %s', bool(on)) 146 del self.device_times 147 self._reset_pm_print_times = False 148 else: 149 logging.info('Device resume times set to %s', bool(on)) 150 151 152 def _get_board(self): 153 """Remove _freon from get_board if found.""" 154 return (utils.get_board().replace("_freon", "")) 155 156 157 def _reset_logs(self): 158 """Throw away cached log lines and reset log pointer to current end.""" 159 if self._log_file: 160 self._log_file.close() 161 self._log_file = open('/var/log/messages') 162 self._log_file.seek(0, os.SEEK_END) 163 self._logs = [] 164 165 166 def _update_logs(self, retries=11): 167 """ 168 Read all lines logged since last reset into log cache. Block until last 169 powerd_suspend resume message was read, raise if it takes too long. 170 """ 171 finished_regex = re.compile(r'powerd_suspend\[\d+\]: Resume finished') 172 for retry in xrange(retries + 1): 173 lines = self._log_file.readlines() 174 if lines: 175 if self._logs and self._logs[-1][-1] != '\n': 176 # Reassemble line that was cut in the middle 177 self._logs[-1] += lines.pop(0) 178 self._logs += lines 179 for line in reversed(self._logs): 180 if (finished_regex.search(line)): 181 return 182 time.sleep(0.005 * 2**retry) 183 184 raise error.TestError("Sanity check failed: did not try to suspend.") 185 186 187 def _ts(self, name, retries=11): 188 """Searches logs for last timestamp with a given suspend message.""" 189 # Occasionally need to retry due to races from process wakeup order 190 for retry in xrange(retries + 1): 191 try: 192 f = open(self._TIMINGS_FILE) 193 for line in f: 194 words = line.split('=') 195 if name == words[0]: 196 try: 197 timestamp = float(words[1]) 198 except ValueError: 199 logging.warning('Invalid timestamp: %s', line) 200 timestamp = 0 201 return timestamp 202 except IOError: 203 pass 204 time.sleep(0.005 * 2**retry) 205 206 raise error.TestError('Could not find %s entry.' % name) 207 208 209 def _hwclock_ts(self, not_before, retries=3): 210 """Read the RTC resume timestamp saved by powerd_suspend.""" 211 for retry in xrange(retries + 1): 212 early_wakeup = False 213 if os.path.exists(self.HWCLOCK_FILE): 214 match = re.search(r'([0-9]+) seconds since .+ (-?[0-9.]+) sec', 215 utils.read_file(self.HWCLOCK_FILE), re.DOTALL) 216 if match: 217 seconds = int(match.group(1)) + float(match.group(2)) 218 logging.debug('RTC resume timestamp read: %f', seconds) 219 if seconds >= not_before: 220 return seconds 221 early_wakeup = True 222 time.sleep(0.05 * retry) 223 if early_wakeup: 224 logging.debug('Early wakeup, dumping eventlog if it exists:\n') 225 elog = utils.system_output('mosys eventlog list | tail -n %d' % 226 self._RELEVANT_EVENTLOG_LINES, ignore_status=True) 227 wake_elog = (['unknown'] + re.findall(r'Wake Source.*', elog))[-1] 228 for line in reversed(self._logs): 229 match = re.search(r'PM1_STS: WAK.*', line) 230 if match: 231 wake_syslog = match.group(0) 232 break 233 else: 234 wake_syslog = 'unknown' 235 for b, e, s in sys_power.SpuriousWakeupError.S3_WHITELIST: 236 if (re.search(b, utils.get_board()) and 237 re.search(e, wake_elog) and re.search(s, wake_syslog)): 238 logging.warning('Whitelisted spurious wake in S3: %s | %s', 239 wake_elog, wake_syslog) 240 return None 241 raise sys_power.SpuriousWakeupError('Spurious wake in S3: %s | %s' 242 % (wake_elog, wake_syslog)) 243 if self._get_board() in ['lumpy', 'stumpy', 'kiev']: 244 logging.debug('RTC read failure (crosbug/36004), dumping nvram:\n' + 245 utils.system_output('mosys nvram dump', ignore_status=True)) 246 return None 247 raise error.TestError('Broken RTC timestamp: ' + 248 utils.read_file(self.HWCLOCK_FILE)) 249 250 251 def _firmware_resume_time(self): 252 """Calculate seconds for firmware resume from logged TSC. (x86 only)""" 253 if utils.get_arch() not in ['i686', 'x86_64']: 254 # TODO: support this on ARM somehow 255 return 0 256 regex = re.compile(r'TSC at resume: (\d+)$') 257 freq = 1000 * int(utils.read_one_line( 258 '/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq')) 259 for line in reversed(self._logs): 260 match = regex.search(line) 261 if match: 262 return float(match.group(1)) / freq 263 264 raise error.TestError('Failed to find TSC resume value in syslog.') 265 266 267 def _device_resume_time(self): 268 """Read amount of seconds for overall device resume from syslog.""" 269 regex = re.compile(r'PM: resume of devices complete after ([0-9.]+)') 270 for line in reversed(self._logs): 271 match = regex.search(line) 272 if match: 273 return float(match.group(1)) / 1000 274 275 raise error.TestError('Failed to find device resume time in syslog.') 276 277 278 def _individual_device_times(self, start_resume): 279 """Return dict of individual device suspend and resume times.""" 280 self.device_times.append(dict()) 281 regex = re.compile(r'call ([^ ]+)\+ returned 0 after ([0-9]+) usecs') 282 for line in self._logs: 283 match = regex.search(line) 284 if match: 285 key = 'seconds_dev_' + match.group(1).replace(':', '-') 286 secs = float(match.group(2)) / 1e6 287 if cros_logging.extract_kernel_timestamp(line) > start_resume: 288 key += '_resume' 289 else: 290 key += '_suspend' 291 if key in self.device_times[-1]: 292 logging.warning('Duplicate entry for %s: +%f', key, secs) 293 self.device_times[-1][key] += secs 294 else: 295 logging.debug('%s: %f', key, secs) 296 self.device_times[-1][key] = secs 297 298 299 def _identify_driver(self, device): 300 """Return the driver name of a device (or "unknown").""" 301 for path, subdirs, _ in os.walk('/sys/devices'): 302 if device in subdirs: 303 node = os.path.join(path, device, 'driver') 304 if not os.path.exists(node): 305 return "unknown" 306 return os.path.basename(os.path.realpath(node)) 307 else: 308 return "unknown" 309 310 311 def _check_for_errors(self): 312 """Find and identify suspend errors. Return True iff we should retry.""" 313 warning_regex = re.compile(r' kernel: \[.*WARNING:') 314 abort_regex = re.compile(r' kernel: \[.*Freezing of tasks abort' 315 r'| powerd_suspend\[.*Cancel suspend at kernel' 316 r'| kernel: \[.*PM: Wakeup pending, aborting suspend') 317 # rsyslogd can put this out of order with dmesg, so track in variable 318 fail_regex = re.compile(r'powerd_suspend\[\d+\]: Error') 319 failed = False 320 321 # TODO(scottz): warning_monitor crosbug.com/38092 322 for i in xrange(len(self._logs)): 323 line = self._logs[i] 324 if warning_regex.search(line): 325 # match the source file from the WARNING line, and the 326 # actual error text by peeking one or two lines below that 327 src = cros_logging.strip_timestamp(line) 328 text = cros_logging.strip_timestamp(self._logs[i + 1]) + '\n' \ 329 + cros_logging.strip_timestamp(self._logs[i + 2]) 330 for p1, p2 in sys_power.KernelError.WHITELIST: 331 if re.search(p1, src) and re.search(p2, text): 332 logging.info('Whitelisted KernelError: %s', src) 333 break 334 else: 335 raise sys_power.KernelError("%s\n%s" % (src, text)) 336 if abort_regex.search(line): 337 wake_source = 'unknown' 338 match = re.search(r'last active wakeup source: (.*)$', 339 '\n'.join(self._logs[i-5:i+3]), re.MULTILINE) 340 if match: 341 wake_source = match.group(1) 342 driver = self._identify_driver(wake_source) 343 for b, w in sys_power.SpuriousWakeupError.S0_WHITELIST: 344 if (re.search(b, utils.get_board()) and 345 re.search(w, wake_source)): 346 logging.warning('Whitelisted spurious wake before ' 347 'S3: %s | %s', wake_source, driver) 348 return True 349 if "rtc" in driver: 350 raise sys_power.SuspendTimeout('System took too ' 351 'long to suspend.') 352 raise sys_power.SpuriousWakeupError('Spurious wake ' 353 'before S3: %s | %s' % (wake_source, driver)) 354 if fail_regex.search(line): 355 failed = True 356 if failed: 357 raise sys_power.SuspendFailure('Unidentified problem.') 358 return False 359 360 361 def suspend(self, duration=10): 362 """ 363 Do a single suspend for 'duration' seconds. Estimates the amount of time 364 it takes to suspend for a board (see _SUSPEND_DELAY), so the actual RTC 365 wakeup delay will be longer. Returns None on errors, or raises the 366 exception when _throw is set. Returns a dict of general measurements, 367 or a tuple (general_measurements, individual_device_times) when 368 _device_times is set. 369 370 @param duration: time in seconds to do a suspend prior to waking. 371 """ 372 try: 373 iteration = len(self.failures) + len(self.successes) + 1 374 375 # Retry suspend in case we hit a known (whitelisted) bug 376 for _ in xrange(10): 377 self._reset_logs() 378 utils.system('sync') 379 board_delay = self._SUSPEND_DELAY.get(self._get_board(), 380 self._DEFAULT_SUSPEND_DELAY) 381 try: 382 alarm = self._suspend(duration + board_delay) 383 except sys_power.SpuriousWakeupError: 384 # might be another error, we check for it ourselves below 385 alarm = self._ALARM_FORCE_EARLY_WAKEUP 386 387 if os.path.exists('/sys/firmware/log'): 388 for msg in re.findall(r'^.*ERROR.*$', 389 utils.read_file('/sys/firmware/log'), re.M): 390 for board, pattern in sys_power.FirmwareError.WHITELIST: 391 if (re.search(board, utils.get_board()) and 392 re.search(pattern, msg)): 393 logging.info('Whitelisted FW error: ' + msg) 394 break 395 else: 396 firmware_log = os.path.join(self._logdir, 397 'firmware.log.' + str(iteration)) 398 shutil.copy('/sys/firmware/log', firmware_log) 399 logging.info('Saved firmware log: ' + firmware_log) 400 raise sys_power.FirmwareError(msg.strip('\r\n ')) 401 402 self._update_logs() 403 if not self._check_for_errors(): 404 hwclock_ts = self._hwclock_ts(alarm) 405 if hwclock_ts: 406 break 407 else: 408 raise error.TestWarn('Ten tries failed due to whitelisted bug') 409 410 # calculate general measurements 411 start_resume = self._ts('start_resume_time') 412 kernel_down = (self._ts('end_suspend_time') - 413 self._ts('start_suspend_time')) 414 kernel_up = self._ts('end_resume_time') - start_resume 415 devices_up = self._device_resume_time() 416 total_up = hwclock_ts - alarm 417 firmware_up = self._firmware_resume_time() 418 board_up = total_up - kernel_up - firmware_up 419 try: 420 cpu_up = self._ts('cpu_ready_time', 0) - start_resume 421 except error.TestError: 422 # can be missing on non-SMP machines 423 cpu_up = None 424 if total_up > self._MAX_RESUME_TIME: 425 raise error.TestError('Sanity check failed: missed RTC wakeup.') 426 427 logging.info('Success(%d): %g down, %g up, %g board, %g firmware, ' 428 '%g kernel, %g cpu, %g devices', 429 iteration, kernel_down, total_up, board_up, 430 firmware_up, kernel_up, cpu_up, devices_up) 431 self.successes.append({ 432 'seconds_system_suspend': kernel_down, 433 'seconds_system_resume': total_up, 434 'seconds_system_resume_firmware': firmware_up + board_up, 435 'seconds_system_resume_firmware_cpu': firmware_up, 436 'seconds_system_resume_firmware_ec': board_up, 437 'seconds_system_resume_kernel': kernel_up, 438 'seconds_system_resume_kernel_cpu': cpu_up, 439 'seconds_system_resume_kernel_dev': devices_up, 440 }) 441 442 if hasattr(self, 'device_times'): 443 self._individual_device_times(start_resume) 444 return (self.successes[-1], self.device_times[-1]) 445 else: 446 return self.successes[-1] 447 448 except sys_power.SuspendFailure as ex: 449 message = '%s(%d): %s' % (type(ex).__name__, iteration, ex) 450 logging.error(message) 451 self.failures.append(ex) 452 if self._throw: 453 if type(ex).__name__ in ['KernelError', 'SuspendTimeout']: 454 raise error.TestWarn(message) 455 else: 456 raise error.TestFail(message) 457 return None 458 459 460 def finalize(self): 461 """Restore normal environment (not turning 3G back on for now...)""" 462 if os.path.exists(self.HWCLOCK_FILE): 463 os.remove(self.HWCLOCK_FILE) 464 if self._restart_tlsdated: 465 utils.system('initctl start tlsdated') 466 if self._reset_pm_print_times: 467 self._set_pm_print_times(False) 468 469 470 def __del__(self): 471 self.finalize() 472