• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import collections, logging, os, re, shutil, time
6
7from autotest_lib.client.bin import utils
8from autotest_lib.client.common_lib import error
9from autotest_lib.client.cros import cros_logging, sys_power
10from autotest_lib.client.cros import power_utils
11from autotest_lib.client.cros import power_status
12#pylint: disable=W0611
13from autotest_lib.client.cros import flimflam_test_path
14import flimflam
15
16class Suspender(object):
17    """Class for suspend/resume measurements.
18
19    Public attributes:
20        disconnect_3G_time: Amount of seconds it took to disable 3G.
21        successes[]: List of timing measurement dicts from successful suspends.
22        failures[]: List of SuspendFailure exceptions from failed suspends.
23        device_times[]: List of individual device suspend/resume time dicts.
24
25    Public methods:
26        suspend: Do a suspend/resume cycle. Return timing measurement dict.
27
28    Private attributes:
29        _logs: Array of /var/log/messages lines since start of suspend cycle.
30        _log_file: Open file descriptor at the end of /var/log/messages.
31        _logdir: Directory to store firmware logs in case of errors.
32        _suspend: Set to the sys_power suspend function to use.
33        _throw: Set to have SuspendFailure exceptions raised to the caller.
34        _reset_pm_print_times: Set to deactivate pm_print_times after the test.
35        _restart_tlsdated: Set to restart tlsdated after the test.
36
37    Private methods:
38        __init__: Shuts off tlsdated for duration of test, disables 3G
39        __del__: Restore tlsdated (must run eventually, but GC delay no problem)
40        _set_pm_print_times: Enable/disable kernel device suspend timing output.
41        _check_failure_log: Check /sys/.../suspend_stats for new failures.
42        _ts: Returns a timestamp from /var/run/power_manager/last_resume_timings
43        _hwclock_ts: Read RTC timestamp left on resume in hwclock-on-resume
44        _device_resume_time: Read seconds overall device resume took from logs.
45        _individual_device_times: Reads individual device suspend/resume times.
46        _identify_driver: Return the driver name of a device (or "unknown").
47    """
48
49    # board-specific "time to suspend" values determined empirically
50    # TODO: migrate to separate file with http://crosbug.com/38148
51    _DEFAULT_SUSPEND_DELAY = 5
52    _SUSPEND_DELAY = {
53        # TODO: Reevaluate this when http://crosbug.com/38460 is fixed
54        'daisy': 6,
55        'daisy_spring': 6,
56        'peach_pit': 6,
57
58        # TODO: Reevaluate these when http://crosbug.com/38225 is fixed
59        'x86-mario': 6,
60        'x86-alex': 5,
61
62        # Lumpy and Stumpy need high values, because it seems to mitigate their
63        # RTC interrupt problem. See http://crosbug.com/36004
64        'lumpy': 5,
65        'stumpy': 5,
66
67        # RTS5209 card reader has a really bad staging driver, can take ~1 sec
68        'butterfly': 4,
69
70        # Hard disk sync and overall just slow
71        'parrot': 8,
72        'kiev': 9,
73    }
74
75    # alarm/not_before value guaranteed to raise SpuriousWakeup in _hwclock_ts
76    _ALARM_FORCE_EARLY_WAKEUP = 2147483647
77
78    # File written by send_metrics_on_resume containing timing information about
79    # the last resume.
80    _TIMINGS_FILE = '/var/run/power_manager/root/last_resume_timings'
81
82    # Amount of lines to dump from the eventlog on a SpuriousWakeup. Should be
83    # enough to include ACPI Wake Reason... 10 should be far on the safe side.
84    _RELEVANT_EVENTLOG_LINES = 10
85
86    # Sanity check value to catch overlong resume times (from missed RTC wakes)
87    _MAX_RESUME_TIME = 10
88
89    # File written by powerd_suspend containing the hwclock time at resume.
90    HWCLOCK_FILE = '/var/run/power_manager/root/hwclock-on-resume'
91
92    # File read by powerd to decide on the state to suspend (mem or freeze).
93    _SUSPEND_STATE_PREF_FILE = 'suspend_to_idle'
94
95    def __init__(self, logdir, method=sys_power.do_suspend,
96                 throw=False, device_times=False, suspend_state=''):
97        """
98        Prepare environment for suspending.
99        @param suspend_state: Suspend state to enter into. It can be
100                              'mem' or 'freeze' or an empty string. If
101                              the suspend state is an empty string,
102                              system suspends to the default pref.
103        """
104        self.disconnect_3G_time = 0
105        self.successes = []
106        self.failures = []
107        self._logdir = logdir
108        self._suspend = method
109        self._throw = throw
110        self._reset_pm_print_times = False
111        self._restart_tlsdated = False
112        self._log_file = None
113        self._suspend_state = suspend_state
114        if device_times:
115            self.device_times = []
116
117        # stop tlsdated, make sure we/hwclock have /dev/rtc for ourselves
118        if utils.system_output('initctl status tlsdated').find('start') != -1:
119            utils.system('initctl stop tlsdated')
120            self._restart_tlsdated = True
121            # give process's file descriptors time to asynchronously tear down
122            time.sleep(0.1)
123
124        # prime powerd_suspend RTC timestamp saving and make sure hwclock works
125        utils.open_write_close(self.HWCLOCK_FILE, '')
126        hwclock_output = utils.system_output('hwclock -r --debug --utc',
127                                             ignore_status=True)
128        if not re.search('Using.*/dev interface to.*clock', hwclock_output):
129            raise error.TestError('hwclock cannot find rtc: ' + hwclock_output)
130
131        # activate device suspend timing debug output
132        if hasattr(self, 'device_times'):
133            if not int(utils.read_one_line('/sys/power/pm_print_times')):
134                self._set_pm_print_times(True)
135                self._reset_pm_print_times = True
136
137        # Shut down 3G to remove its variability from suspend time measurements
138        flim = flimflam.FlimFlam()
139        service = flim.FindCellularService(0)
140        if service:
141            logging.info('Found 3G interface, disconnecting.')
142            start_time = time.time()
143            (success, status) = flim.DisconnectService(
144                    service=service, wait_timeout=60)
145            if success:
146                logging.info('3G disconnected successfully.')
147                self.disconnect_3G_time = time.time() - start_time
148            else:
149                logging.error('Could not disconnect: %s.', status)
150                self.disconnect_3G_time = -1
151
152        self._configure_suspend_state()
153
154    def _configure_suspend_state(self):
155        """Configure the suspend state as requested."""
156        if self._suspend_state:
157            available_suspend_states = utils.read_one_line('/sys/power/state')
158            if self._suspend_state not in available_suspend_states:
159                raise error.TestNAError('Invalid suspend state: ' +
160                                        self._suspend_state)
161            # Check the current state. If it is same as the one requested,
162            # we don't want to call PowerPrefChanger(restarts powerd).
163            if self._suspend_state == power_utils.get_sleep_state():
164                return
165            should_freeze = '1' if self._suspend_state == 'freeze' else '0'
166            new_prefs = {self._SUSPEND_STATE_PREF_FILE: should_freeze}
167            self._power_pref_changer = power_utils.PowerPrefChanger(new_prefs)
168
169    def _set_pm_print_times(self, on):
170        """Enable/disable extra suspend timing output from powerd to syslog."""
171        if utils.system('echo %s > /sys/power/pm_print_times' % int(bool(on)),
172                ignore_status=True):
173            logging.warning('Failed to set pm_print_times to %s', bool(on))
174            del self.device_times
175            self._reset_pm_print_times = False
176        else:
177            logging.info('Device resume times set to %s', bool(on))
178
179
180    def _get_board(self):
181        """Remove _freon from get_board if found."""
182        return (utils.get_board().replace("_freon", ""))
183
184
185    def _reset_logs(self):
186        """Throw away cached log lines and reset log pointer to current end."""
187        if self._log_file:
188            self._log_file.close()
189        self._log_file = open('/var/log/messages')
190        self._log_file.seek(0, os.SEEK_END)
191        self._logs = []
192
193
194    def _update_logs(self, retries=11):
195        """
196        Read all lines logged since last reset into log cache. Block until last
197        powerd_suspend resume message was read, raise if it takes too long.
198        """
199        finished_regex = re.compile(r'powerd_suspend\[\d+\]: Resume finished')
200        for retry in xrange(retries + 1):
201            lines = self._log_file.readlines()
202            if lines:
203                if self._logs and self._logs[-1][-1] != '\n':
204                    # Reassemble line that was cut in the middle
205                    self._logs[-1] += lines.pop(0)
206                self._logs += lines
207            for line in reversed(self._logs):
208                if (finished_regex.search(line)):
209                    return
210            time.sleep(0.005 * 2**retry)
211
212        raise error.TestError("Sanity check failed: did not try to suspend.")
213
214
215    def _ts(self, name, retries=11):
216        """Searches logs for last timestamp with a given suspend message."""
217        # Occasionally need to retry due to races from process wakeup order
218        for retry in xrange(retries + 1):
219            try:
220                f = open(self._TIMINGS_FILE)
221                for line in f:
222                    words = line.split('=')
223                    if name == words[0]:
224                        try:
225                            timestamp = float(words[1])
226                        except ValueError:
227                            logging.warning('Invalid timestamp: %s', line)
228                            timestamp = 0
229                        return timestamp
230            except IOError:
231                pass
232            time.sleep(0.005 * 2**retry)
233
234        raise error.TestError('Could not find %s entry.' % name)
235
236
237    def _hwclock_ts(self, not_before, retries=3):
238        """Read the RTC resume timestamp saved by powerd_suspend."""
239        for retry in xrange(retries + 1):
240            early_wakeup = False
241            if os.path.exists(self.HWCLOCK_FILE):
242                # TODO(crbug.com/733773): Still fragile see bug.
243                match = re.search(r'(.+)(\.\d+)[+-]\d+:?\d+$',
244                                  utils.read_file(self.HWCLOCK_FILE), re.DOTALL)
245                if match:
246                    timeval = time.strptime(match.group(1),
247                                            "%Y-%m-%d %H:%M:%S")
248                    seconds = time.mktime(timeval)
249                    seconds += float(match.group(2))
250                    logging.debug('RTC resume timestamp read: %f', seconds)
251                    if seconds >= not_before:
252                        return seconds
253                    early_wakeup = True
254            time.sleep(0.05 * retry)
255        if early_wakeup:
256            logging.debug('Early wakeup, dumping eventlog if it exists:\n')
257            elog = utils.system_output('mosys eventlog list | tail -n %d' %
258                    self._RELEVANT_EVENTLOG_LINES, ignore_status=True)
259            wake_elog = (['unknown'] + re.findall(r'Wake Source.*', elog))[-1]
260            for line in reversed(self._logs):
261                match = re.search(r'PM1_STS: WAK.*', line)
262                if match:
263                    wake_syslog = match.group(0)
264                    break
265            else:
266                wake_syslog = 'unknown'
267            for b, e, s in sys_power.SpuriousWakeupError.S3_WHITELIST:
268                if (re.search(b, utils.get_board()) and
269                        re.search(e, wake_elog) and re.search(s, wake_syslog)):
270                    logging.warning('Whitelisted spurious wake in S3: %s | %s',
271                                    wake_elog, wake_syslog)
272                    return None
273            raise sys_power.SpuriousWakeupError('Spurious wake in S3: %s | %s'
274                    % (wake_elog, wake_syslog))
275        if self._get_board() in ['lumpy', 'stumpy', 'kiev']:
276            logging.debug('RTC read failure (crosbug/36004), dumping nvram:\n' +
277                    utils.system_output('mosys nvram dump', ignore_status=True))
278            return None
279        raise error.TestError('Broken RTC timestamp: ' +
280                              utils.read_file(self.HWCLOCK_FILE))
281
282
283    def _firmware_resume_time(self):
284        """Calculate seconds for firmware resume from logged TSC. (x86 only)"""
285        if utils.get_arch() not in ['i686', 'x86_64']:
286            # TODO: support this on ARM somehow
287            return 0
288        regex_freeze = re.compile(r'PM: resume from suspend-to-idle')
289        regex_tsc = re.compile(r'TSC at resume: (\d+)$')
290        freq = 1000 * int(utils.read_one_line(
291                '/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq'))
292        for line in reversed(self._logs):
293            match_freeze = regex_freeze.search(line)
294            if match_freeze:
295                logging.info('fw resume time zero due to suspend-to-idle\n')
296                return 0
297            match_tsc = regex_tsc.search(line)
298            if match_tsc:
299                return float(match_tsc.group(1)) / freq
300
301        raise error.TestError('Failed to find TSC resume value in syslog.')
302
303
304    def _device_resume_time(self):
305        """Read amount of seconds for overall device resume from syslog."""
306        regex = re.compile(r'PM: resume of devices complete after ([0-9.]+)')
307        for line in reversed(self._logs):
308            match = regex.search(line)
309            if match:
310                return float(match.group(1)) / 1000
311
312        raise error.TestError('Failed to find device resume time in syslog.')
313
314    def _get_phase_times(self):
315        phase_times = []
316        regex = re.compile(r'PM: (\w+ )?(resume|suspend) of devices complete')
317        for line in self._logs:
318          match = regex.search(line)
319          if match:
320            ts = cros_logging.extract_kernel_timestamp(line)
321            phase = match.group(1)
322            if not phase:
323              phase = 'REG'
324            phase_times.append((phase.upper(), ts))
325        return sorted(phase_times, key = lambda entry: entry[1])
326
327    def _get_phase(self, ts, phase_table, dev):
328      for entry in phase_table:
329        #checking if timestamp was before that phase's cutoff
330        if ts < entry[1]:
331          return entry[0]
332      raise error.TestError('Device %s has a timestamp after all devices %s',
333                            dev, 'had already resumed')
334
335    def _individual_device_times(self, start_resume):
336        """Return dict of individual device suspend and resume times."""
337        self.device_times.append(dict())
338        dev_details = collections.defaultdict(dict)
339        regex = re.compile(r'call ([^ ]+)\+ returned 0 after ([0-9]+) usecs')
340        phase_table = self._get_phase_times()
341        for line in self._logs:
342          match = regex.search(line)
343          if match:
344            device = match.group(1).replace(':', '-')
345            key = 'seconds_dev_' + device
346            secs = float(match.group(2)) / 1e6
347            ts = cros_logging.extract_kernel_timestamp(line)
348            if ts > start_resume:
349              key += '_resume'
350            else:
351              key += '_suspend'
352            #looking if we're in a special phase
353            phase = self._get_phase(ts, phase_table, device)
354            dev = dev_details[key]
355            if phase in dev:
356              logging.warning('Duplicate %s entry for device %s, +%f', phase,
357                              device, secs)
358              dev[phase] += secs
359            else:
360              dev[phase] = secs
361
362        for dev_key, dev in dev_details.iteritems():
363          total_secs = sum(dev.values())
364          self.device_times[-1][dev_key] = total_secs
365          report = '%s: %f TOT' % (dev_key, total_secs)
366          for phase in dev.keys():
367            if phase is 'REG':
368              continue
369            report += ', %f %s' % (dev[phase], phase)
370          logging.debug(report)
371
372    def _identify_driver(self, device):
373        """Return the driver name of a device (or "unknown")."""
374        for path, subdirs, _ in os.walk('/sys/devices'):
375            if device in subdirs:
376                node = os.path.join(path, device, 'driver')
377                if not os.path.exists(node):
378                    return "unknown"
379                return os.path.basename(os.path.realpath(node))
380        else:
381            return "unknown"
382
383
384    def _check_for_errors(self, ignore_kernel_warns):
385        """Find and identify suspend errors.
386
387        @param ignore_kernel_warns: Ignore kernel errors.
388
389        @returns: True iff we should retry.
390
391        @raises:
392          sys_power.KernelError: for non-whitelisted kernel failures.
393          sys_power.SuspendTimeout: took too long to enter suspend.
394          sys_power.SpuriousWakeupError: woke too soon from suspend.
395          sys_power.SuspendFailure: unidentified failure.
396        """
397        warning_regex = re.compile(r' kernel: \[.*WARNING:')
398        abort_regex = re.compile(r' kernel: \[.*Freezing of tasks abort'
399                r'| powerd_suspend\[.*Cancel suspend at kernel'
400                r'| kernel: \[.*PM: Wakeup pending, aborting suspend')
401        # rsyslogd can put this out of order with dmesg, so track in variable
402        fail_regex = re.compile(r'powerd_suspend\[\d+\]: Error')
403        failed = False
404
405        # TODO(scottz): warning_monitor crosbug.com/38092
406        log_len = len(self._logs)
407        for i in xrange(log_len):
408            line = self._logs[i]
409            if warning_regex.search(line):
410                # match the source file from the WARNING line, and the
411                # actual error text by peeking one or two lines below that
412                src = cros_logging.strip_timestamp(line)
413                text = ''
414                if i+1 < log_len:
415                    text = cros_logging.strip_timestamp(self._logs[i + 1])
416                if i+2 < log_len:
417                    text += '\n' + cros_logging.strip_timestamp(
418                        self._logs[i + 2])
419                for p1, p2 in sys_power.KernelError.WHITELIST:
420                    if re.search(p1, src) and re.search(p2, text):
421                        logging.info('Whitelisted KernelError: %s', src)
422                        break
423                else:
424                    if ignore_kernel_warns:
425                        logging.warn('Non-whitelisted KernelError: %s', src)
426                    else:
427                        raise sys_power.KernelError("%s\n%s" % (src, text))
428            if abort_regex.search(line):
429                wake_source = 'unknown'
430                match = re.search(r'last active wakeup source: (.*)$',
431                        '\n'.join(self._logs[i-5:i+3]), re.MULTILINE)
432                if match:
433                    wake_source = match.group(1)
434                driver = self._identify_driver(wake_source)
435                for b, w in sys_power.SpuriousWakeupError.S0_WHITELIST:
436                    if (re.search(b, utils.get_board()) and
437                            re.search(w, wake_source)):
438                        logging.warning('Whitelisted spurious wake before '
439                                        'S3: %s | %s', wake_source, driver)
440                        return True
441                if "rtc" in driver:
442                    raise sys_power.SuspendTimeout('System took too '
443                                                   'long to suspend.')
444                raise sys_power.SpuriousWakeupError('Spurious wake '
445                        'before S3: %s | %s' % (wake_source, driver))
446            if fail_regex.search(line):
447                failed = True
448
449        if failed:
450            raise sys_power.SuspendFailure('Unidentified problem.')
451        return False
452
453    def suspend(self, duration=10, ignore_kernel_warns=False):
454        """
455        Do a single suspend for 'duration' seconds. Estimates the amount of time
456        it takes to suspend for a board (see _SUSPEND_DELAY), so the actual RTC
457        wakeup delay will be longer. Returns None on errors, or raises the
458        exception when _throw is set. Returns a dict of general measurements,
459        or a tuple (general_measurements, individual_device_times) when
460        _device_times is set.
461
462        @param duration: time in seconds to do a suspend prior to waking.
463        @param ignore_kernel_warns: Ignore kernel errors.  Defaults to false.
464        """
465
466        if power_utils.get_sleep_state() == 'freeze':
467            self._s0ix_residency_stats = power_status.S0ixResidencyStats()
468
469        try:
470            iteration = len(self.failures) + len(self.successes) + 1
471            # Retry suspend in case we hit a known (whitelisted) bug
472            for _ in xrange(10):
473                self._reset_logs()
474                utils.system('sync')
475                board_delay = self._SUSPEND_DELAY.get(self._get_board(),
476                        self._DEFAULT_SUSPEND_DELAY)
477                try:
478                    alarm = self._suspend(duration + board_delay)
479                except sys_power.SpuriousWakeupError:
480                    # might be another error, we check for it ourselves below
481                    alarm = self._ALARM_FORCE_EARLY_WAKEUP
482
483                if os.path.exists('/sys/firmware/log'):
484                    for msg in re.findall(r'^.*ERROR.*$',
485                            utils.read_file('/sys/firmware/log'), re.M):
486                        for board, pattern in sys_power.FirmwareError.WHITELIST:
487                            if (re.search(board, utils.get_board()) and
488                                    re.search(pattern, msg)):
489                                logging.info('Whitelisted FW error: ' + msg)
490                                break
491                        else:
492                            firmware_log = os.path.join(self._logdir,
493                                    'firmware.log.' + str(iteration))
494                            shutil.copy('/sys/firmware/log', firmware_log)
495                            logging.info('Saved firmware log: ' + firmware_log)
496                            raise sys_power.FirmwareError(msg.strip('\r\n '))
497
498                self._update_logs()
499                if not self._check_for_errors(ignore_kernel_warns):
500                    hwclock_ts = self._hwclock_ts(alarm)
501                    if hwclock_ts:
502                        break
503
504            else:
505                raise error.TestWarn('Ten tries failed due to whitelisted bug')
506
507            # calculate general measurements
508            start_resume = self._ts('start_resume_time')
509            kernel_down = (self._ts('end_suspend_time') -
510                           self._ts('start_suspend_time'))
511            kernel_up = self._ts('end_resume_time') - start_resume
512            devices_up = self._device_resume_time()
513            total_up = hwclock_ts - alarm
514            firmware_up = self._firmware_resume_time()
515            board_up = total_up - kernel_up - firmware_up
516            try:
517                cpu_up = self._ts('cpu_ready_time', 0) - start_resume
518            except error.TestError:
519                # can be missing on non-SMP machines
520                cpu_up = None
521            if total_up > self._MAX_RESUME_TIME:
522                raise error.TestError('Sanity check failed: missed RTC wakeup.')
523
524            logging.info('Success(%d): %g down, %g up, %g board, %g firmware, '
525                         '%g kernel, %g cpu, %g devices',
526                         iteration, kernel_down, total_up, board_up,
527                         firmware_up, kernel_up, cpu_up, devices_up)
528
529            if hasattr(self, '_s0ix_residency_stats'):
530                s0ix_residency_secs = \
531                        self._s0ix_residency_stats.\
532                                get_accumulated_residency_secs()
533                if not s0ix_residency_secs:
534                    raise sys_power.S0ixResidencyNotChanged(
535                        'S0ix residency did not change.')
536                logging.info('S0ix residency : %d secs.', s0ix_residency_secs)
537
538            self.successes.append({
539                'seconds_system_suspend': kernel_down,
540                'seconds_system_resume': total_up,
541                'seconds_system_resume_firmware': firmware_up + board_up,
542                'seconds_system_resume_firmware_cpu': firmware_up,
543                'seconds_system_resume_firmware_ec': board_up,
544                'seconds_system_resume_kernel': kernel_up,
545                'seconds_system_resume_kernel_cpu': cpu_up,
546                'seconds_system_resume_kernel_dev': devices_up,
547                })
548
549            if hasattr(self, 'device_times'):
550                self._individual_device_times(start_resume)
551                return (self.successes[-1], self.device_times[-1])
552            else:
553                return self.successes[-1]
554
555        except sys_power.SuspendFailure as ex:
556            message = '%s(%d): %s' % (type(ex).__name__, iteration, ex)
557            logging.error(message)
558            self.failures.append(ex)
559            if self._throw:
560                if type(ex).__name__ in ['KernelError', 'SuspendTimeout']:
561                    raise error.TestWarn(message)
562                else:
563                    raise error.TestFail(message)
564            return None
565
566
567    def finalize(self):
568        """Restore normal environment (not turning 3G back on for now...)"""
569        if os.path.exists(self.HWCLOCK_FILE):
570            os.remove(self.HWCLOCK_FILE)
571            if self._restart_tlsdated:
572                utils.system('initctl start tlsdated')
573            if self._reset_pm_print_times:
574                self._set_pm_print_times(False)
575        if hasattr(self, '_power_pref_changer'):
576            self._power_pref_changer.finalize()
577
578
579    def __del__(self):
580        self.finalize()
581