• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import json
6import logging
7import os
8import time
9
10import common
11from autotest_lib.client.common_lib import error
12from autotest_lib.client.common_lib import global_config
13from autotest_lib.client.common_lib import hosts
14from autotest_lib.client.common_lib.cros import retry
15from autotest_lib.server import afe_utils
16from autotest_lib.server import crashcollect
17from autotest_lib.server.hosts import repair
18from autotest_lib.server.hosts import cros_firmware
19
20# _DEV_MODE_ALLOW_POOLS - The set of pools that are allowed to be
21# in dev mode (usually, those should be unmanaged devices)
22#
23_DEV_MODE_ALLOWED_POOLS = set(
24    global_config.global_config.get_config_value(
25            'CROS',
26            'pools_dev_mode_allowed',
27            type=str,
28            default='',
29            allow_blank=True).split(','))
30
31# Setting to suppress dev mode check; primarily used for moblab where all
32# DUT's are in dev mode.
33_DEV_MODE_ALWAYS_ALLOWED = global_config.global_config.get_config_value(
34            'CROS',
35            'dev_mode_allowed',
36            type=bool,
37            default=False)
38
39# Triggers for the 'au', 'powerwash', and 'usb' repair actions.
40# These are also used as dependencies in the `CrosHost` repair
41# sequence, as follows:
42#
43# au:
44#   - triggers: _CROS_AU_TRIGGERS
45#   - depends on: _CROS_USB_TRIGGERS + _CROS_POWERWASH_TRIGGERS
46#
47# powerwash:
48#   - triggers: _CROS_POWERWASH_TRIGGERS + _CROS_AU_TRIGGERS
49#   - depends on: _CROS_USB_TRIGGERS
50#
51# usb:
52#   - triggers: _CROS_USB_TRIGGERS + _CROS_POWERWASH_TRIGGERS +
53#               _CROS_AU_TRIGGERS
54#   - no dependencies
55#
56# N.B. AC power detection depends on software on the DUT, and there
57# have been bugs where detection failed even though the DUT really
58# did have power.  So, we make the 'power' verifier a trigger for
59# reinstall repair actions, too.
60#
61# TODO(jrbarnette):  AU repair can't fix all problems reported by
62# the 'cros' verifier; it's listed as an AU trigger as a
63# simplification.  The ultimate fix is to split the 'cros' verifier
64# into smaller individual verifiers.
65_CROS_AU_TRIGGERS = ('power', 'rwfw', 'python', 'cros',)
66_CROS_POWERWASH_TRIGGERS = ('tpm', 'good_au', 'ext4',)
67_CROS_USB_TRIGGERS = ('ssh', 'writable',)
68
69
70class ACPowerVerifier(hosts.Verifier):
71    """Check for AC power and a reasonable battery charge."""
72
73    def verify(self, host):
74        # Temporarily work around a problem caused by some old FSI
75        # builds that don't have the power_supply_info command by
76        # ignoring failures.  The repair triggers believe that this
77        # verifier can't be fixed by re-installing, which means if a DUT
78        # gets stuck with one of those old builds, it can't be repaired.
79        #
80        # TODO(jrbarnette): This is for crbug.com/599158; we need a
81        # better solution.
82        try:
83            info = host.get_power_supply_info()
84        except:
85            logging.exception('get_power_supply_info() failed')
86            return
87        try:
88            if info['Line Power']['online'] != 'yes':
89                raise hosts.AutoservVerifyError(
90                        'AC power is not plugged in')
91        except KeyError:
92            logging.info('Cannot determine AC power status - '
93                         'skipping check.')
94        try:
95            if float(info['Battery']['percentage']) < 50.0:
96                raise hosts.AutoservVerifyError(
97                        'Battery is less than 50%')
98        except KeyError:
99            logging.info('Cannot determine battery status - '
100                         'skipping check.')
101
102    @property
103    def description(self):
104        return 'The DUT is plugged in to AC power'
105
106
107class WritableVerifier(hosts.Verifier):
108    """
109    Confirm the stateful file systems are writable.
110
111    The standard linux response to certain unexpected file system errors
112    (including hardware errors in block devices) is to change the file
113    system status to read-only.  This checks that that hasn't happened.
114
115    The test covers the two file systems that need to be writable for
116    critical operations like AU:
117      * The (unencrypted) stateful system which includes
118        /mnt/stateful_partition.
119      * The encrypted stateful partition, which includes /var.
120
121    The test doesn't check various bind mounts; those are expected to
122    fail the same way as their underlying main mounts.  Whether the
123    Linux kernel can guarantee that is untested...
124    """
125
126    # N.B. Order matters here:  Encrypted stateful is loop-mounted from
127    # a file in unencrypted stateful, so we don't test for errors in
128    # encrypted stateful if unencrypted fails.
129    _TEST_DIRECTORIES = ['/mnt/stateful_partition', '/var/tmp']
130
131    def verify(self, host):
132        # This deliberately stops looking after the first error.
133        # See above for the details.
134        for testdir in self._TEST_DIRECTORIES:
135            filename = os.path.join(testdir, 'writable_test')
136            command = 'touch %s && rm %s' % (filename, filename)
137            rv = host.run(command=command, ignore_status=True)
138            if rv.exit_status != 0:
139                msg = 'Can\'t create a file in %s' % testdir
140                raise hosts.AutoservVerifyError(msg)
141
142    @property
143    def description(self):
144        return 'The stateful filesystems are writable'
145
146
147class EXT4fsErrorVerifier(hosts.Verifier):
148    """
149    Confirm we have not seen critical file system kernel errors.
150    """
151    def verify(self, host):
152        # grep for stateful FS errors of the type "EXT4-fs error (device sda1):"
153        command = ("dmesg | grep -E \"EXT4-fs error \(device "
154                   "$(cut -d ' ' -f 5,9 /proc/$$/mountinfo | "
155                   "grep -e '^/mnt/stateful_partition ' | "
156                   "cut -d ' ' -f 2 | cut -d '/' -f 3)\):\"")
157        output = host.run(command=command, ignore_status=True).stdout
158        if output:
159            sample = output.splitlines()[0]
160            message = 'Saw file system error: %s' % sample
161            raise hosts.AutoservVerifyError(message)
162        # Check for other critical FS errors.
163        command = 'dmesg | grep "This should not happen!!  Data will be lost"'
164        output = host.run(command=command, ignore_status=True).stdout
165        if output:
166            message = 'Saw file system error: Data will be lost'
167            raise hosts.AutoservVerifyError(message)
168        else:
169            logging.error('Could not determine stateful mount.')
170
171    @property
172    def description(self):
173        return 'Did not find critical file system errors'
174
175
176class UpdateSuccessVerifier(hosts.Verifier):
177    """
178    Checks that the DUT successfully finished its last provision job.
179
180    At the start of any update (e.g. for a Provision job), the code
181    creates a marker file named `host.PROVISION_FAILED`.  The file is
182    located in a part of the stateful partition that will be removed if
183    an update finishes successfully.  Thus, the presence of the file
184    indicates that a prior update failed.
185
186    The verifier tests for the existence of the marker file and fails if
187    it still exists.
188    """
189    def verify(self, host):
190        result = host.run('test -f %s' % host.PROVISION_FAILED,
191                          ignore_status=True)
192        if result.exit_status == 0:
193            raise hosts.AutoservVerifyError(
194                    'Last AU on this DUT failed')
195
196    @property
197    def description(self):
198        return 'The most recent AU attempt on this DUT succeeded'
199
200
201class TPMStatusVerifier(hosts.Verifier):
202    """Verify that the host's TPM is in a good state."""
203
204    def verify(self, host):
205        # This cryptohome command emits status information in JSON format. It
206        # looks something like this:
207        # {
208        #    "installattrs": {
209        #       ...
210        #    },
211        #    "mounts": [ {
212        #       ...
213        #    } ],
214        #    "tpm": {
215        #       "being_owned": false,
216        #       "can_connect": true,
217        #       "can_decrypt": false,
218        #       "can_encrypt": false,
219        #       "can_load_srk": true,
220        #       "can_load_srk_pubkey": true,
221        #       "enabled": true,
222        #       "has_context": true,
223        #       "has_cryptohome_key": false,
224        #       "has_key_handle": false,
225        #       "last_error": 0,
226        #       "owned": true
227        #    }
228        # }
229        output = host.run('cryptohome --action=status').stdout.strip()
230        try:
231            status = json.loads(output)
232        except ValueError:
233            logging.info('Cannot determine the Crytohome valid status - '
234                         'skipping check.')
235            return
236        try:
237            tpm = status['tpm']
238            if not tpm['enabled']:
239                raise hosts.AutoservVerifyError(
240                        'TPM is not enabled -- Hardware is not working.')
241            if not tpm['can_connect']:
242                raise hosts.AutoservVerifyError(
243                        ('TPM connect failed -- '
244                         'last_error=%d.' % tpm['last_error']))
245            if tpm['owned'] and not tpm['can_load_srk']:
246                raise hosts.AutoservVerifyError(
247                        'Cannot load the TPM SRK')
248            if tpm['can_load_srk'] and not tpm['can_load_srk_pubkey']:
249                raise hosts.AutoservVerifyError(
250                        'Cannot load the TPM SRK public key')
251        except KeyError:
252            logging.info('Cannot determine the Crytohome valid status - '
253                         'skipping check.')
254
255    @property
256    def description(self):
257        return 'The host\'s TPM is available and working'
258
259
260class PythonVerifier(hosts.Verifier):
261    """Confirm the presence of a working Python interpreter."""
262
263    def verify(self, host):
264        result = host.run('python -c "import cPickle"',
265                          ignore_status=True)
266        if result.exit_status != 0:
267            message = 'The python interpreter is broken'
268            if result.exit_status == 127:
269                search = host.run('which python', ignore_status=True)
270                if search.exit_status != 0 or not search.stdout:
271                    message = ('Python is missing; may be caused by '
272                               'powerwash')
273            raise hosts.AutoservVerifyError(message)
274
275    @property
276    def description(self):
277        return 'Python on the host is installed and working'
278
279
280class DevModeVerifier(hosts.Verifier):
281    """Verify that the host is not in dev mode."""
282
283    def verify(self, host):
284        # Some pools are allowed to be in dev mode
285        info = host.host_info_store.get()
286        if (_DEV_MODE_ALWAYS_ALLOWED or
287                bool(info.pools & _DEV_MODE_ALLOWED_POOLS)):
288            return
289
290        result = host.run('crossystem devsw_boot', ignore_status=True).stdout
291        if result != '0':
292            raise hosts.AutoservVerifyError('The host is in dev mode')
293
294    @property
295    def description(self):
296        return 'The host should not be in dev mode'
297
298
299class JetstreamServicesVerifier(hosts.Verifier):
300    """Verify that Jetstream services are running."""
301
302    # Retry for b/62576902
303    @retry.retry(error.AutoservError, timeout_min=1, delay_sec=10)
304    def verify(self, host):
305        try:
306            if not host.upstart_status('ap-controller'):
307                raise hosts.AutoservVerifyError(
308                    'ap-controller service is not running')
309        except error.AutoservRunError:
310            raise hosts.AutoservVerifyError(
311                'ap-controller service not found')
312
313        try:
314            host.run('pgrep ap-controller')
315        except error.AutoservRunError:
316            raise hosts.AutoservVerifyError(
317                'ap-controller process is not running')
318
319    @property
320    def description(self):
321        return 'Jetstream services must be running'
322
323
324class ServoSysRqRepair(hosts.RepairAction):
325    """
326    Repair a Chrome device by sending a system request to the kernel.
327
328    Sending 3 times the Alt+VolUp+x key combination (aka sysrq-x)
329    will ask the kernel to panic itself and reboot while conserving
330    the kernel logs in console ramoops.
331    """
332
333    def repair(self, host):
334        if not host.servo:
335            raise hosts.AutoservRepairError(
336                    '%s has no servo support.' % host.hostname)
337        # Press 3 times Alt+VolUp+X
338        # no checking DUT health between each press as
339        # killing Chrome is not really likely to fix the DUT SSH.
340        for _ in range(3):
341            try:
342                host.servo.sysrq_x()
343            except error.TestFail, ex:
344                raise hosts.AutoservRepairError(
345                      'cannot press sysrq-x: %s.' % str(ex))
346            # less than 5 seconds between presses.
347            time.sleep(2.0)
348
349        if host.wait_up(host.BOOT_TIMEOUT):
350            # Collect logs once we regain ssh access before clobbering them.
351            local_log_dir = crashcollect.get_crashinfo_dir(host, 'after_sysrq')
352            host.collect_logs('/var/log', local_log_dir, ignore_errors=True)
353            # Collect crash info.
354            crashcollect.get_crashinfo(host, None)
355            return
356        raise hosts.AutoservRepairError(
357                '%s is still offline after sysrq-x.' % host.hostname)
358
359    @property
360    def description(self):
361        return 'Reset the DUT via keyboard sysrq-x'
362
363
364class ServoResetRepair(hosts.RepairAction):
365    """Repair a Chrome device by resetting it with servo."""
366
367    def repair(self, host):
368        if not host.servo:
369            raise hosts.AutoservRepairError(
370                    '%s has no servo support.' % host.hostname)
371        host.servo.get_power_state_controller().reset()
372        if host.wait_up(host.BOOT_TIMEOUT):
373            # Collect logs once we regain ssh access before clobbering them.
374            local_log_dir = crashcollect.get_crashinfo_dir(host, 'after_reset')
375            host.collect_logs('/var/log', local_log_dir, ignore_errors=True)
376            # Collect crash info.
377            crashcollect.get_crashinfo(host, None)
378            return
379        raise hosts.AutoservRepairError(
380                '%s is still offline after servo reset.' % host.hostname)
381
382    @property
383    def description(self):
384        return 'Reset the DUT via servo'
385
386
387class CrosRebootRepair(repair.RebootRepair):
388    """Repair a CrOS target by clearing dev mode and rebooting it."""
389
390    def repair(self, host):
391        # N.B. We need to reboot regardless of whether set_gbb_flags
392        # succeeds or fails.
393        host.run('/usr/share/vboot/bin/set_gbb_flags.sh 0',
394                 ignore_status=True)
395        super(CrosRebootRepair, self).repair(host)
396
397    @property
398    def description(self):
399        return 'Reset GBB flags and Reboot the host'
400
401
402class AutoUpdateRepair(hosts.RepairAction):
403    """
404    Repair by re-installing a test image using autoupdate.
405
406    Try to install the DUT's designated "stable test image" using the
407    standard procedure for installing a new test image via autoupdate.
408    """
409
410    def repair(self, host):
411        afe_utils.machine_install_and_update_labels(host, repair=True)
412
413    @property
414    def description(self):
415        return 'Re-install the stable build via AU'
416
417
418class PowerWashRepair(AutoUpdateRepair):
419    """
420    Powerwash the DUT, then re-install using autoupdate.
421
422    Powerwash the DUT, then attempt to re-install a stable test image as
423    for `AutoUpdateRepair`.
424    """
425
426    def repair(self, host):
427        host.run('echo "fast safe" > '
428                 '/mnt/stateful_partition/factory_install_reset')
429        host.reboot(timeout=host.POWERWASH_BOOT_TIMEOUT, wait=True)
430        super(PowerWashRepair, self).repair(host)
431
432    @property
433    def description(self):
434        return 'Powerwash and then re-install the stable build via AU'
435
436
437class ServoInstallRepair(hosts.RepairAction):
438    """
439    Reinstall a test image from USB using servo.
440
441    Use servo to re-install the DUT's designated "stable test image"
442    from servo-attached USB storage.
443    """
444
445    def repair(self, host):
446        if not host.servo:
447            raise hosts.AutoservRepairError(
448                    '%s has no servo support.' % host.hostname)
449        host.servo_install(host.stage_image_for_servo())
450
451    @property
452    def description(self):
453        return 'Reinstall from USB using servo'
454
455
456class JetstreamRepair(hosts.RepairAction):
457    """Repair by restarting Jetstrem services."""
458
459    def repair(self, host):
460        host.cleanup_services()
461
462    @property
463    def description(self):
464        return 'Restart Jetstream services'
465
466
467def _cros_verify_dag():
468    """Return the verification DAG for a `CrosHost`."""
469    FirmwareStatusVerifier = cros_firmware.FirmwareStatusVerifier
470    FirmwareVersionVerifier = cros_firmware.FirmwareVersionVerifier
471    verify_dag = (
472        (repair.SshVerifier,         'ssh',      ()),
473        (DevModeVerifier,            'devmode',  ('ssh',)),
474        (ACPowerVerifier,            'power',    ('ssh',)),
475        (EXT4fsErrorVerifier,        'ext4',     ('ssh',)),
476        (WritableVerifier,           'writable', ('ssh',)),
477        (TPMStatusVerifier,          'tpm',      ('ssh',)),
478        (UpdateSuccessVerifier,      'good_au',  ('ssh',)),
479        (FirmwareStatusVerifier,     'fwstatus', ('ssh',)),
480        (FirmwareVersionVerifier,    'rwfw',     ('ssh',)),
481        (PythonVerifier,             'python',   ('ssh',)),
482        (repair.LegacyHostVerifier,  'cros',     ('ssh',)),
483    )
484    return verify_dag
485
486
487def _cros_basic_repair_actions():
488    """Return the basic repair actions for a `CrosHost`"""
489    FirmwareRepair = cros_firmware.FirmwareRepair
490    repair_actions = (
491        # RPM cycling must precede Servo reset:  if the DUT has a dead
492        # battery, we need to reattach AC power before we reset via servo.
493        (repair.RPMCycleRepair, 'rpm', (), ('ssh', 'power',)),
494        (ServoSysRqRepair, 'sysrq', (), ('ssh',)),
495        (ServoResetRepair, 'servoreset', (), ('ssh',)),
496
497        # N.B. FirmwareRepair can't fix a 'good_au' failure directly,
498        # because it doesn't remove the flag file that triggers the
499        # failure.  We include it as a repair trigger because it's
500        # possible the the last update failed because of the firmware,
501        # and we want the repair steps below to be able to trust the
502        # firmware.
503        (FirmwareRepair, 'firmware', (), ('ssh', 'fwstatus', 'good_au',)),
504
505        (CrosRebootRepair, 'reboot', ('ssh',), ('devmode', 'writable',)),
506    )
507    return repair_actions
508
509
510def _cros_extended_repair_actions(au_triggers=_CROS_AU_TRIGGERS,
511                                  powerwash_triggers=_CROS_POWERWASH_TRIGGERS,
512                                  usb_triggers=_CROS_USB_TRIGGERS):
513    """Return the extended repair actions for a `CrosHost`"""
514
515    # The dependencies and triggers for the 'au', 'powerwash', and 'usb'
516    # repair actions stack up:  Each one is able to repair progressively
517    # more verifiers than the one before.  The 'triggers' lists specify
518    # the progression.
519
520    repair_actions = (
521        (AutoUpdateRepair, 'au',
522                usb_triggers + powerwash_triggers, au_triggers),
523        (PowerWashRepair, 'powerwash',
524                usb_triggers, powerwash_triggers + au_triggers),
525        (ServoInstallRepair, 'usb',
526                (), usb_triggers + powerwash_triggers + au_triggers),
527    )
528    return repair_actions
529
530
531def _cros_repair_actions():
532    """Return the repair actions for a `CrosHost`."""
533    repair_actions = (_cros_basic_repair_actions() +
534                      _cros_extended_repair_actions())
535    return repair_actions
536
537
538def create_cros_repair_strategy():
539    """Return a `RepairStrategy` for a `CrosHost`."""
540    verify_dag = _cros_verify_dag()
541    repair_actions = _cros_repair_actions()
542    return hosts.RepairStrategy(verify_dag, repair_actions)
543
544
545def _moblab_verify_dag():
546    """Return the verification DAG for a `MoblabHost`."""
547    FirmwareVersionVerifier = cros_firmware.FirmwareVersionVerifier
548    verify_dag = (
549        (repair.SshVerifier,         'ssh',     ()),
550        (ACPowerVerifier,            'power',   ('ssh',)),
551        (FirmwareVersionVerifier,    'rwfw',    ('ssh',)),
552        (PythonVerifier,             'python',  ('ssh',)),
553        (repair.LegacyHostVerifier,  'cros',    ('ssh',)),
554    )
555    return verify_dag
556
557
558def _moblab_repair_actions():
559    """Return the repair actions for a `MoblabHost`."""
560    repair_actions = (
561        (repair.RPMCycleRepair, 'rpm', (), ('ssh', 'power',)),
562        (AutoUpdateRepair, 'au', ('ssh',), _CROS_AU_TRIGGERS),
563    )
564    return repair_actions
565
566
567def create_moblab_repair_strategy():
568    """
569    Return a `RepairStrategy` for a `MoblabHost`.
570
571    Moblab is a subset of the CrOS verify and repair.  Several pieces
572    are removed because they're not expected to be meaningful.  Some
573    others are removed for more specific reasons:
574
575    'tpm':  Moblab DUTs don't run the tests that matter to this
576        verifier.  TODO(jrbarnette)  This assertion is unproven.
577
578    'good_au':  This verifier can't pass, because the Moblab AU
579        procedure doesn't properly delete CrosHost.PROVISION_FAILED.
580        TODO(jrbarnette) We should refactor _machine_install() so that
581        it can be different for Moblab.
582
583    'firmware':  Moblab DUTs shouldn't be in FAFT pools, so we don't try
584        this.
585
586    'powerwash':  Powerwash on Moblab causes trouble with deleting the
587        DHCP leases file, so we skip it.
588    """
589    verify_dag = _moblab_verify_dag()
590    repair_actions = _moblab_repair_actions()
591    return hosts.RepairStrategy(verify_dag, repair_actions)
592
593
594def _jetstream_repair_actions():
595    """Return the repair actions for a `JetstreamHost`."""
596    au_triggers = _CROS_AU_TRIGGERS + ('jetstream_services',)
597    repair_actions = (
598        _cros_basic_repair_actions() +
599        (
600            (JetstreamRepair, 'jetstream_repair',
601             _CROS_USB_TRIGGERS + _CROS_POWERWASH_TRIGGERS, au_triggers),
602        ) +
603        _cros_extended_repair_actions(au_triggers=au_triggers))
604    return repair_actions
605
606
607def _jetstream_verify_dag():
608    """Return the verification DAG for a `JetstreamHost`."""
609    verify_dag = _cros_verify_dag() + (
610        (JetstreamServicesVerifier, 'jetstream_services', ('ssh',)),
611    )
612    return verify_dag
613
614
615def create_jetstream_repair_strategy():
616    """
617    Return a `RepairStrategy` for a `JetstreamHost`.
618
619    The Jetstream repair strategy is based on the CrOS verify and repair,
620    but adds the JetstreamServicesVerifier.
621    """
622    verify_dag = _jetstream_verify_dag()
623    repair_actions = _jetstream_repair_actions()
624    return hosts.RepairStrategy(verify_dag, repair_actions)
625