• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Lint as: python2, python3
2# Copyright 2016 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""
7Repair actions and verifiers relating to CrOS firmware.
8
9This contains the repair actions and verifiers need to find problems
10with the firmware installed on ChromeOS DUTs, and when necessary, to
11fix problems by updating or re-installing the firmware.
12
13The operations in the module support two distinct use cases:
14  * DUTs used for FAFT tests can in some cases have problems with
15    corrupted firmware.  The module supplies `FirmwareStatusVerifier`
16    to check for corruption, and supplies `FaftFirmwareRepair` to
17    re-install firmware of current faft stable_version via servo
18    when needed.
19  * DUTs used for general testing normally should be running a
20    designated "stable" firmware version.  This module supplies
21    `FirmwareVersionVerifier` to detect and automatically update
22    firmware that is out-of-date from the designated version. This model
23    also supplys `GeneralFirmwareRepair` to re-install firmware that
24    tied with current stable_version image via servo when needed.
25
26For purposes of the operations in the module, we distinguish three kinds
27of DUT, based on pool assignments:
28  * DUTs used for general testing.  These DUTs automatically check for
29    and install the stable firmware using `FirmwareVersionVerifier`.
30  * DUTs in pools used for FAFT testing.  These check for bad firmware
31    builds with `FirmwareStatusVerifier`, and will fix problems using
32    `FirmwareRepair`.  These DUTs don't check for or install the
33    stable firmware.
34  * DUTs not in general pools, and not used for FAFT.  These DUTs
35    are expected to be managed by separate processes and are excluded
36    from all of the verification and repair code in this module.
37"""
38
39# pylint: disable=missing-docstring
40
41from __future__ import absolute_import
42from __future__ import division
43from __future__ import print_function
44
45import json
46import logging
47
48import common
49from autotest_lib.client.common_lib import global_config
50from autotest_lib.client.common_lib import hosts
51from autotest_lib.server import afe_utils
52from autotest_lib.server.hosts import repair_utils
53from autotest_lib.server.hosts import cros_constants
54
55from autotest_lib.utils.frozen_chromite.lib import timeout_util
56import six
57
58
59# _FIRMWARE_REPAIR_POOLS - The set of pools that should be
60# managed by `FirmwareStatusVerifier` and `FirmwareRepair`.
61#
62_FIRMWARE_REPAIR_POOLS = set(
63    global_config.global_config.get_config_value(
64            'CROS',
65            'pools_support_firmware_repair',
66            type=str).split(','))
67
68
69def _is_firmware_testing_device(host):
70    """
71    check if a host is dedicated for firmware testing.
72
73    When this function returns true, the DUT should be managed by
74    `FirmwareStatusVerifier` and `FaftFirmwareRepair`, but not
75    `FirmwareVersionVerifier` and `GeneralFirmwareRepair.
76
77    @return A true value if the host should use `FirmwareStatusVerifier`
78            and `FaftFirmwareRepair`; a false value otherwise.
79    """
80    info = host.host_info_store.get()
81    return bool(info.pools & _FIRMWARE_REPAIR_POOLS)
82
83
84def _is_firmware_update_supported(host):
85    """
86    Return whether a DUT should be running the standard firmware.
87
88    In the test lab, DUTs used for general testing, (e.g. the `bvt`
89    pool) need their firmware kept up-to-date with
90    `FirmwareVersionVerifier`.  However, some pools have alternative
91    policies for firmware management.  This returns whether a given DUT
92    should be updated via the standard stable version update, or
93    managed by some other procedure.
94
95    @param host   The host to be checked for update policy.
96    @return A true value if the host should use
97            `FirmwareVersionVerifier`; a false value otherwise.
98    """
99    return not _is_firmware_testing_device(host)
100
101
102def _get_available_firmware(host, model):
103    """Get the available RW firmware version given the model.
104
105    @param host     The host to get available firmware for.
106    @param model    The model name to get corresponding firmware version.
107    @return The available RW firmware version if found, else, None.
108    """
109    result = host.run('chromeos-firmwareupdate --manifest', ignore_status=True)
110
111    if result.exit_status != 0:
112        return None
113
114    # The manifest is a JSON in .model.host.versions.rw
115    data = json.loads(result.stdout) or {}
116    key = model if len(data) > 1 else next(six.iterkeys(data), '')
117    key += '.host.versions.rw'
118    for k in key.split('.'):
119        data = data.get(k, {})
120    return data or None
121
122
123class FirmwareStatusVerifier(hosts.Verifier):
124    """
125    Verify that a host's firmware is in a good state.
126
127    For DUTs that run firmware tests, it's possible that the firmware
128    on the DUT can get corrupted.  This verifier checks whether it
129    appears that firmware should be re-flashed using servo.
130    """
131
132    @timeout_util.TimeoutDecorator(cros_constants.VERIFY_TIMEOUT_SEC)
133    def verify(self, host):
134        if not _is_firmware_testing_device(host):
135            return
136        try:
137            # Read the AP firmware and dump the sections that we're
138            # interested in.
139            cmd = ('mkdir /tmp/verify_firmware; '
140                   'cd /tmp/verify_firmware; '
141                   'for section in VBLOCK_A VBLOCK_B FW_MAIN_A FW_MAIN_B; '
142                   'do flashrom -p host -r -i $section:$section; '
143                   'done')
144            host.run(cmd)
145
146            # Verify the firmware blocks A and B.
147            cmd = ('vbutil_firmware --verify /tmp/verify_firmware/VBLOCK_%c'
148                   ' --signpubkey /usr/share/vboot/devkeys/root_key.vbpubk'
149                   ' --fv /tmp/verify_firmware/FW_MAIN_%c')
150            for c in ('A', 'B'):
151                rv = host.run(cmd % (c, c), ignore_status=True)
152                if rv.exit_status:
153                    raise hosts.AutoservVerifyError(
154                            'Firmware %c is in a bad state.' % c)
155        finally:
156            # Remove the temporary files.
157            host.run('rm -rf /tmp/verify_firmware')
158
159    @property
160    def description(self):
161        return 'Firmware on this DUT is clean'
162
163
164class FirmwareRepair(hosts.RepairAction):
165    """
166    Reinstall the firmware image using servo.
167
168    This repair function attempts to use servo to install the DUT's
169    designated "stable firmware version".
170
171    This repair method only applies to DUTs used for FAFT.
172    """
173
174    def _get_faft_stable_build(self, host):
175        info = host.host_info_store.get()
176        return afe_utils.get_stable_faft_version_v2(info)
177
178    def _get_os_stable_build(self, host):
179        # Use firmware in current stable os build.
180        return host.get_cros_repair_image_name()
181
182    def _run_faft_repair(self, host, build):
183        host.firmware_install(build)
184
185    def _run_general_repair(self, host, build):
186        # As GeneralFirmwareRepair is the last repair action, we expect
187        # stable_version os image is loaded on usbkey during other repair
188        # action runs. And there is also no point to repeat and waste time if
189        # download image to usbkey failed in other repair actions.
190        if host._servo_host.validate_image_usbkey() != build:
191            raise hosts.AutoservRepairError('%s is expected to be preloaded,'
192                      'however it\'s not found on the usbkey' % build,
193                      'image not loaded on usbkey')
194        ec_image, bios_image = host._servo_host.prepare_repair_firmware_image()
195
196        # For EVT device with signed variant exists we skip this repair
197        # as it's hard to decide which image to use if DUT do not boot.
198        info = host.host_info_store.get()
199        phase = info.get_label_value('phase')
200        if 'signed' in bios_image and phase.lower() in ('evt', 'dvt', ''):
201            raise hosts.AutoservRepairError(
202                    'Could not determine which firmware image to use'
203                    ' due to signed firmware image variant exists but'
204                    ' DUT phase is earlier than PVT or missing; Phase'
205                    ' from inventory: %s' % phase,
206                    'Can not determine variant for EVT device')
207
208        # Before flash firmware we want update the build into health profile.
209        if host.health_profile:
210            host.health_profile.set_firmware_stable_version(build)
211
212        if ec_image:
213            logging.info('Attempting to flash ec firmware...')
214            host.servo.program_ec(ec_image, copy_image=False)
215        if bios_image:
216            logging.info('Attempting to flash bios firmware...')
217            host._servo_host.flash_ap_firmware_via_servo(bios_image)
218
219        logging.info('Cold resetting DUT through servo...')
220        host.servo.get_power_state_controller().reset()
221        host.wait_up(timeout=host.BOOT_TIMEOUT)
222        # flash firmware via servo will turn DUT into dev mode, so disable
223        # dev mode and reset gbb flag here.
224        host.run('/usr/share/vboot/bin/set_gbb_flags.sh 0', ignore_status=True)
225        host.run('crossystem disable_dev_request=1', ignore_status=True)
226        host.reboot()
227
228
229class FaftFirmwareRepair(FirmwareRepair):
230    """
231    Reinstall the firmware for DUTs in faft related pool.
232    """
233
234    def repair(self, host):
235        repair_utils.require_servo(host, ignore_state=True)
236        build = self._get_faft_stable_build(host)
237        if build:
238            self._run_faft_repair(host, build)
239        else:
240            logging.info('Cannot find faft stable_version, falling back to'
241                         ' use firmware on OS stable_version.')
242            build = self._get_os_stable_build(host)
243            if not build:
244                raise hosts.AutoservRepairError(
245                        'Failed to find stable_version from host_info.',
246                        'cannot find stable_version')
247            self._run_general_repair(host, build)
248
249    def _is_applicable(self, host):
250        return _is_firmware_testing_device(host)
251
252    @property
253    def description(self):
254        return 'Re-install the stable firmware(faft) via servo'
255
256
257class GeneralFirmwareRepair(FirmwareRepair):
258    """Reinstall the firmware for non-faft DUTs.
259    We need different RepairAction for non firmware testing DUT because
260    we want only try re-install firmware if all other RepairAction could
261    not restore ssh capability to the DUT.
262    """
263
264    def repair(self, host):
265        repair_utils.require_servo(host, ignore_state=True)
266        build = self._get_os_stable_build(host)
267        if not build:
268            raise hosts.AutoservRepairError(
269                    'Failed to find stable_version from host_info.',
270                    'cannot find stable_version')
271        self._run_general_repair(host, build)
272
273    def _is_applicable(self, host):
274        if _is_firmware_testing_device(host):
275            return False
276        if not host.servo:
277            logging.info(
278                    'The current servo state of %s is not met the'
279                    ' minimum requirement to flash firmware.', host.hostname)
280        # Flash firmware via servo is consider an expansive opertation, so we
281        # want to check repair data from previous repairs to determine if
282        # firmware repair is need.
283        dhp = host.health_profile
284        if not dhp:
285            logging.info('Device health profile is not available, cannot'
286                         ' determine if firmware repair is needed.')
287            return False
288        repair_fail_count = dhp.get_repair_fail_count()
289        if repair_fail_count < 2:
290            # We want to start with a more conservative strategy, so only try
291            # this action on DUTs that failed repair at least twice.
292            # @TODO(xianuowang@) adjust or remove this threshold.
293            logging.info(
294                    'Firmware repair will only applies to DUT that'
295                    ' failed at least two AdminRepair, current fail'
296                    ' count: %s', repair_fail_count)
297            return False
298        flashed_build = dhp.get_firmware_stable_version()
299        candidate_build = self._get_os_stable_build(host)
300        # If we had an success firmware flash in this repair loop,
301        # there is no need to retry flash the same firmware build.
302        if (dhp.get_succeed_repair_action(self.tag) > 0
303                    and flashed_build == candidate_build):
304            logging.info(
305                    'Firmware from %s has been already installed on %s,'
306                    ' no need to retry.', flashed_build, host.hostname)
307            return False
308        if (dhp.get_failed_repair_action(self.tag) > 2
309                    and flashed_build == candidate_build):
310            logging.info(
311                    'Firmware from %s has been attempted and failed 3 '
312                    'times, no need to retry.', flashed_build)
313            return False
314        return True
315
316    @property
317    def description(self):
318        return 'Re-install the stable firmware(non-faft) via servo'
319
320
321class FirmwareVersionVerifier(hosts.Verifier):
322    """
323    Check for a firmware update, and apply it if appropriate.
324
325    This verifier checks to ensure that either the firmware on the DUT
326    is up-to-date, or that the target firmware can be installed from the
327    currently running build.
328
329    Failure occurs when all of the following apply:
330     1. The DUT is not excluded from updates.  For example, DUTs used
331        for FAFT testing use `FirmwareRepair` instead.
332     2. The DUT's board has an assigned stable firmware version.
333     3. The DUT is not running the assigned stable firmware.
334     4. The firmware supplied in the running OS build is not the
335        assigned stable firmware.
336
337    If the DUT needs an upgrade and the currently running OS build
338    supplies the necessary firmware, the verifier installs the new
339    firmware using `chromeos-firmwareupdate`.  Failure to install will
340    cause the verifier to fail.
341
342    This verifier nominally breaks the rule that "verifiers must succeed
343    quickly", since it can invoke `reboot()` during the success code
344    path.  We're doing it anyway for two reasons:
345      * The time between updates will typically be measured in months,
346        so the amortized cost is low.
347      * The reason we distinguish repair from verify is to allow
348        rescheduling work immediately while the expensive repair happens
349        out-of-band.  But a firmware update will likely hit all DUTs at
350        once, so it's pointless to pass the buck to repair.
351
352    N.B. This verifier is a trigger for all repair actions that install
353    the stable repair image.  If the firmware is out-of-date, but the
354    stable repair image does *not* contain the proper firmware version,
355    _the target DUT will fail repair, and will be unable to fix itself_.
356    """
357
358    @staticmethod
359    def _get_rw_firmware(host):
360        result = host.run('crossystem fwid', ignore_status=True)
361        if result.exit_status == 0:
362            return result.stdout
363        else:
364            return None
365
366    @staticmethod
367    def _check_hardware_match(version_a, version_b):
368        """
369        Check that two firmware versions identify the same hardware.
370
371        Firmware version strings look like this:
372            Google_Gnawty.5216.239.34
373        The part before the numbers identifies the hardware for which
374        the firmware was built.  This function checks that the hardware
375        identified by `version_a` and `version_b` is the same.
376
377        This is a confidence check to protect us from installing the wrong
378        firmware on a DUT when a board label has somehow gone astray.
379
380        @param version_a  First firmware version for the comparison.
381        @param version_b  Second firmware version for the comparison.
382        """
383        hardware_a = version_a.split('.')[0]
384        hardware_b = version_b.split('.')[0]
385        if hardware_a != hardware_b:
386            message = 'Hardware/Firmware mismatch updating %s to %s'
387            raise hosts.AutoservVerifyError(
388                    message % (version_a, version_b))
389
390    def _is_stable_image_installed(self, host):
391        """Verify that ChromeOS image on host is a stable version.
392
393        This check verify that device booted from stable image to protect us
394        from installing the firmware from bad/broken/no-tested image. Bad
395        image can have broken updater or corrupted firmware.
396
397        The representation version looks like:
398                nocturne-release/R89-13728.0.0
399        Check compare version from host to version provide as stable image
400        from host-info file.
401
402        @param host  CrosHost instance.
403        """
404        os_from_host = host.get_release_builder_path()
405        os_from_host_info = host.get_cros_repair_image_name()
406        if os_from_host != os_from_host_info:
407            raise hosts.AutoservNonCriticalVerifyError(
408                    'Firmware update can be run only from stable image.'
409                    ' Expected version:"%s", actually: "%s"' %
410                    (os_from_host_info, os_from_host))
411
412    @timeout_util.TimeoutDecorator(cros_constants.VERIFY_TIMEOUT_SEC)
413    def verify(self, host):
414        # Test 1 - The DUT is not excluded from updates.
415        if not _is_firmware_update_supported(host):
416            return
417        # Test 2 - The DUT has an assigned stable firmware version.
418        info = host.host_info_store.get()
419        if info.model is None:
420            raise hosts.AutoservVerifyError(
421                    'Can not verify firmware version. '
422                    'No model label value found')
423
424        stable_firmware = None
425        try:
426            stable_firmware = afe_utils.get_stable_firmware_version_v2(info)
427        except Exception as e:
428            logging.exception('Failed lookup to AFE for stable fw version '
429                              ' with exception: %s', e)
430
431        if stable_firmware is None:
432            logging.debug('Expected FW version not found')
433            # This DUT doesn't have a firmware update target
434            return
435        logging.debug('Expected FW version: %s', stable_firmware)
436        # For tests 3 and 4:  If the output from `crossystem` or
437        # `chromeos-firmwareupdate` isn't what we expect, we log an
438        # error, but don't fail:  We don't want DUTs unable to test a
439        # build merely because of a bug or change in either of those
440        # commands.
441
442        # Test 3 - The DUT is not running the target stable firmware.
443        current_firmware = self._get_rw_firmware(host)
444        if current_firmware is None:
445            logging.error('DUT firmware version can\'t be determined.')
446            return
447        logging.debug('Current FW version: %s', current_firmware)
448        if current_firmware == stable_firmware:
449            return
450        # Test 4 - The firmware supplied in the running OS build is not
451        # the assigned stable firmware.
452        available_firmware = _get_available_firmware(host, info.model)
453        if available_firmware is None:
454            logging.error('Supplied firmware version in OS can\'t be '
455                          'determined.')
456            return
457        self._is_stable_image_installed(host)
458        if available_firmware != stable_firmware:
459            raise hosts.AutoservVerifyError(
460                    'DUT firmware requires update from %s to %s' %
461                    (current_firmware, stable_firmware))
462        # Time to update the firmware.
463        logging.info('Updating firmware from %s to %s',
464                     current_firmware, stable_firmware)
465        self._check_hardware_match(current_firmware, stable_firmware)
466        try:
467            host.run('chromeos-firmwareupdate --mode=autoupdate')
468            host.reboot()
469        except Exception as e:
470            message = ('chromeos-firmwareupdate failed: from '
471                       '%s to %s')
472            logging.exception(message, current_firmware, stable_firmware)
473            raise hosts.AutoservVerifyError(
474                    message % (current_firmware, stable_firmware))
475        final_firmware = self._get_rw_firmware(host)
476        if final_firmware != stable_firmware:
477            message = ('chromeos-firmwareupdate failed: tried upgrade '
478                       'to %s, now running %s instead')
479            raise hosts.AutoservVerifyError(
480                    message % (stable_firmware, final_firmware))
481
482    @property
483    def description(self):
484        return 'The firmware on this DUT is up-to-date'
485