• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import glob
6import logging
7import os
8import re
9import urllib2
10import urlparse
11
12from autotest_lib.client.bin import utils
13from autotest_lib.client.common_lib import error, global_config
14from autotest_lib.client.common_lib.cros import dev_server
15from autotest_lib.server import autotest
16from autotest_lib.server import utils as server_utils
17from autotest_lib.server.cros.dynamic_suite import constants as ds_constants
18from autotest_lib.server.cros.dynamic_suite import tools
19from chromite.lib import retry_util
20
21try:
22    from chromite.lib import metrics
23except ImportError:
24    metrics = utils.metrics_mock
25
26
27def _metric_name(base_name):
28    return 'chromeos/autotest/provision/' + base_name
29
30
31# Local stateful update path is relative to the CrOS source directory.
32UPDATER_IDLE = 'UPDATE_STATUS_IDLE'
33UPDATER_NEED_REBOOT = 'UPDATE_STATUS_UPDATED_NEED_REBOOT'
34# A list of update engine client states that occur after an update is triggered.
35UPDATER_PROCESSING_UPDATE = ['UPDATE_STATUS_CHECKING_FORUPDATE',
36                             'UPDATE_STATUS_UPDATE_AVAILABLE',
37                             'UPDATE_STATUS_DOWNLOADING',
38                             'UPDATE_STATUS_FINALIZING']
39
40
41_STATEFUL_UPDATE_SCRIPT = 'stateful_update'
42_QUICK_PROVISION_SCRIPT = 'quick-provision'
43
44_UPDATER_BIN = '/usr/bin/update_engine_client'
45_UPDATER_LOGS = ['/var/log/messages', '/var/log/update_engine']
46
47_KERNEL_A = {'name': 'KERN-A', 'kernel': 2, 'root': 3}
48_KERNEL_B = {'name': 'KERN-B', 'kernel': 4, 'root': 5}
49
50# Time to wait for new kernel to be marked successful after
51# auto update.
52_KERNEL_UPDATE_TIMEOUT = 120
53
54
55# PROVISION_FAILED - A flag file to indicate provision failures.  The
56# file is created at the start of any AU procedure (see
57# `ChromiumOSUpdater._prepare_host()`).  The file's location in
58# stateful means that on successul update it will be removed.  Thus, if
59# this file exists, it indicates that we've tried and failed in a
60# previous attempt to update.
61PROVISION_FAILED = '/var/tmp/provision_failed'
62
63
64# A flag file used to enable special handling in lab DUTs.  Some
65# parts of the system in Chromium OS test images will behave in ways
66# convenient to the test lab when this file is present.  Generally,
67# we create this immediately after any update completes.
68_LAB_MACHINE_FILE = '/mnt/stateful_partition/.labmachine'
69
70
71# _TARGET_VERSION - A file containing the new version to which we plan
72# to update.  This file is used by the CrOS shutdown code to detect and
73# handle certain version downgrade cases.  Specifically:  Downgrading
74# may trigger an unwanted powerwash in the target build when the
75# following conditions are met:
76#  * Source build is a v4.4 kernel with R69-10756.0.0 or later.
77#  * Target build predates the R69-10756.0.0 cutoff.
78# When this file is present and indicates a downgrade, the OS shutdown
79# code on the DUT knows how to prevent the powerwash.
80_TARGET_VERSION = '/run/update_target_version'
81
82
83# _REBOOT_FAILURE_MESSAGE - This is the standard message text returned
84# when the Host.reboot() method fails.  The source of this text comes
85# from `wait_for_restart()` in client/common_lib/hosts/base_classes.py.
86
87_REBOOT_FAILURE_MESSAGE = 'Host did not return from reboot'
88
89
90class RootFSUpdateError(error.TestFail):
91    """Raised when the RootFS fails to update."""
92
93
94class StatefulUpdateError(error.TestFail):
95    """Raised when the stateful partition fails to update."""
96
97
98class _AttributedUpdateError(error.TestFail):
99    """Update failure with an attributed cause."""
100
101    def __init__(self, attribution, msg):
102        super(_AttributedUpdateError, self).__init__(
103            '%s: %s' % (attribution, msg))
104        self._message = msg
105
106    def _classify(self):
107        for err_pattern, classification in self._CLASSIFIERS:
108            if re.match(err_pattern, self._message):
109                return classification
110        return None
111
112    @property
113    def failure_summary(self):
114        """Summarize this error for metrics reporting."""
115        classification = self._classify()
116        if classification:
117            return '%s: %s' % (self._SUMMARY, classification)
118        else:
119            return self._SUMMARY
120
121
122class HostUpdateError(_AttributedUpdateError):
123    """Failure updating a DUT attributable to the DUT.
124
125    This class of exception should be raised when the most likely cause
126    of failure was a condition existing on the DUT prior to the update,
127    such as a hardware problem, or a bug in the software on the DUT.
128    """
129
130    DUT_DOWN = 'No answer to ssh'
131
132    _SUMMARY = 'DUT failed prior to update'
133    _CLASSIFIERS = [
134        (DUT_DOWN, DUT_DOWN),
135        (_REBOOT_FAILURE_MESSAGE, 'Reboot failed'),
136    ]
137
138    def __init__(self, hostname, msg):
139        super(HostUpdateError, self).__init__(
140            'Error on %s prior to update' % hostname, msg)
141
142
143class DevServerError(_AttributedUpdateError):
144    """Failure updating a DUT attributable to the devserver.
145
146    This class of exception should be raised when the most likely cause
147    of failure was the devserver serving the target image for update.
148    """
149
150    _SUMMARY = 'Devserver failed prior to update'
151    _CLASSIFIERS = []
152
153    def __init__(self, devserver, msg):
154        super(DevServerError, self).__init__(
155            'Devserver error on %s' % devserver, msg)
156
157
158class ImageInstallError(_AttributedUpdateError):
159    """Failure updating a DUT when installing from the devserver.
160
161    This class of exception should be raised when the target DUT fails
162    to download and install the target image from the devserver, and
163    either the devserver or the DUT might be at fault.
164    """
165
166    _SUMMARY = 'Image failed to download and install'
167    _CLASSIFIERS = []
168
169    def __init__(self, hostname, devserver, msg):
170        super(ImageInstallError, self).__init__(
171            'Download and install failed from %s onto %s'
172            % (devserver, hostname), msg)
173
174
175class NewBuildUpdateError(_AttributedUpdateError):
176    """Failure updating a DUT attributable to the target build.
177
178    This class of exception should be raised when updating to a new
179    build fails, and the most likely cause of the failure is a bug in
180    the newly installed target build.
181    """
182
183    CHROME_FAILURE = 'Chrome failed to reach login screen'
184    UPDATE_ENGINE_FAILURE = ('update-engine failed to call '
185                             'chromeos-setgoodkernel')
186    ROLLBACK_FAILURE = 'System rolled back to previous build'
187
188    _SUMMARY = 'New build failed'
189    _CLASSIFIERS = [
190        (CHROME_FAILURE, 'Chrome did not start'),
191        (UPDATE_ENGINE_FAILURE, 'update-engine did not start'),
192        (ROLLBACK_FAILURE, ROLLBACK_FAILURE),
193    ]
194
195    def __init__(self, update_version, msg):
196        super(NewBuildUpdateError, self).__init__(
197            'Failure in build %s' % update_version, msg)
198
199    @property
200    def failure_summary(self):
201        #pylint: disable=missing-docstring
202        return 'Build failed to work after installing'
203
204
205def _url_to_version(update_url):
206    """Return the version based on update_url.
207
208    @param update_url: url to the image to update to.
209
210    """
211    # The Chrome OS version is generally the last element in the URL. The only
212    # exception is delta update URLs, which are rooted under the version; e.g.,
213    # http://.../update/.../0.14.755.0/au/0.14.754.0. In this case we want to
214    # strip off the au section of the path before reading the version.
215    return re.sub('/au/.*', '',
216                  urlparse.urlparse(update_url).path).split('/')[-1].strip()
217
218
219def url_to_image_name(update_url):
220    """Return the image name based on update_url.
221
222    From a URL like:
223        http://172.22.50.205:8082/update/lumpy-release/R27-3837.0.0
224    return lumpy-release/R27-3837.0.0
225
226    @param update_url: url to the image to update to.
227    @returns a string representing the image name in the update_url.
228
229    """
230    return '/'.join(urlparse.urlparse(update_url).path.split('/')[-2:])
231
232
233def get_update_failure_reason(exception):
234    """Convert an exception into a failure reason for metrics.
235
236    The passed in `exception` should be one raised by failure of
237    `ChromiumOSUpdater.run_update`.  The returned string will describe
238    the failure.  If the input exception value is not a truish value
239    the return value will be `None`.
240
241    The number of possible return strings is restricted to a limited
242    enumeration of values so that the string may be safely used in
243    Monarch metrics without worrying about cardinality of the range of
244    string values.
245
246    @param exception  Exception to be converted to a failure reason.
247
248    @return A string suitable for use in Monarch metrics, or `None`.
249    """
250    if exception:
251        if isinstance(exception, _AttributedUpdateError):
252            return exception.failure_summary
253        else:
254            return 'Unknown Error: %s' % type(exception).__name__
255    return None
256
257
258def _get_devserver_build_from_update_url(update_url):
259    """Get the devserver and build from the update url.
260
261    @param update_url: The url for update.
262        Eg: http://devserver:port/update/build.
263
264    @return: A tuple of (devserver url, build) or None if the update_url
265        doesn't match the expected pattern.
266
267    @raises ValueError: If the update_url doesn't match the expected pattern.
268    @raises ValueError: If no global_config was found, or it doesn't contain an
269        image_url_pattern.
270    """
271    pattern = global_config.global_config.get_config_value(
272            'CROS', 'image_url_pattern', type=str, default='')
273    if not pattern:
274        raise ValueError('Cannot parse update_url, the global config needs '
275                'an image_url_pattern.')
276    re_pattern = pattern.replace('%s', '(\S+)')
277    parts = re.search(re_pattern, update_url)
278    if not parts or len(parts.groups()) < 2:
279        raise ValueError('%s is not an update url' % update_url)
280    return parts.groups()
281
282
283def _list_image_dir_contents(update_url):
284    """Lists the contents of the devserver for a given build/update_url.
285
286    @param update_url: An update url. Eg: http://devserver:port/update/build.
287    """
288    if not update_url:
289        logging.warning('Need update_url to list contents of the devserver.')
290        return
291    error_msg = 'Cannot check contents of devserver, update url %s' % update_url
292    try:
293        devserver_url, build = _get_devserver_build_from_update_url(update_url)
294    except ValueError as e:
295        logging.warning('%s: %s', error_msg, e)
296        return
297    devserver = dev_server.ImageServer(devserver_url)
298    try:
299        devserver.list_image_dir(build)
300    # The devserver will retry on URLError to avoid flaky connections, but will
301    # eventually raise the URLError if it persists. All HTTPErrors get
302    # converted to DevServerExceptions.
303    except (dev_server.DevServerException, urllib2.URLError) as e:
304        logging.warning('%s: %s', error_msg, e)
305
306
307def _get_metric_fields(update_url):
308    """Return a dict of metric fields.
309
310    This is used for sending autoupdate metrics for the given update URL.
311
312    @param update_url  Metrics fields will be calculated from this URL.
313    """
314    build_name = url_to_image_name(update_url)
315    try:
316        board, build_type, milestone, _ = server_utils.ParseBuildName(
317            build_name)
318    except server_utils.ParseBuildNameException:
319        logging.warning('Unable to parse build name %s for metrics. '
320                        'Continuing anyway.', build_name)
321        board, build_type, milestone = ('', '', '')
322    return {
323        'dev_server': dev_server.get_resolved_hostname(update_url),
324        'board': board,
325        'build_type': build_type,
326        'milestone': milestone,
327    }
328
329
330# TODO(garnold) This implements shared updater functionality needed for
331# supporting the autoupdate_EndToEnd server-side test. We should probably
332# migrate more of the existing ChromiumOSUpdater functionality to it as we
333# expand non-CrOS support in other tests.
334class ChromiumOSUpdater(object):
335    """Chromium OS specific DUT update functionality."""
336
337    def __init__(self, update_url, host=None, interactive=True,
338                 use_quick_provision=False):
339        """Initializes the object.
340
341        @param update_url: The URL we want the update to use.
342        @param host: A client.common_lib.hosts.Host implementation.
343        @param interactive: Bool whether we are doing an interactive update.
344        @param use_quick_provision: Whether we should attempt to perform
345            the update using the quick-provision script.
346        """
347        self.update_url = update_url
348        self.host = host
349        self.interactive = interactive
350        self.update_version = _url_to_version(update_url)
351        self._use_quick_provision = use_quick_provision
352
353
354    def _run(self, cmd, *args, **kwargs):
355        """Abbreviated form of self.host.run(...)"""
356        return self.host.run(cmd, *args, **kwargs)
357
358
359    def check_update_status(self):
360        """Returns the current update engine state.
361
362        We use the `update_engine_client -status' command and parse the line
363        indicating the update state, e.g. "CURRENT_OP=UPDATE_STATUS_IDLE".
364        """
365        update_status = self.host.run(command='%s -status | grep CURRENT_OP' %
366                                      _UPDATER_BIN)
367        return update_status.stdout.strip().split('=')[-1]
368
369
370    def _rootdev(self, options=''):
371        """Returns the stripped output of rootdev <options>.
372
373        @param options: options to run rootdev.
374
375        """
376        return self._run('rootdev %s' % options).stdout.strip()
377
378
379    def get_kernel_state(self):
380        """Returns the (<active>, <inactive>) kernel state as a pair.
381
382        @raise RootFSUpdateError if the DUT reports a root partition
383                number that isn't one of the known valid values.
384        """
385        active_root = int(re.findall('\d+\Z', self._rootdev('-s'))[0])
386        if active_root == _KERNEL_A['root']:
387            return _KERNEL_A, _KERNEL_B
388        elif active_root == _KERNEL_B['root']:
389            return _KERNEL_B, _KERNEL_A
390        else:
391            raise RootFSUpdateError(
392                    'Encountered unknown root partition: %s' % active_root)
393
394
395    def _cgpt(self, flag, kernel):
396        """Return numeric cgpt value for the specified flag, kernel, device."""
397        return int(self._run('cgpt show -n -i %d %s $(rootdev -s -d)' % (
398            kernel['kernel'], flag)).stdout.strip())
399
400
401    def _get_next_kernel(self):
402        """Return the kernel that has priority for the next boot."""
403        priority_a = self._cgpt('-P', _KERNEL_A)
404        priority_b = self._cgpt('-P', _KERNEL_B)
405        if priority_a > priority_b:
406            return _KERNEL_A
407        else:
408            return _KERNEL_B
409
410
411    def _get_kernel_success(self, kernel):
412        """Return boolean success flag for the specified kernel.
413
414        @param kernel: information of the given kernel, either _KERNEL_A
415            or _KERNEL_B.
416        """
417        return self._cgpt('-S', kernel) != 0
418
419
420    def _get_kernel_tries(self, kernel):
421        """Return tries count for the specified kernel.
422
423        @param kernel: information of the given kernel, either _KERNEL_A
424            or _KERNEL_B.
425        """
426        return self._cgpt('-T', kernel)
427
428
429    def _get_last_update_error(self):
430        """Get the last autoupdate error code."""
431        command_result = self._run(
432                 '%s --last_attempt_error' % _UPDATER_BIN)
433        return command_result.stdout.strip().replace('\n', ', ')
434
435
436    def _base_update_handler_no_retry(self, run_args):
437        """Base function to handle a remote update ssh call.
438
439        @param run_args: Dictionary of args passed to ssh_host.run function.
440
441        @throws: intercepts and re-throws all exceptions
442        """
443        try:
444            self.host.run(**run_args)
445        except Exception as e:
446            logging.debug('exception in update handler: %s', e)
447            raise e
448
449
450    def _base_update_handler(self, run_args, err_msg_prefix=None):
451        """Handle a remote update ssh call, possibly with retries.
452
453        @param run_args: Dictionary of args passed to ssh_host.run function.
454        @param err_msg_prefix: Prefix of the exception error message.
455        """
456        def exception_handler(e):
457            """Examines exceptions and returns True if the update handler
458            should be retried.
459
460            @param e: the exception intercepted by the retry util.
461            """
462            return (isinstance(e, error.AutoservSSHTimeout) or
463                    (isinstance(e, error.GenericHostRunError) and
464                     hasattr(e, 'description') and
465                     (re.search('ERROR_CODE=37', e.description) or
466                      re.search('generic error .255.', e.description))))
467
468        try:
469            # Try the update twice (arg 2 is max_retry, not including the first
470            # call).  Some exceptions may be caught by the retry handler.
471            retry_util.GenericRetry(exception_handler, 1,
472                                    self._base_update_handler_no_retry,
473                                    run_args)
474        except Exception as e:
475            message = err_msg_prefix + ': ' + str(e)
476            raise RootFSUpdateError(message)
477
478
479    def _wait_for_update_service(self):
480        """Ensure that the update engine daemon is running, possibly
481        by waiting for it a bit in case the DUT just rebooted and the
482        service hasn't started yet.
483        """
484        def handler(e):
485            """Retry exception handler.
486
487            Assumes that the error is due to the update service not having
488            started yet.
489
490            @param e: the exception intercepted by the retry util.
491            """
492            if isinstance(e, error.AutoservRunError):
493                logging.debug('update service check exception: %s\n'
494                              'retrying...', e)
495                return True
496            else:
497                return False
498
499        # Retry at most three times, every 5s.
500        status = retry_util.GenericRetry(handler, 3,
501                                         self.check_update_status,
502                                         sleep=5)
503
504        # Expect the update engine to be idle.
505        if status != UPDATER_IDLE:
506            raise RootFSUpdateError(
507                    'Update engine status is %s (%s was expected).'
508                    % (status, UPDATER_IDLE))
509
510
511    def _reset_update_engine(self):
512        """Resets the host to prepare for a clean update regardless of state."""
513        self._run('stop ui || true')
514        self._run('stop update-engine || true')
515        self._run('start update-engine')
516        self._wait_for_update_service()
517
518
519    def _reset_stateful_partition(self):
520        """Clear any pending stateful update request."""
521        self._run('%s --stateful_change=reset 2>&1'
522                  % self._get_stateful_update_script())
523        self._run('rm -f %s' % _TARGET_VERSION)
524
525
526    def _set_target_version(self):
527        """Set the "target version" for the update."""
528        version_number = self.update_version.split('-')[1]
529        self._run('echo %s > %s' % (version_number, _TARGET_VERSION))
530
531
532    def _revert_boot_partition(self):
533        """Revert the boot partition."""
534        part = self._rootdev('-s')
535        logging.warning('Reverting update; Boot partition will be %s', part)
536        return self._run('/postinst %s 2>&1' % part)
537
538
539    def _verify_kernel_state(self):
540        """Verify that the next kernel to boot is correct for update.
541
542        This tests that the kernel state is correct for a successfully
543        downloaded and installed update.  That is, the next kernel to
544        boot must be the currently inactive kernel.
545
546        @raise RootFSUpdateError if the DUT next kernel isn't the
547                expected next kernel.
548        """
549        inactive_kernel = self.get_kernel_state()[1]
550        next_kernel = self._get_next_kernel()
551        if next_kernel != inactive_kernel:
552            raise RootFSUpdateError(
553                    'Update failed.  The kernel for next boot is %s, '
554                    'but %s was expected.'
555                    % (next_kernel['name'], inactive_kernel['name']))
556        return inactive_kernel
557
558
559    def _verify_update_completed(self):
560        """Verifies that an update has completed.
561
562        @raise RootFSUpdateError if the DUT doesn't indicate that
563                download is complete and the DUT is ready for reboot.
564        """
565        status = self.check_update_status()
566        if status != UPDATER_NEED_REBOOT:
567            error_msg = ''
568            if status == UPDATER_IDLE:
569                error_msg = 'Update error: %s' % self._get_last_update_error()
570            raise RootFSUpdateError(
571                    'Update engine status is %s (%s was expected).  %s'
572                    % (status, UPDATER_NEED_REBOOT, error_msg))
573        return self._verify_kernel_state()
574
575
576    def trigger_update(self):
577        """Triggers a background update."""
578        # If this function is called immediately after reboot (which it
579        # can be), there is no guarantee that the update engine is up
580        # and running yet, so wait for it.
581        self._wait_for_update_service()
582
583        autoupdate_cmd = ('%s --check_for_update --omaha_url=%s' %
584                          (_UPDATER_BIN, self.update_url))
585        run_args = {'command': autoupdate_cmd}
586        err_prefix = 'Failed to trigger an update on %s. ' % self.host.hostname
587        logging.info('Triggering update via: %s', autoupdate_cmd)
588        metric_fields = {'success': False}
589        try:
590            self._base_update_handler(run_args, err_prefix)
591            metric_fields['success'] = True
592        finally:
593            c = metrics.Counter('chromeos/autotest/autoupdater/trigger')
594            metric_fields.update(_get_metric_fields(self.update_url))
595            c.increment(fields=metric_fields)
596
597
598    def update_image(self):
599        """Updates the device root FS and kernel and verifies success."""
600        autoupdate_cmd = ('%s --update --omaha_url=%s' %
601                          (_UPDATER_BIN, self.update_url))
602        if not self.interactive:
603            autoupdate_cmd = '%s --interactive=false' % autoupdate_cmd
604        run_args = {'command': autoupdate_cmd, 'timeout': 3600}
605        err_prefix = ('Failed to install device image using payload at %s '
606                      'on %s. ' % (self.update_url, self.host.hostname))
607        logging.info('Updating image via: %s', autoupdate_cmd)
608        metric_fields = {'success': False}
609        try:
610            self._base_update_handler(run_args, err_prefix)
611            metric_fields['success'] = True
612        finally:
613            c = metrics.Counter('chromeos/autotest/autoupdater/update')
614            metric_fields.update(_get_metric_fields(self.update_url))
615            c.increment(fields=metric_fields)
616        return self._verify_update_completed()
617
618
619    def _get_remote_script(self, script_name):
620        """Ensure that `script_name` is present on the DUT.
621
622        The given script (e.g. `stateful_update`) may be present in the
623        stateful partition under /usr/local/bin, or we may have to
624        download it from the devserver.
625
626        Determine whether the script is present or must be downloaded
627        and download if necessary.  Then, return a command fragment
628        sufficient to run the script from whereever it now lives on the
629        DUT.
630
631        @param script_name  The name of the script as expected in
632                            /usr/local/bin and on the devserver.
633        @return A string with the command (minus arguments) that will
634                run the target script.
635        """
636        remote_script = '/usr/local/bin/%s' % script_name
637        if self.host.path_exists(remote_script):
638            return remote_script
639        remote_tmp_script = '/tmp/%s' % script_name
640        server_name = urlparse.urlparse(self.update_url)[1]
641        script_url = 'http://%s/static/%s' % (server_name, script_name)
642        fetch_script = (
643            'curl -o %s %s && head -1 %s | grep "^#!" | sed "s/#!//"') % (
644                   remote_tmp_script, script_url, remote_tmp_script)
645        script_interpreter = self._run(fetch_script,
646                                       ignore_status=True).stdout.strip()
647        if not script_interpreter:
648            return None
649        return '%s %s' % (script_interpreter, remote_tmp_script)
650
651
652    def _get_stateful_update_script(self):
653        """Returns a command to run the stateful update script.
654
655        Find `stateful_update` on the target or install it, as
656        necessary.  If installation fails, raise an exception.
657
658        @raise StatefulUpdateError if the script can't be found or
659            installed.
660        @return A string that can be joined with arguments to run the
661            `stateful_update` command on the DUT.
662        """
663        script_command = self._get_remote_script(_STATEFUL_UPDATE_SCRIPT)
664        if not script_command:
665            raise StatefulUpdateError('Could not install %s on DUT'
666                                      % _STATEFUL_UPDATE_SCRIPT)
667        return script_command
668
669
670    def rollback_rootfs(self, powerwash):
671        """Triggers rollback and waits for it to complete.
672
673        @param powerwash: If true, powerwash as part of rollback.
674
675        @raise RootFSUpdateError if anything went wrong.
676        """
677        version = self.host.get_release_version()
678        # Introduced can_rollback in M36 (build 5772). # etc/lsb-release matches
679        # X.Y.Z. This version split just pulls the first part out.
680        try:
681            build_number = int(version.split('.')[0])
682        except ValueError:
683            logging.error('Could not parse build number.')
684            build_number = 0
685
686        if build_number >= 5772:
687            can_rollback_cmd = '%s --can_rollback' % _UPDATER_BIN
688            logging.info('Checking for rollback.')
689            try:
690                self._run(can_rollback_cmd)
691            except error.AutoservRunError as e:
692                raise RootFSUpdateError("Rollback isn't possible on %s: %s" %
693                                        (self.host.hostname, str(e)))
694
695        rollback_cmd = '%s --rollback --follow' % _UPDATER_BIN
696        if not powerwash:
697            rollback_cmd += ' --nopowerwash'
698
699        logging.info('Performing rollback.')
700        try:
701            self._run(rollback_cmd)
702        except error.AutoservRunError as e:
703            raise RootFSUpdateError('Rollback failed on %s: %s' %
704                                    (self.host.hostname, str(e)))
705
706        self._verify_update_completed()
707
708
709    def update_stateful(self, clobber=True):
710        """Updates the stateful partition.
711
712        @param clobber: If True, a clean stateful installation.
713
714        @raise StatefulUpdateError if the update script fails to
715                complete successfully.
716        """
717        logging.info('Updating stateful partition...')
718        statefuldev_url = self.update_url.replace('update', 'static')
719
720        # Attempt stateful partition update; this must succeed so that the newly
721        # installed host is testable after update.
722        statefuldev_cmd = [self._get_stateful_update_script(), statefuldev_url]
723        if clobber:
724            statefuldev_cmd.append('--stateful_change=clean')
725
726        statefuldev_cmd.append('2>&1')
727        try:
728            self._run(' '.join(statefuldev_cmd), timeout=1200)
729        except error.AutoservRunError:
730            raise StatefulUpdateError(
731                    'Failed to perform stateful update on %s' %
732                    self.host.hostname)
733
734
735    def verify_boot_expectations(self, expected_kernel, rollback_message):
736        """Verifies that we fully booted given expected kernel state.
737
738        This method both verifies that we booted using the correct kernel
739        state and that the OS has marked the kernel as good.
740
741        @param expected_kernel: kernel that we are verifying with,
742            i.e. I expect to be booted onto partition 4 etc. See output of
743            get_kernel_state.
744        @param rollback_message: string include in except message text
745            if we booted with the wrong partition.
746
747        @raise NewBuildUpdateError if any of the various checks fail.
748        """
749        # Figure out the newly active kernel.
750        active_kernel = self.get_kernel_state()[0]
751
752        # Check for rollback due to a bad build.
753        if active_kernel != expected_kernel:
754
755            # Kernel crash reports should be wiped between test runs, but
756            # may persist from earlier parts of the test, or from problems
757            # with provisioning.
758            #
759            # Kernel crash reports will NOT be present if the crash happened
760            # before encrypted stateful is mounted.
761            #
762            # TODO(dgarrett): Integrate with server/crashcollect.py at some
763            # point.
764            kernel_crashes = glob.glob('/var/spool/crash/kernel.*.kcrash')
765            if kernel_crashes:
766                rollback_message += ': kernel_crash'
767                logging.debug('Found %d kernel crash reports:',
768                              len(kernel_crashes))
769                # The crash names contain timestamps that may be useful:
770                #   kernel.20131207.005945.0.kcrash
771                for crash in kernel_crashes:
772                    logging.debug('  %s', os.path.basename(crash))
773
774            # Print out some information to make it easier to debug
775            # the rollback.
776            logging.debug('Dumping partition table.')
777            self._run('cgpt show $(rootdev -s -d)')
778            logging.debug('Dumping crossystem for firmware debugging.')
779            self._run('crossystem --all')
780            raise NewBuildUpdateError(self.update_version, rollback_message)
781
782        # Make sure chromeos-setgoodkernel runs.
783        try:
784            utils.poll_for_condition(
785                lambda: (self._get_kernel_tries(active_kernel) == 0
786                         and self._get_kernel_success(active_kernel)),
787                exception=RootFSUpdateError(),
788                timeout=_KERNEL_UPDATE_TIMEOUT, sleep_interval=5)
789        except RootFSUpdateError:
790            services_status = self._run('status system-services').stdout
791            if services_status != 'system-services start/running\n':
792                event = NewBuildUpdateError.CHROME_FAILURE
793            else:
794                event = NewBuildUpdateError.UPDATE_ENGINE_FAILURE
795            raise NewBuildUpdateError(self.update_version, event)
796
797
798    def _prepare_host(self):
799        """Make sure the target DUT is working and ready for update.
800
801        Initially, the target DUT's state is unknown.  The DUT is
802        expected to be online, but we strive to be forgiving if Chrome
803        and/or the update engine aren't fully functional.
804        """
805        # Summary of work, and the rationale:
806        #  1. Reboot, because it's a good way to clear out problems.
807        #  2. Touch the PROVISION_FAILED file, to allow repair to detect
808        #     failure later.
809        #  3. Run the hook for host class specific preparation.
810        #  4. Stop Chrome, because the system is designed to eventually
811        #     reboot if Chrome is stuck in a crash loop.
812        #  5. Force `update-engine` to start, because if Chrome failed
813        #     to start properly, the status of the `update-engine` job
814        #     will be uncertain.
815        if not self.host.is_up():
816            raise HostUpdateError(self.host.hostname,
817                                  HostUpdateError.DUT_DOWN)
818        self._reset_stateful_partition()
819        self.host.reboot(timeout=self.host.REBOOT_TIMEOUT)
820        self._run('touch %s' % PROVISION_FAILED)
821        self.host.prepare_for_update()
822        self._reset_update_engine()
823        logging.info('Updating from version %s to %s.',
824                     self.host.get_release_version(),
825                     self.update_version)
826
827
828    def _verify_devserver(self):
829        """Check that our chosen devserver is still working.
830
831        @raise DevServerError if the devserver fails any sanity check.
832        """
833        server = 'http://%s' % urlparse.urlparse(self.update_url)[1]
834        try:
835            if not dev_server.ImageServer.devserver_healthy(server):
836                raise DevServerError(
837                        server, 'Devserver is not healthy')
838        except Exception as e:
839            raise DevServerError(
840                    server, 'Devserver is not up and available')
841
842
843    def _install_via_update_engine(self):
844        """Install an updating using the production AU flow.
845
846        This uses the standard AU flow and the `stateful_update` script
847        to download and install a root FS, kernel and stateful
848        filesystem content.
849
850        @return The kernel expected to be booted next.
851        """
852        logging.info('Installing image using update_engine.')
853        expected_kernel = self.update_image()
854        self.update_stateful()
855        self._set_target_version()
856        return expected_kernel
857
858
859    def _install_via_quick_provision(self):
860        """Install an updating using the `quick-provision` script.
861
862        This uses the `quick-provision` script to download and install
863        a root FS, kernel and stateful filesystem content.
864
865        @return The kernel expected to be booted next.
866        """
867        if not self._use_quick_provision:
868            return None
869        build_re = global_config.global_config.get_config_value(
870                'CROS', 'quick_provision_build_regex', type=str, default='')
871        image_name = url_to_image_name(self.update_url)
872        if not build_re or re.match(build_re, image_name) is None:
873            logging.info('Not eligible for quick-provision.')
874            return None
875        logging.info('Installing image using quick-provision.')
876        provision_command = self._get_remote_script(_QUICK_PROVISION_SCRIPT)
877        server_name = urlparse.urlparse(self.update_url)[1]
878        static_url = 'http://%s/static' % server_name
879        command = '%s --noreboot %s %s' % (
880                      provision_command, image_name, static_url)
881        try:
882            self._run(command)
883            self._set_target_version()
884            return self._verify_kernel_state()
885        except Exception:
886            # N.B.  We handle only `Exception` here.  Non-Exception
887            # classes (such as KeyboardInterrupt) are handled by our
888            # caller.
889            logging.exception('quick-provision script failed; '
890                              'will fall back to update_engine.')
891            self._revert_boot_partition()
892            self._reset_stateful_partition()
893            self._reset_update_engine()
894            return None
895
896
897    def _install_update(self):
898        """Install the requested image on the DUT, but don't start it.
899
900        This downloads and installs a root FS, kernel and stateful
901        filesystem content.  This does not reboot the DUT, so the update
902        is merely pending when the method returns.
903
904        @return The kernel expected to be booted next.
905        """
906        logging.info('Installing image at %s onto %s',
907                     self.update_url, self.host.hostname)
908        try:
909            return (self._install_via_quick_provision()
910                    or self._install_via_update_engine())
911        except:
912            # N.B. This handling code includes non-Exception classes such
913            # as KeyboardInterrupt.  We need to clean up, but we also must
914            # re-raise.
915            self._revert_boot_partition()
916            self._reset_stateful_partition()
917            self._reset_update_engine()
918            # Collect update engine logs in the event of failure.
919            if self.host.job:
920                logging.info('Collecting update engine logs due to failure...')
921                self.host.get_file(
922                        _UPDATER_LOGS, self.host.job.sysinfo.sysinfodir,
923                        preserve_perm=False)
924            _list_image_dir_contents(self.update_url)
925            raise
926
927
928    def _complete_update(self, expected_kernel):
929        """Finish the update, and confirm that it succeeded.
930
931        Initial condition is that the target build has been downloaded
932        and installed on the DUT, but has not yet been booted.  This
933        function is responsible for rebooting the DUT, and checking that
934        the new build is running successfully.
935
936        @param expected_kernel: kernel expected to be active after reboot.
937        """
938        # Regarding the 'crossystem' command below: In some cases,
939        # the update flow puts the TPM into a state such that it
940        # fails verification.  We don't know why.  However, this
941        # call papers over the problem by clearing the TPM during
942        # the reboot.
943        #
944        # We ignore failures from 'crossystem'.  Although failure
945        # here is unexpected, and could signal a bug, the point of
946        # the exercise is to paper over problems; allowing this to
947        # fail would defeat the purpose.
948        self._run('crossystem clear_tpm_owner_request=1',
949                  ignore_status=True)
950        self.host.reboot(timeout=self.host.REBOOT_TIMEOUT)
951
952        # Touch the lab machine file to leave a marker that
953        # distinguishes this image from other test images.
954        # Afterwards, we must re-run the autoreboot script because
955        # it depends on the _LAB_MACHINE_FILE.
956        autoreboot_cmd = ('FILE="%s" ; [ -f "$FILE" ] || '
957                          '( touch "$FILE" ; start autoreboot )')
958        self._run(autoreboot_cmd % _LAB_MACHINE_FILE)
959        self.verify_boot_expectations(
960                expected_kernel, NewBuildUpdateError.ROLLBACK_FAILURE)
961
962        logging.debug('Cleaning up old autotest directories.')
963        try:
964            installed_autodir = autotest.Autotest.get_installed_autodir(
965                    self.host)
966            self._run('rm -rf ' + installed_autodir)
967        except autotest.AutodirNotFoundError:
968            logging.debug('No autotest installed directory found.')
969
970
971    def run_update(self):
972        """Perform a full update of a DUT in the test lab.
973
974        This downloads and installs the root FS and stateful partition
975        content needed for the update specified in `self.host` and
976        `self.update_url`.  The update is performed according to the
977        requirements for provisioning a DUT for testing the requested
978        build.
979
980        At the end of the procedure, metrics are reported describing the
981        outcome of the operation.
982
983        @returns A tuple of the form `(image_name, attributes)`, where
984            `image_name` is the name of the image installed, and
985            `attributes` is new attributes to be applied to the DUT.
986        """
987        server_name = dev_server.get_resolved_hostname(self.update_url)
988        metrics.Counter(_metric_name('install')).increment(
989                fields={'devserver': server_name})
990
991        self._verify_devserver()
992
993        try:
994            self._prepare_host()
995        except _AttributedUpdateError:
996            raise
997        except Exception as e:
998            logging.exception('Failure preparing host prior to update.')
999            raise HostUpdateError(self.host.hostname, str(e))
1000
1001        try:
1002            expected_kernel = self._install_update()
1003        except _AttributedUpdateError:
1004            raise
1005        except Exception as e:
1006            logging.exception('Failure during download and install.')
1007            server_name = dev_server.get_resolved_hostname(self.update_url)
1008            raise ImageInstallError(self.host.hostname, server_name, str(e))
1009
1010        try:
1011            self._complete_update(expected_kernel)
1012        except _AttributedUpdateError:
1013            raise
1014        except Exception as e:
1015            logging.exception('Failure from build after update.')
1016            raise NewBuildUpdateError(self.update_version, str(e))
1017
1018        image_name = url_to_image_name(self.update_url)
1019        # update_url is different from devserver url needed to stage autotest
1020        # packages, therefore, resolve a new devserver url here.
1021        devserver_url = dev_server.ImageServer.resolve(
1022                image_name, self.host.hostname).url()
1023        repo_url = tools.get_package_url(devserver_url, image_name)
1024        return image_name, {ds_constants.JOB_REPO_URL: repo_url}
1025