1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import glob 6import logging 7import os 8import re 9import urllib2 10import urlparse 11 12from autotest_lib.client.bin import utils 13from autotest_lib.client.common_lib import error, global_config 14from autotest_lib.client.common_lib.cros import dev_server 15from autotest_lib.server import autotest 16from autotest_lib.server import utils as server_utils 17from autotest_lib.server.cros.dynamic_suite import constants as ds_constants 18from autotest_lib.server.cros.dynamic_suite import tools 19from chromite.lib import retry_util 20 21try: 22 from chromite.lib import metrics 23except ImportError: 24 metrics = utils.metrics_mock 25 26 27def _metric_name(base_name): 28 return 'chromeos/autotest/provision/' + base_name 29 30 31# Local stateful update path is relative to the CrOS source directory. 32UPDATER_IDLE = 'UPDATE_STATUS_IDLE' 33UPDATER_NEED_REBOOT = 'UPDATE_STATUS_UPDATED_NEED_REBOOT' 34# A list of update engine client states that occur after an update is triggered. 35UPDATER_PROCESSING_UPDATE = ['UPDATE_STATUS_CHECKING_FORUPDATE', 36 'UPDATE_STATUS_UPDATE_AVAILABLE', 37 'UPDATE_STATUS_DOWNLOADING', 38 'UPDATE_STATUS_FINALIZING'] 39 40 41_STATEFUL_UPDATE_SCRIPT = 'stateful_update' 42_QUICK_PROVISION_SCRIPT = 'quick-provision' 43 44_UPDATER_BIN = '/usr/bin/update_engine_client' 45_UPDATER_LOGS = ['/var/log/messages', '/var/log/update_engine'] 46 47_KERNEL_A = {'name': 'KERN-A', 'kernel': 2, 'root': 3} 48_KERNEL_B = {'name': 'KERN-B', 'kernel': 4, 'root': 5} 49 50# Time to wait for new kernel to be marked successful after 51# auto update. 52_KERNEL_UPDATE_TIMEOUT = 120 53 54 55# PROVISION_FAILED - A flag file to indicate provision failures. The 56# file is created at the start of any AU procedure (see 57# `ChromiumOSUpdater._prepare_host()`). The file's location in 58# stateful means that on successul update it will be removed. Thus, if 59# this file exists, it indicates that we've tried and failed in a 60# previous attempt to update. 61PROVISION_FAILED = '/var/tmp/provision_failed' 62 63 64# A flag file used to enable special handling in lab DUTs. Some 65# parts of the system in Chromium OS test images will behave in ways 66# convenient to the test lab when this file is present. Generally, 67# we create this immediately after any update completes. 68_LAB_MACHINE_FILE = '/mnt/stateful_partition/.labmachine' 69 70 71# _TARGET_VERSION - A file containing the new version to which we plan 72# to update. This file is used by the CrOS shutdown code to detect and 73# handle certain version downgrade cases. Specifically: Downgrading 74# may trigger an unwanted powerwash in the target build when the 75# following conditions are met: 76# * Source build is a v4.4 kernel with R69-10756.0.0 or later. 77# * Target build predates the R69-10756.0.0 cutoff. 78# When this file is present and indicates a downgrade, the OS shutdown 79# code on the DUT knows how to prevent the powerwash. 80_TARGET_VERSION = '/run/update_target_version' 81 82 83# _REBOOT_FAILURE_MESSAGE - This is the standard message text returned 84# when the Host.reboot() method fails. The source of this text comes 85# from `wait_for_restart()` in client/common_lib/hosts/base_classes.py. 86 87_REBOOT_FAILURE_MESSAGE = 'Host did not return from reboot' 88 89 90class RootFSUpdateError(error.TestFail): 91 """Raised when the RootFS fails to update.""" 92 93 94class StatefulUpdateError(error.TestFail): 95 """Raised when the stateful partition fails to update.""" 96 97 98class _AttributedUpdateError(error.TestFail): 99 """Update failure with an attributed cause.""" 100 101 def __init__(self, attribution, msg): 102 super(_AttributedUpdateError, self).__init__( 103 '%s: %s' % (attribution, msg)) 104 self._message = msg 105 106 def _classify(self): 107 for err_pattern, classification in self._CLASSIFIERS: 108 if re.match(err_pattern, self._message): 109 return classification 110 return None 111 112 @property 113 def failure_summary(self): 114 """Summarize this error for metrics reporting.""" 115 classification = self._classify() 116 if classification: 117 return '%s: %s' % (self._SUMMARY, classification) 118 else: 119 return self._SUMMARY 120 121 122class HostUpdateError(_AttributedUpdateError): 123 """Failure updating a DUT attributable to the DUT. 124 125 This class of exception should be raised when the most likely cause 126 of failure was a condition existing on the DUT prior to the update, 127 such as a hardware problem, or a bug in the software on the DUT. 128 """ 129 130 DUT_DOWN = 'No answer to ssh' 131 132 _SUMMARY = 'DUT failed prior to update' 133 _CLASSIFIERS = [ 134 (DUT_DOWN, DUT_DOWN), 135 (_REBOOT_FAILURE_MESSAGE, 'Reboot failed'), 136 ] 137 138 def __init__(self, hostname, msg): 139 super(HostUpdateError, self).__init__( 140 'Error on %s prior to update' % hostname, msg) 141 142 143class DevServerError(_AttributedUpdateError): 144 """Failure updating a DUT attributable to the devserver. 145 146 This class of exception should be raised when the most likely cause 147 of failure was the devserver serving the target image for update. 148 """ 149 150 _SUMMARY = 'Devserver failed prior to update' 151 _CLASSIFIERS = [] 152 153 def __init__(self, devserver, msg): 154 super(DevServerError, self).__init__( 155 'Devserver error on %s' % devserver, msg) 156 157 158class ImageInstallError(_AttributedUpdateError): 159 """Failure updating a DUT when installing from the devserver. 160 161 This class of exception should be raised when the target DUT fails 162 to download and install the target image from the devserver, and 163 either the devserver or the DUT might be at fault. 164 """ 165 166 _SUMMARY = 'Image failed to download and install' 167 _CLASSIFIERS = [] 168 169 def __init__(self, hostname, devserver, msg): 170 super(ImageInstallError, self).__init__( 171 'Download and install failed from %s onto %s' 172 % (devserver, hostname), msg) 173 174 175class NewBuildUpdateError(_AttributedUpdateError): 176 """Failure updating a DUT attributable to the target build. 177 178 This class of exception should be raised when updating to a new 179 build fails, and the most likely cause of the failure is a bug in 180 the newly installed target build. 181 """ 182 183 CHROME_FAILURE = 'Chrome failed to reach login screen' 184 UPDATE_ENGINE_FAILURE = ('update-engine failed to call ' 185 'chromeos-setgoodkernel') 186 ROLLBACK_FAILURE = 'System rolled back to previous build' 187 188 _SUMMARY = 'New build failed' 189 _CLASSIFIERS = [ 190 (CHROME_FAILURE, 'Chrome did not start'), 191 (UPDATE_ENGINE_FAILURE, 'update-engine did not start'), 192 (ROLLBACK_FAILURE, ROLLBACK_FAILURE), 193 ] 194 195 def __init__(self, update_version, msg): 196 super(NewBuildUpdateError, self).__init__( 197 'Failure in build %s' % update_version, msg) 198 199 @property 200 def failure_summary(self): 201 #pylint: disable=missing-docstring 202 return 'Build failed to work after installing' 203 204 205def _url_to_version(update_url): 206 """Return the version based on update_url. 207 208 @param update_url: url to the image to update to. 209 210 """ 211 # The Chrome OS version is generally the last element in the URL. The only 212 # exception is delta update URLs, which are rooted under the version; e.g., 213 # http://.../update/.../0.14.755.0/au/0.14.754.0. In this case we want to 214 # strip off the au section of the path before reading the version. 215 return re.sub('/au/.*', '', 216 urlparse.urlparse(update_url).path).split('/')[-1].strip() 217 218 219def url_to_image_name(update_url): 220 """Return the image name based on update_url. 221 222 From a URL like: 223 http://172.22.50.205:8082/update/lumpy-release/R27-3837.0.0 224 return lumpy-release/R27-3837.0.0 225 226 @param update_url: url to the image to update to. 227 @returns a string representing the image name in the update_url. 228 229 """ 230 return '/'.join(urlparse.urlparse(update_url).path.split('/')[-2:]) 231 232 233def get_update_failure_reason(exception): 234 """Convert an exception into a failure reason for metrics. 235 236 The passed in `exception` should be one raised by failure of 237 `ChromiumOSUpdater.run_update`. The returned string will describe 238 the failure. If the input exception value is not a truish value 239 the return value will be `None`. 240 241 The number of possible return strings is restricted to a limited 242 enumeration of values so that the string may be safely used in 243 Monarch metrics without worrying about cardinality of the range of 244 string values. 245 246 @param exception Exception to be converted to a failure reason. 247 248 @return A string suitable for use in Monarch metrics, or `None`. 249 """ 250 if exception: 251 if isinstance(exception, _AttributedUpdateError): 252 return exception.failure_summary 253 else: 254 return 'Unknown Error: %s' % type(exception).__name__ 255 return None 256 257 258def _get_devserver_build_from_update_url(update_url): 259 """Get the devserver and build from the update url. 260 261 @param update_url: The url for update. 262 Eg: http://devserver:port/update/build. 263 264 @return: A tuple of (devserver url, build) or None if the update_url 265 doesn't match the expected pattern. 266 267 @raises ValueError: If the update_url doesn't match the expected pattern. 268 @raises ValueError: If no global_config was found, or it doesn't contain an 269 image_url_pattern. 270 """ 271 pattern = global_config.global_config.get_config_value( 272 'CROS', 'image_url_pattern', type=str, default='') 273 if not pattern: 274 raise ValueError('Cannot parse update_url, the global config needs ' 275 'an image_url_pattern.') 276 re_pattern = pattern.replace('%s', '(\S+)') 277 parts = re.search(re_pattern, update_url) 278 if not parts or len(parts.groups()) < 2: 279 raise ValueError('%s is not an update url' % update_url) 280 return parts.groups() 281 282 283def _list_image_dir_contents(update_url): 284 """Lists the contents of the devserver for a given build/update_url. 285 286 @param update_url: An update url. Eg: http://devserver:port/update/build. 287 """ 288 if not update_url: 289 logging.warning('Need update_url to list contents of the devserver.') 290 return 291 error_msg = 'Cannot check contents of devserver, update url %s' % update_url 292 try: 293 devserver_url, build = _get_devserver_build_from_update_url(update_url) 294 except ValueError as e: 295 logging.warning('%s: %s', error_msg, e) 296 return 297 devserver = dev_server.ImageServer(devserver_url) 298 try: 299 devserver.list_image_dir(build) 300 # The devserver will retry on URLError to avoid flaky connections, but will 301 # eventually raise the URLError if it persists. All HTTPErrors get 302 # converted to DevServerExceptions. 303 except (dev_server.DevServerException, urllib2.URLError) as e: 304 logging.warning('%s: %s', error_msg, e) 305 306 307def _get_metric_fields(update_url): 308 """Return a dict of metric fields. 309 310 This is used for sending autoupdate metrics for the given update URL. 311 312 @param update_url Metrics fields will be calculated from this URL. 313 """ 314 build_name = url_to_image_name(update_url) 315 try: 316 board, build_type, milestone, _ = server_utils.ParseBuildName( 317 build_name) 318 except server_utils.ParseBuildNameException: 319 logging.warning('Unable to parse build name %s for metrics. ' 320 'Continuing anyway.', build_name) 321 board, build_type, milestone = ('', '', '') 322 return { 323 'dev_server': dev_server.get_resolved_hostname(update_url), 324 'board': board, 325 'build_type': build_type, 326 'milestone': milestone, 327 } 328 329 330# TODO(garnold) This implements shared updater functionality needed for 331# supporting the autoupdate_EndToEnd server-side test. We should probably 332# migrate more of the existing ChromiumOSUpdater functionality to it as we 333# expand non-CrOS support in other tests. 334class ChromiumOSUpdater(object): 335 """Chromium OS specific DUT update functionality.""" 336 337 def __init__(self, update_url, host=None, interactive=True, 338 use_quick_provision=False): 339 """Initializes the object. 340 341 @param update_url: The URL we want the update to use. 342 @param host: A client.common_lib.hosts.Host implementation. 343 @param interactive: Bool whether we are doing an interactive update. 344 @param use_quick_provision: Whether we should attempt to perform 345 the update using the quick-provision script. 346 """ 347 self.update_url = update_url 348 self.host = host 349 self.interactive = interactive 350 self.update_version = _url_to_version(update_url) 351 self._use_quick_provision = use_quick_provision 352 353 354 def _run(self, cmd, *args, **kwargs): 355 """Abbreviated form of self.host.run(...)""" 356 return self.host.run(cmd, *args, **kwargs) 357 358 359 def check_update_status(self): 360 """Returns the current update engine state. 361 362 We use the `update_engine_client -status' command and parse the line 363 indicating the update state, e.g. "CURRENT_OP=UPDATE_STATUS_IDLE". 364 """ 365 update_status = self.host.run(command='%s -status | grep CURRENT_OP' % 366 _UPDATER_BIN) 367 return update_status.stdout.strip().split('=')[-1] 368 369 370 def _rootdev(self, options=''): 371 """Returns the stripped output of rootdev <options>. 372 373 @param options: options to run rootdev. 374 375 """ 376 return self._run('rootdev %s' % options).stdout.strip() 377 378 379 def get_kernel_state(self): 380 """Returns the (<active>, <inactive>) kernel state as a pair. 381 382 @raise RootFSUpdateError if the DUT reports a root partition 383 number that isn't one of the known valid values. 384 """ 385 active_root = int(re.findall('\d+\Z', self._rootdev('-s'))[0]) 386 if active_root == _KERNEL_A['root']: 387 return _KERNEL_A, _KERNEL_B 388 elif active_root == _KERNEL_B['root']: 389 return _KERNEL_B, _KERNEL_A 390 else: 391 raise RootFSUpdateError( 392 'Encountered unknown root partition: %s' % active_root) 393 394 395 def _cgpt(self, flag, kernel): 396 """Return numeric cgpt value for the specified flag, kernel, device.""" 397 return int(self._run('cgpt show -n -i %d %s $(rootdev -s -d)' % ( 398 kernel['kernel'], flag)).stdout.strip()) 399 400 401 def _get_next_kernel(self): 402 """Return the kernel that has priority for the next boot.""" 403 priority_a = self._cgpt('-P', _KERNEL_A) 404 priority_b = self._cgpt('-P', _KERNEL_B) 405 if priority_a > priority_b: 406 return _KERNEL_A 407 else: 408 return _KERNEL_B 409 410 411 def _get_kernel_success(self, kernel): 412 """Return boolean success flag for the specified kernel. 413 414 @param kernel: information of the given kernel, either _KERNEL_A 415 or _KERNEL_B. 416 """ 417 return self._cgpt('-S', kernel) != 0 418 419 420 def _get_kernel_tries(self, kernel): 421 """Return tries count for the specified kernel. 422 423 @param kernel: information of the given kernel, either _KERNEL_A 424 or _KERNEL_B. 425 """ 426 return self._cgpt('-T', kernel) 427 428 429 def _get_last_update_error(self): 430 """Get the last autoupdate error code.""" 431 command_result = self._run( 432 '%s --last_attempt_error' % _UPDATER_BIN) 433 return command_result.stdout.strip().replace('\n', ', ') 434 435 436 def _base_update_handler_no_retry(self, run_args): 437 """Base function to handle a remote update ssh call. 438 439 @param run_args: Dictionary of args passed to ssh_host.run function. 440 441 @throws: intercepts and re-throws all exceptions 442 """ 443 try: 444 self.host.run(**run_args) 445 except Exception as e: 446 logging.debug('exception in update handler: %s', e) 447 raise e 448 449 450 def _base_update_handler(self, run_args, err_msg_prefix=None): 451 """Handle a remote update ssh call, possibly with retries. 452 453 @param run_args: Dictionary of args passed to ssh_host.run function. 454 @param err_msg_prefix: Prefix of the exception error message. 455 """ 456 def exception_handler(e): 457 """Examines exceptions and returns True if the update handler 458 should be retried. 459 460 @param e: the exception intercepted by the retry util. 461 """ 462 return (isinstance(e, error.AutoservSSHTimeout) or 463 (isinstance(e, error.GenericHostRunError) and 464 hasattr(e, 'description') and 465 (re.search('ERROR_CODE=37', e.description) or 466 re.search('generic error .255.', e.description)))) 467 468 try: 469 # Try the update twice (arg 2 is max_retry, not including the first 470 # call). Some exceptions may be caught by the retry handler. 471 retry_util.GenericRetry(exception_handler, 1, 472 self._base_update_handler_no_retry, 473 run_args) 474 except Exception as e: 475 message = err_msg_prefix + ': ' + str(e) 476 raise RootFSUpdateError(message) 477 478 479 def _wait_for_update_service(self): 480 """Ensure that the update engine daemon is running, possibly 481 by waiting for it a bit in case the DUT just rebooted and the 482 service hasn't started yet. 483 """ 484 def handler(e): 485 """Retry exception handler. 486 487 Assumes that the error is due to the update service not having 488 started yet. 489 490 @param e: the exception intercepted by the retry util. 491 """ 492 if isinstance(e, error.AutoservRunError): 493 logging.debug('update service check exception: %s\n' 494 'retrying...', e) 495 return True 496 else: 497 return False 498 499 # Retry at most three times, every 5s. 500 status = retry_util.GenericRetry(handler, 3, 501 self.check_update_status, 502 sleep=5) 503 504 # Expect the update engine to be idle. 505 if status != UPDATER_IDLE: 506 raise RootFSUpdateError( 507 'Update engine status is %s (%s was expected).' 508 % (status, UPDATER_IDLE)) 509 510 511 def _reset_update_engine(self): 512 """Resets the host to prepare for a clean update regardless of state.""" 513 self._run('stop ui || true') 514 self._run('stop update-engine || true') 515 self._run('start update-engine') 516 self._wait_for_update_service() 517 518 519 def _reset_stateful_partition(self): 520 """Clear any pending stateful update request.""" 521 self._run('%s --stateful_change=reset 2>&1' 522 % self._get_stateful_update_script()) 523 self._run('rm -f %s' % _TARGET_VERSION) 524 525 526 def _set_target_version(self): 527 """Set the "target version" for the update.""" 528 version_number = self.update_version.split('-')[1] 529 self._run('echo %s > %s' % (version_number, _TARGET_VERSION)) 530 531 532 def _revert_boot_partition(self): 533 """Revert the boot partition.""" 534 part = self._rootdev('-s') 535 logging.warning('Reverting update; Boot partition will be %s', part) 536 return self._run('/postinst %s 2>&1' % part) 537 538 539 def _verify_kernel_state(self): 540 """Verify that the next kernel to boot is correct for update. 541 542 This tests that the kernel state is correct for a successfully 543 downloaded and installed update. That is, the next kernel to 544 boot must be the currently inactive kernel. 545 546 @raise RootFSUpdateError if the DUT next kernel isn't the 547 expected next kernel. 548 """ 549 inactive_kernel = self.get_kernel_state()[1] 550 next_kernel = self._get_next_kernel() 551 if next_kernel != inactive_kernel: 552 raise RootFSUpdateError( 553 'Update failed. The kernel for next boot is %s, ' 554 'but %s was expected.' 555 % (next_kernel['name'], inactive_kernel['name'])) 556 return inactive_kernel 557 558 559 def _verify_update_completed(self): 560 """Verifies that an update has completed. 561 562 @raise RootFSUpdateError if the DUT doesn't indicate that 563 download is complete and the DUT is ready for reboot. 564 """ 565 status = self.check_update_status() 566 if status != UPDATER_NEED_REBOOT: 567 error_msg = '' 568 if status == UPDATER_IDLE: 569 error_msg = 'Update error: %s' % self._get_last_update_error() 570 raise RootFSUpdateError( 571 'Update engine status is %s (%s was expected). %s' 572 % (status, UPDATER_NEED_REBOOT, error_msg)) 573 return self._verify_kernel_state() 574 575 576 def trigger_update(self): 577 """Triggers a background update.""" 578 # If this function is called immediately after reboot (which it 579 # can be), there is no guarantee that the update engine is up 580 # and running yet, so wait for it. 581 self._wait_for_update_service() 582 583 autoupdate_cmd = ('%s --check_for_update --omaha_url=%s' % 584 (_UPDATER_BIN, self.update_url)) 585 run_args = {'command': autoupdate_cmd} 586 err_prefix = 'Failed to trigger an update on %s. ' % self.host.hostname 587 logging.info('Triggering update via: %s', autoupdate_cmd) 588 metric_fields = {'success': False} 589 try: 590 self._base_update_handler(run_args, err_prefix) 591 metric_fields['success'] = True 592 finally: 593 c = metrics.Counter('chromeos/autotest/autoupdater/trigger') 594 metric_fields.update(_get_metric_fields(self.update_url)) 595 c.increment(fields=metric_fields) 596 597 598 def update_image(self): 599 """Updates the device root FS and kernel and verifies success.""" 600 autoupdate_cmd = ('%s --update --omaha_url=%s' % 601 (_UPDATER_BIN, self.update_url)) 602 if not self.interactive: 603 autoupdate_cmd = '%s --interactive=false' % autoupdate_cmd 604 run_args = {'command': autoupdate_cmd, 'timeout': 3600} 605 err_prefix = ('Failed to install device image using payload at %s ' 606 'on %s. ' % (self.update_url, self.host.hostname)) 607 logging.info('Updating image via: %s', autoupdate_cmd) 608 metric_fields = {'success': False} 609 try: 610 self._base_update_handler(run_args, err_prefix) 611 metric_fields['success'] = True 612 finally: 613 c = metrics.Counter('chromeos/autotest/autoupdater/update') 614 metric_fields.update(_get_metric_fields(self.update_url)) 615 c.increment(fields=metric_fields) 616 return self._verify_update_completed() 617 618 619 def _get_remote_script(self, script_name): 620 """Ensure that `script_name` is present on the DUT. 621 622 The given script (e.g. `stateful_update`) may be present in the 623 stateful partition under /usr/local/bin, or we may have to 624 download it from the devserver. 625 626 Determine whether the script is present or must be downloaded 627 and download if necessary. Then, return a command fragment 628 sufficient to run the script from whereever it now lives on the 629 DUT. 630 631 @param script_name The name of the script as expected in 632 /usr/local/bin and on the devserver. 633 @return A string with the command (minus arguments) that will 634 run the target script. 635 """ 636 remote_script = '/usr/local/bin/%s' % script_name 637 if self.host.path_exists(remote_script): 638 return remote_script 639 remote_tmp_script = '/tmp/%s' % script_name 640 server_name = urlparse.urlparse(self.update_url)[1] 641 script_url = 'http://%s/static/%s' % (server_name, script_name) 642 fetch_script = ( 643 'curl -o %s %s && head -1 %s | grep "^#!" | sed "s/#!//"') % ( 644 remote_tmp_script, script_url, remote_tmp_script) 645 script_interpreter = self._run(fetch_script, 646 ignore_status=True).stdout.strip() 647 if not script_interpreter: 648 return None 649 return '%s %s' % (script_interpreter, remote_tmp_script) 650 651 652 def _get_stateful_update_script(self): 653 """Returns a command to run the stateful update script. 654 655 Find `stateful_update` on the target or install it, as 656 necessary. If installation fails, raise an exception. 657 658 @raise StatefulUpdateError if the script can't be found or 659 installed. 660 @return A string that can be joined with arguments to run the 661 `stateful_update` command on the DUT. 662 """ 663 script_command = self._get_remote_script(_STATEFUL_UPDATE_SCRIPT) 664 if not script_command: 665 raise StatefulUpdateError('Could not install %s on DUT' 666 % _STATEFUL_UPDATE_SCRIPT) 667 return script_command 668 669 670 def rollback_rootfs(self, powerwash): 671 """Triggers rollback and waits for it to complete. 672 673 @param powerwash: If true, powerwash as part of rollback. 674 675 @raise RootFSUpdateError if anything went wrong. 676 """ 677 version = self.host.get_release_version() 678 # Introduced can_rollback in M36 (build 5772). # etc/lsb-release matches 679 # X.Y.Z. This version split just pulls the first part out. 680 try: 681 build_number = int(version.split('.')[0]) 682 except ValueError: 683 logging.error('Could not parse build number.') 684 build_number = 0 685 686 if build_number >= 5772: 687 can_rollback_cmd = '%s --can_rollback' % _UPDATER_BIN 688 logging.info('Checking for rollback.') 689 try: 690 self._run(can_rollback_cmd) 691 except error.AutoservRunError as e: 692 raise RootFSUpdateError("Rollback isn't possible on %s: %s" % 693 (self.host.hostname, str(e))) 694 695 rollback_cmd = '%s --rollback --follow' % _UPDATER_BIN 696 if not powerwash: 697 rollback_cmd += ' --nopowerwash' 698 699 logging.info('Performing rollback.') 700 try: 701 self._run(rollback_cmd) 702 except error.AutoservRunError as e: 703 raise RootFSUpdateError('Rollback failed on %s: %s' % 704 (self.host.hostname, str(e))) 705 706 self._verify_update_completed() 707 708 709 def update_stateful(self, clobber=True): 710 """Updates the stateful partition. 711 712 @param clobber: If True, a clean stateful installation. 713 714 @raise StatefulUpdateError if the update script fails to 715 complete successfully. 716 """ 717 logging.info('Updating stateful partition...') 718 statefuldev_url = self.update_url.replace('update', 'static') 719 720 # Attempt stateful partition update; this must succeed so that the newly 721 # installed host is testable after update. 722 statefuldev_cmd = [self._get_stateful_update_script(), statefuldev_url] 723 if clobber: 724 statefuldev_cmd.append('--stateful_change=clean') 725 726 statefuldev_cmd.append('2>&1') 727 try: 728 self._run(' '.join(statefuldev_cmd), timeout=1200) 729 except error.AutoservRunError: 730 raise StatefulUpdateError( 731 'Failed to perform stateful update on %s' % 732 self.host.hostname) 733 734 735 def verify_boot_expectations(self, expected_kernel, rollback_message): 736 """Verifies that we fully booted given expected kernel state. 737 738 This method both verifies that we booted using the correct kernel 739 state and that the OS has marked the kernel as good. 740 741 @param expected_kernel: kernel that we are verifying with, 742 i.e. I expect to be booted onto partition 4 etc. See output of 743 get_kernel_state. 744 @param rollback_message: string include in except message text 745 if we booted with the wrong partition. 746 747 @raise NewBuildUpdateError if any of the various checks fail. 748 """ 749 # Figure out the newly active kernel. 750 active_kernel = self.get_kernel_state()[0] 751 752 # Check for rollback due to a bad build. 753 if active_kernel != expected_kernel: 754 755 # Kernel crash reports should be wiped between test runs, but 756 # may persist from earlier parts of the test, or from problems 757 # with provisioning. 758 # 759 # Kernel crash reports will NOT be present if the crash happened 760 # before encrypted stateful is mounted. 761 # 762 # TODO(dgarrett): Integrate with server/crashcollect.py at some 763 # point. 764 kernel_crashes = glob.glob('/var/spool/crash/kernel.*.kcrash') 765 if kernel_crashes: 766 rollback_message += ': kernel_crash' 767 logging.debug('Found %d kernel crash reports:', 768 len(kernel_crashes)) 769 # The crash names contain timestamps that may be useful: 770 # kernel.20131207.005945.0.kcrash 771 for crash in kernel_crashes: 772 logging.debug(' %s', os.path.basename(crash)) 773 774 # Print out some information to make it easier to debug 775 # the rollback. 776 logging.debug('Dumping partition table.') 777 self._run('cgpt show $(rootdev -s -d)') 778 logging.debug('Dumping crossystem for firmware debugging.') 779 self._run('crossystem --all') 780 raise NewBuildUpdateError(self.update_version, rollback_message) 781 782 # Make sure chromeos-setgoodkernel runs. 783 try: 784 utils.poll_for_condition( 785 lambda: (self._get_kernel_tries(active_kernel) == 0 786 and self._get_kernel_success(active_kernel)), 787 exception=RootFSUpdateError(), 788 timeout=_KERNEL_UPDATE_TIMEOUT, sleep_interval=5) 789 except RootFSUpdateError: 790 services_status = self._run('status system-services').stdout 791 if services_status != 'system-services start/running\n': 792 event = NewBuildUpdateError.CHROME_FAILURE 793 else: 794 event = NewBuildUpdateError.UPDATE_ENGINE_FAILURE 795 raise NewBuildUpdateError(self.update_version, event) 796 797 798 def _prepare_host(self): 799 """Make sure the target DUT is working and ready for update. 800 801 Initially, the target DUT's state is unknown. The DUT is 802 expected to be online, but we strive to be forgiving if Chrome 803 and/or the update engine aren't fully functional. 804 """ 805 # Summary of work, and the rationale: 806 # 1. Reboot, because it's a good way to clear out problems. 807 # 2. Touch the PROVISION_FAILED file, to allow repair to detect 808 # failure later. 809 # 3. Run the hook for host class specific preparation. 810 # 4. Stop Chrome, because the system is designed to eventually 811 # reboot if Chrome is stuck in a crash loop. 812 # 5. Force `update-engine` to start, because if Chrome failed 813 # to start properly, the status of the `update-engine` job 814 # will be uncertain. 815 if not self.host.is_up(): 816 raise HostUpdateError(self.host.hostname, 817 HostUpdateError.DUT_DOWN) 818 self._reset_stateful_partition() 819 self.host.reboot(timeout=self.host.REBOOT_TIMEOUT) 820 self._run('touch %s' % PROVISION_FAILED) 821 self.host.prepare_for_update() 822 self._reset_update_engine() 823 logging.info('Updating from version %s to %s.', 824 self.host.get_release_version(), 825 self.update_version) 826 827 828 def _verify_devserver(self): 829 """Check that our chosen devserver is still working. 830 831 @raise DevServerError if the devserver fails any sanity check. 832 """ 833 server = 'http://%s' % urlparse.urlparse(self.update_url)[1] 834 try: 835 if not dev_server.ImageServer.devserver_healthy(server): 836 raise DevServerError( 837 server, 'Devserver is not healthy') 838 except Exception as e: 839 raise DevServerError( 840 server, 'Devserver is not up and available') 841 842 843 def _install_via_update_engine(self): 844 """Install an updating using the production AU flow. 845 846 This uses the standard AU flow and the `stateful_update` script 847 to download and install a root FS, kernel and stateful 848 filesystem content. 849 850 @return The kernel expected to be booted next. 851 """ 852 logging.info('Installing image using update_engine.') 853 expected_kernel = self.update_image() 854 self.update_stateful() 855 self._set_target_version() 856 return expected_kernel 857 858 859 def _install_via_quick_provision(self): 860 """Install an updating using the `quick-provision` script. 861 862 This uses the `quick-provision` script to download and install 863 a root FS, kernel and stateful filesystem content. 864 865 @return The kernel expected to be booted next. 866 """ 867 if not self._use_quick_provision: 868 return None 869 build_re = global_config.global_config.get_config_value( 870 'CROS', 'quick_provision_build_regex', type=str, default='') 871 image_name = url_to_image_name(self.update_url) 872 if not build_re or re.match(build_re, image_name) is None: 873 logging.info('Not eligible for quick-provision.') 874 return None 875 logging.info('Installing image using quick-provision.') 876 provision_command = self._get_remote_script(_QUICK_PROVISION_SCRIPT) 877 server_name = urlparse.urlparse(self.update_url)[1] 878 static_url = 'http://%s/static' % server_name 879 command = '%s --noreboot %s %s' % ( 880 provision_command, image_name, static_url) 881 try: 882 self._run(command) 883 self._set_target_version() 884 return self._verify_kernel_state() 885 except Exception: 886 # N.B. We handle only `Exception` here. Non-Exception 887 # classes (such as KeyboardInterrupt) are handled by our 888 # caller. 889 logging.exception('quick-provision script failed; ' 890 'will fall back to update_engine.') 891 self._revert_boot_partition() 892 self._reset_stateful_partition() 893 self._reset_update_engine() 894 return None 895 896 897 def _install_update(self): 898 """Install the requested image on the DUT, but don't start it. 899 900 This downloads and installs a root FS, kernel and stateful 901 filesystem content. This does not reboot the DUT, so the update 902 is merely pending when the method returns. 903 904 @return The kernel expected to be booted next. 905 """ 906 logging.info('Installing image at %s onto %s', 907 self.update_url, self.host.hostname) 908 try: 909 return (self._install_via_quick_provision() 910 or self._install_via_update_engine()) 911 except: 912 # N.B. This handling code includes non-Exception classes such 913 # as KeyboardInterrupt. We need to clean up, but we also must 914 # re-raise. 915 self._revert_boot_partition() 916 self._reset_stateful_partition() 917 self._reset_update_engine() 918 # Collect update engine logs in the event of failure. 919 if self.host.job: 920 logging.info('Collecting update engine logs due to failure...') 921 self.host.get_file( 922 _UPDATER_LOGS, self.host.job.sysinfo.sysinfodir, 923 preserve_perm=False) 924 _list_image_dir_contents(self.update_url) 925 raise 926 927 928 def _complete_update(self, expected_kernel): 929 """Finish the update, and confirm that it succeeded. 930 931 Initial condition is that the target build has been downloaded 932 and installed on the DUT, but has not yet been booted. This 933 function is responsible for rebooting the DUT, and checking that 934 the new build is running successfully. 935 936 @param expected_kernel: kernel expected to be active after reboot. 937 """ 938 # Regarding the 'crossystem' command below: In some cases, 939 # the update flow puts the TPM into a state such that it 940 # fails verification. We don't know why. However, this 941 # call papers over the problem by clearing the TPM during 942 # the reboot. 943 # 944 # We ignore failures from 'crossystem'. Although failure 945 # here is unexpected, and could signal a bug, the point of 946 # the exercise is to paper over problems; allowing this to 947 # fail would defeat the purpose. 948 self._run('crossystem clear_tpm_owner_request=1', 949 ignore_status=True) 950 self.host.reboot(timeout=self.host.REBOOT_TIMEOUT) 951 952 # Touch the lab machine file to leave a marker that 953 # distinguishes this image from other test images. 954 # Afterwards, we must re-run the autoreboot script because 955 # it depends on the _LAB_MACHINE_FILE. 956 autoreboot_cmd = ('FILE="%s" ; [ -f "$FILE" ] || ' 957 '( touch "$FILE" ; start autoreboot )') 958 self._run(autoreboot_cmd % _LAB_MACHINE_FILE) 959 self.verify_boot_expectations( 960 expected_kernel, NewBuildUpdateError.ROLLBACK_FAILURE) 961 962 logging.debug('Cleaning up old autotest directories.') 963 try: 964 installed_autodir = autotest.Autotest.get_installed_autodir( 965 self.host) 966 self._run('rm -rf ' + installed_autodir) 967 except autotest.AutodirNotFoundError: 968 logging.debug('No autotest installed directory found.') 969 970 971 def run_update(self): 972 """Perform a full update of a DUT in the test lab. 973 974 This downloads and installs the root FS and stateful partition 975 content needed for the update specified in `self.host` and 976 `self.update_url`. The update is performed according to the 977 requirements for provisioning a DUT for testing the requested 978 build. 979 980 At the end of the procedure, metrics are reported describing the 981 outcome of the operation. 982 983 @returns A tuple of the form `(image_name, attributes)`, where 984 `image_name` is the name of the image installed, and 985 `attributes` is new attributes to be applied to the DUT. 986 """ 987 server_name = dev_server.get_resolved_hostname(self.update_url) 988 metrics.Counter(_metric_name('install')).increment( 989 fields={'devserver': server_name}) 990 991 self._verify_devserver() 992 993 try: 994 self._prepare_host() 995 except _AttributedUpdateError: 996 raise 997 except Exception as e: 998 logging.exception('Failure preparing host prior to update.') 999 raise HostUpdateError(self.host.hostname, str(e)) 1000 1001 try: 1002 expected_kernel = self._install_update() 1003 except _AttributedUpdateError: 1004 raise 1005 except Exception as e: 1006 logging.exception('Failure during download and install.') 1007 server_name = dev_server.get_resolved_hostname(self.update_url) 1008 raise ImageInstallError(self.host.hostname, server_name, str(e)) 1009 1010 try: 1011 self._complete_update(expected_kernel) 1012 except _AttributedUpdateError: 1013 raise 1014 except Exception as e: 1015 logging.exception('Failure from build after update.') 1016 raise NewBuildUpdateError(self.update_version, str(e)) 1017 1018 image_name = url_to_image_name(self.update_url) 1019 # update_url is different from devserver url needed to stage autotest 1020 # packages, therefore, resolve a new devserver url here. 1021 devserver_url = dev_server.ImageServer.resolve( 1022 image_name, self.host.hostname).url() 1023 repo_url = tools.get_package_url(devserver_url, image_name) 1024 return image_name, {ds_constants.JOB_REPO_URL: repo_url} 1025