1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import glob 6import logging 7import os 8import re 9import sys 10import urllib2 11import urlparse 12 13from autotest_lib.client.bin import utils 14from autotest_lib.client.common_lib import error, global_config 15from autotest_lib.client.common_lib.cros import dev_server 16from autotest_lib.server import autotest 17from autotest_lib.server import utils as server_utils 18from autotest_lib.server.cros.dynamic_suite import constants as ds_constants 19from autotest_lib.server.cros.dynamic_suite import tools 20from chromite.lib import retry_util 21 22try: 23 from chromite.lib import metrics 24except ImportError: 25 metrics = utils.metrics_mock 26 27 28def _metric_name(base_name): 29 return 'chromeos/autotest/provision/' + base_name 30 31 32# Local stateful update path is relative to the CrOS source directory. 33UPDATER_IDLE = 'UPDATE_STATUS_IDLE' 34UPDATER_NEED_REBOOT = 'UPDATE_STATUS_UPDATED_NEED_REBOOT' 35# A list of update engine client states that occur after an update is triggered. 36UPDATER_PROCESSING_UPDATE = ['UPDATE_STATUS_CHECKING_FOR_UPDATE', 37 'UPDATE_STATUS_UPDATE_AVAILABLE', 38 'UPDATE_STATUS_DOWNLOADING', 39 'UPDATE_STATUS_FINALIZING', 40 'UPDATE_STATUS_VERIFYING', 41 'UPDATE_STATUS_REPORTING_ERROR_EVENT', 42 'UPDATE_STATUS_ATTEMPTING_ROLLBACK'] 43 44 45_STATEFUL_UPDATE_SCRIPT = 'stateful_update' 46_QUICK_PROVISION_SCRIPT = 'quick-provision' 47 48_UPDATER_BIN = '/usr/bin/update_engine_client' 49_UPDATER_LOGS = ['/var/log/messages', '/var/log/update_engine'] 50 51_KERNEL_A = {'name': 'KERN-A', 'kernel': 2, 'root': 3} 52_KERNEL_B = {'name': 'KERN-B', 'kernel': 4, 'root': 5} 53 54# Time to wait for new kernel to be marked successful after 55# auto update. 56_KERNEL_UPDATE_TIMEOUT = 120 57 58 59# PROVISION_FAILED - A flag file to indicate provision failures. The 60# file is created at the start of any AU procedure (see 61# `ChromiumOSUpdater._prepare_host()`). The file's location in 62# stateful means that on successul update it will be removed. Thus, if 63# this file exists, it indicates that we've tried and failed in a 64# previous attempt to update. 65PROVISION_FAILED = '/var/tmp/provision_failed' 66 67 68# A flag file used to enable special handling in lab DUTs. Some 69# parts of the system in Chromium OS test images will behave in ways 70# convenient to the test lab when this file is present. Generally, 71# we create this immediately after any update completes. 72_LAB_MACHINE_FILE = '/mnt/stateful_partition/.labmachine' 73 74 75# _TARGET_VERSION - A file containing the new version to which we plan 76# to update. This file is used by the CrOS shutdown code to detect and 77# handle certain version downgrade cases. Specifically: Downgrading 78# may trigger an unwanted powerwash in the target build when the 79# following conditions are met: 80# * Source build is a v4.4 kernel with R69-10756.0.0 or later. 81# * Target build predates the R69-10756.0.0 cutoff. 82# When this file is present and indicates a downgrade, the OS shutdown 83# code on the DUT knows how to prevent the powerwash. 84_TARGET_VERSION = '/run/update_target_version' 85 86 87# _REBOOT_FAILURE_MESSAGE - This is the standard message text returned 88# when the Host.reboot() method fails. The source of this text comes 89# from `wait_for_restart()` in client/common_lib/hosts/base_classes.py. 90 91_REBOOT_FAILURE_MESSAGE = 'Host did not return from reboot' 92 93 94DEVSERVER_PORT = '8082' 95GS_CACHE_PORT = '8888' 96 97 98class RootFSUpdateError(error.TestFail): 99 """Raised when the RootFS fails to update.""" 100 101 102class StatefulUpdateError(error.TestFail): 103 """Raised when the stateful partition fails to update.""" 104 105 106class _AttributedUpdateError(error.TestFail): 107 """Update failure with an attributed cause.""" 108 109 def __init__(self, attribution, msg): 110 super(_AttributedUpdateError, self).__init__( 111 '%s: %s' % (attribution, msg)) 112 self._message = msg 113 114 def _classify(self): 115 for err_pattern, classification in self._CLASSIFIERS: 116 if re.match(err_pattern, self._message): 117 return classification 118 return None 119 120 @property 121 def failure_summary(self): 122 """Summarize this error for metrics reporting.""" 123 classification = self._classify() 124 if classification: 125 return '%s: %s' % (self._SUMMARY, classification) 126 else: 127 return self._SUMMARY 128 129 130class HostUpdateError(_AttributedUpdateError): 131 """Failure updating a DUT attributable to the DUT. 132 133 This class of exception should be raised when the most likely cause 134 of failure was a condition existing on the DUT prior to the update, 135 such as a hardware problem, or a bug in the software on the DUT. 136 """ 137 138 DUT_DOWN = 'No answer to ssh' 139 140 _SUMMARY = 'DUT failed prior to update' 141 _CLASSIFIERS = [ 142 (DUT_DOWN, DUT_DOWN), 143 (_REBOOT_FAILURE_MESSAGE, 'Reboot failed'), 144 ] 145 146 def __init__(self, hostname, msg): 147 super(HostUpdateError, self).__init__( 148 'Error on %s prior to update' % hostname, msg) 149 150 151class DevServerError(_AttributedUpdateError): 152 """Failure updating a DUT attributable to the devserver. 153 154 This class of exception should be raised when the most likely cause 155 of failure was the devserver serving the target image for update. 156 """ 157 158 _SUMMARY = 'Devserver failed prior to update' 159 _CLASSIFIERS = [] 160 161 def __init__(self, devserver, msg): 162 super(DevServerError, self).__init__( 163 'Devserver error on %s' % devserver, msg) 164 165 166class ImageInstallError(_AttributedUpdateError): 167 """Failure updating a DUT when installing from the devserver. 168 169 This class of exception should be raised when the target DUT fails 170 to download and install the target image from the devserver, and 171 either the devserver or the DUT might be at fault. 172 """ 173 174 _SUMMARY = 'Image failed to download and install' 175 _CLASSIFIERS = [] 176 177 def __init__(self, hostname, devserver, msg): 178 super(ImageInstallError, self).__init__( 179 'Download and install failed from %s onto %s' 180 % (devserver, hostname), msg) 181 182 183class NewBuildUpdateError(_AttributedUpdateError): 184 """Failure updating a DUT attributable to the target build. 185 186 This class of exception should be raised when updating to a new 187 build fails, and the most likely cause of the failure is a bug in 188 the newly installed target build. 189 """ 190 191 CHROME_FAILURE = 'Chrome failed to reach login screen' 192 UPDATE_ENGINE_FAILURE = ('update-engine failed to call ' 193 'chromeos-setgoodkernel') 194 ROLLBACK_FAILURE = 'System rolled back to previous build' 195 196 _SUMMARY = 'New build failed' 197 _CLASSIFIERS = [ 198 (CHROME_FAILURE, 'Chrome did not start'), 199 (UPDATE_ENGINE_FAILURE, 'update-engine did not start'), 200 (ROLLBACK_FAILURE, ROLLBACK_FAILURE), 201 ] 202 203 def __init__(self, update_version, msg): 204 super(NewBuildUpdateError, self).__init__( 205 'Failure in build %s' % update_version, msg) 206 207 @property 208 def failure_summary(self): 209 #pylint: disable=missing-docstring 210 return 'Build failed to work after installing' 211 212 213def _url_to_version(update_url): 214 """Return the version based on update_url. 215 216 @param update_url: url to the image to update to. 217 218 """ 219 # The Chrome OS version is generally the last element in the URL. The only 220 # exception is delta update URLs, which are rooted under the version; e.g., 221 # http://.../update/.../0.14.755.0/au/0.14.754.0. In this case we want to 222 # strip off the au section of the path before reading the version. 223 return re.sub('/au/.*', '', 224 urlparse.urlparse(update_url).path).split('/')[-1].strip() 225 226 227def url_to_image_name(update_url): 228 """Return the image name based on update_url. 229 230 From a URL like: 231 http://172.22.50.205:8082/update/lumpy-release/R27-3837.0.0 232 return lumpy-release/R27-3837.0.0 233 234 @param update_url: url to the image to update to. 235 @returns a string representing the image name in the update_url. 236 237 """ 238 return '/'.join(urlparse.urlparse(update_url).path.split('/')[-2:]) 239 240 241def get_update_failure_reason(exception): 242 """Convert an exception into a failure reason for metrics. 243 244 The passed in `exception` should be one raised by failure of 245 `ChromiumOSUpdater.run_update`. The returned string will describe 246 the failure. If the input exception value is not a truish value 247 the return value will be `None`. 248 249 The number of possible return strings is restricted to a limited 250 enumeration of values so that the string may be safely used in 251 Monarch metrics without worrying about cardinality of the range of 252 string values. 253 254 @param exception Exception to be converted to a failure reason. 255 256 @return A string suitable for use in Monarch metrics, or `None`. 257 """ 258 if exception: 259 if isinstance(exception, _AttributedUpdateError): 260 return exception.failure_summary 261 else: 262 return 'Unknown Error: %s' % type(exception).__name__ 263 return None 264 265 266def _get_devserver_build_from_update_url(update_url): 267 """Get the devserver and build from the update url. 268 269 @param update_url: The url for update. 270 Eg: http://devserver:port/update/build. 271 272 @return: A tuple of (devserver url, build) or None if the update_url 273 doesn't match the expected pattern. 274 275 @raises ValueError: If the update_url doesn't match the expected pattern. 276 @raises ValueError: If no global_config was found, or it doesn't contain an 277 image_url_pattern. 278 """ 279 pattern = global_config.global_config.get_config_value( 280 'CROS', 'image_url_pattern', type=str, default='') 281 if not pattern: 282 raise ValueError('Cannot parse update_url, the global config needs ' 283 'an image_url_pattern.') 284 re_pattern = pattern.replace('%s', '(\S+)') 285 parts = re.search(re_pattern, update_url) 286 if not parts or len(parts.groups()) < 2: 287 raise ValueError('%s is not an update url' % update_url) 288 return parts.groups() 289 290 291def _list_image_dir_contents(update_url): 292 """Lists the contents of the devserver for a given build/update_url. 293 294 @param update_url: An update url. Eg: http://devserver:port/update/build. 295 """ 296 if not update_url: 297 logging.warning('Need update_url to list contents of the devserver.') 298 return 299 error_msg = 'Cannot check contents of devserver, update url %s' % update_url 300 try: 301 devserver_url, build = _get_devserver_build_from_update_url(update_url) 302 except ValueError as e: 303 logging.warning('%s: %s', error_msg, e) 304 return 305 devserver = dev_server.ImageServer(devserver_url) 306 try: 307 devserver.list_image_dir(build) 308 # The devserver will retry on URLError to avoid flaky connections, but will 309 # eventually raise the URLError if it persists. All HTTPErrors get 310 # converted to DevServerExceptions. 311 except (dev_server.DevServerException, urllib2.URLError) as e: 312 logging.warning('%s: %s', error_msg, e) 313 314 315def _get_metric_fields(update_url): 316 """Return a dict of metric fields. 317 318 This is used for sending autoupdate metrics for the given update URL. 319 320 @param update_url Metrics fields will be calculated from this URL. 321 """ 322 build_name = url_to_image_name(update_url) 323 try: 324 board, build_type, milestone, _ = server_utils.ParseBuildName( 325 build_name) 326 except server_utils.ParseBuildNameException: 327 logging.warning('Unable to parse build name %s for metrics. ' 328 'Continuing anyway.', build_name) 329 board, build_type, milestone = ('', '', '') 330 return { 331 'dev_server': dev_server.get_resolved_hostname(update_url), 332 'board': board, 333 'build_type': build_type, 334 'milestone': milestone, 335 } 336 337 338# TODO(garnold) This implements shared updater functionality needed for 339# supporting the autoupdate_EndToEnd server-side test. We should probably 340# migrate more of the existing ChromiumOSUpdater functionality to it as we 341# expand non-CrOS support in other tests. 342class ChromiumOSUpdater(object): 343 """Chromium OS specific DUT update functionality.""" 344 345 def __init__(self, update_url, host=None, interactive=True, 346 use_quick_provision=False): 347 """Initializes the object. 348 349 @param update_url: The URL we want the update to use. 350 @param host: A client.common_lib.hosts.Host implementation. 351 @param interactive: Bool whether we are doing an interactive update. 352 @param use_quick_provision: Whether we should attempt to perform 353 the update using the quick-provision script. 354 """ 355 self.update_url = update_url 356 self.host = host 357 self.interactive = interactive 358 self.update_version = _url_to_version(update_url) 359 self._use_quick_provision = use_quick_provision 360 361 362 def _run(self, cmd, *args, **kwargs): 363 """Abbreviated form of self.host.run(...)""" 364 return self.host.run(cmd, *args, **kwargs) 365 366 367 def check_update_status(self): 368 """Returns the current update engine state. 369 370 We use the `update_engine_client -status' command and parse the line 371 indicating the update state, e.g. "CURRENT_OP=UPDATE_STATUS_IDLE". 372 """ 373 update_status = self.host.run(command='%s -status | grep CURRENT_OP' % 374 _UPDATER_BIN) 375 return update_status.stdout.strip().split('=')[-1] 376 377 378 def _rootdev(self, options=''): 379 """Returns the stripped output of rootdev <options>. 380 381 @param options: options to run rootdev. 382 383 """ 384 return self._run('rootdev %s' % options).stdout.strip() 385 386 387 def get_kernel_state(self): 388 """Returns the (<active>, <inactive>) kernel state as a pair. 389 390 @raise RootFSUpdateError if the DUT reports a root partition 391 number that isn't one of the known valid values. 392 """ 393 active_root = int(re.findall('\d+\Z', self._rootdev('-s'))[0]) 394 if active_root == _KERNEL_A['root']: 395 return _KERNEL_A, _KERNEL_B 396 elif active_root == _KERNEL_B['root']: 397 return _KERNEL_B, _KERNEL_A 398 else: 399 raise RootFSUpdateError( 400 'Encountered unknown root partition: %s' % active_root) 401 402 403 def _cgpt(self, flag, kernel): 404 """Return numeric cgpt value for the specified flag, kernel, device.""" 405 return int(self._run('cgpt show -n -i %d %s $(rootdev -s -d)' % ( 406 kernel['kernel'], flag)).stdout.strip()) 407 408 409 def _get_next_kernel(self): 410 """Return the kernel that has priority for the next boot.""" 411 priority_a = self._cgpt('-P', _KERNEL_A) 412 priority_b = self._cgpt('-P', _KERNEL_B) 413 if priority_a > priority_b: 414 return _KERNEL_A 415 else: 416 return _KERNEL_B 417 418 419 def _get_kernel_success(self, kernel): 420 """Return boolean success flag for the specified kernel. 421 422 @param kernel: information of the given kernel, either _KERNEL_A 423 or _KERNEL_B. 424 """ 425 return self._cgpt('-S', kernel) != 0 426 427 428 def _get_kernel_tries(self, kernel): 429 """Return tries count for the specified kernel. 430 431 @param kernel: information of the given kernel, either _KERNEL_A 432 or _KERNEL_B. 433 """ 434 return self._cgpt('-T', kernel) 435 436 437 def _get_last_update_error(self): 438 """Get the last autoupdate error code.""" 439 command_result = self._run( 440 '%s --last_attempt_error' % _UPDATER_BIN) 441 return command_result.stdout.strip().replace('\n', ', ') 442 443 444 def _base_update_handler_no_retry(self, run_args): 445 """Base function to handle a remote update ssh call. 446 447 @param run_args: Dictionary of args passed to ssh_host.run function. 448 449 @throws: intercepts and re-throws all exceptions 450 """ 451 try: 452 self.host.run(**run_args) 453 except Exception as e: 454 logging.debug('exception in update handler: %s', e) 455 raise e 456 457 458 def _base_update_handler(self, run_args, err_msg_prefix=None): 459 """Handle a remote update ssh call, possibly with retries. 460 461 @param run_args: Dictionary of args passed to ssh_host.run function. 462 @param err_msg_prefix: Prefix of the exception error message. 463 """ 464 def exception_handler(e): 465 """Examines exceptions and returns True if the update handler 466 should be retried. 467 468 @param e: the exception intercepted by the retry util. 469 """ 470 return (isinstance(e, error.AutoservSSHTimeout) or 471 (isinstance(e, error.GenericHostRunError) and 472 hasattr(e, 'description') and 473 (re.search('ERROR_CODE=37', e.description) or 474 re.search('generic error .255.', e.description)))) 475 476 try: 477 # Try the update twice (arg 2 is max_retry, not including the first 478 # call). Some exceptions may be caught by the retry handler. 479 retry_util.GenericRetry(exception_handler, 1, 480 self._base_update_handler_no_retry, 481 run_args) 482 except Exception as e: 483 message = err_msg_prefix + ': ' + str(e) 484 raise RootFSUpdateError(message) 485 486 487 def _wait_for_update_service(self): 488 """Ensure that the update engine daemon is running, possibly 489 by waiting for it a bit in case the DUT just rebooted and the 490 service hasn't started yet. 491 """ 492 def handler(e): 493 """Retry exception handler. 494 495 Assumes that the error is due to the update service not having 496 started yet. 497 498 @param e: the exception intercepted by the retry util. 499 """ 500 if isinstance(e, error.AutoservRunError): 501 logging.debug('update service check exception: %s\n' 502 'retrying...', e) 503 return True 504 else: 505 return False 506 507 # Retry at most three times, every 5s. 508 status = retry_util.GenericRetry(handler, 3, 509 self.check_update_status, 510 sleep=5) 511 512 # Expect the update engine to be idle. 513 if status != UPDATER_IDLE: 514 raise RootFSUpdateError( 515 'Update engine status is %s (%s was expected).' 516 % (status, UPDATER_IDLE)) 517 518 519 def _reset_update_engine(self): 520 """Resets the host to prepare for a clean update regardless of state.""" 521 self._run('stop ui || true') 522 self._run('stop update-engine || true') 523 self._run('start update-engine') 524 self._wait_for_update_service() 525 526 527 def _reset_stateful_partition(self): 528 """Clear any pending stateful update request.""" 529 self._run('%s --stateful_change=reset 2>&1' 530 % self._get_stateful_update_script()) 531 self._run('rm -f %s' % _TARGET_VERSION) 532 533 534 def _set_target_version(self): 535 """Set the "target version" for the update.""" 536 version_number = self.update_version.split('-')[1] 537 self._run('echo %s > %s' % (version_number, _TARGET_VERSION)) 538 539 540 def _revert_boot_partition(self): 541 """Revert the boot partition.""" 542 part = self._rootdev('-s') 543 logging.warning('Reverting update; Boot partition will be %s', part) 544 return self._run('/postinst %s 2>&1' % part) 545 546 547 def _verify_kernel_state(self): 548 """Verify that the next kernel to boot is correct for update. 549 550 This tests that the kernel state is correct for a successfully 551 downloaded and installed update. That is, the next kernel to 552 boot must be the currently inactive kernel. 553 554 @raise RootFSUpdateError if the DUT next kernel isn't the 555 expected next kernel. 556 """ 557 inactive_kernel = self.get_kernel_state()[1] 558 next_kernel = self._get_next_kernel() 559 if next_kernel != inactive_kernel: 560 raise RootFSUpdateError( 561 'Update failed. The kernel for next boot is %s, ' 562 'but %s was expected.' 563 % (next_kernel['name'], inactive_kernel['name'])) 564 return inactive_kernel 565 566 567 def _verify_update_completed(self): 568 """Verifies that an update has completed. 569 570 @raise RootFSUpdateError if the DUT doesn't indicate that 571 download is complete and the DUT is ready for reboot. 572 """ 573 status = self.check_update_status() 574 if status != UPDATER_NEED_REBOOT: 575 error_msg = '' 576 if status == UPDATER_IDLE: 577 error_msg = 'Update error: %s' % self._get_last_update_error() 578 raise RootFSUpdateError( 579 'Update engine status is %s (%s was expected). %s' 580 % (status, UPDATER_NEED_REBOOT, error_msg)) 581 return self._verify_kernel_state() 582 583 584 def trigger_update(self): 585 """Triggers a background update.""" 586 # If this function is called immediately after reboot (which it 587 # can be), there is no guarantee that the update engine is up 588 # and running yet, so wait for it. 589 self._wait_for_update_service() 590 591 autoupdate_cmd = ('%s --check_for_update --omaha_url=%s' % 592 (_UPDATER_BIN, self.update_url)) 593 run_args = {'command': autoupdate_cmd} 594 err_prefix = 'Failed to trigger an update on %s. ' % self.host.hostname 595 logging.info('Triggering update via: %s', autoupdate_cmd) 596 metric_fields = {'success': False} 597 try: 598 self._base_update_handler(run_args, err_prefix) 599 metric_fields['success'] = True 600 finally: 601 c = metrics.Counter('chromeos/autotest/autoupdater/trigger') 602 metric_fields.update(_get_metric_fields(self.update_url)) 603 c.increment(fields=metric_fields) 604 605 606 def update_image(self): 607 """Updates the device root FS and kernel and verifies success.""" 608 autoupdate_cmd = ('%s --update --omaha_url=%s' % 609 (_UPDATER_BIN, self.update_url)) 610 if not self.interactive: 611 autoupdate_cmd = '%s --interactive=false' % autoupdate_cmd 612 run_args = {'command': autoupdate_cmd, 'timeout': 3600} 613 err_prefix = ('Failed to install device image using payload at %s ' 614 'on %s. ' % (self.update_url, self.host.hostname)) 615 logging.info('Updating image via: %s', autoupdate_cmd) 616 metric_fields = {'success': False} 617 try: 618 self._base_update_handler(run_args, err_prefix) 619 metric_fields['success'] = True 620 finally: 621 c = metrics.Counter('chromeos/autotest/autoupdater/update') 622 metric_fields.update(_get_metric_fields(self.update_url)) 623 c.increment(fields=metric_fields) 624 return self._verify_update_completed() 625 626 627 def _get_remote_script(self, script_name): 628 """Ensure that `script_name` is present on the DUT. 629 630 The given script (e.g. `stateful_update`) may be present in the 631 stateful partition under /usr/local/bin, or we may have to 632 download it from the devserver. 633 634 Determine whether the script is present or must be downloaded 635 and download if necessary. Then, return a command fragment 636 sufficient to run the script from whereever it now lives on the 637 DUT. 638 639 @param script_name The name of the script as expected in 640 /usr/local/bin and on the devserver. 641 @return A string with the command (minus arguments) that will 642 run the target script. 643 """ 644 remote_script = '/usr/local/bin/%s' % script_name 645 if self.host.path_exists(remote_script): 646 return remote_script 647 remote_tmp_script = '/tmp/%s' % script_name 648 server_name = urlparse.urlparse(self.update_url)[1] 649 script_url = 'http://%s/static/%s' % (server_name, script_name) 650 fetch_script = 'curl -Ss -o %s %s && head -1 %s' % ( 651 remote_tmp_script, script_url, remote_tmp_script) 652 653 first_line = self._run(fetch_script).stdout.strip() 654 655 if first_line and first_line.startswith('#!'): 656 script_interpreter = first_line.lstrip('#!') 657 if script_interpreter: 658 return '%s %s' % (script_interpreter, remote_tmp_script) 659 return None 660 661 def _get_stateful_update_script(self): 662 """Returns a command to run the stateful update script. 663 664 Find `stateful_update` on the target or install it, as 665 necessary. If installation fails, raise an exception. 666 667 @raise StatefulUpdateError if the script can't be found or 668 installed. 669 @return A string that can be joined with arguments to run the 670 `stateful_update` command on the DUT. 671 """ 672 script_command = self._get_remote_script(_STATEFUL_UPDATE_SCRIPT) 673 if not script_command: 674 raise StatefulUpdateError('Could not install %s on DUT' 675 % _STATEFUL_UPDATE_SCRIPT) 676 return script_command 677 678 679 def rollback_rootfs(self, powerwash): 680 """Triggers rollback and waits for it to complete. 681 682 @param powerwash: If true, powerwash as part of rollback. 683 684 @raise RootFSUpdateError if anything went wrong. 685 """ 686 version = self.host.get_release_version() 687 # Introduced can_rollback in M36 (build 5772). # etc/lsb-release matches 688 # X.Y.Z. This version split just pulls the first part out. 689 try: 690 build_number = int(version.split('.')[0]) 691 except ValueError: 692 logging.error('Could not parse build number.') 693 build_number = 0 694 695 if build_number >= 5772: 696 can_rollback_cmd = '%s --can_rollback' % _UPDATER_BIN 697 logging.info('Checking for rollback.') 698 try: 699 self._run(can_rollback_cmd) 700 except error.AutoservRunError as e: 701 raise RootFSUpdateError("Rollback isn't possible on %s: %s" % 702 (self.host.hostname, str(e))) 703 704 rollback_cmd = '%s --rollback --follow' % _UPDATER_BIN 705 if not powerwash: 706 rollback_cmd += ' --nopowerwash' 707 708 logging.info('Performing rollback.') 709 try: 710 self._run(rollback_cmd) 711 except error.AutoservRunError as e: 712 raise RootFSUpdateError('Rollback failed on %s: %s' % 713 (self.host.hostname, str(e))) 714 715 self._verify_update_completed() 716 717 718 def update_stateful(self, clobber=True): 719 """Updates the stateful partition. 720 721 @param clobber: If True, a clean stateful installation. 722 723 @raise StatefulUpdateError if the update script fails to 724 complete successfully. 725 """ 726 logging.info('Updating stateful partition...') 727 statefuldev_url = self.update_url.replace('update', 'static') 728 729 # Attempt stateful partition update; this must succeed so that the newly 730 # installed host is testable after update. 731 statefuldev_cmd = [self._get_stateful_update_script(), statefuldev_url] 732 if clobber: 733 statefuldev_cmd.append('--stateful_change=clean') 734 735 statefuldev_cmd.append('2>&1') 736 try: 737 self._run(' '.join(statefuldev_cmd), timeout=1200) 738 except error.AutoservRunError: 739 raise StatefulUpdateError( 740 'Failed to perform stateful update on %s' % 741 self.host.hostname) 742 743 744 def verify_boot_expectations(self, expected_kernel, rollback_message): 745 """Verifies that we fully booted given expected kernel state. 746 747 This method both verifies that we booted using the correct kernel 748 state and that the OS has marked the kernel as good. 749 750 @param expected_kernel: kernel that we are verifying with, 751 i.e. I expect to be booted onto partition 4 etc. See output of 752 get_kernel_state. 753 @param rollback_message: string include in except message text 754 if we booted with the wrong partition. 755 756 @raise NewBuildUpdateError if any of the various checks fail. 757 """ 758 # Figure out the newly active kernel. 759 active_kernel = self.get_kernel_state()[0] 760 761 # Check for rollback due to a bad build. 762 if active_kernel != expected_kernel: 763 764 # Kernel crash reports should be wiped between test runs, but 765 # may persist from earlier parts of the test, or from problems 766 # with provisioning. 767 # 768 # Kernel crash reports will NOT be present if the crash happened 769 # before encrypted stateful is mounted. 770 # 771 # TODO(dgarrett): Integrate with server/crashcollect.py at some 772 # point. 773 kernel_crashes = glob.glob('/var/spool/crash/kernel.*.kcrash') 774 if kernel_crashes: 775 rollback_message += ': kernel_crash' 776 logging.debug('Found %d kernel crash reports:', 777 len(kernel_crashes)) 778 # The crash names contain timestamps that may be useful: 779 # kernel.20131207.005945.0.kcrash 780 for crash in kernel_crashes: 781 logging.debug(' %s', os.path.basename(crash)) 782 783 # Print out some information to make it easier to debug 784 # the rollback. 785 logging.debug('Dumping partition table.') 786 self._run('cgpt show $(rootdev -s -d)') 787 logging.debug('Dumping crossystem for firmware debugging.') 788 self._run('crossystem --all') 789 raise NewBuildUpdateError(self.update_version, rollback_message) 790 791 # Make sure chromeos-setgoodkernel runs. 792 try: 793 utils.poll_for_condition( 794 lambda: (self._get_kernel_tries(active_kernel) == 0 795 and self._get_kernel_success(active_kernel)), 796 exception=RootFSUpdateError(), 797 timeout=_KERNEL_UPDATE_TIMEOUT, sleep_interval=5) 798 except RootFSUpdateError: 799 services_status = self._run('status system-services').stdout 800 if services_status != 'system-services start/running\n': 801 event = NewBuildUpdateError.CHROME_FAILURE 802 else: 803 event = NewBuildUpdateError.UPDATE_ENGINE_FAILURE 804 raise NewBuildUpdateError(self.update_version, event) 805 806 807 def _prepare_host(self): 808 """Make sure the target DUT is working and ready for update. 809 810 Initially, the target DUT's state is unknown. The DUT is 811 expected to be online, but we strive to be forgiving if Chrome 812 and/or the update engine aren't fully functional. 813 """ 814 # Summary of work, and the rationale: 815 # 1. Reboot, because it's a good way to clear out problems. 816 # 2. Touch the PROVISION_FAILED file, to allow repair to detect 817 # failure later. 818 # 3. Run the hook for host class specific preparation. 819 # 4. Stop Chrome, because the system is designed to eventually 820 # reboot if Chrome is stuck in a crash loop. 821 # 5. Force `update-engine` to start, because if Chrome failed 822 # to start properly, the status of the `update-engine` job 823 # will be uncertain. 824 if not self.host.is_up(): 825 raise HostUpdateError(self.host.hostname, 826 HostUpdateError.DUT_DOWN) 827 self._reset_stateful_partition() 828 self.host.reboot(timeout=self.host.REBOOT_TIMEOUT) 829 self._run('touch %s' % PROVISION_FAILED) 830 self.host.prepare_for_update() 831 self._reset_update_engine() 832 logging.info('Updating from version %s to %s.', 833 self.host.get_release_version(), 834 self.update_version) 835 836 837 def _install_via_update_engine(self): 838 """Install an updating using the production AU flow. 839 840 This uses the standard AU flow and the `stateful_update` script 841 to download and install a root FS, kernel and stateful 842 filesystem content. 843 844 @return The kernel expected to be booted next. 845 """ 846 logging.info('Installing image using update_engine.') 847 expected_kernel = self.update_image() 848 self.update_stateful() 849 self._set_target_version() 850 return expected_kernel 851 852 853 def _quick_provision_with_gs_cache(self, provision_command, devserver_name, 854 image_name): 855 """Run quick_provision using GsCache server. 856 857 @param provision_command: The path of quick_provision command. 858 @param devserver_name: The devserver name and port (optional). 859 @param image_name: The image to be installed. 860 """ 861 logging.info('Try quick provision with gs_cache.') 862 # If enabled, GsCache server listion on different port on the 863 # devserver. 864 gs_cache_server = devserver_name.replace(DEVSERVER_PORT, GS_CACHE_PORT) 865 gs_cache_url = ('http://%s/download/chromeos-image-archive' 866 % gs_cache_server) 867 868 # Check if GS_Cache server is enabled on the server. 869 self._run('curl -s -o /dev/null %s' % gs_cache_url) 870 871 command = '%s --noreboot %s %s' % (provision_command, image_name, 872 gs_cache_url) 873 self._run(command) 874 metrics.Counter(_metric_name('quick_provision')).increment( 875 fields={'devserver': devserver_name, 'gs_cache': True}) 876 877 878 def _quick_provision_with_devserver(self, provision_command, 879 devserver_name, image_name): 880 """Run quick_provision using legacy devserver. 881 882 @param provision_command: The path of quick_provision command. 883 @param devserver_name: The devserver name and port (optional). 884 @param image_name: The image to be installed. 885 """ 886 logging.info('Try quick provision with devserver.') 887 ds = dev_server.ImageServer('http://%s' % devserver_name) 888 try: 889 ds.stage_artifacts(image_name, ['quick_provision', 'stateful']) 890 except dev_server.DevServerException as e: 891 raise error.TestFail, str(e), sys.exc_info()[2] 892 893 static_url = 'http://%s/static' % devserver_name 894 command = '%s --noreboot %s %s' % (provision_command, image_name, 895 static_url) 896 self._run(command) 897 metrics.Counter(_metric_name('quick_provision')).increment( 898 fields={'devserver': devserver_name, 'gs_cache': False}) 899 900 901 def _install_via_quick_provision(self): 902 """Install an updating using the `quick-provision` script. 903 904 This uses the `quick-provision` script to download and install 905 a root FS, kernel and stateful filesystem content. 906 907 @return The kernel expected to be booted next. 908 """ 909 if not self._use_quick_provision: 910 return None 911 image_name = url_to_image_name(self.update_url) 912 logging.info('Installing image using quick-provision.') 913 provision_command = self._get_remote_script(_QUICK_PROVISION_SCRIPT) 914 server_name = urlparse.urlparse(self.update_url)[1] 915 try: 916 try: 917 self._quick_provision_with_gs_cache(provision_command, 918 server_name, image_name) 919 except Exception: 920 self._quick_provision_with_devserver(provision_command, 921 server_name, image_name) 922 923 self._set_target_version() 924 return self._verify_kernel_state() 925 except Exception: 926 # N.B. We handle only `Exception` here. Non-Exception 927 # classes (such as KeyboardInterrupt) are handled by our 928 # caller. 929 logging.exception('quick-provision script failed; ' 930 'will fall back to update_engine.') 931 self._revert_boot_partition() 932 self._reset_stateful_partition() 933 self._reset_update_engine() 934 return None 935 936 937 def _install_update(self): 938 """Install the requested image on the DUT, but don't start it. 939 940 This downloads and installs a root FS, kernel and stateful 941 filesystem content. This does not reboot the DUT, so the update 942 is merely pending when the method returns. 943 944 @return The kernel expected to be booted next. 945 """ 946 logging.info('Installing image at %s onto %s', 947 self.update_url, self.host.hostname) 948 try: 949 return (self._install_via_quick_provision() 950 or self._install_via_update_engine()) 951 except: 952 # N.B. This handling code includes non-Exception classes such 953 # as KeyboardInterrupt. We need to clean up, but we also must 954 # re-raise. 955 self._revert_boot_partition() 956 self._reset_stateful_partition() 957 self._reset_update_engine() 958 # Collect update engine logs in the event of failure. 959 if self.host.job: 960 logging.info('Collecting update engine logs due to failure...') 961 self.host.get_file( 962 _UPDATER_LOGS, self.host.job.sysinfo.sysinfodir, 963 preserve_perm=False) 964 _list_image_dir_contents(self.update_url) 965 raise 966 967 968 def _complete_update(self, expected_kernel): 969 """Finish the update, and confirm that it succeeded. 970 971 Initial condition is that the target build has been downloaded 972 and installed on the DUT, but has not yet been booted. This 973 function is responsible for rebooting the DUT, and checking that 974 the new build is running successfully. 975 976 @param expected_kernel: kernel expected to be active after reboot. 977 """ 978 # Regarding the 'crossystem' command below: In some cases, 979 # the update flow puts the TPM into a state such that it 980 # fails verification. We don't know why. However, this 981 # call papers over the problem by clearing the TPM during 982 # the reboot. 983 # 984 # We ignore failures from 'crossystem'. Although failure 985 # here is unexpected, and could signal a bug, the point of 986 # the exercise is to paper over problems; allowing this to 987 # fail would defeat the purpose. 988 self._run('crossystem clear_tpm_owner_request=1', 989 ignore_status=True) 990 self.host.reboot(timeout=self.host.REBOOT_TIMEOUT) 991 992 # Touch the lab machine file to leave a marker that 993 # distinguishes this image from other test images. 994 # Afterwards, we must re-run the autoreboot script because 995 # it depends on the _LAB_MACHINE_FILE. 996 autoreboot_cmd = ('FILE="%s" ; [ -f "$FILE" ] || ' 997 '( touch "$FILE" ; start autoreboot )') 998 self._run(autoreboot_cmd % _LAB_MACHINE_FILE) 999 self.verify_boot_expectations( 1000 expected_kernel, NewBuildUpdateError.ROLLBACK_FAILURE) 1001 1002 logging.debug('Cleaning up old autotest directories.') 1003 try: 1004 installed_autodir = autotest.Autotest.get_installed_autodir( 1005 self.host) 1006 self._run('rm -rf ' + installed_autodir) 1007 except autotest.AutodirNotFoundError: 1008 logging.debug('No autotest installed directory found.') 1009 1010 1011 def run_update(self): 1012 """Perform a full update of a DUT in the test lab. 1013 1014 This downloads and installs the root FS and stateful partition 1015 content needed for the update specified in `self.host` and 1016 `self.update_url`. The update is performed according to the 1017 requirements for provisioning a DUT for testing the requested 1018 build. 1019 1020 At the end of the procedure, metrics are reported describing the 1021 outcome of the operation. 1022 1023 @returns A tuple of the form `(image_name, attributes)`, where 1024 `image_name` is the name of the image installed, and 1025 `attributes` is new attributes to be applied to the DUT. 1026 """ 1027 server_name = dev_server.get_resolved_hostname(self.update_url) 1028 metrics.Counter(_metric_name('install')).increment( 1029 fields={'devserver': server_name}) 1030 1031 try: 1032 self._prepare_host() 1033 except _AttributedUpdateError: 1034 raise 1035 except Exception as e: 1036 logging.exception('Failure preparing host prior to update.') 1037 raise HostUpdateError(self.host.hostname, str(e)) 1038 1039 try: 1040 expected_kernel = self._install_update() 1041 except _AttributedUpdateError: 1042 raise 1043 except Exception as e: 1044 logging.exception('Failure during download and install.') 1045 raise ImageInstallError(self.host.hostname, server_name, str(e)) 1046 1047 try: 1048 self._complete_update(expected_kernel) 1049 except _AttributedUpdateError: 1050 raise 1051 except Exception as e: 1052 logging.exception('Failure from build after update.') 1053 raise NewBuildUpdateError(self.update_version, str(e)) 1054 1055 image_name = url_to_image_name(self.update_url) 1056 # update_url is different from devserver url needed to stage autotest 1057 # packages, therefore, resolve a new devserver url here. 1058 devserver_url = dev_server.ImageServer.resolve( 1059 image_name, self.host.hostname).url() 1060 repo_url = tools.get_package_url(devserver_url, image_name) 1061 return image_name, {ds_constants.JOB_REPO_URL: repo_url} 1062