1# Copyright 2016 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import json 6import logging 7import os 8import time 9 10import common 11from autotest_lib.client.common_lib import error 12from autotest_lib.client.common_lib import global_config 13from autotest_lib.client.common_lib import hosts 14from autotest_lib.client.common_lib.cros import retry 15from autotest_lib.server import afe_utils 16from autotest_lib.server import crashcollect 17from autotest_lib.server.hosts import repair 18from autotest_lib.server.hosts import cros_firmware 19 20# _DEV_MODE_ALLOW_POOLS - The set of pools that are allowed to be 21# in dev mode (usually, those should be unmanaged devices) 22# 23_DEV_MODE_ALLOWED_POOLS = set( 24 global_config.global_config.get_config_value( 25 'CROS', 26 'pools_dev_mode_allowed', 27 type=str, 28 default='', 29 allow_blank=True).split(',')) 30 31# Setting to suppress dev mode check; primarily used for moblab where all 32# DUT's are in dev mode. 33_DEV_MODE_ALWAYS_ALLOWED = global_config.global_config.get_config_value( 34 'CROS', 35 'dev_mode_allowed', 36 type=bool, 37 default=False) 38 39# Triggers for the 'au', 'powerwash', and 'usb' repair actions. 40# These are also used as dependencies in the `CrosHost` repair 41# sequence, as follows: 42# 43# au: 44# - triggers: _CROS_AU_TRIGGERS 45# - depends on: _CROS_USB_TRIGGERS + _CROS_POWERWASH_TRIGGERS 46# 47# powerwash: 48# - triggers: _CROS_POWERWASH_TRIGGERS + _CROS_AU_TRIGGERS 49# - depends on: _CROS_USB_TRIGGERS 50# 51# usb: 52# - triggers: _CROS_USB_TRIGGERS + _CROS_POWERWASH_TRIGGERS + 53# _CROS_AU_TRIGGERS 54# - no dependencies 55# 56# N.B. AC power detection depends on software on the DUT, and there 57# have been bugs where detection failed even though the DUT really 58# did have power. So, we make the 'power' verifier a trigger for 59# reinstall repair actions, too. 60# 61# TODO(jrbarnette): AU repair can't fix all problems reported by 62# the 'cros' verifier; it's listed as an AU trigger as a 63# simplification. The ultimate fix is to split the 'cros' verifier 64# into smaller individual verifiers. 65_CROS_AU_TRIGGERS = ('power', 'rwfw', 'python', 'cros',) 66_CROS_POWERWASH_TRIGGERS = ('tpm', 'good_au', 'ext4',) 67_CROS_USB_TRIGGERS = ('ssh', 'writable',) 68 69 70class ACPowerVerifier(hosts.Verifier): 71 """Check for AC power and a reasonable battery charge.""" 72 73 def verify(self, host): 74 # Temporarily work around a problem caused by some old FSI 75 # builds that don't have the power_supply_info command by 76 # ignoring failures. The repair triggers believe that this 77 # verifier can't be fixed by re-installing, which means if a DUT 78 # gets stuck with one of those old builds, it can't be repaired. 79 # 80 # TODO(jrbarnette): This is for crbug.com/599158; we need a 81 # better solution. 82 try: 83 info = host.get_power_supply_info() 84 except: 85 logging.exception('get_power_supply_info() failed') 86 return 87 try: 88 if info['Line Power']['online'] != 'yes': 89 raise hosts.AutoservVerifyError( 90 'AC power is not plugged in') 91 except KeyError: 92 logging.info('Cannot determine AC power status - ' 93 'skipping check.') 94 try: 95 if float(info['Battery']['percentage']) < 50.0: 96 raise hosts.AutoservVerifyError( 97 'Battery is less than 50%') 98 except KeyError: 99 logging.info('Cannot determine battery status - ' 100 'skipping check.') 101 102 @property 103 def description(self): 104 return 'The DUT is plugged in to AC power' 105 106 107class WritableVerifier(hosts.Verifier): 108 """ 109 Confirm the stateful file systems are writable. 110 111 The standard linux response to certain unexpected file system errors 112 (including hardware errors in block devices) is to change the file 113 system status to read-only. This checks that that hasn't happened. 114 115 The test covers the two file systems that need to be writable for 116 critical operations like AU: 117 * The (unencrypted) stateful system which includes 118 /mnt/stateful_partition. 119 * The encrypted stateful partition, which includes /var. 120 121 The test doesn't check various bind mounts; those are expected to 122 fail the same way as their underlying main mounts. Whether the 123 Linux kernel can guarantee that is untested... 124 """ 125 126 # N.B. Order matters here: Encrypted stateful is loop-mounted from 127 # a file in unencrypted stateful, so we don't test for errors in 128 # encrypted stateful if unencrypted fails. 129 _TEST_DIRECTORIES = ['/mnt/stateful_partition', '/var/tmp'] 130 131 def verify(self, host): 132 # This deliberately stops looking after the first error. 133 # See above for the details. 134 for testdir in self._TEST_DIRECTORIES: 135 filename = os.path.join(testdir, 'writable_test') 136 command = 'touch %s && rm %s' % (filename, filename) 137 rv = host.run(command=command, ignore_status=True) 138 if rv.exit_status != 0: 139 msg = 'Can\'t create a file in %s' % testdir 140 raise hosts.AutoservVerifyError(msg) 141 142 @property 143 def description(self): 144 return 'The stateful filesystems are writable' 145 146 147class EXT4fsErrorVerifier(hosts.Verifier): 148 """ 149 Confirm we have not seen critical file system kernel errors. 150 """ 151 def verify(self, host): 152 # grep for stateful FS errors of the type "EXT4-fs error (device sda1):" 153 command = ("dmesg | grep -E \"EXT4-fs error \(device " 154 "$(cut -d ' ' -f 5,9 /proc/$$/mountinfo | " 155 "grep -e '^/mnt/stateful_partition ' | " 156 "cut -d ' ' -f 2 | cut -d '/' -f 3)\):\"") 157 output = host.run(command=command, ignore_status=True).stdout 158 if output: 159 sample = output.splitlines()[0] 160 message = 'Saw file system error: %s' % sample 161 raise hosts.AutoservVerifyError(message) 162 # Check for other critical FS errors. 163 command = 'dmesg | grep "This should not happen!! Data will be lost"' 164 output = host.run(command=command, ignore_status=True).stdout 165 if output: 166 message = 'Saw file system error: Data will be lost' 167 raise hosts.AutoservVerifyError(message) 168 else: 169 logging.error('Could not determine stateful mount.') 170 171 @property 172 def description(self): 173 return 'Did not find critical file system errors' 174 175 176class UpdateSuccessVerifier(hosts.Verifier): 177 """ 178 Checks that the DUT successfully finished its last provision job. 179 180 At the start of any update (e.g. for a Provision job), the code 181 creates a marker file named `host.PROVISION_FAILED`. The file is 182 located in a part of the stateful partition that will be removed if 183 an update finishes successfully. Thus, the presence of the file 184 indicates that a prior update failed. 185 186 The verifier tests for the existence of the marker file and fails if 187 it still exists. 188 """ 189 def verify(self, host): 190 result = host.run('test -f %s' % host.PROVISION_FAILED, 191 ignore_status=True) 192 if result.exit_status == 0: 193 raise hosts.AutoservVerifyError( 194 'Last AU on this DUT failed') 195 196 @property 197 def description(self): 198 return 'The most recent AU attempt on this DUT succeeded' 199 200 201class TPMStatusVerifier(hosts.Verifier): 202 """Verify that the host's TPM is in a good state.""" 203 204 def verify(self, host): 205 # This cryptohome command emits status information in JSON format. It 206 # looks something like this: 207 # { 208 # "installattrs": { 209 # ... 210 # }, 211 # "mounts": [ { 212 # ... 213 # } ], 214 # "tpm": { 215 # "being_owned": false, 216 # "can_connect": true, 217 # "can_decrypt": false, 218 # "can_encrypt": false, 219 # "can_load_srk": true, 220 # "can_load_srk_pubkey": true, 221 # "enabled": true, 222 # "has_context": true, 223 # "has_cryptohome_key": false, 224 # "has_key_handle": false, 225 # "last_error": 0, 226 # "owned": true 227 # } 228 # } 229 output = host.run('cryptohome --action=status').stdout.strip() 230 try: 231 status = json.loads(output) 232 except ValueError: 233 logging.info('Cannot determine the Crytohome valid status - ' 234 'skipping check.') 235 return 236 try: 237 tpm = status['tpm'] 238 if not tpm['enabled']: 239 raise hosts.AutoservVerifyError( 240 'TPM is not enabled -- Hardware is not working.') 241 if not tpm['can_connect']: 242 raise hosts.AutoservVerifyError( 243 ('TPM connect failed -- ' 244 'last_error=%d.' % tpm['last_error'])) 245 if tpm['owned'] and not tpm['can_load_srk']: 246 raise hosts.AutoservVerifyError( 247 'Cannot load the TPM SRK') 248 if tpm['can_load_srk'] and not tpm['can_load_srk_pubkey']: 249 raise hosts.AutoservVerifyError( 250 'Cannot load the TPM SRK public key') 251 except KeyError: 252 logging.info('Cannot determine the Crytohome valid status - ' 253 'skipping check.') 254 255 @property 256 def description(self): 257 return 'The host\'s TPM is available and working' 258 259 260class PythonVerifier(hosts.Verifier): 261 """Confirm the presence of a working Python interpreter.""" 262 263 def verify(self, host): 264 result = host.run('python -c "import cPickle"', 265 ignore_status=True) 266 if result.exit_status != 0: 267 message = 'The python interpreter is broken' 268 if result.exit_status == 127: 269 search = host.run('which python', ignore_status=True) 270 if search.exit_status != 0 or not search.stdout: 271 message = ('Python is missing; may be caused by ' 272 'powerwash') 273 raise hosts.AutoservVerifyError(message) 274 275 @property 276 def description(self): 277 return 'Python on the host is installed and working' 278 279 280class DevModeVerifier(hosts.Verifier): 281 """Verify that the host is not in dev mode.""" 282 283 def verify(self, host): 284 # Some pools are allowed to be in dev mode 285 info = host.host_info_store.get() 286 if (_DEV_MODE_ALWAYS_ALLOWED or 287 bool(info.pools & _DEV_MODE_ALLOWED_POOLS)): 288 return 289 290 result = host.run('crossystem devsw_boot', ignore_status=True).stdout 291 if result != '0': 292 raise hosts.AutoservVerifyError('The host is in dev mode') 293 294 @property 295 def description(self): 296 return 'The host should not be in dev mode' 297 298 299class JetstreamServicesVerifier(hosts.Verifier): 300 """Verify that Jetstream services are running.""" 301 302 # Retry for b/62576902 303 @retry.retry(error.AutoservError, timeout_min=1, delay_sec=10) 304 def verify(self, host): 305 try: 306 if not host.upstart_status('ap-controller'): 307 raise hosts.AutoservVerifyError( 308 'ap-controller service is not running') 309 except error.AutoservRunError: 310 raise hosts.AutoservVerifyError( 311 'ap-controller service not found') 312 313 try: 314 host.run('pgrep ap-controller') 315 except error.AutoservRunError: 316 raise hosts.AutoservVerifyError( 317 'ap-controller process is not running') 318 319 @property 320 def description(self): 321 return 'Jetstream services must be running' 322 323 324class ServoSysRqRepair(hosts.RepairAction): 325 """ 326 Repair a Chrome device by sending a system request to the kernel. 327 328 Sending 3 times the Alt+VolUp+x key combination (aka sysrq-x) 329 will ask the kernel to panic itself and reboot while conserving 330 the kernel logs in console ramoops. 331 """ 332 333 def repair(self, host): 334 if not host.servo: 335 raise hosts.AutoservRepairError( 336 '%s has no servo support.' % host.hostname) 337 # Press 3 times Alt+VolUp+X 338 # no checking DUT health between each press as 339 # killing Chrome is not really likely to fix the DUT SSH. 340 for _ in range(3): 341 try: 342 host.servo.sysrq_x() 343 except error.TestFail, ex: 344 raise hosts.AutoservRepairError( 345 'cannot press sysrq-x: %s.' % str(ex)) 346 # less than 5 seconds between presses. 347 time.sleep(2.0) 348 349 if host.wait_up(host.BOOT_TIMEOUT): 350 # Collect logs once we regain ssh access before clobbering them. 351 local_log_dir = crashcollect.get_crashinfo_dir(host, 'after_sysrq') 352 host.collect_logs('/var/log', local_log_dir, ignore_errors=True) 353 # Collect crash info. 354 crashcollect.get_crashinfo(host, None) 355 return 356 raise hosts.AutoservRepairError( 357 '%s is still offline after sysrq-x.' % host.hostname) 358 359 @property 360 def description(self): 361 return 'Reset the DUT via keyboard sysrq-x' 362 363 364class ServoResetRepair(hosts.RepairAction): 365 """Repair a Chrome device by resetting it with servo.""" 366 367 def repair(self, host): 368 if not host.servo: 369 raise hosts.AutoservRepairError( 370 '%s has no servo support.' % host.hostname) 371 host.servo.get_power_state_controller().reset() 372 if host.wait_up(host.BOOT_TIMEOUT): 373 # Collect logs once we regain ssh access before clobbering them. 374 local_log_dir = crashcollect.get_crashinfo_dir(host, 'after_reset') 375 host.collect_logs('/var/log', local_log_dir, ignore_errors=True) 376 # Collect crash info. 377 crashcollect.get_crashinfo(host, None) 378 return 379 raise hosts.AutoservRepairError( 380 '%s is still offline after servo reset.' % host.hostname) 381 382 @property 383 def description(self): 384 return 'Reset the DUT via servo' 385 386 387class CrosRebootRepair(repair.RebootRepair): 388 """Repair a CrOS target by clearing dev mode and rebooting it.""" 389 390 def repair(self, host): 391 # N.B. We need to reboot regardless of whether set_gbb_flags 392 # succeeds or fails. 393 host.run('/usr/share/vboot/bin/set_gbb_flags.sh 0', 394 ignore_status=True) 395 super(CrosRebootRepair, self).repair(host) 396 397 @property 398 def description(self): 399 return 'Reset GBB flags and Reboot the host' 400 401 402class AutoUpdateRepair(hosts.RepairAction): 403 """ 404 Repair by re-installing a test image using autoupdate. 405 406 Try to install the DUT's designated "stable test image" using the 407 standard procedure for installing a new test image via autoupdate. 408 """ 409 410 def repair(self, host): 411 afe_utils.machine_install_and_update_labels(host, repair=True) 412 413 @property 414 def description(self): 415 return 'Re-install the stable build via AU' 416 417 418class PowerWashRepair(AutoUpdateRepair): 419 """ 420 Powerwash the DUT, then re-install using autoupdate. 421 422 Powerwash the DUT, then attempt to re-install a stable test image as 423 for `AutoUpdateRepair`. 424 """ 425 426 def repair(self, host): 427 host.run('echo "fast safe" > ' 428 '/mnt/stateful_partition/factory_install_reset') 429 host.reboot(timeout=host.POWERWASH_BOOT_TIMEOUT, wait=True) 430 super(PowerWashRepair, self).repair(host) 431 432 @property 433 def description(self): 434 return 'Powerwash and then re-install the stable build via AU' 435 436 437class ServoInstallRepair(hosts.RepairAction): 438 """ 439 Reinstall a test image from USB using servo. 440 441 Use servo to re-install the DUT's designated "stable test image" 442 from servo-attached USB storage. 443 """ 444 445 def repair(self, host): 446 if not host.servo: 447 raise hosts.AutoservRepairError( 448 '%s has no servo support.' % host.hostname) 449 host.servo_install(host.stage_image_for_servo()) 450 451 @property 452 def description(self): 453 return 'Reinstall from USB using servo' 454 455 456class JetstreamRepair(hosts.RepairAction): 457 """Repair by restarting Jetstrem services.""" 458 459 def repair(self, host): 460 host.cleanup_services() 461 462 @property 463 def description(self): 464 return 'Restart Jetstream services' 465 466 467def _cros_verify_dag(): 468 """Return the verification DAG for a `CrosHost`.""" 469 FirmwareStatusVerifier = cros_firmware.FirmwareStatusVerifier 470 FirmwareVersionVerifier = cros_firmware.FirmwareVersionVerifier 471 verify_dag = ( 472 (repair.SshVerifier, 'ssh', ()), 473 (DevModeVerifier, 'devmode', ('ssh',)), 474 (ACPowerVerifier, 'power', ('ssh',)), 475 (EXT4fsErrorVerifier, 'ext4', ('ssh',)), 476 (WritableVerifier, 'writable', ('ssh',)), 477 (TPMStatusVerifier, 'tpm', ('ssh',)), 478 (UpdateSuccessVerifier, 'good_au', ('ssh',)), 479 (FirmwareStatusVerifier, 'fwstatus', ('ssh',)), 480 (FirmwareVersionVerifier, 'rwfw', ('ssh',)), 481 (PythonVerifier, 'python', ('ssh',)), 482 (repair.LegacyHostVerifier, 'cros', ('ssh',)), 483 ) 484 return verify_dag 485 486 487def _cros_basic_repair_actions(): 488 """Return the basic repair actions for a `CrosHost`""" 489 FirmwareRepair = cros_firmware.FirmwareRepair 490 repair_actions = ( 491 # RPM cycling must precede Servo reset: if the DUT has a dead 492 # battery, we need to reattach AC power before we reset via servo. 493 (repair.RPMCycleRepair, 'rpm', (), ('ssh', 'power',)), 494 (ServoSysRqRepair, 'sysrq', (), ('ssh',)), 495 (ServoResetRepair, 'servoreset', (), ('ssh',)), 496 497 # N.B. FirmwareRepair can't fix a 'good_au' failure directly, 498 # because it doesn't remove the flag file that triggers the 499 # failure. We include it as a repair trigger because it's 500 # possible the the last update failed because of the firmware, 501 # and we want the repair steps below to be able to trust the 502 # firmware. 503 (FirmwareRepair, 'firmware', (), ('ssh', 'fwstatus', 'good_au',)), 504 505 (CrosRebootRepair, 'reboot', ('ssh',), ('devmode', 'writable',)), 506 ) 507 return repair_actions 508 509 510def _cros_extended_repair_actions(au_triggers=_CROS_AU_TRIGGERS, 511 powerwash_triggers=_CROS_POWERWASH_TRIGGERS, 512 usb_triggers=_CROS_USB_TRIGGERS): 513 """Return the extended repair actions for a `CrosHost`""" 514 515 # The dependencies and triggers for the 'au', 'powerwash', and 'usb' 516 # repair actions stack up: Each one is able to repair progressively 517 # more verifiers than the one before. The 'triggers' lists specify 518 # the progression. 519 520 repair_actions = ( 521 (AutoUpdateRepair, 'au', 522 usb_triggers + powerwash_triggers, au_triggers), 523 (PowerWashRepair, 'powerwash', 524 usb_triggers, powerwash_triggers + au_triggers), 525 (ServoInstallRepair, 'usb', 526 (), usb_triggers + powerwash_triggers + au_triggers), 527 ) 528 return repair_actions 529 530 531def _cros_repair_actions(): 532 """Return the repair actions for a `CrosHost`.""" 533 repair_actions = (_cros_basic_repair_actions() + 534 _cros_extended_repair_actions()) 535 return repair_actions 536 537 538def create_cros_repair_strategy(): 539 """Return a `RepairStrategy` for a `CrosHost`.""" 540 verify_dag = _cros_verify_dag() 541 repair_actions = _cros_repair_actions() 542 return hosts.RepairStrategy(verify_dag, repair_actions) 543 544 545def _moblab_verify_dag(): 546 """Return the verification DAG for a `MoblabHost`.""" 547 FirmwareVersionVerifier = cros_firmware.FirmwareVersionVerifier 548 verify_dag = ( 549 (repair.SshVerifier, 'ssh', ()), 550 (ACPowerVerifier, 'power', ('ssh',)), 551 (FirmwareVersionVerifier, 'rwfw', ('ssh',)), 552 (PythonVerifier, 'python', ('ssh',)), 553 (repair.LegacyHostVerifier, 'cros', ('ssh',)), 554 ) 555 return verify_dag 556 557 558def _moblab_repair_actions(): 559 """Return the repair actions for a `MoblabHost`.""" 560 repair_actions = ( 561 (repair.RPMCycleRepair, 'rpm', (), ('ssh', 'power',)), 562 (AutoUpdateRepair, 'au', ('ssh',), _CROS_AU_TRIGGERS), 563 ) 564 return repair_actions 565 566 567def create_moblab_repair_strategy(): 568 """ 569 Return a `RepairStrategy` for a `MoblabHost`. 570 571 Moblab is a subset of the CrOS verify and repair. Several pieces 572 are removed because they're not expected to be meaningful. Some 573 others are removed for more specific reasons: 574 575 'tpm': Moblab DUTs don't run the tests that matter to this 576 verifier. TODO(jrbarnette) This assertion is unproven. 577 578 'good_au': This verifier can't pass, because the Moblab AU 579 procedure doesn't properly delete CrosHost.PROVISION_FAILED. 580 TODO(jrbarnette) We should refactor _machine_install() so that 581 it can be different for Moblab. 582 583 'firmware': Moblab DUTs shouldn't be in FAFT pools, so we don't try 584 this. 585 586 'powerwash': Powerwash on Moblab causes trouble with deleting the 587 DHCP leases file, so we skip it. 588 """ 589 verify_dag = _moblab_verify_dag() 590 repair_actions = _moblab_repair_actions() 591 return hosts.RepairStrategy(verify_dag, repair_actions) 592 593 594def _jetstream_repair_actions(): 595 """Return the repair actions for a `JetstreamHost`.""" 596 au_triggers = _CROS_AU_TRIGGERS + ('jetstream_services',) 597 repair_actions = ( 598 _cros_basic_repair_actions() + 599 ( 600 (JetstreamRepair, 'jetstream_repair', 601 _CROS_USB_TRIGGERS + _CROS_POWERWASH_TRIGGERS, au_triggers), 602 ) + 603 _cros_extended_repair_actions(au_triggers=au_triggers)) 604 return repair_actions 605 606 607def _jetstream_verify_dag(): 608 """Return the verification DAG for a `JetstreamHost`.""" 609 verify_dag = _cros_verify_dag() + ( 610 (JetstreamServicesVerifier, 'jetstream_services', ('ssh',)), 611 ) 612 return verify_dag 613 614 615def create_jetstream_repair_strategy(): 616 """ 617 Return a `RepairStrategy` for a `JetstreamHost`. 618 619 The Jetstream repair strategy is based on the CrOS verify and repair, 620 but adds the JetstreamServicesVerifier. 621 """ 622 verify_dag = _jetstream_verify_dag() 623 repair_actions = _jetstream_repair_actions() 624 return hosts.RepairStrategy(verify_dag, repair_actions) 625