1# Lint as: python2, python3 2# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5# 6# Expects to be run in an environment with sudo and no interactive password 7# prompt, such as within the Chromium OS development chroot. 8 9 10"""This file provides core logic for servo verify/repair process.""" 11 12 13from __future__ import absolute_import 14from __future__ import division 15from __future__ import print_function 16 17import logging 18import os 19import re 20import tarfile 21import threading 22import json 23import time 24import six 25import six.moves.xmlrpc_client 26import calendar 27 28from autotest_lib.client.bin import utils 29from autotest_lib.client.common_lib import error 30from autotest_lib.client.common_lib import hosts 31from autotest_lib.client.common_lib import lsbrelease_utils 32from autotest_lib.client.common_lib.cros import retry 33from autotest_lib.client.common_lib.cros.network import ping_runner 34from autotest_lib.server.cros.servo import servo 35from autotest_lib.server.hosts import servo_repair 36from autotest_lib.server.hosts import base_servohost 37from autotest_lib.server.hosts import servo_constants 38from autotest_lib.server.cros.faft.utils import config 39from autotest_lib.client.common_lib import global_config 40from autotest_lib.site_utils.admin_audit import servo_updater 41from autotest_lib.server.cros.servo.topology import servo_topology 42 43try: 44 from chromite.lib import metrics 45except ImportError: 46 metrics = utils.metrics_mock 47 48_CONFIG = global_config.global_config 49 50 51class ServoHost(base_servohost.BaseServoHost): 52 """Host class for a servo host(e.g. beaglebone, labstation) 53 that with a servo instance for a specific port. 54 55 @type _servo: servo.Servo | None 56 """ 57 58 DEFAULT_PORT = int(os.getenv('SERVOD_PORT', '9999')) 59 60 # Timeout for initializing servo signals. 61 INITIALIZE_SERVO_TIMEOUT_SECS = 60 62 63 # Default timeout for run terminal command. 64 DEFAULT_TERMINAL_TIMEOUT = 30 65 66 # Ready test function 67 SERVO_READY_METHOD = 'get_version' 68 69 # Directory prefix on the servo host where the servod logs are stored. 70 SERVOD_LOG_PREFIX = '/var/log/servod' 71 72 # Exit code to use when symlinks for servod logs are not found. 73 NO_SYMLINKS_CODE = 9 74 75 # Directory in the job's results directory to dump the logs into. 76 LOG_DIR = 'servod' 77 78 # Prefix for joint loglevel files in the logs. 79 JOINT_LOG_PREFIX = 'log' 80 81 # Regex group to extract timestamp from logfile name. 82 TS_GROUP = 'ts' 83 84 # This regex is used to extract the timestamp from servod logs. 85 # files always start with log. 86 TS_RE = (r'log.' 87 # The timestamp is of format %Y-%m-%d--%H-%M-%S.MS 88 r'(?P<%s>\d{4}(\-\d{2}){2}\-(-\d{2}){3}.\d{3})' 89 # The loglevel is optional depending on labstation version. 90 r'(.(INFO|DEBUG|WARNING))?' % TS_GROUP) 91 TS_EXTRACTOR = re.compile(TS_RE) 92 93 # Regex group to extract MCU name from logline in servod logs. 94 MCU_GROUP = 'mcu' 95 96 # Regex group to extract logline from MCU logline in servod logs. 97 LINE_GROUP = 'line' 98 99 # This regex is used to extract the mcu and the line content from an 100 # MCU logline in servod logs. e.g. EC or servo_v4 console logs. 101 # Here is an example log-line: 102 # 103 # 2020-01-23 13:15:12,223 - servo_v4 - EC3PO.Console - DEBUG - 104 # console.py:219:LogConsoleOutput - /dev/pts/9 - cc polarity: cc1 105 # 106 # Here is conceptually how they are formatted: 107 # 108 # <time> - <MCU> - EC3PO.Console - <LVL> - <file:line:func> - <pts> - 109 # <output> 110 # 111 # The log format starts with a timestamp 112 MCU_RE = (r'[\d\-]+ [\d:,]+ ' 113 # The mcu that is logging this is next. 114 r'- (?P<%s>\w+) - ' 115 # Next, we have more log outputs before the actual line. 116 # Information about the file line, logging function etc. 117 # Anchor on EC3PO Console, LogConsoleOutput and dev/pts. 118 # NOTE: if the log format changes, this regex needs to be 119 # adjusted. 120 r'EC3PO\.Console[\s\-\w\d:.]+LogConsoleOutput - /dev/pts/\d+ - ' 121 # Lastly, we get the MCU's console line. 122 r'(?P<%s>.+$)' % (MCU_GROUP, LINE_GROUP)) 123 MCU_EXTRACTOR = re.compile(MCU_RE) 124 125 # Regex to detect timeout messages when USBC pigtail has timeout issue. 126 # e.g.: [475635.427072 PD TMOUT RX 1/1] 127 USBC_PIGTAIL_TIMEOUT_RE = r'\[[\d \.]{1,20}(PD TMOUT RX 1\/1)\]' 128 129 # Suffix to identify compressed logfiles. 130 COMPRESSION_SUFFIX = '.tbz2' 131 132 # A suffix to mark servod log directories that came from instance that 133 # ran during this servo_host, but are not the last one running e.g. when 134 # an instance (on purpose, or due to a bug) restarted in the middle of the 135 # run. 136 OLD_LOG_SUFFIX = 'old' 137 138 # Mapping servo board with their vid-pid 139 SERVO_VID_PID = { 140 'servo_v4': '18d1:501b', 141 'ccd_cr50': '18d1:5014', 142 'servo_micro': '18d1:501a', 143 'servo_v3': ['18d1:5004', '0403:6014'], 144 'c2d2': '1d81:5041', 145 } 146 147 def _init_attributes(self): 148 self._servo_state = None 149 self.servo_port = None 150 self.servo_board = None 151 self.servo_model = None 152 self.servo_serial = None 153 self.servo_setup = None 154 self.additional_servod_args = None 155 self._dut_health_profile = None 156 # The flag that indicate if a servo is connected to a smart usbhub. 157 # TODO(xianuowang@) remove this flag once all usbhubs in the lab 158 # get replaced. 159 self.smart_usbhub = None 160 self._servo = None 161 self._topology = None 162 self._tunnel_proxy = None 163 self._tunnel_proxy_lock = threading.Lock() 164 self._initial_instance_ts = None 165 # Flag to make sure that multiple calls to close do not result in the 166 # logic executing multiple times. 167 self._closed = False 168 # Per-thread local data 169 self._local = threading.local() 170 171 def _initialize(self, 172 servo_host='localhost', 173 servo_port=DEFAULT_PORT, 174 servo_board=None, 175 servo_model=None, 176 servo_serial=None, 177 servo_setup=None, 178 additional_servod_args=None, 179 is_in_lab=None, 180 *args, 181 **dargs): 182 """Initialize a ServoHost instance. 183 184 A ServoHost instance represents a host that controls a servo. 185 186 @param servo_host: Name of the host where the servod process 187 is running. 188 @param servo_port: Port the servod process is listening on. Defaults 189 to the SERVOD_PORT environment variable if set, 190 otherwise 9999. 191 @param servo_board: Board that the servo is connected to. 192 @param servo_model: Model that the servo is connected to. 193 @param servo_serial: Serial number of the servo device. 194 @param servo_setup: Type of servo setup, e.g. REGULAR or DUAL_V4. 195 @param additional_servod_args: Additional args that will append to 196 servod start command. 197 @param is_in_lab: True if the servo host is in Cros Lab. Default is set 198 to None, for which utils.host_is_in_lab_zone will be 199 called to check if the servo host is in Cros lab. 200 201 """ 202 super(ServoHost, self)._initialize(hostname=servo_host, 203 is_in_lab=is_in_lab, *args, **dargs) 204 self._init_attributes() 205 self.servo_port = int(servo_port) 206 self.servo_board = servo_board 207 self.servo_model = servo_model 208 self.servo_serial = servo_serial 209 self.servo_setup = servo_setup 210 self.additional_servod_args = additional_servod_args 211 212 # The location of the log files on the servo host for this instance. 213 self.remote_log_dir = '%s_%s' % (self.SERVOD_LOG_PREFIX, 214 self.servo_port) 215 # Path of the servo host lock file. 216 self._lock_file = (self.TEMP_FILE_DIR + str(self.servo_port) + 217 self.LOCK_FILE_POSTFIX) 218 # File path to declare a reboot request. 219 self._reboot_file = (self.TEMP_FILE_DIR + str(self.servo_port) + 220 self.REBOOT_FILE_POSTFIX) 221 222 # Lock the servo host if it's an in-lab labstation to prevent other 223 # task to reboot it until current task completes. We also wait and 224 # make sure the labstation is up here, in the case of the labstation is 225 # in the middle of reboot. 226 self._is_locked = False 227 if (self.wait_up(self.REBOOT_TIMEOUT) and self.is_in_lab() 228 and self.is_labstation()): 229 self._lock() 230 try: 231 self.wait_ready() 232 except Exception as e: 233 logging.info( 234 'Unexpected error while ensure labstation' 235 ' readiness; %s', str(e)) 236 237 self._repair_strategy = ( 238 servo_repair.create_servo_repair_strategy()) 239 240 def __str__(self): 241 return "<%s '%s:%s'>" % ( 242 type(self).__name__, self.hostname, self.servo_port) 243 244 def connect_servo(self): 245 """ Initialize and setup servo for later use. 246 """ 247 self.initilize_servo() 248 self.initialize_dut_for_servo() 249 250 def initilize_servo(self): 251 """Establish a connection to the servod server on this host. 252 253 Initializes `self._servo` and then verifies that all network 254 connections are working. This will create an ssh tunnel if 255 it's required. 256 """ 257 self._servo = servo.Servo(servo_host=self, 258 servo_serial=self.servo_serial) 259 260 def initialize_dut_for_servo(self): 261 """This method will do some setup for dut control, e.g. setup 262 main servo_v4 device, and also testing the connection between servo 263 and DUT. As a side effect of testing the connection, all signals on 264 the target servo are reset to default values, and the USB stick is 265 set to the neutral (off) position. 266 """ 267 if not self._servo: 268 raise hosts.AutoservVerifyError('Servo object needs to be' 269 ' initialized before initialize' 270 ' DUT.') 271 timeout, _ = retry.timeout( 272 self._servo.initialize_dut, 273 timeout_sec=self.INITIALIZE_SERVO_TIMEOUT_SECS) 274 if timeout: 275 raise hosts.AutoservVerifyError('Initialize dut for servo timed' 276 ' out.') 277 278 def disconnect_servo(self): 279 """Disconnect our servo if it exists. 280 281 If we've previously successfully connected to our servo, 282 disconnect any established ssh tunnel, and set `self._servo` 283 back to `None`. 284 """ 285 if self._servo: 286 # N.B. This call is safe even without a tunnel: 287 # rpc_server_tracker.disconnect() silently ignores 288 # unknown ports. 289 self.rpc_server_tracker.disconnect(self.servo_port) 290 self._servo = None 291 292 def _maybe_create_servod_ssh_tunnel_proxy(self): 293 """Create a xmlrpc proxy for use with a ssh tunnel. 294 A lock is used to safely create a singleton proxy. 295 """ 296 with self._tunnel_proxy_lock: 297 if self._tunnel_proxy is None: 298 self._tunnel_proxy = self.rpc_server_tracker.xmlrpc_connect( 299 None, 300 self.servo_port, 301 ready_test_name=self.SERVO_READY_METHOD, 302 timeout_seconds=60, 303 request_timeout_seconds=3600, 304 server_desc=str(self)) 305 306 def get_servod_server_proxy(self): 307 """Return a proxy if it exists; otherwise, create a new one. 308 A proxy can either be a ssh tunnel based proxy, or a httplib 309 based proxy. 310 311 @returns: An xmlrpclib.ServerProxy that is connected to the servod 312 server on the host. 313 """ 314 if (servo_constants.ENABLE_SSH_TUNNEL_FOR_SERVO 315 and not self.is_localhost()): 316 # Check for existing ssh tunnel proxy. 317 if self._tunnel_proxy is None: 318 self._maybe_create_servod_ssh_tunnel_proxy() 319 return self._tunnel_proxy 320 else: 321 # xmlrpc/httplib is not thread-safe, so each thread must have its 322 # own separate proxy connection. 323 if not hasattr(self._local, "_per_thread_proxy"): 324 remote = 'http://%s:%s' % (self.hostname, self.servo_port) 325 self._local._per_thread_proxy = six.moves.xmlrpc_client.ServerProxy(remote) 326 return self._local._per_thread_proxy 327 328 def verify(self, silent=False): 329 """Update the servo host and verify it's in a good state. 330 331 @param silent If true, suppress logging in `status.log`. 332 """ 333 message = 'Beginning verify for servo host %s port %s serial %s' 334 message %= (self.hostname, self.servo_port, self.servo_serial) 335 self.record('INFO', None, None, message) 336 try: 337 self._repair_strategy.verify(self, silent) 338 self._servo_state = servo_constants.SERVO_STATE_WORKING 339 self.record('INFO', None, None, 340 'ServoHost verify set servo_state as WORKING') 341 if self.is_servo_topology_supported(): 342 self._topology = servo_topology.ServoTopology(self) 343 self._topology.generate() 344 except Exception as e: 345 if not self.is_localhost(): 346 self._servo_state = self.determine_servo_state() 347 self.record('INFO', None, None, 348 'ServoHost verify set servo_state as %s' 349 % self._servo_state) 350 if self._is_critical_error(e): 351 raise 352 353 def _get_default_usbkey_mount_path(self): 354 return '/media/servo_usb/%s' % self.servo_port 355 356 def get_image_name_from_usbkey(self, usbkey_dev): 357 """Mount usb drive and check ChromeOS image name on it if there is 358 one. This method assumes the image_usbkey_direction is already set 359 to servo side. 360 361 @param usbkey_dev: usbkey dev path(e.g. /dev/sdb). 362 363 @returns: image_name on the usbkey, e.g. nami-release/R82.10138.0.0, 364 or empty string if no test image detected, or unexpected 365 error occurred. 366 """ 367 logging.info('Checking ChromeOS image name on usbkey.') 368 mount_dst = self._get_default_usbkey_mount_path() 369 # Unmount if there is an existing stale mount. 370 self._unmount_drive(mount_dst) 371 # ChromeOS root fs is in /dev/sdx3 372 mount_src = usbkey_dev + '3' 373 try: 374 if not self._mount_drive(mount_src, mount_dst): 375 logging.debug('Unexpected error occurred on mount usb drive.') 376 return '' 377 378 release_content = self.run( 379 'cat %s/etc/lsb-release' % mount_dst, 380 ignore_status=True).stdout.strip() 381 382 if not re.search(r'RELEASE_TRACK=.*test', release_content): 383 logging.info('The image on usbkey is not a test image') 384 return '' 385 386 return lsbrelease_utils.get_chromeos_release_builder_path( 387 lsb_release_content=release_content) 388 finally: 389 logging.debug('Image check compeleted, unmounting the usb drive.') 390 self._unmount_drive(mount_dst) 391 392 def _extract_firmware_image_from_usbkey(self, fw_dst): 393 """Extract firmware images from the usbkey on servo, this method 394 assumes there is already a ChromeOS test image staged on servo. 395 396 @param fw_dst: the path that we'll copy firmware images to. 397 398 @returns: a json format string of firmware manifest data. 399 """ 400 usbkey_dev = self._probe_and_validate_usb_dev() 401 if not usbkey_dev: 402 raise hosts.AutoservRepairError('Unexpected error occurred when' 403 ' probe usbkey dev path, please check logs for detail.') 404 405 mount_dst = self._get_default_usbkey_mount_path() 406 # Unmount if there is an existing stale mount. 407 self._unmount_drive(mount_dst) 408 # ChromeOS root fs is in /dev/sdx3 409 mount_src = usbkey_dev + '3' 410 try: 411 if not self._mount_drive(mount_src, mount_dst): 412 raise hosts.AutoservRepairError('Failed to extract firmware' 413 ' image; Unable to mount %s.' % usbkey_dev, 414 'unable to mount usbkey') 415 updater_bin = os.path.join(mount_dst, 416 'usr/sbin/chromeos-firmwareupdate') 417 self.run('%s --unpack %s' % (updater_bin, fw_dst)) 418 return self.run('%s --manifest' % updater_bin).stdout 419 finally: 420 self._unmount_drive(mount_dst) 421 422 def prepare_repair_firmware_image(self, fw_dst=None): 423 """Prepare firmware image on the servohost for auto repair process 424 to consume. 425 426 @param fw_dst: the path that we want to store firmware image on 427 the servohost. 428 429 @returns: A tuple that containes ec firmware image path and bios 430 firmware image path on the servohost, or None if type of 431 image is not available based on manifest and dut's model. 432 """ 433 model = self.servo_model or self._dut_host_info.model 434 if not model: 435 raise hosts.AutoservRepairError( 436 'Could not determine DUT\'s model.', 437 'model infomation unknown') 438 439 if not fw_dst: 440 fw_dst = '/tmp/firmware_image/%s' % self.servo_port 441 # Cleanup and re-create dst path to have a fresh start. 442 self.run('rm -rf %s' % fw_dst) 443 self.run('mkdir -p %s' % fw_dst) 444 445 manifest = json.loads(self._extract_firmware_image_from_usbkey(fw_dst)) 446 # For models that have packed $MODEL_signed variant, we want use the 447 # 'signed' variant once we get DVT devices, so try to read manifest 448 # from $MODEL_signed first. 449 build = manifest.get('%s_signed' % model) or manifest.get(model) 450 if not build: 451 raise hosts.AutoservRepairError('Could not find firmware manifest' 452 ' for model:%s' % model, 'model manifest not found') 453 try: 454 ec_image = os.path.join(fw_dst, build['ec']['image']) 455 except KeyError: 456 ec_image = None 457 try: 458 bios_image = os.path.join(fw_dst, build['host']['image']) 459 except KeyError: 460 bios_image = None 461 if not ec_image and not bios_image: 462 raise hosts.AutoservRepairError('Could not find any firmware image' 463 ' for model:%s' % model, 'cannot find firmware image') 464 return ec_image, bios_image 465 466 def flash_ap_firmware_via_servo(self, image): 467 """Flash AP firmware by use a provided image. 468 469 This is will be a short term enhanment for infra repair use, it use 470 'futility update' which will automatically determine various parameters 471 needed for flashrom, and will preserve the GBB, VPD, and HWID for 472 AP firmware update. 473 @TODO(xianuowang@) Remove this method once b/148403277 implemented. 474 475 @param image: the firmware image path on servohost. 476 """ 477 cmd = 'futility update -i %s --servo_port=%s' 478 self.run(cmd % (image, self.servo_port), timeout=900) 479 480 def _probe_and_validate_usb_dev(self): 481 """This method probe the usb dev path by talking to servo, and then 482 validate the dev path is valid block device to servohost. 483 Possible output: 484 1. Encounter error during probe usb dev, returns empty string. 485 2. probe usb dev completed without error but cannot find usb dev, 486 raise AutoservRepairError. 487 3. probe usb dev find a usb dev path, but failed validation in this 488 method, raise AutoservRepairError. 489 490 @returns: A string of usb dev path(e.g. '/dev/sdb'), or empty string 491 if unexpected error occurred during probe. 492 @raises: AutoservRepairError if servo couldn't probe the usb dev path 493 (servo.probe_host_usb_dev() returns empty string), or the dev path is 494 not valid block device to servohost. 495 """ 496 logging.info('Validating image usbkey on servo.') 497 try: 498 usb_dev = self._servo.probe_host_usb_dev() 499 except Exception as e: 500 # We don't want any unexpected or transient servo communicating 501 # failure block usb repair, so capture all errors here. 502 logging.error(e, exc_info=True) 503 logging.error('Unexpected error occurred on get usbkey dev path,' 504 ' skipping usbkey validation.') 505 return '' 506 507 if usb_dev: 508 # probe_host_usb_dev() sometimes return stale record, 509 # so we need to make sure the path exists in fdisk. 510 validate_cmd = 'fdisk -l %s' % usb_dev 511 try: 512 resp = self.run(validate_cmd, ignore_status=True, timeout=30) 513 if resp.exit_status == 0: 514 return usb_dev 515 logging.error('%s is reported from "image_usbkey_dev" control' 516 ' but not detected by fdisk!', usb_dev) 517 except error.AutoservRunError as e: 518 if 'Timeout encountered' in str(e): 519 logging.warning('Timeout encountered during fdisk run,' 520 ' skipping usbkey validation.') 521 return '' 522 raise 523 524 raise hosts.AutoservRepairError( 525 'No usbkey detected on servo, the usbkey may be either missing' 526 ' or broken. Please replace usbkey on the servo and retry.', 527 'missing usbkey') 528 529 def is_ec_supported(self): 530 """Check if ec is supported on the servo_board""" 531 if self.servo_board: 532 try: 533 frm_config = config.Config(self.servo_board, self.servo_model) 534 return frm_config.chrome_ec 535 except Exception as e: 536 logging.error('Unexpected error when read from firmware' 537 ' configs; %s', str(e)) 538 return False 539 540 def validate_image_usbkey(self): 541 """This method first validate if there is a recover usbkey on servo 542 that accessible to servohost, and second check if a ChromeOS image is 543 already on the usb drive and return the image_name so we can avoid 544 unnecessary download and flash to the recover usbkey on servo. 545 546 Please note that, there is special error handling logic here: 547 1. If unexpected error happens, we return empty string. So repair 548 actions will not get blocked. 549 2. If no working usbkey present on servo, but no errors, we'll raise 550 AutoservRepairError here. 551 552 @returns: image_name on the usbkey, e.g. nami-release/R82.10138.0.0, 553 or empty string if no test image detected, or unexpected 554 error occurred. 555 @raises: AutoservRepairError if the usbkey is not detected on servo. 556 """ 557 usb_dev = self._probe_and_validate_usb_dev() 558 if usb_dev: 559 return self.get_image_name_from_usbkey(usb_dev) 560 else: 561 return '' 562 563 def repair(self, silent=False): 564 """Attempt to repair servo host. 565 566 @param silent If true, suppress logging in `status.log`. 567 """ 568 message = 'Beginning repair for servo host %s port %s serial %s' 569 message %= (self.hostname, self.servo_port, self.servo_serial) 570 self.record('INFO', None, None, message) 571 try: 572 self._repair_strategy.repair(self, silent) 573 self._servo_state = servo_constants.SERVO_STATE_WORKING 574 self.record('INFO', None, None, 575 'ServoHost repair set servo_state as WORKING') 576 # If target is a labstation then try to withdraw any existing 577 # reboot request created by this servo because it passed repair. 578 if self.is_labstation(): 579 self.withdraw_reboot_request() 580 if self.is_servo_topology_supported(): 581 self._topology = servo_topology.ServoTopology(self) 582 self._topology.generate() 583 except Exception as e: 584 if not self.is_localhost(): 585 self._servo_state = self.determine_servo_state() 586 self.record('INFO', None, None, 587 'ServoHost repair set servo_state as %s' 588 % self._servo_state) 589 if self._is_critical_error(e): 590 self.disconnect_servo() 591 self.stop_servod() 592 raise 593 594 def _is_critical_error(self, error): 595 if (isinstance(error, hosts.AutoservVerifyDependencyError) 596 and not error.is_critical()): 597 logging.warning('Non-critical verify failure(s) detected during' 598 ' verify/repair servo, servo connection will' 599 ' still be up but may not be fully functional.' 600 ' Some repair actions and servo dependent' 601 ' tests may not run.') 602 return False 603 logging.info( 604 'Critical verify failure(s) detected during repair/verify ' 605 'servo. Disconnecting servo and running `stop servod`, all' 606 ' repair actions and tests that depends on servo will not ' 607 'run.') 608 return True 609 610 def get_servo(self): 611 """Get the cached servo.Servo object. 612 613 @return: a servo.Servo object. 614 @rtype: autotest_lib.server.cros.servo.servo.Servo 615 """ 616 return self._servo 617 618 def request_reboot(self): 619 """Request servohost to be rebooted when it's safe to by touch a file. 620 """ 621 logging.debug('Request to reboot servohost %s has been created by ' 622 'servo with port # %s', self.hostname, self.servo_port) 623 self.run('touch %s' % self._reboot_file, ignore_status=True) 624 625 def withdraw_reboot_request(self): 626 """Withdraw a servohost reboot request if exists by remove the flag 627 file. 628 """ 629 logging.debug('Withdrawing request to reboot servohost %s that created' 630 ' by servo with port # %s if exists.', 631 self.hostname, self.servo_port) 632 self.run('rm -f %s' % self._reboot_file, ignore_status=True) 633 634 def start_servod(self, quick_startup=False): 635 """Start the servod process on servohost. 636 """ 637 # Skip if running on the localhost.(crbug.com/1038168) 638 if self.is_localhost(): 639 logging.debug("Servohost is a localhost, skipping start servod.") 640 return 641 642 cmd = 'start servod' 643 if self.servo_board: 644 cmd += ' BOARD=%s' % self.servo_board 645 if self.servo_model: 646 cmd += ' MODEL=%s' % self.servo_model 647 else: 648 logging.warning('Board for DUT is unknown; starting servod' 649 ' assuming a pre-configured board.') 650 651 cmd += ' PORT=%d' % self.servo_port 652 if self.servo_serial: 653 cmd += ' SERIAL=%s' % self.servo_serial 654 655 # Start servod with dual_v4 based on servo_setup. 656 if self.is_dual_setup(): 657 cmd += ' DUAL_V4=1' 658 659 # Start servod with CONFIG=cr50.xml which required for some pools. 660 if self._require_cr50_servod_config(): 661 cmd += ' CONFIG=cr50.xml' 662 663 # Adding customized args if any. 664 if self.additional_servod_args: 665 cmd += ' ' + self.additional_servod_args 666 667 # Remove the symbolic links from the logs. This helps ensure that 668 # a failed servod instantiation does not cause us to grab old logs 669 # by mistake. 670 self.remove_latest_log_symlinks() 671 self.run(cmd, timeout=60) 672 673 # There's a lag between when `start servod` completes and when 674 # the _ServodConnectionVerifier trigger can actually succeed. 675 # The call to time.sleep() below gives time to make sure that 676 # the trigger won't fail after we return. 677 678 # Normally servod on servo_v3 and labstation take ~10 seconds to ready, 679 # But in the rare case all servo on a labstation are in heavy use they 680 # may take ~30 seconds. So the timeout value will double these value, 681 # and we'll try quick start up when first time initialize servohost, 682 # and use standard start up timeout in repair. 683 if quick_startup: 684 timeout = servo_constants.SERVOD_QUICK_STARTUP_TIMEOUT 685 else: 686 timeout = servo_constants.SERVOD_STARTUP_TIMEOUT 687 logging.debug('Wait %s seconds for servod process fully up.', timeout) 688 time.sleep(timeout) 689 # Cache the initial instance timestamp to check against servod restarts 690 self._initial_instance_ts = self.get_instance_logs_ts() 691 692 def stop_servod(self): 693 """Stop the servod process on servohost. 694 """ 695 # Skip if running on the localhost.(crbug.com/1038168) 696 if self.is_localhost(): 697 logging.debug("Servohost is a localhost, skipping stop servod.") 698 return 699 700 logging.debug('Stopping servod on port %s', self.servo_port) 701 self.run('stop servod PORT=%d' % self.servo_port, 702 timeout=60, ignore_status=True) 703 logging.debug('Wait %s seconds for servod process fully teardown.', 704 servo_constants.SERVOD_TEARDOWN_TIMEOUT) 705 time.sleep(servo_constants.SERVOD_TEARDOWN_TIMEOUT) 706 707 def restart_servod(self, quick_startup=False): 708 """Restart the servod process on servohost. 709 """ 710 self.stop_servod() 711 self.start_servod(quick_startup) 712 713 def _process_servodtool_error(self, response): 714 """Helper function to handle non-zero servodtool response. 715 """ 716 if re.search(servo_constants.ERROR_MESSAGE_USB_HUB_NOT_COMPATIBLE, 717 response.stdout): 718 logging.error('The servo is not plugged on a usb hub that supports' 719 ' power-cycle!') 720 # change the flag so we can update this label in later process. 721 self.smart_usbhub = False 722 return 723 724 if re.search(servo_constants.ERROR_MESSAGE_DEVICE_NOT_FOUND % 725 self.servo_serial, response.stdout): 726 logging.error('No servo with serial %s found!', self.servo_serial) 727 return 728 729 logging.error('Unexpected error occurred from usbhub control, please' 730 ' file a bug and inform chrome-fleet-software@ team!') 731 732 def get_main_servo_usb_path(self): 733 """Helper function to collect current usb-path to main servo. 734 735 The usb-path is path to the folder where usb-device was enumerated. 736 If fail then will return an empty string (''). 737 738 @returns: string, usb-path to the main servo device. 739 e.g.: '/sys/bus/usb/devices/1-6.1.3.1' 740 """ 741 # TODO remove try-except when fix crbug.com/1087964 742 try: 743 cmd = 'servodtool device -s %s usb-path' % self.servo_serial 744 resp = self.run(cmd, ignore_status=True, timeout=30) 745 except Exception as e: 746 # Here we catch only timeout errors. 747 # Other errors is filtered by ignore_status=True 748 logging.debug('Attempt to get servo usb-path failed due to ' 749 'timeout; %s', e) 750 return '' 751 if resp.exit_status != 0: 752 self._process_servodtool_error(resp) 753 return '' 754 usb_path = resp.stdout.strip() 755 logging.info('Usb path of servo %s is %s', self.servo_serial, usb_path) 756 return usb_path 757 758 def _get_servo_usb_devnum(self): 759 """Helper function to collect current usb devnum of servo.""" 760 usb_path = self.get_main_servo_usb_path() 761 if not usb_path: 762 return '' 763 resp = self.run('cat %s/devnum' % usb_path, ignore_status=True) 764 if resp.exit_status != 0: 765 self._process_servodtool_error(resp) 766 return '' 767 return resp.stdout.strip() 768 769 def reboot_servo_v3_on_need(self): 770 """Check and reboot servo_v3 based on below conditions. 771 1. If there is an update pending on reboot. 772 2. Servo_v3 has been up for more than 96 hours. 773 """ 774 if self.get_board() != 'beaglebone_servo': 775 logging.info('Servo reboot is only applicable for servo V3.') 776 return 777 778 update_pending_reboot = (self._check_update_status() == 779 self.UPDATE_STATE.PENDING_REBOOT) 780 uptime_hours = float(self.check_uptime())/3600 781 logging.info('Uptime of servo_v3: %s hour(s)', uptime_hours) 782 long_up_time = uptime_hours > 96 783 784 # Skip reboot if neither condition are met. 785 if not (update_pending_reboot or long_up_time): 786 return 787 788 if update_pending_reboot: 789 message = 'Starting reboot servo_v3 because an update is pending.' 790 reboot_method = self._post_update_reboot 791 elif long_up_time: 792 message = 'Starting reboot servo_v3 because uptime > 96 hours.' 793 reboot_method = self._servo_host_reboot 794 self.record('INFO', None, None, message) 795 logging.info(message) 796 try: 797 reboot_method() 798 message = 'Servo_v3 reboot completed successfully.' 799 except Exception as e: 800 logging.debug("Fail to reboot servo_v3; %s", e) 801 message = ('Servo_v3 reboot failed, please check debug log ' 802 'for details.') 803 logging.info(message) 804 self.record('INFO', None, None, message) 805 806 def _reset_servo(self): 807 logging.info('Resetting servo through smart usbhub.') 808 # TODO remove try-except when fix crbug.com/1087964 809 try: 810 resp = self.run('servodtool device -s %s power-cycle' % 811 self.servo_serial, ignore_status=True, 812 timeout=30) 813 if resp.exit_status != 0: 814 self._process_servodtool_error(resp) 815 return False 816 except Exception as e: 817 # Here we catch only timeout errors. 818 # Other errors is filtered by ignore_status=True 819 logging.debug('Attempt to reset servo failed due to timeout;' 820 ' %s', e) 821 return False 822 823 logging.debug('Wait %s seconds for servo to come back from reset.', 824 servo_constants.SERVO_RESET_TIMEOUT_SECONDS) 825 time.sleep(servo_constants.SERVO_RESET_TIMEOUT_SECONDS) 826 # change the flag so we can update this label in later process. 827 self.smart_usbhub = True 828 return True 829 830 def reset_servo(self): 831 """Reset(power-cycle) the servo via smart usbhub. 832 """ 833 if not self.is_labstation(): 834 logging.info('Servo reset is not applicable to servo_v3.') 835 return 836 837 pre_reset_devnum = self._get_servo_usb_devnum() 838 logging.info('Servo usb devnum before reset: %s', pre_reset_devnum) 839 result = self._reset_servo() 840 if not result: 841 message = ('Failed to reset servo with serial: %s. (Please ignore' 842 ' this error if the DUT is not connected to a smart' 843 ' usbhub).' % self.servo_serial) 844 logging.warning(message) 845 self.record('INFO', None, None, message) 846 return 847 848 post_reset_devnum = self._get_servo_usb_devnum() 849 logging.info('Servo usb devnum after reset: %s', post_reset_devnum) 850 if not (pre_reset_devnum and post_reset_devnum): 851 message = ('Servo reset completed but unable to verify' 852 ' devnum change!') 853 elif pre_reset_devnum != post_reset_devnum: 854 message = ('Reset servo with serial %s completed successfully!' 855 % self.servo_serial) 856 else: 857 message = 'Servo reset completed but devnum is still not changed!' 858 logging.info(message) 859 self.record('INFO', None, None, message) 860 861 def _extract_compressed_logs(self, logdir, relevant_files): 862 """Decompress servod logs in |logdir|. 863 864 @param logdir: directory containing compressed servod logs. 865 @param relevant_files: list of files in |logdir| to consider. 866 867 @returns: tuple, (tarfiles, files) where 868 tarfiles: list of the compressed filenames that have been 869 extracted and deleted 870 files: list of the uncompressed files that were generated 871 """ 872 # For all tar-files, first extract them to the directory, and 873 # then let the common flow handle them. 874 tarfiles = [cf for cf in relevant_files if 875 cf.endswith(self.COMPRESSION_SUFFIX)] 876 files = [] 877 for f in tarfiles: 878 norm_name = os.path.basename(f)[:-len(self.COMPRESSION_SUFFIX)] 879 with tarfile.open(f) as tf: 880 # Each tarfile has only one member, as 881 # that's the compressed log. 882 member = tf.members[0] 883 # Manipulate so that it only extracts the basename, and not 884 # the directories etc. 885 member.name = norm_name 886 files.append(os.path.join(logdir, member.name)) 887 tf.extract(member, logdir) 888 # File has been extracted: remove the compressed file. 889 os.remove(f) 890 return tarfiles, files 891 892 def _extract_mcu_logs(self, log_subdir): 893 """Extract MCU (EC, Cr50, etc) console output from servod debug logs. 894 895 Using the MCU_EXTRACTOR regex (above) extract and split out MCU console 896 lines from the logs to generate invidiual console logs e.g. after 897 this method, you can find an ec.txt and servo_v4.txt in |log_dir| if 898 those MCUs had any console input/output. 899 900 @param log_subdir: directory with log.DEBUG.txt main servod debug logs. 901 """ 902 # Extract the MCU for each one. The MCU logs are only in the .DEBUG 903 # files 904 mcu_lines_file = os.path.join(log_subdir, 'log.DEBUG.txt') 905 if not os.path.exists(mcu_lines_file): 906 logging.info('No DEBUG logs found to extract MCU logs from.') 907 return 908 mcu_files = {} 909 mcu_file_template = '%s.txt' 910 with open(mcu_lines_file, 'r') as f: 911 for line in f: 912 match = self.MCU_EXTRACTOR.match(line) 913 if match: 914 mcu = match.group(self.MCU_GROUP).lower() 915 line = match.group(self.LINE_GROUP) 916 if mcu not in mcu_files: 917 mcu_file = os.path.join(log_subdir, 918 mcu_file_template % mcu) 919 mcu_files[mcu] = open(mcu_file, 'a') 920 fd = mcu_files[mcu] 921 fd.write(line + '\n') 922 for f in mcu_files: 923 mcu_files[f].close() 924 925 def remove_latest_log_symlinks(self): 926 """Remove the conveninence symlinks 'latest' servod logs.""" 927 symlink_wildcard = '%s/latest*' % self.remote_log_dir 928 cmd = 'rm ' + symlink_wildcard 929 self.run(cmd, stderr_tee=None, ignore_status=True) 930 931 def probe_servod_restart(self, instance_ts, outdir): 932 """Grab servod logs from previous instances if part of this session. 933 934 If since the last time this host called start_servod() servod crashed 935 and restarted, this helper finds those logs as well, and stores them 936 with the |OLD_LOG_SUFFIX| to investigate if necessary. 937 938 It also issues a panicinfo command to servo devices after the restart 939 to try and collect reboot information for debugging. 940 941 @param instance_ts: the log timestamp that the current instance uses 942 @param outdir: directory to create a subdirectory into to place the 943 servod logs into. 944 """ 945 if self._initial_instance_ts is None: 946 logging.info('No log timestamp grabbed successfully on servod ' 947 'startup. Cannot check device restarts. Ignoring.') 948 return 949 if instance_ts == self._initial_instance_ts: 950 logging.debug('Servod appears to have run without restarting') 951 return 952 # Servod seems to have restarted (at least once). |_initial_instance_ts| 953 # is the first timestamp, and instance_ts is the current timestamp. Find 954 # all timestamps in between them, and grab the logs for each. 955 tss = self._find_instance_timestamps_between(self._initial_instance_ts, 956 instance_ts) 957 logging.info('Servod has restarted %d times between the start and the ' 958 'end of this servo_host.', len(tss)) 959 logging.info('This might be an issue. Will extract all logs from each ' 960 'instance.') 961 logging.info('Logs that are not the currently running (about to turn ' 962 'down) instance are maked with a .%s in their folder.', 963 self.OLD_LOG_SUFFIX) 964 for ts in tss: 965 self.get_instance_logs(ts, outdir, old=True) 966 # Lastly, servod has restarted due to a potential issue. Try to get 967 # panic information from servo micro and servo v4 for the current logs. 968 # This can only happen if the |_servo| attribute is initialized. 969 if self._servo: 970 for mcu in ['servo_micro', 'servo_v4']: 971 ctrl = '%s_uart_cmd' % mcu 972 if self._servo.has_control(ctrl): 973 logging.info('Trying to retrieve %r panicinfo into logs', 974 mcu) 975 try: 976 self._servo.set_nocheck(ctrl, 'panicinfo') 977 except error.TestFail as e: 978 logging.error('Failed to generate panicinfo for %r ' 979 'logs. %s', mcu, str(e)) 980 981 def _find_instance_timestamps_between(self, start_ts, end_ts): 982 """Find all log timestamps between [start_ts, end_ts). 983 984 @param start_ts: str, earliest log timestamp of interest 985 @param end_ts: str, latest log timestamp of interest 986 987 @returns: list, all timestamps between start_ts and end_ts, end_ts 988 exclusive, on the servo_host. An empty list on errors 989 """ 990 # Simply get all timestamp, and then sort and remove 991 cmd = 'ls %s' % self.remote_log_dir 992 res = self.run(cmd, stderr_tee=None, ignore_status=True) 993 if res.exit_status != 0: 994 # Here we failed to find anything. 995 logging.info('Failed to find remote servod logs. Ignoring.') 996 return [] 997 logfiles = res.stdout.strip().split() 998 timestamps = set() 999 for logfile in logfiles: 1000 ts_match = self.TS_EXTRACTOR.match(logfile) 1001 if not ts_match: 1002 # Simply ignore files that fail the check. It might be the 1003 # 'latest' symlinks or random files. 1004 continue 1005 timestamps.add(ts_match.group(self.TS_GROUP)) 1006 # At this point we have all unique timestamps. 1007 timestamps = sorted(timestamps) 1008 for ts in [start_ts, end_ts]: 1009 if ts not in timestamps: 1010 logging.error('Timestamp %r not in servod logs. Cannot query ' 1011 'for timestamps in between %r and %r', ts, 1012 start_ts, end_ts) 1013 return [] 1014 return timestamps[timestamps.index(start_ts):timestamps.index(end_ts)] 1015 1016 def get_instance_logs_ts(self): 1017 """Retrieve the currently running servod instance's log timestamp 1018 1019 @returns: str, timestamp for current instance, or None on failure 1020 """ 1021 # First, extract the timestamp. This cmd gives the real filename of 1022 # the latest aka current log file. 1023 cmd = ('if [ -f %(dir)s/latest.DEBUG ];' 1024 'then realpath %(dir)s/latest.DEBUG;' 1025 'elif [ -f %(dir)s/latest ];' 1026 'then realpath %(dir)s/latest;' 1027 'else exit %(code)d;' 1028 'fi' % {'dir': self.remote_log_dir, 1029 'code': self.NO_SYMLINKS_CODE}) 1030 res = self.run(cmd, stderr_tee=None, ignore_status=True) 1031 if res.exit_status != 0: 1032 if res.exit_status == self.NO_SYMLINKS_CODE: 1033 logging.warning('servod log latest symlinks not found. ' 1034 'This is likely due to an error starting up ' 1035 'servod. Ignoring..') 1036 else: 1037 logging.warning('Failed to find servod logs on servo host.') 1038 logging.warning(res.stderr.strip()) 1039 return None 1040 fname = os.path.basename(res.stdout.strip()) 1041 # From the fname, ought to extract the timestamp using the TS_EXTRACTOR 1042 ts_match = self.TS_EXTRACTOR.match(fname) 1043 if not ts_match: 1044 logging.warning('Failed to extract timestamp from servod log file ' 1045 '%r. Skipping. The servo host is using outdated ' 1046 'servod logging and needs to be updated.', fname) 1047 return None 1048 return ts_match.group(self.TS_GROUP) 1049 1050 def get_instance_logs(self, instance_ts, outdir, old=False): 1051 """Collect all logs with |instance_ts| and dump into a dir in |outdir| 1052 1053 This method first collects all logs on the servo_host side pertaining 1054 to this servod instance (port, instatiation). It glues them together 1055 into combined log.[level].txt files and extracts all available MCU 1056 console I/O from the logs into individual files e.g. servo_v4.txt 1057 1058 All the output can be found in a directory inside |outdir| that 1059 this generates based on |LOG_DIR|, the servod port, and the instance 1060 timestamp on the servo_host side. 1061 1062 @param instance_ts: log timestamp to grab logfiles for 1063 @param outdir: directory to create a subdirectory into to place the 1064 servod logs into. 1065 @param old: bool, whether to append |OLD_LOG_SUFFIX| to output dir 1066 """ 1067 # Create the local results log dir. 1068 log_dir = os.path.join(outdir, '%s_%s.%s' % (self.LOG_DIR, 1069 str(self.servo_port), 1070 instance_ts)) 1071 if old: 1072 log_dir = '%s.%s' % (log_dir, self.OLD_LOG_SUFFIX) 1073 logging.info('Saving servod logs to %r.', log_dir) 1074 os.mkdir(log_dir) 1075 # Now, get all files with that timestamp. 1076 cmd = 'find %s -maxdepth 1 -name "log.%s*"' % (self.remote_log_dir, 1077 instance_ts) 1078 res = self.run(cmd, stderr_tee=None, ignore_status=True) 1079 files = res.stdout.strip().split() 1080 try: 1081 self.get_file(files, log_dir, try_rsync=False) 1082 if not os.listdir(log_dir): 1083 logging.info('No servod logs retrieved. Ignoring, and removing ' 1084 '%r again.', log_dir) 1085 os.rmdir(log_dir) 1086 return 1087 except error.AutoservRunError as e: 1088 result = e.result_obj 1089 if result.exit_status != 0: 1090 stderr = result.stderr.strip() 1091 logging.warning("Couldn't retrieve servod logs. Ignoring: %s", 1092 stderr or '\n%s' % result) 1093 # Remove the log_dir as nothing was added to it. 1094 os.rmdir(log_dir) 1095 return 1096 local_files = [os.path.join(log_dir, f) for f in os.listdir(log_dir)] 1097 # TODO(crrev.com/c/1793030): remove no-level case once CL is pushed 1098 for level_name in ('DEBUG', 'INFO', 'WARNING', ''): 1099 # Create the joint files for each loglevel. i.e log.DEBUG 1100 joint_file = self.JOINT_LOG_PREFIX 1101 if level_name: 1102 joint_file = '%s.%s' % (self.JOINT_LOG_PREFIX, level_name) 1103 # This helps with some online tools to avoid complaints about an 1104 # unknown filetype. 1105 joint_file = joint_file + '.txt' 1106 joint_path = os.path.join(log_dir, joint_file) 1107 files = [f for f in local_files if level_name in f] 1108 if not files: 1109 # TODO(crrev.com/c/1793030): remove no-level case once CL 1110 # is pushed 1111 continue 1112 # Extract compressed logs if any. 1113 compressed, extracted = self._extract_compressed_logs(log_dir, 1114 files) 1115 files = list(set(files) - set(compressed)) 1116 files.extend(extracted) 1117 # Need to sort. As they all share the same timestamp, and 1118 # loglevel, the index itself is sufficient. The highest index 1119 # is the oldest file, therefore we need a descending sort. 1120 def sortkey(f, level=level_name): 1121 """Custom sortkey to sort based on rotation number int.""" 1122 if f.endswith(level_name): return 0 1123 return int(f.split('.')[-1]) 1124 1125 files.sort(reverse=True, key=sortkey) 1126 # Just rename the first file rather than building from scratch. 1127 os.rename(files[0], joint_path) 1128 with open(joint_path, 'a') as joint_f: 1129 for logfile in files[1:]: 1130 # Transfer the file to the joint file line by line. 1131 with open(logfile, 'r') as log_f: 1132 for line in log_f: 1133 joint_f.write(line) 1134 # File has been written over. Delete safely. 1135 os.remove(logfile) 1136 # Need to remove all files form |local_files| so we don't 1137 # analyze them again. 1138 local_files = list(set(local_files) - set(files) - set(compressed)) 1139 # Lastly, extract MCU logs from the joint logs. 1140 self._extract_mcu_logs(log_dir) 1141 1142 def _lock(self): 1143 """lock servohost by touching a file. 1144 """ 1145 logging.debug('Locking servohost %s by touching %s file', 1146 self.hostname, self._lock_file) 1147 self.run('touch %s' % self._lock_file, ignore_status=True) 1148 self._is_locked = True 1149 1150 def _unlock(self): 1151 """Unlock servohost by removing the lock file. 1152 """ 1153 logging.debug('Unlocking servohost by removing %s file', 1154 self._lock_file) 1155 self.run('rm %s' % self._lock_file, ignore_status=True) 1156 self._is_locked = False 1157 1158 def close(self): 1159 """Close the associated servo and the host object.""" 1160 # NOTE: throughout this method there are multiple attempts to catch 1161 # all errors. This is WAI as log grabbing should not fail tests. 1162 # However, the goal is to catch and handle/process all errors, thus 1163 # we print the traceback and ask for a bug. 1164 if self._closed: 1165 logging.debug('ServoHost is already closed.') 1166 return 1167 1168 # Only attempt ssh related actions if servohost is sshable. We call 1169 # check_cached_up_status() first because it's lightweighted and return 1170 # much faster in the case servohost is down, however, we still want 1171 # to call is_up() later since check_cached_up_status() is ping based check 1172 # and not guarantee the servohost is sshable. 1173 servo_host_ready = self.check_cached_up_status() and self.is_up() 1174 1175 if servo_host_ready: 1176 instance_ts = self.get_instance_logs_ts() 1177 else: 1178 logging.info('Servohost is down, will skip servod log collecting.') 1179 instance_ts = None 1180 # TODO(crbug.com/1011516): once enabled, remove the check against 1181 # localhost and instead check against log-rotiation enablement. 1182 logs_available = (instance_ts is not None and 1183 self.job and 1184 not self.is_localhost()) 1185 if logs_available: 1186 # Probe whether there was a servod restart, and grab those old 1187 # logs as well. 1188 try: 1189 self.probe_servod_restart(instance_ts, self.job.resultdir) 1190 except (error.AutoservRunError, error.TestFail) as e: 1191 logging.info('Failed to grab servo logs due to: %s. ' 1192 'This error is forgiven.', str(e)) 1193 except Exception as e: 1194 logging.error('Unexpected error probing for old logs. %s. ' 1195 'Forgiven. Please file a bug and fix or catch ' 1196 'in log probing function', str(e), 1197 exc_info=True) 1198 if self._servo: 1199 outdir = None if not self.job else self.job.resultdir 1200 # In some cases when we run as lab-tools, the job object is None. 1201 self._servo.close(outdir) 1202 1203 if logs_available: 1204 # Grab current (not old like above) logs after the servo instance 1205 # was closed out. 1206 try: 1207 self.get_instance_logs(instance_ts, self.job.resultdir) 1208 except error.AutoservRunError as e: 1209 logging.info('Failed to grab servo logs due to: %s. ' 1210 'This error is forgiven.', str(e)) 1211 except Exception as e: 1212 logging.error('Unexpected error grabbing servod logs. %s. ' 1213 'Forgiven. Please file a bug and fix or catch ' 1214 'in log grabbing function', str(e), exc_info=True) 1215 1216 if self._is_locked and servo_host_ready: 1217 # Remove the lock if the servohost has been locked. 1218 try: 1219 self._unlock() 1220 except error.AutoservSSHTimeout: 1221 logging.error('Unlock servohost failed due to ssh timeout.' 1222 ' It may caused by servohost went down during' 1223 ' the task.') 1224 # We want always stop servod after task to minimum the impact of bad 1225 # servod process interfere other servods.(see crbug.com/1028665) 1226 if servo_host_ready: 1227 try: 1228 self.stop_servod() 1229 except error.AutoservRunError as e: 1230 logging.info( 1231 "Failed to stop servod due to:\n%s\n" 1232 "This error is forgiven.", str(e)) 1233 1234 super(ServoHost, self).close() 1235 # Mark closed. 1236 self._closed = True 1237 1238 def get_servo_state(self): 1239 return self._servo_state 1240 1241 def _get_host_metrics_data(self): 1242 return {'port': self.servo_port, 1243 'host': self.get_dut_hostname() or self.hostname, 1244 'board': self.servo_board or ''} 1245 1246 def _is_servo_device_connected(self, servo_type, serial): 1247 """Check if device is connected to the labstation. 1248 1249 Works for all servo devices connected to the labstation. 1250 For servo_v3 please use 'self._is_servo_board_present_on_servo_v3' 1251 1252 @param servo_type: The type of servo device. Expecting value can be 1253 servo_v4 or servo_micro. 1254 @param serial: The serial number of the device to detect it. 1255 """ 1256 vid_pid = self.SERVO_VID_PID.get(servo_type) 1257 if not vid_pid or not serial: 1258 # device cannot detected without VID/PID or serial number 1259 return False 1260 logging.debug('Started to detect %s', servo_type) 1261 try: 1262 cmd = 'lsusb -v -d %s |grep iSerial |grep %s' % (vid_pid, serial) 1263 result = self.run(cmd, ignore_status=True, timeout=30) 1264 if result.exit_status == 0 and result.stdout.strip(): 1265 logging.debug('The %s is plugged in to the host.', servo_type) 1266 return True 1267 logging.debug('%s device is not detected; %s', servo_type, result) 1268 return False 1269 except Exception as e: 1270 # can be triggered by timeout issue due running the script 1271 metrics.Counter( 1272 'chromeos/autotest/repair/servo_detection/timeout' 1273 ).increment(fields=self._get_host_metrics_data()) 1274 logging.error('%s device is not detected; %s', servo_type, str(e)) 1275 return None 1276 1277 def _is_servo_board_present_on_servo_v3(self): 1278 """Check if servo board is detected on servo_v3""" 1279 vid_pids = self.SERVO_VID_PID['servo_v3'] 1280 if not vid_pids or len(vid_pids) == 0: 1281 # device cannot detected without VID/PID 1282 return False 1283 logging.debug('Started to detect servo board on servo_v3') 1284 not_detected = 'The servo board is not detected on servo_v3' 1285 try: 1286 cmd = 'lsusb | grep "%s"' % "\|".join(vid_pids) 1287 result = self.run(cmd, ignore_status=True, timeout=30) 1288 if result.exit_status == 0 and result.stdout.strip(): 1289 logging.debug('The servo board is detected on servo_v3') 1290 return True 1291 logging.debug('%s; %s', not_detected, result) 1292 return False 1293 except Exception as e: 1294 # can be triggered by timeout issue due running the script 1295 metrics.Counter( 1296 'chromeos/autotest/repair/servo_detection/timeout' 1297 ).increment(fields=self._get_host_metrics_data()) 1298 logging.error('%s; %s', not_detected, str(e)) 1299 return None 1300 1301 def _is_main_device_not_detected_on_servo_v4(self): 1302 """Check if servod cannot find main device on servo. 1303 1304 The check based on reading servod logs for servo_v4. 1305 """ 1306 if not self._initial_instance_ts: 1307 # latest log not found 1308 return False 1309 logging.debug('latest log for servod created at %s', 1310 self._initial_instance_ts) 1311 try: 1312 log_created = calendar.timegm(time.strptime( 1313 self._initial_instance_ts, 1314 "%Y-%m-%d--%H-%M-%S.%f")) 1315 except ValueError as e: 1316 logging.debug('Cannot read time from log file name: %s', 1317 self._initial_instance_ts) 1318 return False 1319 min_time_created = calendar.timegm(time.gmtime()) 1320 if min_time_created > log_created + 3600: 1321 # the log file is old we cannot use it 1322 logging.debug('log file was created more than hour ago, too old') 1323 return False 1324 logging.debug('latest log was created not longer then 1 hour ago') 1325 1326 # check if servod can detect main device by servo_v4 1327 message = 'ERROR - No servo micro or CCD detected for board' 1328 cmd = ('cat /var/log/servod_%s/log.%s.INFO |grep "%s"' 1329 % (self.servo_port, self._initial_instance_ts, message)) 1330 result = self.run(cmd, ignore_status=True) 1331 if result.stdout.strip(): 1332 logging.info('Servod cannot detect main device on the servo; ' 1333 'Can be caused by bad hardware of servo or ' 1334 'issue on the DUT side.') 1335 return True 1336 logging.debug('The main device is detected') 1337 return False 1338 1339 def _require_cr50_servod_config(self): 1340 """Check whether we need start servod with CONFIG=cr50.xml""" 1341 dut_host_info = self.get_dut_host_info() 1342 if not dut_host_info: 1343 return False 1344 for pool in dut_host_info.pools: 1345 if pool.startswith(servo_constants.CR50_CONFIG_POOL_PREFIX): 1346 return True 1347 return False 1348 1349 def get_verifier_state(self, tag): 1350 """Return the state of servo verifier. 1351 1352 @returns: bool or None 1353 """ 1354 return self._repair_strategy.verifier_is_good(tag) 1355 1356 def determine_servo_state(self): 1357 """Determine servo state based on the failed verifier. 1358 1359 @returns: servo state value 1360 The state detecting based on first fail verifier or collecting of 1361 them. 1362 """ 1363 ssh = self.get_verifier_state('servo_ssh') 1364 disk_space = self.get_verifier_state('disk_space') 1365 start_servod = self.get_verifier_state('servod_job') 1366 create_servo = self.get_verifier_state('servod_connection') 1367 init_servo = self.get_verifier_state('servod_control') 1368 servo_topology = self.get_verifier_state('servo_topology') 1369 dut_connected = self.get_verifier_state('dut_connected') 1370 hub_connected = self.get_verifier_state('hub_connected') 1371 pwr_button = self.get_verifier_state('pwr_button') 1372 lid_open = self.get_verifier_state('lid_open') 1373 ec_board = self.get_verifier_state('ec_board') 1374 cr50_console = self.get_verifier_state('cr50_console') 1375 ccd_testlab = self.get_verifier_state('ccd_testlab') 1376 1377 if not ssh: 1378 return servo_constants.SERVO_STATE_NO_SSH 1379 1380 if (start_servod == hosts.VERIFY_FAILED 1381 or create_servo == hosts.VERIFY_FAILED): 1382 # sometimes servo can start with out present servo 1383 if self.is_labstation(): 1384 if not self.servo_serial: 1385 return servo_constants.SERVO_STATE_WRONG_CONFIG 1386 if self._is_servo_device_connected( 1387 'servo_v4', 1388 self.servo_serial) == False: 1389 return servo_constants.SERVO_STATE_NOT_CONNECTED 1390 elif self._is_servo_board_present_on_servo_v3() == False: 1391 return servo_constants.SERVO_STATE_NOT_CONNECTED 1392 1393 if servo_topology == hosts.VERIFY_FAILED: 1394 return servo_constants.SERVO_STATE_TOPOLOGY_ISSUE 1395 1396 if (dut_connected == hosts.VERIFY_FAILED 1397 or hub_connected == hosts.VERIFY_FAILED): 1398 return servo_constants.SERVO_STATE_DUT_NOT_CONNECTED 1399 # TODO(otabek@): detect special cases detected by pwr_button 1400 if dut_connected == hosts.VERIFY_SUCCESS: 1401 if pwr_button == hosts.VERIFY_FAILED: 1402 metrics.Counter( 1403 'chromeos/autotest/repair/servo_unexpected/pwr_button2' 1404 ).increment(fields=self._get_host_metrics_data()) 1405 1406 if start_servod == hosts.VERIFY_FAILED: 1407 return servo_constants.SERVO_STATE_SERVOD_ISSUE 1408 1409 if create_servo == hosts.VERIFY_FAILED: 1410 if (self.is_labstation() 1411 and self._is_main_device_not_detected_on_servo_v4()): 1412 servo_type = None 1413 if self.get_dut_host_info(): 1414 servo_type = self.get_dut_host_info().get_label_value( 1415 servo_constants.SERVO_TYPE_LABEL_PREFIX) 1416 if servo_type and 'servo_micro' in servo_type: 1417 serial = self.get_servo_micro_serial_number() 1418 logging.debug('servo_micro serial: %s', serial) 1419 if self._is_servo_device_detected('servo_micro', 1420 serial): 1421 return servo_constants.SERVO_STATE_BAD_RIBBON_CABLE 1422 # Device can be not detected because of DUT 1423 # TODO (otabek) update after b/159755652 and b/159754985 1424 metrics.Counter( 1425 'chromeos/autotest/repair/servo_state/needs_replacement' 1426 ).increment(fields=self._get_host_metrics_data()) 1427 elif not self.is_labstation(): 1428 # Here need logic to check if flex cable is connected 1429 pass 1430 1431 # one of the reason why servo can not initialized 1432 if cr50_console == hosts.VERIFY_FAILED: 1433 return servo_constants.SERVO_STATE_CR50_CONSOLE_MISSING 1434 if ccd_testlab == hosts.VERIFY_FAILED: 1435 return servo_constants.SERVO_STATE_CCD_TESTLAB_ISSUE 1436 1437 if (create_servo == hosts.VERIFY_FAILED 1438 or init_servo == hosts.VERIFY_FAILED): 1439 return servo_constants.SERVO_STATE_SERVOD_ISSUE 1440 1441 if ec_board == hosts.VERIFY_FAILED: 1442 return servo_constants.SERVO_STATE_EC_BROKEN 1443 if pwr_button == hosts.VERIFY_FAILED: 1444 return servo_constants.SERVO_STATE_BAD_RIBBON_CABLE 1445 if lid_open == hosts.VERIFY_FAILED: 1446 return servo_constants.SERVO_STATE_LID_OPEN_FAILED 1447 1448 metrics.Counter( 1449 'chromeos/autotest/repair/unknown_servo_state' 1450 ).increment(fields=self._get_host_metrics_data()) 1451 logging.info('We do not have special state for this failure yet :)') 1452 return servo_constants.SERVO_STATE_BROKEN 1453 1454 def is_servo_topology_supported(self): 1455 """Check if servo_topology is supported.""" 1456 if not self.is_up_fast(): 1457 logging.info('Servo-Host is not reachable.') 1458 return False 1459 if not self.is_labstation(): 1460 logging.info('Servo-topology supported only for labstation.') 1461 return False 1462 if not self.servo_serial: 1463 logging.info('Servo-topology required a servo serial.') 1464 return False 1465 return True 1466 1467 def get_topology(self): 1468 """Get servo topology.""" 1469 return self._topology 1470 1471 def is_dual_setup(self): 1472 """Check is servo will run in dual setup. 1473 1474 Dual setup used only for servo_v4 when used ccd_cr50 and servo_micro 1475 at the same time. 1476 """ 1477 return self.servo_setup == servo_constants.SERVO_SETUP_VALUE_DUAL_V4 1478 1479 def set_dut_health_profile(self, dut_health_profile): 1480 """ 1481 @param dut_health_profile: A DeviceHealthProfile object. 1482 """ 1483 logging.debug('setting dut_health_profile field to (%s)', 1484 dut_health_profile) 1485 self._dut_health_profile = dut_health_profile 1486 1487 def get_dut_health_profile(self): 1488 """ 1489 @return A DeviceHealthProfile object. 1490 """ 1491 return self._dut_health_profile 1492 1493 1494def make_servo_hostname(dut_hostname): 1495 """Given a DUT's hostname, return the hostname of its servo. 1496 1497 @param dut_hostname: hostname of a DUT. 1498 1499 @return hostname of the DUT's servo. 1500 1501 """ 1502 host_parts = dut_hostname.split('.') 1503 host_parts[0] = host_parts[0] + '-servo' 1504 return '.'.join(host_parts) 1505 1506 1507def _map_afe_board_to_servo_board(afe_board): 1508 """Map a board we get from the AFE to a servo appropriate value. 1509 1510 Many boards are identical to other boards for servo's purposes. 1511 This function makes that mapping. 1512 1513 @param afe_board string board name received from AFE. 1514 @return board we expect servo to have. 1515 1516 """ 1517 KNOWN_SUFFIXES = ['-freon', '_freon', '_moblab', '-cheets'] 1518 BOARD_MAP = {'gizmo': 'panther'} 1519 mapped_board = afe_board 1520 if afe_board in BOARD_MAP: 1521 mapped_board = BOARD_MAP[afe_board] 1522 else: 1523 for suffix in KNOWN_SUFFIXES: 1524 if afe_board.endswith(suffix): 1525 mapped_board = afe_board[0:-len(suffix)] 1526 break 1527 if mapped_board != afe_board: 1528 logging.info('Mapping AFE board=%s to %s', afe_board, mapped_board) 1529 return mapped_board 1530 1531 1532def get_servo_args_for_host(dut_host): 1533 """Return servo data associated with a given DUT. 1534 1535 @param dut_host Instance of `Host` on which to find the servo 1536 attributes. 1537 @return `servo_args` dict with host and an optional port. 1538 """ 1539 info = dut_host.host_info_store.get() 1540 servo_args = {k: v for k, v in six.iteritems(info.attributes) 1541 if k in servo_constants.SERVO_ATTR_KEYS} 1542 1543 if servo_constants.SERVO_PORT_ATTR in servo_args: 1544 try: 1545 servo_args[servo_constants.SERVO_PORT_ATTR] = int( 1546 servo_args[servo_constants.SERVO_PORT_ATTR]) 1547 except ValueError: 1548 logging.error('servo port is not an int: %s', 1549 servo_args[servo_constants.SERVO_PORT_ATTR]) 1550 # Reset servo_args because we don't want to use an invalid port. 1551 servo_args.pop(servo_constants.SERVO_HOST_ATTR, None) 1552 1553 if info.board: 1554 servo_board = _map_afe_board_to_servo_board(info.board) 1555 servo_args[servo_constants.SERVO_BOARD_ATTR] = servo_board 1556 if info.model: 1557 servo_args[servo_constants.SERVO_MODEL_ATTR] = info.model 1558 return servo_args if servo_constants.SERVO_HOST_ATTR in servo_args else None 1559 1560 1561def _tweak_args_for_ssp_moblab(servo_args): 1562 if (servo_args[servo_constants.SERVO_HOST_ATTR] 1563 in ['localhost', '127.0.0.1']): 1564 servo_args[servo_constants.SERVO_HOST_ATTR] = _CONFIG.get_config_value( 1565 'SSP', 'host_container_ip', type=str, default=None) 1566 1567 1568def create_servo_host(dut, 1569 servo_args, 1570 try_lab_servo=False, 1571 try_servo_repair=False, 1572 dut_host_info=None, 1573 dut_health_profile=None): 1574 """Create a ServoHost object for a given DUT, if appropriate. 1575 1576 This function attempts to create and verify or repair a `ServoHost` 1577 object for a servo connected to the given `dut`, subject to various 1578 constraints imposed by the parameters: 1579 * When the `servo_args` parameter is not `None`, a servo 1580 host must be created, and must be checked with `repair()`. 1581 * Otherwise, if a servo exists in the lab and `try_lab_servo` is 1582 true: 1583 * If `try_servo_repair` is true, then create a servo host and 1584 check it with `repair()`. 1585 * Otherwise, if the servo responds to `ping` then create a 1586 servo host and check it with `verify()`. 1587 1588 In cases where `servo_args` was not `None`, repair failure 1589 exceptions are passed back to the caller; otherwise, exceptions 1590 are logged and then discarded. Note that this only happens in cases 1591 where we're called from a test (not special task) control file that 1592 has an explicit dependency on servo. In that case, we require that 1593 repair not write to `status.log`, so as to avoid polluting test 1594 results. 1595 1596 TODO(jrbarnette): The special handling for servo in test control 1597 files is a thorn in my flesh; I dearly hope to see it cut out before 1598 my retirement. 1599 1600 Parameters for a servo host consist of a host name, port number, and 1601 DUT board, and are determined from one of these sources, in order of 1602 priority: 1603 * Servo attributes from the `dut` parameter take precedence over 1604 all other sources of information. 1605 * If a DNS entry for the servo based on the DUT hostname exists in 1606 the CrOS lab network, that hostname is used with the default 1607 port and the DUT's board. 1608 * If no other options are found, the parameters will be taken 1609 from the `servo_args` dict passed in from the caller. 1610 1611 @param dut An instance of `Host` from which to take 1612 servo parameters (if available). 1613 @param servo_args A dictionary with servo parameters to use if 1614 they can't be found from `dut`. If this 1615 argument is supplied, unrepaired exceptions 1616 from `verify()` will be passed back to the 1617 caller. 1618 @param try_lab_servo If not true, servo host creation will be 1619 skipped unless otherwise required by the 1620 caller. 1621 @param try_servo_repair If true, check a servo host with 1622 `repair()` instead of `verify()`. 1623 @param dut_host_info: A HostInfo object of the DUT that connected 1624 to this servo. 1625 @param dut_health_profile: DUT repair info with history. 1626 1627 @returns: A ServoHost object or None. See comments above. 1628 1629 """ 1630 servo_dependency = servo_args is not None 1631 if dut is not None and (try_lab_servo or servo_dependency): 1632 servo_args_override = get_servo_args_for_host(dut) 1633 if servo_args_override is not None: 1634 if utils.in_moblab_ssp(): 1635 _tweak_args_for_ssp_moblab(servo_args_override) 1636 logging.debug( 1637 'Overriding provided servo_args (%s) with arguments' 1638 ' determined from the host (%s)', 1639 servo_args, 1640 servo_args_override, 1641 ) 1642 servo_args = servo_args_override 1643 1644 if servo_args is None: 1645 logging.debug('No servo_args provided, and failed to find overrides.') 1646 if try_lab_servo or servo_dependency: 1647 return None, servo_constants.SERVO_STATE_MISSING_CONFIG 1648 else: 1649 # For regular test case which not required the servo 1650 return None, None 1651 1652 servo_hostname = servo_args.get(servo_constants.SERVO_HOST_ATTR) 1653 servo_port = servo_args.get(servo_constants.SERVO_PORT_ATTR) 1654 if not _is_servo_host_information_exist(servo_hostname, servo_port): 1655 logging.debug( 1656 'Servo connection info missed hostname: %s , port: %s', 1657 servo_hostname, servo_port) 1658 return None, servo_constants.SERVO_STATE_MISSING_CONFIG 1659 if not is_servo_host_information_valid(servo_hostname, servo_port): 1660 logging.debug( 1661 'Servo connection info is incorrect hostname: %s , port: %s', 1662 servo_hostname, servo_port) 1663 return None, servo_constants.SERVO_STATE_WRONG_CONFIG 1664 1665 newhost = ServoHost(**servo_args) 1666 if not newhost.is_up_fast(count=3): 1667 # ServoHost has internal check to wait if servo-host is in reboot 1668 # process. If servo-host still is not available this check will stop 1669 # further attempts as we do not have any option to recover servo_host. 1670 return None, servo_constants.SERVO_STATE_NO_SSH 1671 1672 # Reset or reboot servo device only during AdminRepair tasks. 1673 if try_servo_repair: 1674 if newhost._is_locked: 1675 # Reset servo if the servo is locked, as we check if the servohost 1676 # is up, if the servohost is labstation and if the servohost is in 1677 # lab inside the locking logic. 1678 newhost.reset_servo() 1679 else: 1680 try: 1681 newhost.reboot_servo_v3_on_need() 1682 except Exception as e: 1683 logging.info('[Non-critical] Unexpected error while trying to' 1684 ' reboot servo_v3, skipping the reboot; %s', e) 1685 1686 if dut: 1687 newhost.set_dut_hostname(dut.hostname) 1688 if dut_host_info: 1689 newhost.set_dut_host_info(dut_host_info) 1690 if dut_health_profile and (try_lab_servo or try_servo_repair): 1691 try: 1692 if newhost.is_localhost(): 1693 logging.info('Servohost is a localhost, skip device' 1694 ' health profile setup...') 1695 else: 1696 dut_health_profile.init_profile(newhost) 1697 newhost.set_dut_health_profile(dut_health_profile) 1698 except Exception as e: 1699 logging.info( 1700 '[Non-critical] Unexpected error while trying to' 1701 ' load device health profile; %s', e) 1702 1703 if try_lab_servo or try_servo_repair: 1704 try: 1705 logging.info("Check and update servo firmware.") 1706 servo_updater.update_servo_firmware( 1707 newhost, 1708 force_update=False) 1709 except Exception as e: 1710 logging.error("Servo device update error: %s", e) 1711 1712 try: 1713 newhost.restart_servod(quick_startup=True) 1714 except error.AutoservSSHTimeout: 1715 logging.warning("Restart servod failed due ssh connection " 1716 "to servohost timed out. This error is forgiven" 1717 " here, we will retry in servo repair process.") 1718 except error.AutoservRunError as e: 1719 logging.warning("Restart servod failed due to:\n%s\n" 1720 "This error is forgiven here, we will retry" 1721 " in servo repair process.", str(e)) 1722 1723 # Note that the logic of repair() includes everything done 1724 # by verify(). It's sufficient to call one or the other; 1725 # we don't need both. 1726 if servo_dependency: 1727 newhost.repair(silent=True) 1728 return newhost, newhost.get_servo_state() 1729 1730 if try_servo_repair: 1731 try: 1732 newhost.repair() 1733 except Exception: 1734 logging.exception('servo repair failed for %s', newhost.hostname) 1735 else: 1736 try: 1737 newhost.verify() 1738 except Exception: 1739 logging.exception('servo verify failed for %s', newhost.hostname) 1740 return newhost, newhost.get_servo_state() 1741 1742 1743def _is_servo_host_information_exist(hostname, port): 1744 if hostname is None or len(hostname.strip()) == 0: 1745 return False 1746 if port is None: 1747 return False 1748 if not type(port) is int: 1749 try: 1750 int(port) 1751 except ValueError: 1752 return False 1753 1754 return True 1755 1756 1757def is_servo_host_information_valid(hostname, port): 1758 """Check if provided servo attributes are valid. 1759 1760 @param hostname Hostname of the servohost. 1761 @param port servo port number. 1762 1763 @returns: A bool value to indicate if provided servo attribute valid. 1764 """ 1765 if not _is_servo_host_information_exist(hostname, port): 1766 return False 1767 # checking range and correct of the port 1768 port_int = int(port) 1769 if port_int < 1 or port_int > 65000: 1770 return False 1771 # we expecting host contain only latters, digits and '-' or '_' 1772 if not re.match('[a-zA-Z0-9-_\.]*$', hostname) or len(hostname) < 5: 1773 return False 1774 return True 1775