1# Lint as: python2, python3 2# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5# 6# Expects to be run in an environment with sudo and no interactive password 7# prompt, such as within the Chromium OS development chroot. 8 9 10"""This file provides core logic for servo verify/repair process.""" 11 12 13from __future__ import absolute_import 14from __future__ import division 15from __future__ import print_function 16 17import logging 18import os 19import re 20import tarfile 21import threading 22import json 23import tempfile 24import time 25import six 26import six.moves.xmlrpc_client 27 28try: 29 import docker 30 from autotest_lib.site_utils.docker import utils as docker_utils 31except ImportError: 32 logging.info("Docker API is not installed in this environment") 33 34from autotest_lib.client.bin import utils 35from autotest_lib.client.common_lib import error 36from autotest_lib.client.common_lib import hosts 37from autotest_lib.client.common_lib import lsbrelease_utils 38from autotest_lib.client.common_lib.cros import retry 39from autotest_lib.server import crashcollect 40from autotest_lib.server.cros.servo import servo 41from autotest_lib.server.hosts import servo_repair 42from autotest_lib.server.hosts import base_servohost 43from autotest_lib.server.hosts import servo_constants 44from autotest_lib.server.cros.faft.utils import config 45from autotest_lib.client.common_lib import global_config 46from autotest_lib.server.cros.servo.topology import servo_topology 47 48try: 49 from autotest_lib.utils.frozen_chromite.lib import metrics 50except ImportError: 51 metrics = utils.metrics_mock 52 53_CONFIG = global_config.global_config 54 55SERVOD_CONTAINER_IMAGE_PATH = "us-docker.pkg.dev/chromeos-partner-moblab/common-core" 56 57# Create the flag on drone or inside the ssp container to enable servod debug mode. 58# The servod container will not be deleted after being stopped. 59SERVOD_DEBUG_FLAG = '/servod_debug' 60 61DOCKER_SERVOD_DEBUG_MODE = os.environ.get('DOCKER_SERVOD_DEBUG_MODE', '0') 62 63 64class ServoHost(base_servohost.BaseServoHost): 65 """Host class for a servo host(e.g. beaglebone, labstation) 66 that with a servo instance for a specific port. 67 68 @type _servo: servo.Servo | None 69 """ 70 71 DEFAULT_PORT = int(os.getenv('SERVOD_PORT', '9999')) 72 73 # Timeout for initializing servo signals. 74 INITIALIZE_SERVO_TIMEOUT_SECS = 60 75 76 # Default timeout for run terminal command. 77 DEFAULT_TERMINAL_TIMEOUT = 30 78 79 # Ready test function 80 SERVO_READY_METHOD = 'get_version' 81 82 # Directory prefix on the servo host where the servod logs are stored. 83 SERVOD_LOG_PREFIX = '/var/log/servod' 84 85 # Exit code to use when symlinks for servod logs are not found. 86 NO_SYMLINKS_CODE = 9 87 88 # Directory in the job's results directory to dump the logs into. 89 LOG_DIR = 'servod' 90 91 # Prefix for joint loglevel files in the logs. 92 JOINT_LOG_PREFIX = 'log' 93 94 # Regex group to extract timestamp from logfile name. 95 TS_GROUP = 'ts' 96 97 # This regex is used to extract the timestamp from servod logs. 98 # files always start with log. 99 TS_RE = (r'log.' 100 # The timestamp is of format %Y-%m-%d--%H-%M-%S.MS 101 r'(?P<%s>\d{4}(\-\d{2}){2}\-(-\d{2}){3}.\d{3})' 102 # The loglevel is optional depending on labstation version. 103 r'(.(INFO|DEBUG|WARNING))?' % TS_GROUP) 104 TS_EXTRACTOR = re.compile(TS_RE) 105 106 # Regex group to extract MCU name from logline in servod logs. 107 MCU_GROUP = 'mcu' 108 109 # Regex group to extract logline from MCU logline in servod logs. 110 LINE_GROUP = 'line' 111 112 # This regex is used to extract the mcu and the line content from an 113 # MCU logline in servod logs. e.g. EC or servo_v4 console logs. 114 # Here is an example log-line: 115 # 116 # 2020-01-23 13:15:12,223 - servo_v4 - EC3PO.Console - DEBUG - 117 # console.py:219:LogConsoleOutput - /dev/pts/9 - cc polarity: cc1 118 # 119 # Here is conceptually how they are formatted: 120 # 121 # <time> - <MCU> - EC3PO.Console - <LVL> - <file:line:func> - <pts> - 122 # <output> 123 # 124 # The log format starts with a timestamp 125 MCU_RE = (r'[\d\-]+ [\d:,]+ ' 126 # The mcu that is logging this is next. 127 r'- (?P<%s>[\w/]+) - ' 128 # Next, we have more log outputs before the actual line. 129 # Information about the file line, logging function etc. 130 # Anchor on EC3PO Console, LogConsoleOutput and dev/pts. 131 # NOTE: if the log format changes, this regex needs to be 132 # adjusted. 133 r'EC3PO\.Console[\s\-\w\d:.]+LogConsoleOutput - /dev/pts/\d+ - ' 134 # Lastly, we get the MCU's console line. 135 r'(?P<%s>.+$)' % (MCU_GROUP, LINE_GROUP)) 136 MCU_EXTRACTOR = re.compile(MCU_RE) 137 138 # Regex to detect timeout messages when USBC pigtail has timeout issue. 139 # e.g.: [475635.427072 PD TMOUT RX 1/1] 140 USBC_PIGTAIL_TIMEOUT_RE = r'\[[\d \.]{1,20}(PD TMOUT RX 1\/1)\]' 141 142 # Suffix to identify compressed logfiles. 143 COMPRESSION_SUFFIX = '.tbz2' 144 145 # A suffix to mark servod log directories that came from instance that 146 # ran during this servo_host, but are not the last one running e.g. when 147 # an instance (on purpose, or due to a bug) restarted in the middle of the 148 # run. 149 OLD_LOG_SUFFIX = 'old' 150 151 def _init_attributes(self): 152 self._servo_state = None 153 self.servo_port = None 154 self.servo_board = None 155 self._ec_supported = None 156 self.servo_model = None 157 self.servo_serial = None 158 self.servo_setup = None 159 self.servo_recovery = None 160 self.servo_fw_channel = None 161 self.additional_servod_args = None 162 self._dut_health_profile = None 163 # The flag that indicate if a servo is connected to a smart usbhub. 164 # TODO(xianuowang@) remove this flag once all usbhubs in the lab 165 # get replaced. 166 self.smart_usbhub = None 167 self._servo = None 168 self._topology = None 169 self._tunnel_proxy = None 170 self._tunnel_proxy_lock = threading.Lock() 171 self._initial_instance_ts = None 172 # Flag to make sure that multiple calls to close do not result in the 173 # logic executing multiple times. 174 self._closed = False 175 # Per-thread local data 176 self._local = threading.local() 177 178 def _initialize(self, 179 servo_host='localhost', 180 servo_port=DEFAULT_PORT, 181 servo_board=None, 182 servo_model=None, 183 servo_serial=None, 184 servo_setup=None, 185 servo_recovery=None, 186 servo_fw_channel=None, 187 additional_servod_args=None, 188 is_in_lab=None, 189 *args, 190 **dargs): 191 """Initialize a ServoHost instance. 192 193 A ServoHost instance represents a host that controls a servo. 194 195 @param servo_host: Name of the host where the servod process 196 is running. 197 @param servo_port: Port the servod process is listening on. Defaults 198 to the SERVOD_PORT environment variable if set, 199 otherwise 9999. 200 @param servo_board: Board that the servo is connected to. 201 @param servo_model: Model that the servo is connected to. 202 @param servo_serial: Serial number of the servo device. 203 @param servo_setup: Type of servo setup, e.g. REGULAR or DUAL_V4. 204 @param additional_servod_args: Additional args that will append to 205 servod start command. 206 @param is_in_lab: True if the servo host is in Cros Lab. Default is set 207 to None, for which utils.host_is_in_lab_zone will be 208 called to check if the servo host is in Cros lab. 209 210 """ 211 super(ServoHost, self)._initialize(hostname=servo_host, 212 is_in_lab=is_in_lab, *args, **dargs) 213 self._init_attributes() 214 self.servo_port = int(servo_port) 215 self.servo_board = servo_board 216 self.servo_model = servo_model 217 self.servo_serial = servo_serial 218 self.servo_setup = servo_setup 219 self.servo_recovery = servo_recovery 220 self.servo_fw_channel = servo_fw_channel 221 self.additional_servod_args = additional_servod_args 222 223 # The location of the log files on the servo host for this instance. 224 self.remote_log_dir = '%s_%s' % (self.SERVOD_LOG_PREFIX, 225 self.servo_port) 226 # Path of the servo host lock file. 227 self._lock_file = (self.TEMP_FILE_DIR + str(self.servo_port) + 228 self.LOCK_FILE_POSTFIX) 229 # File path to declare a reboot request. 230 self._reboot_file = (self.TEMP_FILE_DIR + str(self.servo_port) + 231 self.REBOOT_FILE_POSTFIX) 232 233 # Lock the servo host if it's an in-lab labstation to prevent other 234 # task to reboot it until current task completes. We also wait and 235 # make sure the labstation is up here, in the case of the labstation is 236 # in the middle of reboot. 237 self._is_locked = False 238 if (not self.is_containerized_servod() 239 and self.wait_up(self.REBOOT_TIMEOUT) and self.is_in_lab() 240 and self.is_labstation()): 241 self._lock() 242 try: 243 self.wait_ready() 244 except Exception as e: 245 logging.info( 246 'Unexpected error while ensure labstation' 247 ' readiness; %s', str(e)) 248 249 self._repair_strategy = ( 250 servo_repair.create_servo_repair_strategy()) 251 252 def __str__(self): 253 return "<%s '%s:%s'>" % ( 254 type(self).__name__, self.hostname, self.servo_port) 255 256 def connect_servo(self): 257 """ Initialize and setup servo for later use. 258 """ 259 self.initialize_servo() 260 self.initialize_dut_for_servo() 261 262 def initialize_servo(self): 263 """Establish a connection to the servod server on this host. 264 265 Initializes `self._servo` and then verifies that all network 266 connections are working. This will create an ssh tunnel if 267 it's required. 268 """ 269 self._servo = servo.Servo(servo_host=self, 270 servo_serial=self.servo_serial) 271 272 def initialize_dut_for_servo(self): 273 """This method will do some setup for dut control, e.g. setup 274 main servo_v4 device, and also testing the connection between servo 275 and DUT. As a side effect of testing the connection, all signals on 276 the target servo are reset to default values, and the USB stick is 277 set to the neutral (off) position. 278 """ 279 if not self._servo: 280 raise hosts.AutoservVerifyError('Servo object needs to be' 281 ' initialized before initialize' 282 ' DUT.') 283 timeout, _ = retry.timeout( 284 self._servo.initialize_dut, 285 timeout_sec=self.INITIALIZE_SERVO_TIMEOUT_SECS) 286 if timeout: 287 raise hosts.AutoservVerifyError('Initialize dut for servo timed' 288 ' out.') 289 290 def disconnect_servo(self): 291 """Disconnect our servo if it exists. 292 293 If we've previously successfully connected to our servo, 294 disconnect any established ssh tunnel, and set `self._servo` 295 back to `None`. 296 """ 297 if self._servo: 298 # N.B. This call is safe even without a tunnel: 299 # rpc_server_tracker.disconnect() silently ignores 300 # unknown ports. 301 self.rpc_server_tracker.disconnect(self.servo_port) 302 self._servo = None 303 304 def _to_str(self, maybe_str): 305 """If the param is bytes, convert it to a str.""" 306 # b/217780680, Make this compatible with python3, 307 if isinstance(maybe_str, bytes): 308 return maybe_str.decode('utf-8', errors='replace') 309 return maybe_str 310 311 def _maybe_create_servod_ssh_tunnel_proxy(self): 312 """Create a xmlrpc proxy for use with a ssh tunnel. 313 A lock is used to safely create a singleton proxy. 314 """ 315 with self._tunnel_proxy_lock: 316 if self._tunnel_proxy is None: 317 self._tunnel_proxy = self.rpc_server_tracker.xmlrpc_connect( 318 None, 319 self.servo_port, 320 ready_test_name=self.SERVO_READY_METHOD, 321 timeout_seconds=60, 322 request_timeout_seconds=3600, 323 server_desc=str(self)) 324 325 def get_servod_server_proxy(self): 326 """Return a proxy if it exists; otherwise, create a new one. 327 A proxy can either be a ssh tunnel based proxy, or a httplib 328 based proxy. 329 330 @returns: An xmlrpclib.ServerProxy that is connected to the servod 331 server on the host. 332 """ 333 if (servo_constants.ENABLE_SSH_TUNNEL_FOR_SERVO 334 and not self.is_localhost() 335 and not self.is_containerized_servod()): 336 # Check for existing ssh tunnel proxy. 337 if self._tunnel_proxy is None: 338 self._maybe_create_servod_ssh_tunnel_proxy() 339 return self._tunnel_proxy 340 else: 341 # xmlrpc/httplib is not thread-safe, so each thread must have its 342 # own separate proxy connection. 343 if not hasattr(self._local, "_per_thread_proxy"): 344 if self.is_containerized_servod(): 345 ip_addr = docker_utils.get_container_ip(self.hostname) 346 remote = 'http://%s:%s' % (ip_addr, self.servo_port) 347 else: 348 remote = 'http://%s:%s' % (self.hostname, self.servo_port) 349 logging.debug('Servo Proxy RPC URL: %s', remote) 350 self._local._per_thread_proxy = six.moves.xmlrpc_client.ServerProxy(remote) 351 return self._local._per_thread_proxy 352 353 def verify(self, silent=False): 354 """Update the servo host and verify it's in a good state. 355 356 @param silent If true, suppress logging in `status.log`. 357 """ 358 message = 'Beginning verify for servo host %s port %s serial %s' 359 message %= (self.hostname, self.servo_port, self.servo_serial) 360 self.record('INFO', None, None, message) 361 try: 362 self._repair_strategy.verify(self, silent) 363 self._servo_state = servo_constants.SERVO_STATE_WORKING 364 self.record('INFO', None, None, 365 'ServoHost verify set servo_state as WORKING') 366 except Exception as e: 367 if not self.is_localhost(): 368 self._servo_state = self.determine_servo_state() 369 self.record('INFO', None, None, 370 'ServoHost verify set servo_state as %s' 371 % self._servo_state) 372 if self._is_critical_error(e): 373 raise 374 375 def _get_default_usbkey_mount_path(self): 376 return '/media/servo_usb/%s' % self.servo_port 377 378 def get_image_name_from_usbkey(self, usbkey_dev): 379 """Mount usb drive and check ChromeOS image name on it if there is 380 one. This method assumes the image_usbkey_direction is already set 381 to servo side. 382 383 @param usbkey_dev: usbkey dev path(e.g. /dev/sdb). 384 385 @returns: image_name on the usbkey, e.g. nami-release/R82.10138.0.0, 386 or empty string if no test image detected, or unexpected 387 error occurred. 388 """ 389 logging.info('Checking ChromeOS image name on usbkey.') 390 mount_dst = self._get_default_usbkey_mount_path() 391 # Unmount if there is an existing stale mount. 392 self._unmount_drive(mount_dst) 393 # ChromeOS root fs is in /dev/sdx3 394 mount_src = usbkey_dev + '3' 395 try: 396 if not self._mount_drive(mount_src, mount_dst): 397 logging.debug('Unexpected error occurred on mount usb drive.') 398 return '' 399 400 release_content = self._to_str( 401 self.run('cat %s/etc/lsb-release' % mount_dst, 402 ignore_status=True).stdout.strip()) 403 404 if not re.search(r'RELEASE_TRACK=.*test', release_content): 405 logging.info('The image on usbkey is not a test image') 406 return '' 407 408 return lsbrelease_utils.get_chromeos_release_builder_path( 409 lsb_release_content=release_content) 410 finally: 411 logging.debug('Image check compeleted, unmounting the usb drive.') 412 self._unmount_drive(mount_dst) 413 414 def _extract_firmware_image_from_usbkey(self, fw_dst): 415 """Extract firmware images from the usbkey on servo, this method 416 assumes there is already a ChromeOS test image staged on servo. 417 418 @param fw_dst: the path that we'll copy firmware images to. 419 420 @returns: a json format string of firmware manifest data. 421 """ 422 usbkey_dev = self._probe_and_validate_usb_dev() 423 if not usbkey_dev: 424 raise hosts.AutoservRepairError('Unexpected error occurred when' 425 ' probe usbkey dev path, please check logs for detail.') 426 427 mount_dst = self._get_default_usbkey_mount_path() 428 # Unmount if there is an existing stale mount. 429 self._unmount_drive(mount_dst) 430 # ChromeOS root fs is in /dev/sdx3 431 mount_src = usbkey_dev + '3' 432 try: 433 if not self._mount_drive(mount_src, mount_dst): 434 raise hosts.AutoservRepairError('Failed to extract firmware' 435 ' image; Unable to mount %s.' % usbkey_dev, 436 'unable to mount usbkey') 437 updater_bin = os.path.join(mount_dst, 438 'usr/sbin/chromeos-firmwareupdate') 439 self.run('%s --unpack %s' % (updater_bin, fw_dst)) 440 return self._to_str(self.run('%s --manifest' % updater_bin).stdout) 441 442 finally: 443 self._unmount_drive(mount_dst) 444 445 def prepare_repair_firmware_image(self, fw_dst=None): 446 """Prepare firmware image on the servohost for auto repair process 447 to consume. 448 449 @param fw_dst: the path that we want to store firmware image on 450 the servohost. 451 452 @returns: A tuple that containes ec firmware image path and bios 453 firmware image path on the servohost, or None if type of 454 image is not available based on manifest and dut's model. 455 """ 456 model = self.servo_model or self._dut_host_info.model 457 if not model: 458 raise hosts.AutoservRepairError( 459 'Could not determine DUT\'s model.', 460 'model infomation unknown') 461 462 if not fw_dst: 463 fw_dst = '/tmp/firmware_image/%s' % self.servo_port 464 # Cleanup and re-create dst path to have a fresh start. 465 self.run('rm -rf %s' % fw_dst) 466 self.run('mkdir -p %s' % fw_dst) 467 468 manifest = json.loads(self._extract_firmware_image_from_usbkey(fw_dst)) 469 # For models that have packed $MODEL_signed variant, we want use the 470 # 'signed' variant once we get DVT devices, so try to read manifest 471 # from $MODEL_signed first. 472 build = manifest.get('%s_signed' % model) or manifest.get(model) 473 if not build: 474 raise hosts.AutoservRepairError('Could not find firmware manifest' 475 ' for model:%s' % model, 'model manifest not found') 476 try: 477 ec_image = os.path.join(fw_dst, build['ec']['image']) 478 except KeyError: 479 ec_image = None 480 try: 481 bios_image = os.path.join(fw_dst, build['host']['image']) 482 except KeyError: 483 bios_image = None 484 if not ec_image and not bios_image: 485 raise hosts.AutoservRepairError('Could not find any firmware image' 486 ' for model:%s' % model, 'cannot find firmware image') 487 return ec_image, bios_image 488 489 def flash_ap_firmware_via_servo(self, image): 490 """Flash AP firmware by use a provided image. 491 492 This is will be a short term enhanment for infra repair use, it use 493 'futility update' which will automatically determine various parameters 494 needed for flashrom, and will preserve the GBB, VPD, and HWID for 495 AP firmware update. 496 @TODO(xianuowang@) Remove this method once b/148403277 implemented. 497 498 @param image: the firmware image path on servohost. 499 """ 500 cmd = 'futility update -i %s --servo_port=%s' 501 self.run(cmd % (image, self.servo_port), timeout=900) 502 503 def _probe_and_validate_usb_dev(self): 504 """This method probe the usb dev path by talking to servo, and then 505 validate the dev path is valid block device to servohost. 506 Possible output: 507 1. Encounter error during probe usb dev, returns empty string. 508 2. probe usb dev completed without error but cannot find usb dev, 509 raise AutoservRepairError. 510 3. probe usb dev find a usb dev path, but failed validation in this 511 method, raise AutoservRepairError. 512 513 @returns: A string of usb dev path(e.g. '/dev/sdb'), or empty string 514 if unexpected error occurred during probe. 515 @raises: AutoservRepairError if servo couldn't probe the usb dev path 516 (servo.probe_host_usb_dev() returns empty string), or the dev path is 517 not valid block device to servohost. 518 """ 519 logging.info('Validating image usbkey on servo.') 520 try: 521 usb_dev = self._servo.probe_host_usb_dev() 522 except Exception as e: 523 # We don't want any unexpected or transient servo communicating 524 # failure block usb repair, so capture all errors here. 525 logging.error(e, exc_info=True) 526 logging.error('Unexpected error occurred on get usbkey dev path,' 527 ' skipping usbkey validation.') 528 return '' 529 530 if usb_dev: 531 # probe_host_usb_dev() sometimes return stale record, 532 # so we need to make sure the path exists in fdisk. 533 validate_cmd = 'fdisk -l %s' % usb_dev 534 try: 535 resp = self.run(validate_cmd, ignore_status=True, timeout=30) 536 if resp.exit_status == 0: 537 return usb_dev 538 logging.error('%s is reported from "image_usbkey_dev" control' 539 ' but not detected by fdisk!', usb_dev) 540 except error.AutoservRunError as e: 541 if 'Timeout encountered' in str(e): 542 logging.warning('Timeout encountered during fdisk run,' 543 ' skipping usbkey validation.') 544 return '' 545 raise 546 547 raise hosts.AutoservRepairError( 548 'No usbkey detected on servo, the usbkey may be either missing' 549 ' or broken. Please replace usbkey on the servo and retry.', 550 'missing usbkey') 551 552 def is_ec_supported(self): 553 """Check if ec is supported on the servo_board""" 554 if self.servo_board: 555 if self._ec_supported is not None: 556 return self._ec_supported 557 try: 558 frm_config = config.Config(self.servo_board, self.servo_model) 559 self._ec_supported = getattr(frm_config, 'chrome_ec', False) 560 return self._ec_supported 561 except Exception as e: 562 logging.error( 563 'Unexpected error when read from firmware' 564 ' configs; %s', e) 565 else: 566 logging.debug('Cannot detect if DUT has EC as board unknown.') 567 return False 568 569 def validate_image_usbkey(self): 570 """This method first validate if there is a recover usbkey on servo 571 that accessible to servohost, and second check if a ChromeOS image is 572 already on the usb drive and return the image_name so we can avoid 573 unnecessary download and flash to the recover usbkey on servo. 574 575 Please note that, there is special error handling logic here: 576 1. If unexpected error happens, we return empty string. So repair 577 actions will not get blocked. 578 2. If no working usbkey present on servo, but no errors, we'll raise 579 AutoservRepairError here. 580 581 @returns: image_name on the usbkey, e.g. nami-release/R82.10138.0.0, 582 or empty string if no test image detected, or unexpected 583 error occurred. 584 @raises: AutoservRepairError if the usbkey is not detected on servo. 585 """ 586 usb_dev = self._probe_and_validate_usb_dev() 587 if usb_dev: 588 return self.get_image_name_from_usbkey(usb_dev) 589 else: 590 return '' 591 592 def repair(self, silent=False): 593 """Attempt to repair servo host. 594 595 @param silent If true, suppress logging in `status.log`. 596 """ 597 message = 'Beginning repair for servo host %s port %s serial %s' 598 message %= (self.hostname, self.servo_port, self.servo_serial) 599 self.record('INFO', None, None, message) 600 try: 601 self._repair_strategy.repair(self, silent) 602 self._servo_state = servo_constants.SERVO_STATE_WORKING 603 self.record('INFO', None, None, 604 'ServoHost repair set servo_state as WORKING') 605 # If target is a labstation then try to withdraw any existing 606 # reboot request created by this servo because it passed repair. 607 if self.is_labstation(): 608 self.withdraw_reboot_request() 609 except Exception as e: 610 if not self.is_localhost(): 611 self._servo_state = self.determine_servo_state() 612 self.record('INFO', None, None, 613 'ServoHost repair set servo_state as %s' 614 % self._servo_state) 615 if self._is_critical_error(e): 616 self.disconnect_servo() 617 self.stop_servod() 618 raise 619 620 def _is_critical_error(self, error): 621 if (isinstance(error, hosts.AutoservVerifyDependencyError) 622 and not error.is_critical()): 623 logging.warning('Non-critical verify failure(s) detected during' 624 ' verify/repair servo, servo connection will' 625 ' still be up but may not be fully functional.' 626 ' Some repair actions and servo dependent' 627 ' tests may not run.') 628 return False 629 logging.info( 630 'Critical verify failure(s) detected during repair/verify ' 631 'servo. Disconnecting servo and running `stop servod`, all' 632 ' repair actions and tests that depends on servo will not ' 633 'run.') 634 return True 635 636 def get_servo(self): 637 """Get the cached servo.Servo object. 638 639 @return: a servo.Servo object. 640 @rtype: autotest_lib.server.cros.servo.servo.Servo 641 """ 642 return self._servo 643 644 def request_reboot(self): 645 """Request servohost to be rebooted when it's safe to by touch a file. 646 """ 647 logging.debug('Request to reboot servohost %s has been created by ' 648 'servo with port # %s', self.hostname, self.servo_port) 649 self.run('touch %s' % self._reboot_file, ignore_status=True) 650 651 def withdraw_reboot_request(self): 652 """Withdraw a servohost reboot request if exists by remove the flag 653 file. 654 """ 655 logging.debug('Withdrawing request to reboot servohost %s that created' 656 ' by servo with port # %s if exists.', 657 self.hostname, self.servo_port) 658 self.run('rm -f %s' % self._reboot_file, ignore_status=True) 659 660 def start_servod(self, quick_startup=False): 661 """Start the servod process on servohost. 662 """ 663 # Skip if running on the localhost.(crbug.com/1038168) 664 if self.is_localhost(): 665 logging.debug("Servohost is a localhost, skipping start servod.") 666 return 667 668 if self.is_containerized_servod(): 669 return self.start_containerized_servod() 670 671 cmd = 'start servod' 672 if self.servo_board: 673 cmd += ' BOARD=%s' % self.servo_board 674 if self.servo_model: 675 cmd += ' MODEL=%s' % self.servo_model 676 else: 677 logging.warning('Board for DUT is unknown; starting servod' 678 ' assuming a pre-configured board.') 679 680 cmd += ' PORT=%d' % self.servo_port 681 if self.servo_serial: 682 cmd += ' SERIAL=%s' % self.servo_serial 683 684 # Start servod with dual_v4 based on servo_setup. 685 if self.is_dual_setup(): 686 cmd += ' DUAL_V4=1' 687 688 # Start servod with CONFIG=cr50.xml which required for some pools. 689 if self._require_cr50_servod_config(): 690 cmd += ' CONFIG=cr50.xml' 691 692 if self.servo_recovery == True: 693 cmd += ' REC_MODE=1' 694 695 # Adding customized args if any. 696 if self.additional_servod_args: 697 cmd += ' ' + self.additional_servod_args 698 699 # Remove the symbolic links from the logs. This helps ensure that 700 # a failed servod instantiation does not cause us to grab old logs 701 # by mistake. 702 self.remove_latest_log_symlinks() 703 self.run(cmd, timeout=60) 704 705 # There's a lag between when `start servod` completes and when 706 # the _ServodConnectionVerifier trigger can actually succeed. 707 # The call to time.sleep() below gives time to make sure that 708 # the trigger won't fail after we return. 709 710 # Normally servod on servo_v3 and labstation take ~10 seconds to ready, 711 # But in the rare case all servo on a labstation are in heavy use they 712 # may take ~30 seconds. So the timeout value will double these value, 713 # and we'll try quick start up when first time initialize servohost, 714 # and use standard start up timeout in repair. 715 if quick_startup: 716 timeout = servo_constants.SERVOD_QUICK_STARTUP_TIMEOUT 717 else: 718 timeout = servo_constants.SERVOD_STARTUP_TIMEOUT 719 logging.debug('Wait %s seconds for servod process fully up.', timeout) 720 time.sleep(timeout) 721 # Cache the initial instance timestamp to check against servod restarts 722 self._initial_instance_ts = self.get_instance_logs_ts() 723 724 def stop_servod(self): 725 """Stop the servod process on servohost. 726 """ 727 # Skip if running on the localhost.(crbug.com/1038168) 728 if self.is_localhost(): 729 logging.debug("Servohost is a localhost, skipping stop servod.") 730 return 731 732 if self.is_containerized_servod(): 733 # TODO(gregorynisbet): Remove this message in 2022Q2. 734 logging.debug("ServoHost: Detected containerized servod.") 735 remove_container = True 736 if DOCKER_SERVOD_DEBUG_MODE == '1' or os.path.exists( 737 SERVOD_DEBUG_FLAG): 738 remove_container = False 739 self.stop_containerized_servod(remove_container=remove_container) 740 return 741 742 logging.debug('Stopping servod on port %s', self.servo_port) 743 self.run('stop servod PORT=%d' % self.servo_port, 744 timeout=60, ignore_status=True) 745 logging.debug('Wait %s seconds for servod process fully teardown.', 746 servo_constants.SERVOD_TEARDOWN_TIMEOUT) 747 time.sleep(servo_constants.SERVOD_TEARDOWN_TIMEOUT) 748 749 def wait_for_init_servod_in_container(self, container): 750 """Waits for servod process to be ready to listen inside container.""" 751 ready_output = "Instance associated with id %s ready" % self.servo_port 752 if not container: 753 logging.debug("Container object is None.") 754 return False 755 try: 756 # Executes servodtool command to wait for servod to be active. 757 exit_code, output = container.exec_run( 758 cmd="servodtool instance wait-for-active -p %s" % 759 self.servo_port, 760 stdout=True) 761 # b/217780680, Make this compatible with python3, 762 if isinstance(output, bytes): 763 output = output.decode(errors='replace') 764 if exit_code != 0 or ready_output not in output: 765 logging.debug( 766 'Failed to start servod process inside container,' 767 'exit_code=%s, output=%s.', exit_code, output) 768 return False 769 except docker.errors.APIError as e: 770 logging.error('%s', e) 771 return True 772 773 def start_containerized_servod(self, with_servod=True): 774 """Start the servod process on servohost.""" 775 logging.info("Starting servod container %s.", 776 self.servod_container_name) 777 client = docker_utils.get_docker_client() 778 logging.debug("Docker deamon ping %s", client.ping()) 779 labels = {'WITH_SERVOD': str(with_servod)} 780 try: 781 if self.is_up(with_servod=with_servod): 782 logging.warning("Container already exists - not starting") 783 return 784 logging.info( 785 'Servod container either does not exist or is not running.' 786 ) 787 self.stop_containerized_servod() 788 except docker.errors.APIError: 789 # Container exists but is not running 790 logging.info("Cleanup of non functional container.") 791 self.stop_containerized_servod() 792 793 label = os.environ.get("SERVOD_CONTAINER_LABEL", "release") 794 registry = os.environ.get("REGISTRY_URI", SERVOD_CONTAINER_IMAGE_PATH) 795 image = "%s/servod:%s" % (registry, label) 796 logging.info("Servod container image: %s", image) 797 798 try: 799 client.images.pull(image) 800 except docker.errors.APIError: 801 logging.exception("Failed to pull servod container image.") 802 803 environment = [ 804 "BOARD=%s" % self.servo_board, 805 "MODEL=%s" % self.servo_model, 806 "SERIAL=%s" % self.servo_serial, 807 "PORT=%s" % self.servo_port, 808 ] 809 # Start servod with dual_v4 based on servo_setup. 810 if self.is_dual_setup(): 811 environment.append("DUAL_V4=1") 812 # Start servod with CONFIG=cr50.xml which required for some pools. 813 if self._require_cr50_servod_config(): 814 environment.append("CONFIG=cr50.xml") 815 if self.servo_recovery == True: 816 environment.append("REC_MODE=1") 817 818 container_network = os.environ.get("DOCKER_DEFAULT_NETWORK", None) 819 # In case the network environment is not set, fallback to default network 820 # for moblab or satlab based on the TLE. 821 if not container_network: 822 container_network = "default_moblab" 823 if 'drone' in docker_utils.get_running_containers(client=client): 824 container_network = "default_satlab" 825 826 logging.info('Servod container will use %s network', container_network) 827 logging.info('Servod container environment: %s', environment) 828 try: 829 start_cmds = ["bash", "/start_servod.sh"] 830 if not with_servod: 831 # In some cases we do not need container without running servod. 832 start_cmds = ["tail", "-f", "/dev/null"] 833 logging.debug( 834 'Attempting to start Servod container without servod') 835 container = client.containers.run( 836 image, 837 remove=False, 838 privileged=True, 839 labels=labels, 840 name=self.servod_container_name, 841 hostname=self.servod_container_name, 842 network=container_network, 843 cap_add=["NET_ADMIN"], 844 detach=True, 845 volumes=[ 846 "/dev:/dev", 847 "%s_log:/var/log/servod_%s/" % 848 (self.servod_container_name, self.servo_port) 849 ], 850 environment=environment, 851 command=start_cmds, 852 ) 853 # Probing servod ready state fails. 854 if with_servod: 855 current_time = 0 856 while not self.wait_for_init_servod_in_container( 857 container 858 ) and current_time <= servo_constants.SERVOD_STARTUP_TIMEOUT: 859 time.sleep(10) 860 current_time += 10 861 862 if not self.wait_for_init_servod_in_container(container): 863 logging.info( 864 'Servod process is not up within the servod container after %s seconds.' 865 % servo_constants.SERVOD_STARTUP_TIMEOUT) 866 else: 867 logging.info( 868 'Servod process is up within the servod container after %s seconds.' 869 % current_time) 870 else: 871 logging.info( 872 "Servod container %s up and running without servod process.", 873 self.servod_container_name) 874 875 except docker.errors.ContainerError as e: 876 logging.exception("Failed to start servod container. %s", e) 877 raise 878 except docker.errors.ImageNotFound: 879 logging.exception("Servod container image %s not found.", image) 880 881 def stop_containerized_servod(self, remove_container=True): 882 """Stop the container running servod.""" 883 logging.info("Stopping servod container %s.", 884 self.servod_container_name) 885 client = docker_utils.get_docker_client() 886 try: 887 cont = client.containers.get(self.servod_container_name) 888 except docker.errors.NotFound: 889 logging.info("Servod container %s not found no need to stop it.", 890 self.servod_container_name) 891 except docker.errors.APIError: 892 logging.exception( 893 "Stopping servod container %s caused a docker error.", 894 self.servod_container_name) 895 else: 896 if remove_container == True: 897 cont.remove(force=True) 898 logging.debug('Servod container instance removed') 899 900 def restart_servod(self, quick_startup=False): 901 """Restart the servod process on servohost. 902 """ 903 self.stop_servod() 904 self.start_servod(quick_startup) 905 906 def _process_servodtool_error(self, response): 907 """Helper function to handle non-zero servodtool response. 908 """ 909 if re.search(servo_constants.ERROR_MESSAGE_USB_HUB_NOT_COMPATIBLE, 910 self._to_str(response.stdout)): 911 logging.error('The servo is not plugged on a usb hub that supports' 912 ' power-cycle!') 913 # change the flag so we can update this label in later process. 914 self.smart_usbhub = False 915 return 916 917 if re.search( 918 servo_constants.ERROR_MESSAGE_DEVICE_NOT_FOUND % 919 self.servo_serial, self._to_str(response.stdout)): 920 logging.error('No servo with serial %s found!', self.servo_serial) 921 return 922 923 logging.error('Unexpected error occurred from usbhub control, please' 924 ' file a bug and inform chrome-fleet-software@ team!') 925 926 def get_main_servo_usb_path(self): 927 """Helper function to collect current usb-path to main servo. 928 929 The usb-path is path to the folder where usb-device was enumerated. 930 If fail then will return an empty string (''). 931 932 @returns: string, usb-path to the main servo device. 933 e.g.: '/sys/bus/usb/devices/1-6.1.3.1' 934 """ 935 # TODO remove try-except when fix crbug.com/1087964 936 try: 937 cmd = 'servodtool device -s %s usb-path' % self.servo_serial 938 resp = self.run(cmd, ignore_status=True, timeout=30) 939 except Exception as e: 940 # Here we catch only timeout errors. 941 # Other errors is filtered by ignore_status=True 942 logging.debug('Attempt to get servo usb-path failed due to ' 943 'timeout; %s', e) 944 return '' 945 if resp.exit_status != 0: 946 self._process_servodtool_error(resp) 947 return '' 948 usb_path = self._to_str(resp.stdout.strip()) 949 950 logging.info('Usb path of servo %s is %s', self.servo_serial, usb_path) 951 return usb_path 952 953 def _get_servo_usb_devnum(self): 954 """Helper function to collect current usb devnum of servo.""" 955 usb_path = self.get_main_servo_usb_path() 956 if not usb_path: 957 return '' 958 resp = self.run('cat %s/devnum' % usb_path, ignore_status=True) 959 if resp.exit_status != 0: 960 self._process_servodtool_error(resp) 961 return '' 962 return self._to_str(resp.stdout.strip()) 963 964 def reboot_servo_v3_on_need(self): 965 """Check and reboot servo_v3 based on below conditions. 966 1. If there is an update pending on reboot. 967 2. Servo_v3 has been up for more than 96 hours. 968 """ 969 if self.get_board() != 'beaglebone_servo': 970 logging.info('Servo reboot is only applicable for servo V3.') 971 return 972 973 update_pending_reboot = (self._check_update_status() == 974 self.UPDATE_STATE.PENDING_REBOOT) 975 uptime_hours = float(self.check_uptime())/3600 976 logging.info('Uptime of servo_v3: %s hour(s)', uptime_hours) 977 long_up_time = uptime_hours > 96 978 979 # Skip reboot if neither condition are met. 980 if not (update_pending_reboot or long_up_time): 981 return 982 983 if update_pending_reboot: 984 message = 'Starting reboot servo_v3 because an update is pending.' 985 reboot_method = self._post_update_reboot 986 elif long_up_time: 987 message = 'Starting reboot servo_v3 because uptime > 96 hours.' 988 reboot_method = self._servo_host_reboot 989 self.record('INFO', None, None, message) 990 logging.info(message) 991 try: 992 reboot_method() 993 message = 'Servo_v3 reboot completed successfully.' 994 except Exception as e: 995 logging.debug("Fail to reboot servo_v3; %s", e) 996 message = ('Servo_v3 reboot failed, please check debug log ' 997 'for details.') 998 logging.info(message) 999 self.record('INFO', None, None, message) 1000 1001 def _reset_servo(self): 1002 logging.info('Resetting servo through smart usbhub.') 1003 # TODO remove try-except when fix crbug.com/1087964 1004 try: 1005 resp = self.run('servodtool device -s %s power-cycle' % 1006 self.servo_serial, ignore_status=True, 1007 timeout=30) 1008 if resp.exit_status != 0: 1009 self._process_servodtool_error(resp) 1010 return False 1011 except Exception as e: 1012 # Here we catch only timeout errors. 1013 # Other errors is filtered by ignore_status=True 1014 logging.debug('Attempt to reset servo failed due to timeout;' 1015 ' %s', e) 1016 return False 1017 1018 logging.debug('Wait %s seconds for servo to come back from reset.', 1019 servo_constants.SERVO_RESET_TIMEOUT_SECONDS) 1020 time.sleep(servo_constants.SERVO_RESET_TIMEOUT_SECONDS) 1021 # change the flag so we can update this label in later process. 1022 self.smart_usbhub = True 1023 return True 1024 1025 def reset_servo(self): 1026 """Reset(power-cycle) the servo via smart usbhub. 1027 """ 1028 if not self.is_labstation(): 1029 logging.info('Servo reset is not applicable to servo_v3.') 1030 return 1031 1032 pre_reset_devnum = self._get_servo_usb_devnum() 1033 logging.info('Servo usb devnum before reset: %s', pre_reset_devnum) 1034 result = self._reset_servo() 1035 if not result: 1036 message = ('Failed to reset servo with serial: %s. (Please ignore' 1037 ' this error if the DUT is not connected to a smart' 1038 ' usbhub).' % self.servo_serial) 1039 logging.warning(message) 1040 self.record('INFO', None, None, message) 1041 return 1042 1043 post_reset_devnum = self._get_servo_usb_devnum() 1044 logging.info('Servo usb devnum after reset: %s', post_reset_devnum) 1045 if not (pre_reset_devnum and post_reset_devnum): 1046 message = ('Servo reset completed but unable to verify' 1047 ' devnum change!') 1048 elif pre_reset_devnum != post_reset_devnum: 1049 message = ('Reset servo with serial %s completed successfully!' 1050 % self.servo_serial) 1051 else: 1052 message = 'Servo reset completed but devnum is still not changed!' 1053 logging.info(message) 1054 self.record('INFO', None, None, message) 1055 1056 def _extract_compressed_logs(self, logdir, relevant_files): 1057 """Decompress servod logs in |logdir|. 1058 1059 @param logdir: directory containing compressed servod logs. 1060 @param relevant_files: list of files in |logdir| to consider. 1061 1062 @returns: tuple, (tarfiles, files) where 1063 tarfiles: list of the compressed filenames that have been 1064 extracted and deleted 1065 files: list of the uncompressed files that were generated 1066 """ 1067 # For all tar-files, first extract them to the directory, and 1068 # then let the common flow handle them. 1069 tarfiles = [cf for cf in relevant_files if 1070 cf.endswith(self.COMPRESSION_SUFFIX)] 1071 files = [] 1072 for f in tarfiles: 1073 norm_name = os.path.basename(f)[:-len(self.COMPRESSION_SUFFIX)] 1074 with tarfile.open(f) as tf: 1075 # Each tarfile has only one member, as 1076 # that's the compressed log. 1077 member = tf.members[0] 1078 # Manipulate so that it only extracts the basename, and not 1079 # the directories etc. 1080 member.name = norm_name 1081 files.append(os.path.join(logdir, member.name)) 1082 tf.extract(member, logdir) 1083 # File has been extracted: remove the compressed file. 1084 os.remove(f) 1085 return tarfiles, files 1086 1087 def _extract_mcu_logs(self, log_subdir): 1088 """Extract MCU (EC, Cr50, etc) console output from servod debug logs. 1089 1090 Using the MCU_EXTRACTOR regex (above) extract and split out MCU console 1091 lines from the logs to generate invidiual console logs e.g. after 1092 this method, you can find an ec.txt and servo_v4.txt in |log_dir| if 1093 those MCUs had any console input/output. 1094 1095 @param log_subdir: directory with log.DEBUG.txt main servod debug logs. 1096 """ 1097 # Extract the MCU for each one. The MCU logs are only in the .DEBUG 1098 # files 1099 mcu_lines_file = os.path.join(log_subdir, 'log.DEBUG.txt') 1100 if not os.path.exists(mcu_lines_file): 1101 logging.info('No DEBUG logs found to extract MCU logs from.') 1102 return 1103 mcu_files = {} 1104 mcu_file_template = '%s.txt' 1105 with open(mcu_lines_file, 'r') as f: 1106 for line in f: 1107 match = self.MCU_EXTRACTOR.match(line) 1108 if match: 1109 mcu = match.group(self.MCU_GROUP).lower() 1110 # The |mcu| might contain a '/' in it. Replace it with '.' 1111 # to avoid generating bad filepaths. 1112 mcu = mcu.replace('/', '.') 1113 line = match.group(self.LINE_GROUP) 1114 if mcu not in mcu_files: 1115 mcu_file = os.path.join(log_subdir, 1116 mcu_file_template % mcu) 1117 mcu_files[mcu] = open(mcu_file, 'a') 1118 fd = mcu_files[mcu] 1119 fd.write(line + '\n') 1120 for f in mcu_files: 1121 mcu_files[f].close() 1122 1123 def remove_latest_log_symlinks(self): 1124 """Remove the conveninence symlinks 'latest' servod logs.""" 1125 symlink_wildcard = '%s/latest*' % self.remote_log_dir 1126 cmd = 'rm ' + symlink_wildcard 1127 self.run(cmd, stderr_tee=None, ignore_status=True) 1128 1129 def probe_servod_restart(self, instance_ts, outdir): 1130 """Grab servod logs from previous instances if part of this session. 1131 1132 If since the last time this host called start_servod() servod crashed 1133 and restarted, this helper finds those logs as well, and stores them 1134 with the |OLD_LOG_SUFFIX| to investigate if necessary. 1135 1136 It also issues a panicinfo command to servo devices after the restart 1137 to try and collect reboot information for debugging. 1138 1139 @param instance_ts: the log timestamp that the current instance uses 1140 @param outdir: directory to create a subdirectory into to place the 1141 servod logs into. 1142 """ 1143 if self._initial_instance_ts is None: 1144 logging.info('No log timestamp grabbed successfully on servod ' 1145 'startup. Cannot check device restarts. Ignoring.') 1146 return 1147 if instance_ts == self._initial_instance_ts: 1148 logging.debug('Servod appears to have run without restarting') 1149 return 1150 # Servod seems to have restarted (at least once). |_initial_instance_ts| 1151 # is the first timestamp, and instance_ts is the current timestamp. Find 1152 # all timestamps in between them, and grab the logs for each. 1153 tss = self._find_instance_timestamps_between(self._initial_instance_ts, 1154 instance_ts) 1155 logging.info('Servod has restarted %d times between the start and the ' 1156 'end of this servo_host.', len(tss)) 1157 logging.info('This might be an issue. Will extract all logs from each ' 1158 'instance.') 1159 logging.info('Logs that are not the currently running (about to turn ' 1160 'down) instance are maked with a .%s in their folder.', 1161 self.OLD_LOG_SUFFIX) 1162 for ts in tss: 1163 self.get_instance_logs(ts, outdir, old=True) 1164 # Lastly, servod has restarted due to a potential issue. Try to get 1165 # panic information from servo micro and servo v4 for the current logs. 1166 # This can only happen if the |_servo| attribute is initialized. 1167 if self._servo: 1168 for mcu in ['servo_micro', 'servo_v4', 'servo_v4p1']: 1169 ctrl = '%s_uart_cmd' % mcu 1170 if self._servo.has_control(ctrl): 1171 logging.info('Trying to retrieve %r panicinfo into logs', 1172 mcu) 1173 try: 1174 self._servo.set_nocheck(ctrl, 'panicinfo') 1175 except error.TestFail as e: 1176 logging.error('Failed to generate panicinfo for %r ' 1177 'logs. %s', mcu, str(e)) 1178 1179 def _find_instance_timestamps_between(self, start_ts, end_ts): 1180 """Find all log timestamps between [start_ts, end_ts). 1181 1182 @param start_ts: str, earliest log timestamp of interest 1183 @param end_ts: str, latest log timestamp of interest 1184 1185 @returns: list, all timestamps between start_ts and end_ts, end_ts 1186 exclusive, on the servo_host. An empty list on errors 1187 """ 1188 # Simply get all timestamp, and then sort and remove 1189 cmd = 'ls %s' % self.remote_log_dir 1190 res = self.run(cmd, stderr_tee=None, ignore_status=True) 1191 if res.exit_status != 0: 1192 # Here we failed to find anything. 1193 logging.info('Failed to find remote servod logs. Ignoring.') 1194 return [] 1195 logfiles = self._to_str(res.stdout.strip().split()) 1196 timestamps = set() 1197 for logfile in logfiles: 1198 ts_match = self.TS_EXTRACTOR.match(logfile) 1199 if not ts_match: 1200 # Simply ignore files that fail the check. It might be the 1201 # 'latest' symlinks or random files. 1202 continue 1203 timestamps.add(ts_match.group(self.TS_GROUP)) 1204 # At this point we have all unique timestamps. 1205 timestamps = sorted(timestamps) 1206 for ts in [start_ts, end_ts]: 1207 if ts not in timestamps: 1208 logging.error('Timestamp %r not in servod logs. Cannot query ' 1209 'for timestamps in between %r and %r', ts, 1210 start_ts, end_ts) 1211 return [] 1212 return timestamps[timestamps.index(start_ts):timestamps.index(end_ts)] 1213 1214 def get_instance_logs_ts(self): 1215 """Retrieve the currently running servod instance's log timestamp 1216 1217 @returns: str, timestamp for current instance, or None on failure 1218 """ 1219 # First, extract the timestamp. This cmd gives the real filename of 1220 # the latest aka current log file. 1221 cmd = ('if [ -f %(dir)s/latest.DEBUG ];' 1222 'then realpath %(dir)s/latest.DEBUG;' 1223 'elif [ -f %(dir)s/latest ];' 1224 'then realpath %(dir)s/latest;' 1225 'else exit %(code)d;' 1226 'fi' % {'dir': self.remote_log_dir, 1227 'code': self.NO_SYMLINKS_CODE}) 1228 res = self.run(cmd, stderr_tee=None, ignore_status=True) 1229 if res.exit_status != 0: 1230 if res.exit_status == self.NO_SYMLINKS_CODE: 1231 logging.warning('servod log latest symlinks not found. ' 1232 'This is likely due to an error starting up ' 1233 'servod. Ignoring..') 1234 else: 1235 logging.warning('Failed to find servod logs on servo host.') 1236 logging.warning(res.stderr.strip()) 1237 return None 1238 fname = os.path.basename(self._to_str(res.stdout.strip())) 1239 # From the fname, ought to extract the timestamp using the TS_EXTRACTOR 1240 if type(fname) == type(b' '): 1241 fname = fname.decode("utf-8") 1242 ts_match = self.TS_EXTRACTOR.match(fname) 1243 if not ts_match: 1244 logging.warning('Failed to extract timestamp from servod log file ' 1245 '%r. Skipping. The servo host is using outdated ' 1246 'servod logging and needs to be updated.', fname) 1247 return None 1248 return ts_match.group(self.TS_GROUP) 1249 1250 def get_servohost_logs(self, outdir): 1251 """Get logs that can help debugging servo/servod problem from 1252 the servohost 1253 1254 @param outdir: directory to create a subdirectory into to place the 1255 servod logs into. 1256 """ 1257 log_dir = os.path.join(outdir, 'servohost_%s' % self.hostname) 1258 if os.path.isdir(log_dir): 1259 # In multi-DUTs testing, each DUTs will may their own servohost 1260 # instance, where could cause duplicate efforts if they share a 1261 # same servohost, so we can just skip the collect if the log 1262 # dir already exists. 1263 logging.info( 1264 'Skip dmesg and messages logs collecting as %s' 1265 ' already exists.', log_dir) 1266 return 1267 logging.info('Collecting dmesg and messages from servohost %s', 1268 self.hostname) 1269 os.mkdir(log_dir) 1270 logging.info('Saving servohost logs to %s.', log_dir) 1271 # First collect dmesg from the servohost. 1272 crashcollect.collect_command(self, 'dmesg -H', 1273 os.path.join(log_dir, 'dmesg')) 1274 # Collect messages log from the servohost. 1275 if not self.is_containerized_servod(): 1276 try: 1277 self.get_file('/var/log/messages', log_dir, try_rsync=False) 1278 except error.AutoservRunError: 1279 logging.warning( 1280 'Failed to collect messages log from servohost.') 1281 1282 def get_servod_startup_log(self, outdir): 1283 """Get servod start_up log, this log is available even servod was 1284 not started successfully. 1285 1286 @param outdir: directory to create a subdirectory into to place the 1287 servod logs into. 1288 """ 1289 if self.is_containerized_servod(): 1290 return 1291 log_dir = os.path.join(outdir, 'servod_startup_%s' % self.servo_port) 1292 os.mkdir(log_dir) 1293 start_up_log = '/var/log/servod_%s.STARTUP.log' % self.servo_port 1294 try: 1295 self.get_file(start_up_log, log_dir, try_rsync=False) 1296 except error.AutoservRunError: 1297 logging.warning('Failed to collect servod start up log' 1298 ' from servohost.') 1299 1300 def get_instance_logs(self, instance_ts, outdir, old=False): 1301 """Collect all logs with |instance_ts| and dump into a dir in |outdir| 1302 1303 This method first collects all logs on the servo_host side pertaining 1304 to this servod instance (port, instatiation). It glues them together 1305 into combined log.[level].txt files and extracts all available MCU 1306 console I/O from the logs into individual files e.g. servo_v4.txt 1307 1308 All the output can be found in a directory inside |outdir| that 1309 this generates based on |LOG_DIR|, the servod port, and the instance 1310 timestamp on the servo_host side. 1311 1312 @param instance_ts: log timestamp to grab logfiles for 1313 @param outdir: directory to create a subdirectory into to place the 1314 servod logs into. 1315 @param old: bool, whether to append |OLD_LOG_SUFFIX| to output dir 1316 """ 1317 # Create the local results log dir. 1318 log_dir = os.path.join(outdir, '%s_%s.%s' % (self.LOG_DIR, 1319 str(self.servo_port), 1320 instance_ts)) 1321 if old: 1322 log_dir = '%s.%s' % (log_dir, self.OLD_LOG_SUFFIX) 1323 logging.info('Saving servod logs to %r.', log_dir) 1324 os.mkdir(log_dir) 1325 # Now, get all files with that timestamp. 1326 cmd = 'find %s -maxdepth 1 -name "log.%s*"' % (self.remote_log_dir, 1327 instance_ts) 1328 res = self.run(cmd, stderr_tee=None, ignore_status=True) 1329 files = self._to_str(res.stdout.strip()).split() 1330 try: 1331 if self.is_containerized_servod(): 1332 client = docker_utils.get_docker_client() 1333 container = client.containers.get(self.servod_container_name) 1334 for f in files: 1335 file_stream, stat = container.get_archive(f) 1336 tf = tempfile.NamedTemporaryFile(delete=False) 1337 for block in file_stream: 1338 tf.write(block) 1339 tf.close() 1340 pw_tar = tarfile.TarFile(tf.name) 1341 pw_tar.extractall(log_dir) 1342 os.remove(tf.name) 1343 else: 1344 self.get_file(files, log_dir, try_rsync=False) 1345 1346 if not os.listdir(log_dir): 1347 logging.info('No servod logs retrieved. Ignoring, and removing ' 1348 '%r again.', log_dir) 1349 os.rmdir(log_dir) 1350 return 1351 except docker.errors.NotFound: 1352 logging.info("Servod container %s not found no need to stop it.", 1353 self.hostname) 1354 except error.AutoservRunError as e: 1355 result = e.result_obj 1356 if result.exit_status != 0: 1357 stderr = result.stderr.strip() 1358 logging.warning("Couldn't retrieve servod logs. Ignoring: %s", 1359 stderr or '\n%s' % result) 1360 # Remove the log_dir as nothing was added to it. 1361 os.rmdir(log_dir) 1362 return 1363 local_files = [os.path.join(log_dir, f) for f in os.listdir(log_dir)] 1364 # TODO(crrev.com/c/1793030): remove no-level case once CL is pushed 1365 for level_name in ('DEBUG', 'INFO', 'WARNING', ''): 1366 # Create the joint files for each loglevel. i.e log.DEBUG 1367 joint_file = self.JOINT_LOG_PREFIX 1368 if level_name: 1369 joint_file = '%s.%s' % (self.JOINT_LOG_PREFIX, level_name) 1370 # This helps with some online tools to avoid complaints about an 1371 # unknown filetype. 1372 joint_file = joint_file + '.txt' 1373 joint_path = os.path.join(log_dir, joint_file) 1374 files = [f for f in local_files if level_name in f] 1375 if not files: 1376 # TODO(crrev.com/c/1793030): remove no-level case once CL 1377 # is pushed 1378 continue 1379 # Extract compressed logs if any. 1380 compressed, extracted = self._extract_compressed_logs(log_dir, 1381 files) 1382 files = list(set(files) - set(compressed)) 1383 files.extend(extracted) 1384 # Need to sort. As they all share the same timestamp, and 1385 # loglevel, the index itself is sufficient. The highest index 1386 # is the oldest file, therefore we need a descending sort. 1387 def sortkey(f, level=level_name): 1388 """Custom sortkey to sort based on rotation number int.""" 1389 if f.endswith(level_name): return 0 1390 return int(f.split('.')[-1]) 1391 1392 files.sort(reverse=True, key=sortkey) 1393 # Just rename the first file rather than building from scratch. 1394 os.rename(files[0], joint_path) 1395 with open(joint_path, 'a') as joint_f: 1396 for logfile in files[1:]: 1397 # Transfer the file to the joint file line by line. 1398 with open(logfile, 'r') as log_f: 1399 for line in log_f: 1400 joint_f.write(line) 1401 # File has been written over. Delete safely. 1402 os.remove(logfile) 1403 # Need to remove all files form |local_files| so we don't 1404 # analyze them again. 1405 local_files = list(set(local_files) - set(files) - set(compressed)) 1406 # Lastly, extract MCU logs from the joint logs. 1407 self._extract_mcu_logs(log_dir) 1408 1409 def _lock(self): 1410 """lock servohost by touching a file. 1411 """ 1412 logging.debug('Locking servohost %s by touching %s file', 1413 self.hostname, self._lock_file) 1414 self.run('touch %s' % self._lock_file, ignore_status=True) 1415 self._is_locked = True 1416 1417 def _unlock(self): 1418 """Unlock servohost by removing the lock file. 1419 """ 1420 logging.debug('Unlocking servohost by removing %s file', 1421 self._lock_file) 1422 self.run('rm %s' % self._lock_file, ignore_status=True) 1423 self._is_locked = False 1424 1425 def close(self): 1426 """Close the associated servo and the host object.""" 1427 # NOTE: throughout this method there are multiple attempts to catch 1428 # all errors. This is WAI as log grabbing should not fail tests. 1429 # However, the goal is to catch and handle/process all errors, thus 1430 # we print the traceback and ask for a bug. 1431 if self._closed: 1432 logging.debug('ServoHost is already closed.') 1433 return 1434 1435 # Only attempt ssh related actions if servohost is sshable. We call 1436 # check_cached_up_status() first because it's lightweighted and return 1437 # much faster in the case servohost is down, however, we still want 1438 # to call is_up() later since check_cached_up_status() is ping based check 1439 # and not guarantee the servohost is sshable. 1440 servo_host_ready = self.check_cached_up_status() and self.is_up() 1441 1442 # If the dockerized servod is in used, we can start a new servod container 1443 # with out the servod process for log collection. 1444 if (not self.is_localhost() and not servo_host_ready 1445 and self.is_containerized_servod()): 1446 logging.info( 1447 'Start servod container without servod for log collection.' 1448 ) 1449 self.start_containerized_servod(with_servod=False) 1450 servo_host_ready = True 1451 1452 # TODO(crbug.com/1011516): once enabled, remove the check against 1453 # localhost and instead check against log-rotiation enablement. 1454 should_collect_log = (servo_host_ready and self.job 1455 and not self.is_localhost()) 1456 if should_collect_log: 1457 instance_ts = self.get_instance_logs_ts() 1458 else: 1459 logging.info('Servohost is down, will skip servod log collecting.') 1460 instance_ts = None 1461 servod_logs_available = instance_ts is not None 1462 if servod_logs_available: 1463 # Probe whether there was a servod restart, and grab those old 1464 # logs as well. 1465 try: 1466 self.probe_servod_restart(instance_ts, self.job.resultdir) 1467 except (error.AutoservRunError, error.TestFail) as e: 1468 logging.info('Failed to grab servo logs due to: %s. ' 1469 'This error is forgiven.', str(e)) 1470 except Exception as e: 1471 logging.error('Unexpected error probing for old logs. %s. ' 1472 'Forgiven. Please file a bug and fix or catch ' 1473 'in log probing function', str(e), 1474 exc_info=True) 1475 if self._servo: 1476 outdir = None if not self.job else self.job.resultdir 1477 # In some cases when we run as lab-tools, the job object is None. 1478 self._servo.close(outdir) 1479 try: 1480 if should_collect_log: 1481 self.get_servod_startup_log(self.job.resultdir) 1482 self.get_servohost_logs(self.job.resultdir) 1483 # Grab current (not old like above) logs after the servo instance 1484 # was closed out. 1485 if servod_logs_available: 1486 self.get_instance_logs(instance_ts, self.job.resultdir) 1487 except error.AutoservRunError as e: 1488 logging.info( 1489 'Failed to grab servo logs due to: %s. ' 1490 'This error is forgiven.', str(e)) 1491 except Exception as e: 1492 logging.error( 1493 'Unexpected error grabbing servod logs. %s. ' 1494 'Forgiven. Please file a bug and fix or catch ' 1495 'in log grabbing function', 1496 str(e), 1497 exc_info=True) 1498 1499 if self._is_locked and servo_host_ready: 1500 # Remove the lock if the servohost has been locked. 1501 try: 1502 self._unlock() 1503 except error.AutoservSSHTimeout: 1504 logging.error('Unlock servohost failed due to ssh timeout.' 1505 ' It may caused by servohost went down during' 1506 ' the task.') 1507 # We want always stop servod after task to minimum the impact of bad 1508 # servod process interfere other servods.(see crbug.com/1028665) 1509 if servo_host_ready: 1510 try: 1511 self.stop_servod() 1512 except error.AutoservRunError as e: 1513 logging.info( 1514 "Failed to stop servod due to:\n%s\n" 1515 "This error is forgiven.", str(e)) 1516 1517 super(ServoHost, self).close() 1518 # Mark closed. 1519 self._closed = True 1520 1521 def get_servo_state(self): 1522 return self._servo_state 1523 1524 def _get_host_metrics_data(self): 1525 return {'port': self.servo_port, 1526 'host': self.get_dut_hostname() or self.hostname, 1527 'board': self.servo_board or ''} 1528 1529 def is_servo_board_present_on_servo_v3(self): 1530 """Check if servo board is detected on servo_v3""" 1531 logging.debug('Started to detect servo board on servo_v3') 1532 vid_pids = ['18d1:5004', '0403:6014'] 1533 not_detected = 'The servo board is not detected on servo_v3' 1534 try: 1535 cmd = 'lsusb | grep "%s"' % "\|".join(vid_pids) 1536 result = self.run(cmd, ignore_status=True, timeout=30) 1537 if result.exit_status == 0 and self._to_str(result.stdout.strip()): 1538 logging.debug('The servo board is detected on servo_v3') 1539 return True 1540 logging.debug('%s; %s', not_detected, result) 1541 return False 1542 except Exception as e: 1543 # can be triggered by timeout issue due running the script 1544 metrics.Counter( 1545 'chromeos/autotest/repair/servo_detection/timeout' 1546 ).increment(fields=self._get_host_metrics_data()) 1547 logging.error('%s; %s', not_detected, str(e)) 1548 return None 1549 1550 def _require_cr50_servod_config(self): 1551 """Check whether we need start servod with CONFIG=cr50.xml""" 1552 dut_host_info = self.get_dut_host_info() 1553 if not dut_host_info: 1554 return False 1555 for pool in dut_host_info.pools: 1556 if pool.startswith(servo_constants.CR50_CONFIG_POOL_PREFIX): 1557 return True 1558 return False 1559 1560 def get_verifier_state(self, tag): 1561 """Return the state of servo verifier. 1562 1563 @returns: bool or None 1564 """ 1565 return self._repair_strategy.verifier_is_good(tag) 1566 1567 def get_repair_strategy_node(self, tag): 1568 """Return the instance of verifier/repair node for host by tag. 1569 1570 @returns: _DependencyNode or None 1571 """ 1572 return self._repair_strategy.node_by_tag(tag) 1573 1574 def determine_servo_state(self): 1575 """Determine servo state based on the failed verifier. 1576 1577 @returns: servo state value 1578 The state detecting based on first fail verifier or collecting of 1579 them. 1580 """ 1581 ssh = self.get_verifier_state('connection') 1582 servo_root_present = self.get_verifier_state('servo_root_present') 1583 servo_root_present_node = self.get_repair_strategy_node( 1584 'servo_root_present') 1585 servo_v3_present = self.get_verifier_state('servo_v3_root_present') 1586 servo_fw = self.get_verifier_state('servo_fw') 1587 servo_fw_update = self.get_repair_strategy_node('servo_fw_update') 1588 servod_dut_controller_missing = self.get_repair_strategy_node( 1589 'servod_dut_controller_missing') 1590 disk_space = self.get_verifier_state('servo_disk_space') 1591 start_servod = self.get_verifier_state('start_servod') 1592 servod_started = self.get_verifier_state('servod_started') 1593 servod_echo = self.get_verifier_state('servod_echo') 1594 create_servo = self.get_verifier_state('servod_connection') 1595 init_servo = self.get_verifier_state('servod_control') 1596 cr50_low_sbu = self.get_verifier_state('servo_cr50_low_sbu') 1597 cr50_off = self.get_verifier_state('servo_cr50_off') 1598 servo_topology = self.get_verifier_state('servo_topology') 1599 dut_connected = self.get_verifier_state('servo_dut_connected') 1600 hub_connected = self.get_verifier_state('servo_hub_connected') 1601 pwr_button = self.get_verifier_state('servo_pwr_button') 1602 lid_open = self.get_verifier_state('servo_lid_open') 1603 ec_board = self.get_verifier_state('servo_ec_console') 1604 cr50_console = self.get_verifier_state('servo_cr50_console') 1605 ccd_testlab = self.get_verifier_state('servo_ccd_testlab') 1606 1607 if not ssh: 1608 return servo_constants.SERVO_STATE_NO_SSH 1609 if start_servod == hosts.VERIFY_FAILED: 1610 return servo_constants.SERVO_STATE_SERVO_HOST_ISSUE 1611 if servo_root_present == hosts.VERIFY_FAILED: 1612 if not self.servo_serial: 1613 return servo_constants.SERVO_STATE_WRONG_CONFIG 1614 if hasattr(servo_root_present_node, 'serial_mismatch'): 1615 return servo_constants.SERVO_STATE_SERIAL_MISMATCH 1616 return servo_constants.SERVO_STATE_NOT_CONNECTED 1617 if servo_v3_present == hosts.VERIFY_FAILED: 1618 # if we cannot find required board on servo_v3 1619 return servo_constants.SERVO_STATE_NEED_REPLACEMENT 1620 if servo_fw == hosts.VERIFY_FAILED: 1621 logging.info(servo_fw_update) 1622 if hasattr(servo_fw_update, 'servo_updater_issue_detected'): 1623 return servo_constants.SERVO_STATE_SERVO_UPDATER_ISSUE 1624 return servo_constants.SERVO_STATE_NEED_REPLACEMENT 1625 1626 if dut_connected == hosts.VERIFY_FAILED: 1627 return servo_constants.SERVO_STATE_DUT_NOT_CONNECTED 1628 if hub_connected == hosts.VERIFY_FAILED: 1629 logging.info('Servo HUB not connected') 1630 return servo_constants.SERVO_STATE_DUT_NOT_CONNECTED 1631 1632 if cr50_low_sbu == hosts.VERIFY_FAILED: 1633 return servo_constants.SERVO_STATE_SBU_LOW_VOLTAGE 1634 if cr50_off == hosts.VERIFY_FAILED: 1635 return servo_constants.SERVO_STATE_CR50_NOT_ENUMERATED 1636 1637 if servod_dut_controller_missing == hosts.VERIFY_FAILED: 1638 return servo_constants.SERVO_STATE_SERVOD_DUT_CONTROLLER_MISSING 1639 if servo_topology == hosts.VERIFY_FAILED: 1640 return servo_constants.SERVO_STATE_TOPOLOGY_ISSUE 1641 1642 # TODO(otabek@): detect special cases detected by pwr_button 1643 if dut_connected == hosts.VERIFY_SUCCESS: 1644 if pwr_button == hosts.VERIFY_FAILED: 1645 metrics.Counter( 1646 'chromeos/autotest/repair/servo_unexpected/pwr_button2' 1647 ).increment(fields=self._get_host_metrics_data()) 1648 1649 if (servod_started == hosts.VERIFY_FAILED 1650 or servod_echo == hosts.VERIFY_FAILED): 1651 return servo_constants.SERVO_STATE_SERVOD_ISSUE 1652 1653 # one of the reason why servo can not initialized 1654 if cr50_console == hosts.VERIFY_FAILED: 1655 return servo_constants.SERVO_STATE_CR50_CONSOLE_MISSING 1656 if ccd_testlab == hosts.VERIFY_FAILED: 1657 return servo_constants.SERVO_STATE_CCD_TESTLAB_ISSUE 1658 1659 if (create_servo == hosts.VERIFY_FAILED 1660 or init_servo == hosts.VERIFY_FAILED): 1661 return servo_constants.SERVO_STATE_SERVOD_PROXY_ISSUE 1662 1663 if ec_board == hosts.VERIFY_FAILED: 1664 return servo_constants.SERVO_STATE_EC_BROKEN 1665 if pwr_button == hosts.VERIFY_FAILED: 1666 return servo_constants.SERVO_STATE_BAD_RIBBON_CABLE 1667 if lid_open == hosts.VERIFY_FAILED: 1668 return servo_constants.SERVO_STATE_LID_OPEN_FAILED 1669 1670 metrics.Counter( 1671 'chromeos/autotest/repair/unknown_servo_state' 1672 ).increment(fields=self._get_host_metrics_data()) 1673 logging.info('We do not have special state for this failure yet :)') 1674 return servo_constants.SERVO_STATE_BROKEN 1675 1676 def is_servo_topology_supported(self): 1677 """Check if servo_topology is supported.""" 1678 if self.is_containerized_servod(): 1679 logging.info('Servod is running within a container.') 1680 return True 1681 if not self.is_up_fast(): 1682 logging.info('Servo-Host is not reachable.') 1683 return False 1684 if not self.servo_serial: 1685 logging.info('Servo-topology required a servo serial.') 1686 return False 1687 if self.is_labstation(): 1688 logging.info('Servod is running within labstation.') 1689 return True 1690 return False 1691 1692 def get_topology(self): 1693 """Get servo topology.""" 1694 if not self._topology: 1695 self._topology = servo_topology.ServoTopology(self) 1696 return self._topology 1697 1698 def is_dual_setup(self): 1699 """Check is servo will run in dual setup. 1700 1701 Dual setup used only for servo_v4 when used ccd_cr50 and servo_micro 1702 at the same time. 1703 """ 1704 return self.servo_setup == servo_constants.SERVO_SETUP_VALUE_DUAL_V4 1705 1706 def set_dut_health_profile(self, dut_health_profile): 1707 """ 1708 @param dut_health_profile: A DeviceHealthProfile object. 1709 """ 1710 logging.debug('setting dut_health_profile field to (%s)', 1711 dut_health_profile) 1712 self._dut_health_profile = dut_health_profile 1713 1714 def get_dut_health_profile(self): 1715 """ 1716 @return A DeviceHealthProfile object. 1717 """ 1718 return self._dut_health_profile 1719 1720 def print_all_servo_of_host(self): 1721 """Print all servos detected on the host.""" 1722 try: 1723 logging.info('\tDevices detected on the host:') 1724 devices = self.get_topology().get_list_available_servos() 1725 for device in devices: 1726 logging.info('\t%s', device) 1727 except Exception as e: 1728 logging.debug('(Not critical) Fail list all servos: %s', e) 1729 1730 1731def make_servo_hostname(dut_hostname): 1732 """Given a DUT's hostname, return the hostname of its servo. 1733 1734 @param dut_hostname: hostname of a DUT. 1735 1736 @return hostname of the DUT's servo. 1737 1738 """ 1739 host_parts = dut_hostname.split('.') 1740 host_parts[0] = host_parts[0] + '-servo' 1741 return '.'.join(host_parts) 1742 1743 1744def _map_afe_board_to_servo_board(afe_board): 1745 """Map a board we get from the AFE to a servo appropriate value. 1746 1747 Many boards are identical to other boards for servo's purposes. 1748 This function makes that mapping. 1749 1750 @param afe_board string board name received from AFE. 1751 @return board we expect servo to have. 1752 1753 """ 1754 KNOWN_SUFFIXES = ['-freon', '_freon', '_moblab', '-cheets'] 1755 BOARD_MAP = {'gizmo': 'panther'} 1756 mapped_board = afe_board 1757 if afe_board in BOARD_MAP: 1758 mapped_board = BOARD_MAP[afe_board] 1759 else: 1760 for suffix in KNOWN_SUFFIXES: 1761 if afe_board.endswith(suffix): 1762 mapped_board = afe_board[0:-len(suffix)] 1763 break 1764 if mapped_board != afe_board: 1765 logging.info('Mapping AFE board=%s to %s', afe_board, mapped_board) 1766 return mapped_board 1767 1768 1769def get_servo_args_for_host(dut_host): 1770 """Return servo data associated with a given DUT. 1771 1772 @param dut_host Instance of `Host` on which to find the servo 1773 attributes. 1774 @return `servo_args` dict with host and an optional port. 1775 """ 1776 info = dut_host.host_info_store.get() 1777 servo_args = {k: v for k, v in six.iteritems(info.attributes) 1778 if k in servo_constants.SERVO_ATTR_KEYS} 1779 1780 if servo_constants.SERVO_HOST_SSH_PORT_ATTR in servo_args: 1781 try: 1782 servo_args[servo_constants.SERVO_HOST_SSH_PORT_ATTR] = int( 1783 servo_args[servo_constants.SERVO_HOST_SSH_PORT_ATTR]) 1784 except ValueError: 1785 logging.error('servo host port is not an int: %s', 1786 servo_args[servo_constants.SERVO_HOST_SSH_PORT_ATTR]) 1787 # Reset servo_args because we don't want to use an invalid port. 1788 servo_args.pop(servo_constants.SERVO_HOST_SSH_PORT_ATTR, None) 1789 1790 if servo_constants.SERVO_PORT_ATTR in servo_args: 1791 try: 1792 servo_args[servo_constants.SERVO_PORT_ATTR] = int( 1793 servo_args[servo_constants.SERVO_PORT_ATTR]) 1794 except ValueError: 1795 logging.error('servo port is not an int: %s', 1796 servo_args[servo_constants.SERVO_PORT_ATTR]) 1797 # Reset servo_args because we don't want to use an invalid port. 1798 servo_args.pop(servo_constants.SERVO_HOST_ATTR, None) 1799 1800 if info.board: 1801 servo_board = _map_afe_board_to_servo_board(info.board) 1802 servo_args[servo_constants.SERVO_BOARD_ATTR] = servo_board 1803 if info.model: 1804 servo_args[servo_constants.SERVO_MODEL_ATTR] = info.model 1805 return servo_args if servo_constants.SERVO_HOST_ATTR in servo_args else None 1806 1807 1808def _tweak_args_for_ssp_moblab(servo_args): 1809 if (servo_args[servo_constants.SERVO_HOST_ATTR] 1810 in ['localhost', '127.0.0.1']): 1811 servo_args[servo_constants.SERVO_HOST_ATTR] = _CONFIG.get_config_value( 1812 'SSP', 'host_container_ip', type=str, default=None) 1813 1814 1815def create_servo_host(dut, 1816 servo_args, 1817 try_lab_servo=False, 1818 try_servo_repair=False, 1819 try_servo_recovery=True, 1820 dut_host_info=None, 1821 dut_health_profile=None): 1822 """Create a ServoHost object for a given DUT, if appropriate. 1823 1824 This function attempts to create and verify or repair a `ServoHost` 1825 object for a servo connected to the given `dut`, subject to various 1826 constraints imposed by the parameters: 1827 * When the `servo_args` parameter is not `None`, a servo 1828 host must be created, and must be checked with `repair()`. 1829 * Otherwise, if a servo exists in the lab and `try_lab_servo` is 1830 true: 1831 * If `try_servo_repair` is true, then create a servo host and 1832 check it with `repair()`. 1833 * Otherwise, if the servo responds to `ping` then create a 1834 servo host and check it with `verify()`. 1835 1836 In cases where `servo_args` was not `None`, repair failure 1837 exceptions are passed back to the caller; otherwise, exceptions 1838 are logged and then discarded. Note that this only happens in cases 1839 where we're called from a test (not special task) control file that 1840 has an explicit dependency on servo. In that case, we require that 1841 repair not write to `status.log`, so as to avoid polluting test 1842 results. 1843 1844 TODO(jrbarnette): The special handling for servo in test control 1845 files is a thorn in my flesh; I dearly hope to see it cut out before 1846 my retirement. 1847 1848 Parameters for a servo host consist of a host name, port number, and 1849 DUT board, and are determined from one of these sources, in order of 1850 priority: 1851 * Servo attributes from the `dut` parameter take precedence over 1852 all other sources of information. 1853 * If a DNS entry for the servo based on the DUT hostname exists in 1854 the CrOS lab network, that hostname is used with the default 1855 port and the DUT's board. 1856 * If no other options are found, the parameters will be taken 1857 from the `servo_args` dict passed in from the caller. 1858 1859 @param dut: An instance of `Host` from which to take 1860 servo parameters (if available). 1861 @param servo_args: A dictionary with servo parameters to use if 1862 they can't be found from `dut`. If this 1863 argument is supplied, unrepaired exceptions 1864 from `verify()` will be passed back to the 1865 caller. 1866 @param try_lab_servo: If not true, servo host creation will be 1867 skipped unless otherwise required by the 1868 caller. 1869 @param try_servo_repair: If true, check a servo host with 1870 `repair()` instead of `verify()`. 1871 @param try_servo_recovery: If true, start servod in recovery mode. 1872 Default value is True. 1873 @param dut_host_info: A HostInfo object of the DUT that connected 1874 to this servo. 1875 @param dut_health_profile: DUT repair info with history. 1876 1877 @returns: A ServoHost object or None. See comments above. 1878 1879 """ 1880 # We are explicitly looking for if servo_args is None here(which means 1881 # servo not needed), as servo_args == {} means servo is needed and 1882 # we expect load servo_args from host_info_store. 1883 if servo_args is None: 1884 servo_dependency = False 1885 local_run = False 1886 else: 1887 servo_dependency = True 1888 # If servo_args pass in directly, then this is a local test run. 1889 local_run = servo_constants.SERVO_HOST_ATTR in servo_args 1890 1891 if local_run: 1892 logging.warning('User input servo_args detected, will attempt' 1893 ' to start servod and initialize servo conncetion' 1894 ' directly. All servo/servohost verify and repair' 1895 ' steps will be skipped.') 1896 1897 # Loading servo args from host_info_store. 1898 if dut is not None and (try_lab_servo or servo_dependency): 1899 servo_args_override = get_servo_args_for_host(dut) 1900 if servo_args_override is not None: 1901 if utils.in_moblab_ssp(): 1902 _tweak_args_for_ssp_moblab(servo_args_override) 1903 logging.debug( 1904 'Overriding provided servo_args (%s) with arguments' 1905 ' determined from the host (%s)', 1906 servo_args, 1907 servo_args_override, 1908 ) 1909 servo_args = servo_args_override 1910 1911 if not servo_args: 1912 logging.debug('No servo_args provided, and failed to find overrides.') 1913 if try_lab_servo or servo_dependency: 1914 return None, servo_constants.SERVO_STATE_MISSING_CONFIG 1915 else: 1916 # For regular test case which not required the servo 1917 return None, None 1918 1919 servo_hostname = servo_args.get(servo_constants.SERVO_HOST_ATTR) 1920 servo_port = servo_args.get(servo_constants.SERVO_PORT_ATTR) 1921 if not local_run: 1922 if not _is_servo_host_information_exist(servo_hostname, servo_port): 1923 logging.debug( 1924 'Servo connection info missed hostname: %s , port: %s', 1925 servo_hostname, servo_port) 1926 return None, servo_constants.SERVO_STATE_MISSING_CONFIG 1927 if not is_servo_host_information_valid(servo_hostname, servo_port): 1928 logging.debug( 1929 'Servo connection info is incorrect hostname: %s , port: %s', 1930 servo_hostname, servo_port) 1931 return None, servo_constants.SERVO_STATE_WRONG_CONFIG 1932 1933 if try_servo_recovery == True: 1934 servo_args[servo_constants.SERVO_RECOVERY_MODE] = True 1935 1936 newhost = ServoHost(**servo_args) 1937 if local_run: 1938 try: 1939 newhost.start_servod() 1940 except: 1941 # If we failed to start servod here, we can assume the servod 1942 # either already started or the test is running against a 1943 # non-standard servohost so the user will resiponsble for ensure 1944 # servod is running. 1945 pass 1946 try: 1947 newhost.initialize_servo() 1948 newhost.initialize_dut_for_servo() 1949 newhost._servo_state = servo_constants.SERVO_STATE_WORKING 1950 return newhost, newhost.get_servo_state() 1951 except Exception as e: 1952 logging.error('Failed to initialize servo. %s', e) 1953 return None, servo_constants.SERVO_STATE_BROKEN 1954 1955 if newhost.is_containerized_servod(): 1956 # TODO(otabek@): Update for servod-manager. 1957 # Servod docker is not available for access. 1958 newhost.start_containerized_servod() 1959 # use_icmp is always set to true in autotest, but in labpack 1960 # this is determined by autoserv args. We need to make this consistent 1961 # across labpack and autotest b/205855910 1962 elif newhost.use_icmp and not newhost.is_up_fast(count=3): 1963 # ServoHost has internal check to wait if servo-host is in reboot 1964 # process. If servo-host still is not available this check will stop 1965 # further attempts as we do not have any option to recover servo_host. 1966 return None, servo_constants.SERVO_STATE_NO_SSH 1967 1968 # Reset or reboot servo device only during AdminRepair tasks. 1969 if try_servo_repair: 1970 if newhost._is_locked: 1971 # Print available servos on the host for debugging. 1972 newhost.print_all_servo_of_host() 1973 # Reset servo if the servo is locked, as we check if the servohost 1974 # is up, if the servohost is labstation and if the servohost is in 1975 # lab inside the locking logic. 1976 newhost.reset_servo() 1977 else: 1978 try: 1979 newhost.reboot_servo_v3_on_need() 1980 except Exception as e: 1981 logging.info('[Non-critical] Unexpected error while trying to' 1982 ' reboot servo_v3, skipping the reboot; %s', e) 1983 1984 if dut: 1985 newhost.set_dut_hostname(dut.hostname) 1986 if dut_host_info: 1987 newhost.set_dut_host_info(dut_host_info) 1988 if (dut_health_profile and (try_lab_servo or try_servo_repair)): 1989 try: 1990 if newhost.is_localhost(): 1991 logging.info('Servohost is a localhost, skip device' 1992 ' health profile setup...') 1993 else: 1994 dut_health_profile.init_profile(newhost) 1995 newhost.set_dut_health_profile(dut_health_profile) 1996 except Exception as e: 1997 logging.info( 1998 '[Non-critical] Unexpected error while trying to' 1999 ' load device health profile; %s', e) 2000 2001 # Note that the logic of repair() includes everything done 2002 # by verify(). It's sufficient to call one or the other; 2003 # we don't need both. 2004 if servo_dependency: 2005 newhost.repair(silent=True) 2006 return newhost, newhost.get_servo_state() 2007 2008 if try_servo_repair: 2009 try: 2010 newhost.repair() 2011 except Exception: 2012 logging.exception('servo repair failed for %s', newhost.hostname) 2013 else: 2014 try: 2015 newhost.verify() 2016 except Exception: 2017 logging.exception('servo verify failed for %s', newhost.hostname) 2018 return newhost, newhost.get_servo_state() 2019 2020 2021def _is_servo_host_information_exist(hostname, port): 2022 if hostname is None or len(hostname.strip()) == 0: 2023 return False 2024 if port is None: 2025 return False 2026 if not type(port) is int: 2027 try: 2028 int(port) 2029 except ValueError: 2030 return False 2031 return True 2032 2033 2034def is_servo_host_information_valid(hostname, port): 2035 """Check if provided servo attributes are valid. 2036 2037 @param hostname Hostname of the servohost. 2038 @param port servo port number. 2039 2040 @returns: A bool value to indicate if provided servo attribute valid. 2041 """ 2042 if not _is_servo_host_information_exist(hostname, port): 2043 return False 2044 # checking range and correct of the port 2045 port_int = int(port) 2046 if port_int < 1 or port_int > 65000: 2047 return False 2048 # we expecting host contain only latters, digits and '-' or '_' 2049 if not re.match('[a-zA-Z0-9-_\.:]*$', hostname) or len(hostname) < 5: 2050 return False 2051 return True 2052