# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. # # Expects to be run in an environment with sudo and no interactive password # prompt, such as within the Chromium OS development chroot. """This file provides core logic for servo verify/repair process.""" import logging import os import time import traceback import xmlrpclib from autotest_lib.client.bin import utils from autotest_lib.client.common_lib import error from autotest_lib.client.common_lib import global_config from autotest_lib.client.common_lib import hosts from autotest_lib.client.common_lib.cros import retry from autotest_lib.client.common_lib.cros.network import ping_runner from autotest_lib.server.cros.servo import servo from autotest_lib.server.hosts import servo_repair from autotest_lib.server.hosts import base_servohost # Names of the host attributes in the database that represent the values for # the servo_host and servo_port for a servo connected to the DUT. SERVO_HOST_ATTR = 'servo_host' SERVO_PORT_ATTR = 'servo_port' SERVO_BOARD_ATTR = 'servo_board' # Model is inferred from host labels. SERVO_MODEL_ATTR = 'servo_model' SERVO_SERIAL_ATTR = 'servo_serial' SERVO_ATTR_KEYS = ( SERVO_BOARD_ATTR, SERVO_HOST_ATTR, SERVO_PORT_ATTR, SERVO_SERIAL_ATTR, ) # Timeout value for stop/start servod process. SERVOD_TEARDOWN_TIMEOUT = 3 SERVOD_QUICK_STARTUP_TIMEOUT = 20 SERVOD_STARTUP_TIMEOUT = 60 _CONFIG = global_config.global_config ENABLE_SSH_TUNNEL_FOR_SERVO = _CONFIG.get_config_value( 'CROS', 'enable_ssh_tunnel_for_servo', type=bool, default=False) AUTOTEST_BASE = _CONFIG.get_config_value( 'SCHEDULER', 'drone_installation_directory', default='/usr/local/autotest') SERVO_STATE_LABEL_PREFIX = 'servo_state' SERVO_STATE_WORKING = 'WORKING' SERVO_STATE_BROKEN = 'BROKEN' class ServoHost(base_servohost.BaseServoHost): """Host class for a servo host(e.g. beaglebone, labstation) that with a servo instance for a specific port. @type _servo: servo.Servo | None """ DEFAULT_PORT = int(os.getenv('SERVOD_PORT', '9999')) # Timeout for initializing servo signals. INITIALIZE_SERVO_TIMEOUT_SECS = 60 # Ready test function SERVO_READY_METHOD = 'get_version' def _init_attributes(self): self._servo_state = None self.servo_port = None self.servo_board = None self.servo_model = None self.servo_serial = None self._servo = None self._servod_server_proxy = None def _initialize(self, servo_host='localhost', servo_port=DEFAULT_PORT, servo_board=None, servo_model=None, servo_serial=None, is_in_lab=None, *args, **dargs): """Initialize a ServoHost instance. A ServoHost instance represents a host that controls a servo. @param servo_host: Name of the host where the servod process is running. @param servo_port: Port the servod process is listening on. Defaults to the SERVOD_PORT environment variable if set, otherwise 9999. @param servo_board: Board that the servo is connected to. @param servo_model: Model that the servo is connected to. @param is_in_lab: True if the servo host is in Cros Lab. Default is set to None, for which utils.host_is_in_lab_zone will be called to check if the servo host is in Cros lab. """ super(ServoHost, self)._initialize(hostname=servo_host, is_in_lab=is_in_lab, *args, **dargs) self._init_attributes() self.servo_port = int(servo_port) self.servo_board = servo_board self.servo_model = servo_model self.servo_serial = servo_serial # Path of the servo host lock file. self._lock_file = (self.TEMP_FILE_DIR + str(self.servo_port) + self.LOCK_FILE_POSTFIX) # File path to declare a reboot request. self._reboot_file = (self.TEMP_FILE_DIR + str(self.servo_port) + self.REBOOT_FILE_POSTFIX) # Lock the servo host if it's an in-lab labstation to prevent other # task to reboot it until current task completes. We also wait and # make sure the labstation is up here, in the case of the labstation is # in the middle of reboot. self._is_locked = False if (self.wait_up(self.REBOOT_TIMEOUT) and self.is_in_lab() and self.is_labstation()): self._lock() self._repair_strategy = ( servo_repair.create_servo_repair_strategy()) def connect_servo(self): """Establish a connection to the servod server on this host. Initializes `self._servo` and then verifies that all network connections are working. This will create an ssh tunnel if it's required. As a side effect of testing the connection, all signals on the target servo are reset to default values, and the USB stick is set to the neutral (off) position. """ servo_obj = servo.Servo(servo_host=self, servo_serial=self.servo_serial) self._servo = servo_obj timeout, _ = retry.timeout( servo_obj.initialize_dut, timeout_sec=self.INITIALIZE_SERVO_TIMEOUT_SECS) if timeout: raise hosts.AutoservVerifyError( 'Servo initialize timed out.') def disconnect_servo(self): """Disconnect our servo if it exists. If we've previously successfully connected to our servo, disconnect any established ssh tunnel, and set `self._servo` back to `None`. """ if self._servo: # N.B. This call is safe even without a tunnel: # rpc_server_tracker.disconnect() silently ignores # unknown ports. self.rpc_server_tracker.disconnect(self.servo_port) self._servo = None def _create_servod_server_proxy(self): """Create a proxy that can be used to communicate with servod server. @returns: An xmlrpclib.ServerProxy that is connected to the servod server on the host. """ if ENABLE_SSH_TUNNEL_FOR_SERVO and not self.is_localhost(): return self.rpc_server_tracker.xmlrpc_connect( None, self.servo_port, ready_test_name=self.SERVO_READY_METHOD, timeout_seconds=60, request_timeout_seconds=3600) else: remote = 'http://%s:%s' % (self.hostname, self.servo_port) return xmlrpclib.ServerProxy(remote) def get_servod_server_proxy(self): """Return a cached proxy if exists; otherwise, create a new one. @returns: An xmlrpclib.ServerProxy that is connected to the servod server on the host. """ # Single-threaded execution, no race if self._servod_server_proxy is None: self._servod_server_proxy = self._create_servod_server_proxy() return self._servod_server_proxy def verify(self, silent=False): """Update the servo host and verify it's in a good state. @param silent If true, suppress logging in `status.log`. """ message = 'Beginning verify for servo host %s port %s serial %s' message %= (self.hostname, self.servo_port, self.servo_serial) self.record('INFO', None, None, message) try: self._repair_strategy.verify(self, silent) self._servo_state = SERVO_STATE_WORKING self.record('INFO', None, None, 'ServoHost verify set servo_state as WORKING') except: self._servo_state = SERVO_STATE_BROKEN self.record('INFO', None, None, 'ServoHost verify set servo_state as BROKEN') self.disconnect_servo() self.stop_servod() raise def repair(self, silent=False): """Attempt to repair servo host. @param silent If true, suppress logging in `status.log`. """ message = 'Beginning repair for servo host %s port %s serial %s' message %= (self.hostname, self.servo_port, self.servo_serial) self.record('INFO', None, None, message) try: self._repair_strategy.repair(self, silent) self._servo_state = SERVO_STATE_WORKING self.record('INFO', None, None, 'ServoHost repair set servo_state as WORKING') # If target is a labstation then try to withdraw any existing # reboot request created by this servo because it passed repair. if self.is_labstation(): self.withdraw_reboot_request() except: self._servo_state = SERVO_STATE_BROKEN self.record('INFO', None, None, 'ServoHost repair set servo_state as BROKEN') self.disconnect_servo() self.stop_servod() raise def get_servo(self): """Get the cached servo.Servo object. @return: a servo.Servo object. @rtype: autotest_lib.server.cros.servo.servo.Servo """ return self._servo def request_reboot(self): """Request servohost to be rebooted when it's safe to by touch a file. """ logging.debug('Request to reboot servohost %s has been created by ' 'servo with port # %s', self.hostname, self.servo_port) self.run('touch %s' % self._reboot_file, ignore_status=True) def withdraw_reboot_request(self): """Withdraw a servohost reboot request if exists by remove the flag file. """ logging.debug('Withdrawing request to reboot servohost %s that created' ' by servo with port # %s if exists.', self.hostname, self.servo_port) self.run('rm -f %s' % self._reboot_file, ignore_status=True) def start_servod(self, quick_startup=False): """Start the servod process on servohost. """ # Skip if running on the localhost.(crbug.com/1038168) if self.is_localhost(): logging.debug("Servohost is a localhost, skipping start servod.") return cmd = 'start servod' if self.servo_board: cmd += ' BOARD=%s' % self.servo_board if self.servo_model: cmd += ' MODEL=%s' % self.servo_model else: logging.warning('Board for DUT is unknown; starting servod' ' assuming a pre-configured board.') cmd += ' PORT=%d' % self.servo_port if self.servo_serial: cmd += ' SERIAL=%s' % self.servo_serial self.run(cmd, timeout=60) # There's a lag between when `start servod` completes and when # the _ServodConnectionVerifier trigger can actually succeed. # The call to time.sleep() below gives time to make sure that # the trigger won't fail after we return. # Normally servod on servo_v3 and labstation take ~10 seconds to ready, # But in the rare case all servo on a labstation are in heavy use they # may take ~30 seconds. So the timeout value will double these value, # and we'll try quick start up when first time initialize servohost, # and use standard start up timeout in repair. if quick_startup: timeout = SERVOD_QUICK_STARTUP_TIMEOUT else: timeout = SERVOD_STARTUP_TIMEOUT logging.debug('Wait %s seconds for servod process fully up.', timeout) time.sleep(timeout) def stop_servod(self): """Stop the servod process on servohost. """ # Skip if running on the localhost.(crbug.com/1038168) if self.is_localhost(): logging.debug("Servohost is a localhost, skipping stop servod.") return logging.debug('Stopping servod on port %s', self.servo_port) self.run('stop servod PORT=%d' % self.servo_port, timeout=60, ignore_status=True) logging.debug('Wait %s seconds for servod process fully teardown.', SERVOD_TEARDOWN_TIMEOUT) time.sleep(SERVOD_TEARDOWN_TIMEOUT) def restart_servod(self, quick_startup=False): """Restart the servod process on servohost. """ self.stop_servod() self.start_servod(quick_startup) def _lock(self): """lock servohost by touching a file. """ logging.debug('Locking servohost %s by touching %s file', self.hostname, self._lock_file) self.run('touch %s' % self._lock_file, ignore_status=True) self._is_locked = True def _unlock(self): """Unlock servohost by removing the lock file. """ logging.debug('Unlocking servohost by removing %s file', self._lock_file) self.run('rm %s' % self._lock_file, ignore_status=True) self._is_locked = False def close(self): """Close the associated servo and the host object.""" if self._servo: # In some cases when we run as lab-tools, the job object is None. if self.job and not self._servo.uart_logs_dir: self._servo.uart_logs_dir = self.job.resultdir self._servo.close() if self._is_locked: # Remove the lock if the servohost has been locked. try: self._unlock() except error.AutoservSSHTimeout: logging.error('Unlock servohost failed due to ssh timeout.' ' It may caused by servohost went down during' ' the task.') # We want always stop servod after task to minimum the impact of bad # servod process interfere other servods.(see crbug.com/1028665) try: self.stop_servod() except error.AutoservRunError as e: logging.info("Failed to stop servod due to:\n%s\n" "This error is forgived.", str(e)) super(ServoHost, self).close() def get_servo_state(self): return SERVO_STATE_BROKEN if self._servo_state is None else self._servo_state def make_servo_hostname(dut_hostname): """Given a DUT's hostname, return the hostname of its servo. @param dut_hostname: hostname of a DUT. @return hostname of the DUT's servo. """ host_parts = dut_hostname.split('.') host_parts[0] = host_parts[0] + '-servo' return '.'.join(host_parts) def servo_host_is_up(servo_hostname): """Given a servo host name, return if it's up or not. @param servo_hostname: hostname of the servo host. @return True if it's up, False otherwise """ # Technically, this duplicates the SSH ping done early in the servo # proxy initialization code. However, this ping ends in a couple # seconds when if fails, rather than the 60 seconds it takes to decide # that an SSH ping has timed out. Specifically, that timeout happens # when our servo DNS name resolves, but there is no host at that IP. logging.info('Pinging servo host at %s', servo_hostname) ping_config = ping_runner.PingConfig( servo_hostname, count=3, ignore_result=True, ignore_status=True) return ping_runner.PingRunner().ping(ping_config).received > 0 def _map_afe_board_to_servo_board(afe_board): """Map a board we get from the AFE to a servo appropriate value. Many boards are identical to other boards for servo's purposes. This function makes that mapping. @param afe_board string board name received from AFE. @return board we expect servo to have. """ KNOWN_SUFFIXES = ['-freon', '_freon', '_moblab', '-cheets'] BOARD_MAP = {'gizmo': 'panther'} mapped_board = afe_board if afe_board in BOARD_MAP: mapped_board = BOARD_MAP[afe_board] else: for suffix in KNOWN_SUFFIXES: if afe_board.endswith(suffix): mapped_board = afe_board[0:-len(suffix)] break if mapped_board != afe_board: logging.info('Mapping AFE board=%s to %s', afe_board, mapped_board) return mapped_board def get_servo_args_for_host(dut_host): """Return servo data associated with a given DUT. @param dut_host Instance of `Host` on which to find the servo attributes. @return `servo_args` dict with host and an optional port. """ info = dut_host.host_info_store.get() servo_args = {k: v for k, v in info.attributes.iteritems() if k in SERVO_ATTR_KEYS} if SERVO_PORT_ATTR in servo_args: try: servo_args[SERVO_PORT_ATTR] = int(servo_args[SERVO_PORT_ATTR]) except ValueError: logging.error('servo port is not an int: %s', servo_args[SERVO_PORT_ATTR]) # Reset servo_args because we don't want to use an invalid port. servo_args.pop(SERVO_HOST_ATTR, None) if info.board: servo_args[SERVO_BOARD_ATTR] = _map_afe_board_to_servo_board(info.board) if info.model: servo_args[SERVO_MODEL_ATTR] = info.model return servo_args if SERVO_HOST_ATTR in servo_args else None def _tweak_args_for_ssp_moblab(servo_args): if servo_args[SERVO_HOST_ATTR] in ['localhost', '127.0.0.1']: servo_args[SERVO_HOST_ATTR] = _CONFIG.get_config_value( 'SSP', 'host_container_ip', type=str, default=None) def create_servo_host(dut, servo_args, try_lab_servo=False, try_servo_repair=False, dut_host_info=None): """Create a ServoHost object for a given DUT, if appropriate. This function attempts to create and verify or repair a `ServoHost` object for a servo connected to the given `dut`, subject to various constraints imposed by the parameters: * When the `servo_args` parameter is not `None`, a servo host must be created, and must be checked with `repair()`. * Otherwise, if a servo exists in the lab and `try_lab_servo` is true: * If `try_servo_repair` is true, then create a servo host and check it with `repair()`. * Otherwise, if the servo responds to `ping` then create a servo host and check it with `verify()`. In cases where `servo_args` was not `None`, repair failure exceptions are passed back to the caller; otherwise, exceptions are logged and then discarded. Note that this only happens in cases where we're called from a test (not special task) control file that has an explicit dependency on servo. In that case, we require that repair not write to `status.log`, so as to avoid polluting test results. TODO(jrbarnette): The special handling for servo in test control files is a thorn in my flesh; I dearly hope to see it cut out before my retirement. Parameters for a servo host consist of a host name, port number, and DUT board, and are determined from one of these sources, in order of priority: * Servo attributes from the `dut` parameter take precedence over all other sources of information. * If a DNS entry for the servo based on the DUT hostname exists in the CrOS lab network, that hostname is used with the default port and the DUT's board. * If no other options are found, the parameters will be taken from the `servo_args` dict passed in from the caller. @param dut An instance of `Host` from which to take servo parameters (if available). @param servo_args A dictionary with servo parameters to use if they can't be found from `dut`. If this argument is supplied, unrepaired exceptions from `verify()` will be passed back to the caller. @param try_lab_servo If not true, servo host creation will be skipped unless otherwise required by the caller. @param try_servo_repair If true, check a servo host with `repair()` instead of `verify()`. @returns: A ServoHost object or None. See comments above. """ servo_dependency = servo_args is not None if dut is not None and (try_lab_servo or servo_dependency): servo_args_override = get_servo_args_for_host(dut) if servo_args_override is not None: if utils.in_moblab_ssp(): _tweak_args_for_ssp_moblab(servo_args_override) logging.debug( 'Overriding provided servo_args (%s) with arguments' ' determined from the host (%s)', servo_args, servo_args_override, ) servo_args = servo_args_override if servo_args is None: logging.debug('No servo_args provided, and failed to find overrides.') return None if SERVO_HOST_ATTR not in servo_args: logging.debug('%s attribute missing from servo_args: %s', SERVO_HOST_ATTR, servo_args) return None if (not servo_dependency and not try_servo_repair and not servo_host_is_up(servo_args[SERVO_HOST_ATTR])): logging.debug('ServoHost is not up.') return None newhost = ServoHost(**servo_args) try: newhost.restart_servod(quick_startup=True) except error.AutoservSSHTimeout: logging.warning("Restart servod failed due ssh connection " "to servohost timed out. This error is forgiven" " here, we will retry in servo repair process.") except error.AutoservRunError as e: logging.warning("Restart servod failed due to:\n%s\n" "This error is forgiven here, we will retry" " in servo repair process.", str(e)) # TODO(gregorynisbet): Clean all of this up. logging.debug('create_servo_host: attempt to set info store on ' 'servo host') try: if dut_host_info is None: logging.debug('create_servo_host: dut_host_info is ' 'None, skipping') else: newhost.set_dut_host_info(dut_host_info) logging.debug('create_servo_host: successfully set info ' 'store') except Exception: logging.error("create_servo_host: (%s)", traceback.format_exc()) # Note that the logic of repair() includes everything done # by verify(). It's sufficient to call one or the other; # we don't need both. if servo_dependency: newhost.repair(silent=True) return newhost if try_servo_repair: try: newhost.repair() except Exception: logging.exception('servo repair failed for %s', newhost.hostname) else: try: newhost.verify() except Exception: logging.exception('servo verify failed for %s', newhost.hostname) return newhost