1# Lint as: python2, python3 2# 3# Copyright 2007 Google Inc. Released under the GPL v2 4 5""" 6This module defines the SSHHost class. 7 8Implementation details: 9You should import the "hosts" package instead of importing each type of host. 10 11 SSHHost: a remote machine with a ssh access 12""" 13 14from __future__ import absolute_import 15from __future__ import division 16from __future__ import print_function 17 18import inspect 19import logging 20import re 21import time 22 23import common 24from autotest_lib.client.common_lib import error 25from autotest_lib.client.common_lib import pxssh 26from autotest_lib.server import utils 27from autotest_lib.server.hosts import abstract_ssh 28import six 29 30# In case cros_host is being ran via SSP on an older Moblab version with an 31# older chromite version. 32try: 33 from chromite.lib import metrics 34except ImportError: 35 metrics = utils.metrics_mock 36 37 38def THIS_IS_SLOW(func): 39 """Mark the given function as slow, when looking at calls to it""" 40 func.__name__ = '%s__SLOW__' % func.__name__ 41 return func 42 43 44class SSHHost(abstract_ssh.AbstractSSHHost): 45 """ 46 This class represents a remote machine controlled through an ssh 47 session on which you can run programs. 48 49 It is not the machine autoserv is running on. The machine must be 50 configured for password-less login, for example through public key 51 authentication. 52 53 It includes support for controlling the machine through a serial 54 console on which you can run programs. If such a serial console is 55 set up on the machine then capabilities such as hard reset and 56 boot strap monitoring are available. If the machine does not have a 57 serial console available then ordinary SSH-based commands will 58 still be available, but attempts to use extensions such as 59 console logging or hard reset will fail silently. 60 61 Implementation details: 62 This is a leaf class in an abstract class hierarchy, it must 63 implement the unimplemented methods in parent classes. 64 """ 65 RUN_TIMEOUT = 3600 66 67 def _initialize(self, hostname, *args, **dargs): 68 """ 69 Construct a SSHHost object 70 71 Args: 72 hostname: network hostname or address of remote machine 73 """ 74 super(SSHHost, self)._initialize(hostname=hostname, *args, **dargs) 75 self._default_run_timeout = self.RUN_TIMEOUT 76 self.setup_ssh() 77 78 79 def ssh_command(self, connect_timeout=30, options='', alive_interval=300, 80 alive_count_max=3, connection_attempts=1): 81 """ 82 Construct an ssh command with proper args for this host. 83 84 @param connect_timeout: connection timeout (in seconds) 85 @param options: SSH options 86 @param alive_interval: SSH Alive interval. 87 @param alive_count_max: SSH AliveCountMax. 88 @param connection_attempts: SSH ConnectionAttempts 89 """ 90 options = " ".join([options, self._master_ssh.ssh_option]) 91 base_cmd = self.make_ssh_command(user=self.user, port=self.port, 92 opts=options, 93 hosts_file=self.known_hosts_file, 94 connect_timeout=connect_timeout, 95 alive_interval=alive_interval, 96 alive_count_max=alive_count_max, 97 connection_attempts=connection_attempts) 98 return "%s %s" % (base_cmd, self.hostname) 99 100 def _get_server_stack_state(self, lowest_frames=0, highest_frames=None): 101 """ Get the server stack frame status. 102 @param lowest_frames: the lowest frames to start printing. 103 @param highest_frames: the highest frames to print. 104 (None means no restriction) 105 """ 106 stack_frames = inspect.stack() 107 stack = '' 108 for frame in stack_frames[lowest_frames:highest_frames]: 109 function_name = inspect.getframeinfo(frame[0]).function 110 stack = '%s|%s' % (function_name, stack) 111 del stack_frames 112 return stack[:-1] # Delete the last '|' character 113 114 def _verbose_logger_command(self, command): 115 """ 116 Prepend the command for the client with information about the ssh 117 command to be executed and the server stack state. 118 119 @param command: the ssh command to be executed. 120 """ 121 # The last few frames on the stack are not useful, so skip them. 122 stack = self._get_server_stack_state(lowest_frames=3, highest_frames=6) 123 # If logger executable exists on the DUT, use it to report the command. 124 # Then regardless of logger, run the command as usual. 125 command = ('test -x /usr/bin/logger && /usr/bin/logger' 126 ' -t autotest "from [%s] ssh_run: %s"; %s' 127 % (stack, utils.sh_escape(command), command)) 128 return command 129 130 131 def _run(self, command, timeout, ignore_status, 132 stdout, stderr, connect_timeout, env, options, stdin, args, 133 ignore_timeout, ssh_failure_retry_ok): 134 """Helper function for run().""" 135 if connect_timeout > timeout: 136 # timeout passed from run() may be smaller than 1, because we 137 # subtract the elapsed time from the original timeout supplied. 138 connect_timeout = max(int(timeout), 1) 139 original_cmd = command 140 141 ssh_cmd = self.ssh_command(connect_timeout, options) 142 if not env.strip(): 143 env = "" 144 else: 145 env = "export %s;" % env 146 for arg in args: 147 command += ' "%s"' % utils.sh_escape(arg) 148 full_cmd = '%s "%s %s"' % (ssh_cmd, env, utils.sh_escape(command)) 149 150 # TODO(jrbarnette): crbug.com/484726 - When we're in an SSP 151 # container, sometimes shortly after reboot we will see DNS 152 # resolution errors on ssh commands; the problem never 153 # occurs more than once in a row. This especially affects 154 # the autoupdate_Rollback test, but other cases have been 155 # affected, too. 156 # 157 # We work around it by detecting the first DNS resolution error 158 # and retrying exactly one time. 159 dns_error_retry_count = 1 160 161 def counters_inc(counter_name, failure_name): 162 """Helper function to increment metrics counters. 163 @param counter_name: string indicating which counter to use 164 @param failure_name: string indentifying an error, or 'success' 165 """ 166 if counter_name == 'call': 167 # ssh_counter records the outcome of each ssh invocation 168 # inside _run(), including exceptions. 169 ssh_counter = metrics.Counter('chromeos/autotest/ssh/calls') 170 fields = {'error' : failure_name or 'success', 171 'attempt' : ssh_call_count} 172 ssh_counter.increment(fields=fields) 173 174 if counter_name == 'run': 175 # run_counter records each call to _run() with its result 176 # and how many tries were made. Calls are recorded when 177 # _run() exits (including exiting with an exception) 178 run_counter = metrics.Counter('chromeos/autotest/ssh/runs') 179 fields = {'error' : failure_name or 'success', 180 'attempt' : ssh_call_count} 181 run_counter.increment(fields=fields) 182 183 # If ssh_failure_retry_ok is True, retry twice on timeouts and generic 184 # error 255: if a simple retry doesn't work, kill the ssh master 185 # connection and try again. (Note that either error could come from 186 # the command running in the DUT, in which case the retry may be 187 # useless but, in theory, also harmless.) 188 if ssh_failure_retry_ok: 189 # Ignore ssh command timeout, even though it could be a timeout due 190 # to the command executing in the remote host. Note that passing 191 # ignore_timeout = True makes utils.run() return None on timeouts 192 # (and only on timeouts). 193 original_ignore_timeout = ignore_timeout 194 ignore_timeout = True 195 ssh_failure_retry_count = 2 196 else: 197 ssh_failure_retry_count = 0 198 199 ssh_call_count = 0 200 201 while True: 202 try: 203 # Increment call count first, in case utils.run() throws an 204 # exception. 205 ssh_call_count += 1 206 result = utils.run(full_cmd, timeout, True, stdout, stderr, 207 verbose=False, stdin=stdin, 208 stderr_is_expected=ignore_status, 209 ignore_timeout=ignore_timeout) 210 except Exception as e: 211 # No retries on exception. 212 counters_inc('call', 'exception') 213 counters_inc('run', 'exception') 214 raise e 215 216 failure_name = None 217 218 if result: 219 if result.exit_status == 255: 220 if re.search(r'^ssh: .*: Name or service not known', 221 result.stderr): 222 failure_name = 'dns_failure' 223 else: 224 failure_name = 'error_255' 225 elif result.exit_status > 0: 226 failure_name = 'nonzero_status' 227 else: 228 # result == None 229 failure_name = 'timeout' 230 231 # Record the outcome of the ssh invocation. 232 counters_inc('call', failure_name) 233 234 if failure_name: 235 # There was a failure: decide whether to retry. 236 if failure_name == 'dns_failure': 237 if dns_error_retry_count > 0: 238 logging.debug('retrying ssh because of DNS failure') 239 dns_error_retry_count -= 1 240 continue 241 else: 242 if ssh_failure_retry_count == 2: 243 logging.debug('retrying ssh command after %s', 244 failure_name) 245 ssh_failure_retry_count -= 1 246 continue 247 elif ssh_failure_retry_count == 1: 248 # After two failures, restart the master connection 249 # before the final try. 250 logging.debug('retry 2: restarting master connection') 251 self.restart_master_ssh() 252 # Last retry: reinstate timeout behavior. 253 ignore_timeout = original_ignore_timeout 254 ssh_failure_retry_count -= 1 255 continue 256 257 # No retry conditions occurred. Exit the loop. 258 break 259 260 # The outcomes of ssh invocations have been recorded. Now record 261 # the outcome of this function. 262 263 if ignore_timeout and not result: 264 counters_inc('run', 'ignored_timeout') 265 return None 266 267 # The error messages will show up in band (indistinguishable 268 # from stuff sent through the SSH connection), so we have the 269 # remote computer echo the message "Connected." before running 270 # any command. Since the following 2 errors have to do with 271 # connecting, it's safe to do these checks. 272 if result.exit_status == 255: 273 if re.search(r'^ssh: connect to host .* port .*: ' 274 r'Connection timed out\r$', result.stderr): 275 counters_inc('run', 'final_timeout') 276 raise error.AutoservSSHTimeout( 277 "ssh timed out: %r" % original_cmd.strip(), result) 278 if "Permission denied." in result.stderr: 279 msg = "ssh permission denied" 280 counters_inc('run', 'final_eperm') 281 raise error.AutoservSshPermissionDeniedError(msg, result) 282 283 if not ignore_status and result.exit_status > 0: 284 counters_inc('run', 'final_run_error') 285 msg = result.stderr.strip() 286 if not msg: 287 msg = result.stdout.strip() 288 if msg: 289 msg = msg.splitlines()[-1] 290 raise error.AutoservRunError("command execution error (%d): %r" % 291 (result.exit_status, msg), result) 292 293 counters_inc('run', failure_name) 294 return result 295 296 def set_default_run_timeout(self, timeout): 297 """Set the default timeout for run.""" 298 if timeout < 0: 299 raise error.TestError('Invalid timeout %d', timeout) 300 self._default_run_timeout = timeout 301 302 @THIS_IS_SLOW 303 def run(self, command, timeout=None, ignore_status=False, 304 stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS, 305 connect_timeout=30, options='', stdin=None, verbose=True, args=(), 306 ignore_timeout=False, ssh_failure_retry_ok=False): 307 """ 308 Run a command on the remote host. 309 @note: This RPC call has an overhead of minimum 40ms and up to 400ms on 310 servers (crbug.com/734887). Each time a call is added for 311 every job, a server core dies in the lab. 312 @see: common_lib.hosts.host.run() 313 314 @param timeout: command execution timeout in seconds. Default is 315 _default_run_timeout (1 hour). 316 @param connect_timeout: ssh connection timeout (in seconds) 317 @param options: string with additional ssh command options 318 @param verbose: log the commands 319 @param ignore_timeout: bool True if SSH command timeouts should be 320 ignored. Will return None on command timeout. 321 @param ssh_failure_retry_ok: True if the command may be retried on 322 probable ssh failure (error 255 or timeout). When true, 323 the command may be executed up to three times, the second 324 time after restarting the ssh master connection. Use only for 325 commands that are idempotent, because when a "probable 326 ssh failure" occurs, we cannot tell if the command executed 327 or not. 328 329 @raises AutoservRunError: if the command failed 330 @raises AutoservSSHTimeout: ssh connection has timed out 331 """ 332 # For example if the command is a list, we need to convert it to a 333 # string first. 334 if not isinstance(command, six.string_types): 335 command = ' '.join(command) 336 337 if timeout is None: 338 timeout = self._default_run_timeout 339 start_time = time.time() 340 with metrics.SecondsTimer('chromeos/autotest/ssh/master_ssh_time', 341 scale=0.001): 342 if verbose: 343 stack = self._get_server_stack_state(lowest_frames=1, 344 highest_frames=7) 345 logging.debug("Running (ssh) '%s' from '%s'", command, stack) 346 command = self._verbose_logger_command(command) 347 348 self.start_master_ssh(min( 349 timeout, 350 self.DEFAULT_START_MASTER_SSH_TIMEOUT_S, 351 )) 352 353 env = " ".join("=".join(pair) for pair in six.iteritems(self.env)) 354 elapsed = time.time() - start_time 355 try: 356 return self._run(command, timeout - elapsed, ignore_status, 357 stdout_tee, stderr_tee, connect_timeout, env, 358 options, stdin, args, ignore_timeout, 359 ssh_failure_retry_ok) 360 except error.CmdError as cmderr: 361 # We get a CmdError here only if there is timeout of that 362 # command. Catch that and stuff it into AutoservRunError and 363 # raise it. 364 timeout_message = str('Timeout encountered: %s' % 365 cmderr.args[0]) 366 raise error.AutoservRunError(timeout_message, cmderr.args[1]) 367 368 369 def run_background(self, command, verbose=True): 370 """Start a command on the host in the background. 371 372 The command is started on the host in the background, and 373 this method call returns immediately without waiting for the 374 command's completion. The PID of the process on the host is 375 returned as a string. 376 377 The command may redirect its stdin, stdout, or stderr as 378 necessary. Without redirection, all input and output will 379 use /dev/null. 380 381 @param command The command to run in the background 382 @param verbose As for `self.run()` 383 384 @return Returns the PID of the remote background process 385 as a string. 386 """ 387 # Redirection here isn't merely hygienic; it's a functional 388 # requirement. sshd won't terminate until stdin, stdout, 389 # and stderr are all closed. 390 # 391 # The subshell is needed to do the right thing in case the 392 # passed in command has its own I/O redirections. 393 cmd_fmt = '( %s ) </dev/null >/dev/null 2>&1 & echo -n $!' 394 return self.run(cmd_fmt % command, verbose=verbose).stdout 395 396 397 def run_short(self, command, **kwargs): 398 """ 399 Calls the run() command with a short default timeout. 400 401 Takes the same arguments as does run(), 402 with the exception of the timeout argument which 403 here is fixed at 60 seconds. 404 It returns the result of run. 405 406 @param command: the command line string 407 408 """ 409 return self.run(command, timeout=60, **kwargs) 410 411 412 def run_grep(self, command, timeout=30, ignore_status=False, 413 stdout_ok_regexp=None, stdout_err_regexp=None, 414 stderr_ok_regexp=None, stderr_err_regexp=None, 415 connect_timeout=30): 416 """ 417 Run a command on the remote host and look for regexp 418 in stdout or stderr to determine if the command was 419 successul or not. 420 421 422 @param command: the command line string 423 @param timeout: time limit in seconds before attempting to 424 kill the running process. The run() function 425 will take a few seconds longer than 'timeout' 426 to complete if it has to kill the process. 427 @param ignore_status: do not raise an exception, no matter 428 what the exit code of the command is. 429 @param stdout_ok_regexp: regexp that should be in stdout 430 if the command was successul. 431 @param stdout_err_regexp: regexp that should be in stdout 432 if the command failed. 433 @param stderr_ok_regexp: regexp that should be in stderr 434 if the command was successul. 435 @param stderr_err_regexp: regexp that should be in stderr 436 if the command failed. 437 @param connect_timeout: connection timeout (in seconds) 438 439 Returns: 440 if the command was successul, raises an exception 441 otherwise. 442 443 Raises: 444 AutoservRunError: 445 - the exit code of the command execution was not 0. 446 - If stderr_err_regexp is found in stderr, 447 - If stdout_err_regexp is found in stdout, 448 - If stderr_ok_regexp is not found in stderr. 449 - If stdout_ok_regexp is not found in stdout, 450 """ 451 452 # We ignore the status, because we will handle it at the end. 453 result = self.run(command, timeout, ignore_status=True, 454 connect_timeout=connect_timeout) 455 456 # Look for the patterns, in order 457 for (regexp, stream) in ((stderr_err_regexp, result.stderr), 458 (stdout_err_regexp, result.stdout)): 459 if regexp and stream: 460 err_re = re.compile (regexp) 461 if err_re.search(stream): 462 raise error.AutoservRunError( 463 '%r failed, found error pattern: %r' % (command, 464 regexp), result) 465 466 for (regexp, stream) in ((stderr_ok_regexp, result.stderr), 467 (stdout_ok_regexp, result.stdout)): 468 if regexp and stream: 469 ok_re = re.compile (regexp) 470 if ok_re.search(stream): 471 if ok_re.search(stream): 472 return 473 474 if not ignore_status and result.exit_status > 0: 475 msg = result.stderr.strip() 476 if not msg: 477 msg = result.stdout.strip() 478 if msg: 479 msg = msg.splitlines()[-1] 480 raise error.AutoservRunError("command execution error (%d): %r" % 481 (result.exit_status, msg), result) 482 483 484 def setup_ssh_key(self): 485 """Setup SSH Key""" 486 logging.debug('Performing SSH key setup on %s as %s.', 487 self.host_port, self.user) 488 489 try: 490 host = pxssh.pxssh() 491 host.login(self.hostname, self.user, self.password, 492 port=self.port) 493 public_key = utils.get_public_key() 494 495 host.sendline('mkdir -p ~/.ssh') 496 host.prompt() 497 host.sendline('chmod 700 ~/.ssh') 498 host.prompt() 499 host.sendline("echo '%s' >> ~/.ssh/authorized_keys; " % 500 public_key) 501 host.prompt() 502 host.sendline('chmod 600 ~/.ssh/authorized_keys') 503 host.prompt() 504 host.logout() 505 506 logging.debug('SSH key setup complete.') 507 508 except: 509 logging.debug('SSH key setup has failed.') 510 try: 511 host.logout() 512 except: 513 pass 514 515 516 def setup_ssh(self): 517 """Setup SSH""" 518 if self.password: 519 try: 520 self.ssh_ping() 521 except error.AutoservSshPingHostError: 522 self.setup_ssh_key() 523