1# Copyright 2009 Google Inc. Released under the GPL v2 2 3""" 4This module defines the base classes for the Host hierarchy. 5 6Implementation details: 7You should import the "hosts" package instead of importing each type of host. 8 9 Host: a machine on which you can run programs 10""" 11 12__author__ = """ 13mbligh@google.com (Martin J. Bligh), 14poirier@google.com (Benjamin Poirier), 15stutsman@google.com (Ryan Stutsman) 16""" 17 18import cPickle, logging, os, re, time 19 20from autotest_lib.client.common_lib import global_config, error, utils 21from autotest_lib.client.common_lib.cros import path_utils 22 23 24class Host(object): 25 """ 26 This class represents a machine on which you can run programs. 27 28 It may be a local machine, the one autoserv is running on, a remote 29 machine or a virtual machine. 30 31 Implementation details: 32 This is an abstract class, leaf subclasses must implement the methods 33 listed here. You must not instantiate this class but should 34 instantiate one of those leaf subclasses. 35 36 When overriding methods that raise NotImplementedError, the leaf class 37 is fully responsible for the implementation and should not chain calls 38 to super. When overriding methods that are a NOP in Host, the subclass 39 should chain calls to super(). The criteria for fitting a new method into 40 one category or the other should be: 41 1. If two separate generic implementations could reasonably be 42 concatenated, then the abstract implementation should pass and 43 subclasses should chain calls to super. 44 2. If only one class could reasonably perform the stated function 45 (e.g. two separate run() implementations cannot both be executed) 46 then the method should raise NotImplementedError in Host, and 47 the implementor should NOT chain calls to super, to ensure that 48 only one implementation ever gets executed. 49 """ 50 51 job = None 52 DEFAULT_REBOOT_TIMEOUT = global_config.global_config.get_config_value( 53 "HOSTS", "default_reboot_timeout", type=int, default=1800) 54 WAIT_DOWN_REBOOT_TIMEOUT = global_config.global_config.get_config_value( 55 "HOSTS", "wait_down_reboot_timeout", type=int, default=840) 56 WAIT_DOWN_REBOOT_WARNING = global_config.global_config.get_config_value( 57 "HOSTS", "wait_down_reboot_warning", type=int, default=540) 58 HOURS_TO_WAIT_FOR_RECOVERY = global_config.global_config.get_config_value( 59 "HOSTS", "hours_to_wait_for_recovery", type=float, default=2.5) 60 # the number of hardware repair requests that need to happen before we 61 # actually send machines to hardware repair 62 HARDWARE_REPAIR_REQUEST_THRESHOLD = 4 63 OP_REBOOT = 'reboot' 64 OP_SUSPEND = 'suspend' 65 PWR_OPERATION = [OP_REBOOT, OP_SUSPEND] 66 67 68 def __init__(self, *args, **dargs): 69 self._initialize(*args, **dargs) 70 71 72 def _initialize(self, *args, **dargs): 73 pass 74 75 76 @property 77 def job_repo_url_attribute(self): 78 """Get the host attribute name for job_repo_url. 79 """ 80 return 'job_repo_url' 81 82 83 def close(self): 84 """Close the connection to the host. 85 """ 86 pass 87 88 89 def setup(self): 90 """Setup the host object. 91 """ 92 pass 93 94 95 def run(self, command, timeout=3600, ignore_status=False, 96 stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS, 97 stdin=None, args=()): 98 """ 99 Run a command on this host. 100 101 @param command: the command line string 102 @param timeout: time limit in seconds before attempting to 103 kill the running process. The run() function 104 will take a few seconds longer than 'timeout' 105 to complete if it has to kill the process. 106 @param ignore_status: do not raise an exception, no matter 107 what the exit code of the command is. 108 @param stdout_tee: where to tee the stdout 109 @param stderr_tee: where to tee the stderr 110 @param stdin: stdin to pass (a string) to the executed command 111 @param args: sequence of strings to pass as arguments to command by 112 quoting them in " and escaping their contents if necessary 113 114 @return a utils.CmdResult object 115 116 @raises AutotestHostRunError: the exit code of the command execution 117 was not 0 and ignore_status was not enabled 118 """ 119 raise NotImplementedError('Run not implemented!') 120 121 122 def run_output(self, command, *args, **dargs): 123 """Run and retrieve the value of stdout stripped of whitespace. 124 125 @param command: Command to execute. 126 @param *args: Extra arguments to run. 127 @param **dargs: Extra keyword arguments to run. 128 129 @return: String value of stdout. 130 """ 131 return self.run(command, *args, **dargs).stdout.rstrip() 132 133 134 def reboot(self): 135 """Reboot the host. 136 """ 137 raise NotImplementedError('Reboot not implemented!') 138 139 140 def suspend(self): 141 """Suspend the host. 142 """ 143 raise NotImplementedError('Suspend not implemented!') 144 145 146 def sysrq_reboot(self): 147 """Execute host reboot via SysRq key. 148 """ 149 raise NotImplementedError('Sysrq reboot not implemented!') 150 151 152 def reboot_setup(self, *args, **dargs): 153 """Prepare for reboot. 154 155 This doesn't appear to be implemented by any current hosts. 156 157 @param *args: Extra arguments to ?. 158 @param **dargs: Extra keyword arguments to ?. 159 """ 160 pass 161 162 163 def reboot_followup(self, *args, **dargs): 164 """Post reboot work. 165 166 This doesn't appear to be implemented by any current hosts. 167 168 @param *args: Extra arguments to ?. 169 @param **dargs: Extra keyword arguments to ?. 170 """ 171 pass 172 173 174 def get_file(self, source, dest, delete_dest=False): 175 """Retrieve a file from the host. 176 177 @param source: Remote file path (directory, file or list). 178 @param dest: Local file path (directory, file or list). 179 @param delete_dest: Delete files in remote path that are not in local 180 path. 181 """ 182 raise NotImplementedError('Get file not implemented!') 183 184 185 def send_file(self, source, dest, delete_dest=False, excludes=None): 186 """Send a file to the host. 187 188 @param source: Local file path (directory, file or list). 189 @param dest: Remote file path (directory, file or list). 190 @param delete_dest: Delete files in remote path that are not in local 191 path. 192 @param excludes: A list of file pattern that matches files not to be 193 sent. `send_file` will fail if exclude is not 194 supported. 195 """ 196 raise NotImplementedError('Send file not implemented!') 197 198 199 def get_tmp_dir(self): 200 """Create a temporary directory on the host. 201 """ 202 raise NotImplementedError('Get temp dir not implemented!') 203 204 205 def is_up(self): 206 """Confirm the host is online. 207 """ 208 raise NotImplementedError('Is up not implemented!') 209 210 211 def is_shutting_down(self): 212 """ Indicates is a machine is currently shutting down. """ 213 return False 214 215 216 def get_wait_up_processes(self): 217 """ Gets the list of local processes to wait for in wait_up. """ 218 get_config = global_config.global_config.get_config_value 219 proc_list = get_config("HOSTS", "wait_up_processes", 220 default="").strip() 221 processes = set(p.strip() for p in proc_list.split(",")) 222 processes.discard("") 223 return processes 224 225 226 def get_boot_id(self, timeout=60): 227 """ Get a unique ID associated with the current boot. 228 229 Should return a string with the semantics such that two separate 230 calls to Host.get_boot_id() return the same string if the host did 231 not reboot between the two calls, and two different strings if it 232 has rebooted at least once between the two calls. 233 234 @param timeout The number of seconds to wait before timing out. 235 236 @return A string unique to this boot or None if not available.""" 237 BOOT_ID_FILE = '/proc/sys/kernel/random/boot_id' 238 NO_ID_MSG = 'no boot_id available' 239 cmd = 'if [ -f %r ]; then cat %r; else echo %r; fi' % ( 240 BOOT_ID_FILE, BOOT_ID_FILE, NO_ID_MSG) 241 boot_id = self.run(cmd, timeout=timeout).stdout.strip() 242 if boot_id == NO_ID_MSG: 243 return None 244 return boot_id 245 246 247 def wait_up(self, timeout=None): 248 """Wait for the host to come up. 249 250 @param timeout: Max seconds to wait. 251 """ 252 raise NotImplementedError('Wait up not implemented!') 253 254 255 def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None): 256 """Wait for the host to go down. 257 258 @param timeout: Max seconds to wait before returning. 259 @param warning_timer: Seconds before warning host is not down. 260 @param old_boot_id: Result of self.get_boot_id() before shutdown. 261 """ 262 raise NotImplementedError('Wait down not implemented!') 263 264 265 def _construct_host_metadata(self, type_str): 266 """Returns dict of metadata with type_str, hostname, time_recorded. 267 268 @param type_str: String representing _type field in es db. 269 For example: type_str='reboot_total'. 270 """ 271 metadata = { 272 'hostname': self.hostname, 273 'time_recorded': time.time(), 274 '_type': type_str, 275 } 276 return metadata 277 278 279 def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, 280 down_timeout=WAIT_DOWN_REBOOT_TIMEOUT, 281 down_warning=WAIT_DOWN_REBOOT_WARNING, 282 log_failure=True, old_boot_id=None, **dargs): 283 """Wait for the host to come back from a reboot. 284 285 This is a generic implementation based entirely on wait_up and 286 wait_down. 287 288 @param timeout: Max seconds to wait for reboot to start. 289 @param down_timeout: Max seconds to wait for host to go down. 290 @param down_warning: Seconds to wait before warning host hasn't gone 291 down. 292 @param log_failure: bool(Log when host does not go down.) 293 @param old_boot_id: Result of self.get_boot_id() before restart. 294 @param **dargs: Extra arguments to reboot_followup. 295 296 @raises AutoservRebootError if host does not come back up. 297 """ 298 if not self.wait_down(timeout=down_timeout, 299 warning_timer=down_warning, 300 old_boot_id=old_boot_id): 301 if log_failure: 302 self.record("ABORT", None, "reboot.verify", "shut down failed") 303 raise error.AutoservShutdownError("Host did not shut down") 304 if self.wait_up(timeout): 305 self.record("GOOD", None, "reboot.verify") 306 self.reboot_followup(**dargs) 307 else: 308 self.record("ABORT", None, "reboot.verify", 309 "Host did not return from reboot") 310 raise error.AutoservRebootError("Host did not return from reboot") 311 312 313 def verify(self): 314 """Check if host is in good state. 315 """ 316 self.verify_hardware() 317 self.verify_connectivity() 318 self.verify_software() 319 320 321 def verify_hardware(self): 322 """Check host hardware. 323 """ 324 pass 325 326 327 def verify_connectivity(self): 328 """Check host network connectivity. 329 """ 330 pass 331 332 333 def verify_software(self): 334 """Check host software. 335 """ 336 pass 337 338 339 def check_diskspace(self, path, gb): 340 """Raises an error if path does not have at least gb GB free. 341 342 @param path The path to check for free disk space. 343 @param gb A floating point number to compare with a granularity 344 of 1 MB. 345 346 1000 based SI units are used. 347 348 @raises AutoservDiskFullHostError if path has less than gb GB free. 349 @raises AutoservDirectoryNotFoundError if path is not a valid directory. 350 @raises AutoservDiskSizeUnknownError the return from du is not parsed 351 correctly. 352 """ 353 one_mb = 10 ** 6 # Bytes (SI unit). 354 mb_per_gb = 1000.0 355 logging.info('Checking for >= %s GB of space under %s on machine %s', 356 gb, path, self.hostname) 357 358 if not self.path_exists(path): 359 msg = 'Path does not exist on host: %s' % path 360 logging.warning(msg) 361 raise error.AutoservDirectoryNotFoundError(msg) 362 363 cmd = 'df -PB %d %s | tail -1' % (one_mb, path) 364 df = self.run(cmd).stdout.split() 365 try: 366 free_space_gb = int(df[3]) / mb_per_gb 367 except (IndexError, ValueError): 368 msg = ('Could not determine the size of %s. ' 369 'Output from df: %s') % (path, df) 370 logging.error(msg) 371 raise error.AutoservDiskSizeUnknownError(msg) 372 373 if free_space_gb < gb: 374 raise error.AutoservDiskFullHostError(path, gb, free_space_gb) 375 else: 376 logging.info('Found %s GB >= %s GB of space under %s on machine %s', 377 free_space_gb, gb, path, self.hostname) 378 379 380 def check_inodes(self, path, min_kilo_inodes): 381 """Raises an error if a file system is short on i-nodes. 382 383 @param path The path to check for free i-nodes. 384 @param min_kilo_inodes Minimum number of i-nodes required, 385 in units of 1000 i-nodes. 386 387 @raises AutoservNoFreeInodesError If the minimum required 388 i-node count isn't available. 389 """ 390 min_inodes = 1000 * min_kilo_inodes 391 logging.info('Checking for >= %d i-nodes under %s ' 392 'on machine %s', min_inodes, path, self.hostname) 393 df = self.run('df -Pi %s | tail -1' % path).stdout.split() 394 free_inodes = int(df[3]) 395 if free_inodes < min_inodes: 396 raise error.AutoservNoFreeInodesError(path, min_inodes, 397 free_inodes) 398 else: 399 logging.info('Found %d >= %d i-nodes under %s on ' 400 'machine %s', free_inodes, min_inodes, 401 path, self.hostname) 402 403 404 def erase_dir_contents(self, path, ignore_status=True, timeout=3600): 405 """Empty a given directory path contents. 406 407 @param path: Path to empty. 408 @param ignore_status: Ignore the exit status from run. 409 @param timeout: Max seconds to allow command to complete. 410 """ 411 rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf' 412 self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout) 413 414 415 def repair(self): 416 """Try and get the host to pass `self.verify()`.""" 417 self.verify() 418 419 420 def disable_ipfilters(self): 421 """Allow all network packets in and out of the host.""" 422 self.run('iptables-save > /tmp/iptable-rules') 423 self.run('iptables -P INPUT ACCEPT') 424 self.run('iptables -P FORWARD ACCEPT') 425 self.run('iptables -P OUTPUT ACCEPT') 426 427 428 def enable_ipfilters(self): 429 """Re-enable the IP filters disabled from disable_ipfilters()""" 430 if self.path_exists('/tmp/iptable-rules'): 431 self.run('iptables-restore < /tmp/iptable-rules') 432 433 434 def cleanup(self): 435 """Restore host to clean state. 436 """ 437 pass 438 439 440 def install(self, installableObject): 441 """Call install on a thing. 442 443 @param installableObject: Thing with install method that will accept our 444 self. 445 """ 446 installableObject.install(self) 447 448 449 def get_autodir(self): 450 raise NotImplementedError('Get autodir not implemented!') 451 452 453 def set_autodir(self): 454 raise NotImplementedError('Set autodir not implemented!') 455 456 457 def start_loggers(self): 458 """ Called to start continuous host logging. """ 459 pass 460 461 462 def stop_loggers(self): 463 """ Called to stop continuous host logging. """ 464 pass 465 466 467 # some extra methods simplify the retrieval of information about the 468 # Host machine, with generic implementations based on run(). subclasses 469 # should feel free to override these if they can provide better 470 # implementations for their specific Host types 471 472 def get_num_cpu(self): 473 """ Get the number of CPUs in the host according to /proc/cpuinfo. """ 474 proc_cpuinfo = self.run('cat /proc/cpuinfo', 475 stdout_tee=open(os.devnull, 'w')).stdout 476 cpus = 0 477 for line in proc_cpuinfo.splitlines(): 478 if line.startswith('processor'): 479 cpus += 1 480 return cpus 481 482 483 def get_arch(self): 484 """ Get the hardware architecture of the remote machine. """ 485 cmd_uname = path_utils.must_be_installed('/bin/uname', host=self) 486 arch = self.run('%s -m' % cmd_uname).stdout.rstrip() 487 if re.match(r'i\d86$', arch): 488 arch = 'i386' 489 return arch 490 491 492 def get_kernel_ver(self): 493 """ Get the kernel version of the remote machine. """ 494 cmd_uname = path_utils.must_be_installed('/bin/uname', host=self) 495 return self.run('%s -r' % cmd_uname).stdout.rstrip() 496 497 498 def get_cmdline(self): 499 """ Get the kernel command line of the remote machine. """ 500 return self.run('cat /proc/cmdline').stdout.rstrip() 501 502 503 def get_meminfo(self): 504 """ Get the kernel memory info (/proc/meminfo) of the remote machine 505 and return a dictionary mapping the various statistics. """ 506 meminfo_dict = {} 507 meminfo = self.run('cat /proc/meminfo').stdout.splitlines() 508 for key, val in (line.split(':', 1) for line in meminfo): 509 meminfo_dict[key.strip()] = val.strip() 510 return meminfo_dict 511 512 513 def path_exists(self, path): 514 """Determine if path exists on the remote machine. 515 516 @param path: path to check 517 518 @return: bool(path exists)""" 519 result = self.run('test -e "%s"' % utils.sh_escape(path), 520 ignore_status=True) 521 return result.exit_status == 0 522 523 524 # some extra helpers for doing job-related operations 525 526 def record(self, *args, **dargs): 527 """ Helper method for recording status logs against Host.job that 528 silently becomes a NOP if Host.job is not available. The args and 529 dargs are passed on to Host.job.record unchanged. """ 530 if self.job: 531 self.job.record(*args, **dargs) 532 533 534 def log_kernel(self): 535 """ Helper method for logging kernel information into the status logs. 536 Intended for cases where the "current" kernel is not really defined 537 and we want to explicitly log it. Does nothing if this host isn't 538 actually associated with a job. """ 539 if self.job: 540 kernel = self.get_kernel_ver() 541 self.job.record("INFO", None, None, 542 optional_fields={"kernel": kernel}) 543 544 545 def log_op(self, op, op_func): 546 """ Decorator for wrapping a management operaiton in a group for status 547 logging purposes. 548 549 @param op: name of the operation. 550 @param op_func: a function that carries out the operation 551 (reboot, suspend) 552 """ 553 if self.job and not hasattr(self, "RUNNING_LOG_OP"): 554 self.RUNNING_LOG_OP = True 555 try: 556 self.job.run_op(op, op_func, self.get_kernel_ver) 557 finally: 558 del self.RUNNING_LOG_OP 559 else: 560 op_func() 561 562 563 def list_files_glob(self, glob): 564 """Get a list of files on a remote host given a glob pattern path. 565 566 @param glob: pattern 567 568 @return: list of files 569 """ 570 SCRIPT = ("python -c 'import cPickle, glob, sys;" 571 "cPickle.dump(glob.glob(sys.argv[1]), sys.stdout, 0)'") 572 output = self.run(SCRIPT, args=(glob,), stdout_tee=None, 573 timeout=60).stdout 574 return cPickle.loads(output) 575 576 577 def symlink_closure(self, paths): 578 """ 579 Given a sequence of path strings, return the set of all paths that 580 can be reached from the initial set by following symlinks. 581 582 @param paths: sequence of path strings. 583 @return: a sequence of path strings that are all the unique paths that 584 can be reached from the given ones after following symlinks. 585 """ 586 SCRIPT = ("python -c 'import cPickle, os, sys\n" 587 "paths = cPickle.load(sys.stdin)\n" 588 "closure = {}\n" 589 "while paths:\n" 590 " path = paths.keys()[0]\n" 591 " del paths[path]\n" 592 " if not os.path.exists(path):\n" 593 " continue\n" 594 " closure[path] = None\n" 595 " if os.path.islink(path):\n" 596 " link_to = os.path.join(os.path.dirname(path),\n" 597 " os.readlink(path))\n" 598 " if link_to not in closure.keys():\n" 599 " paths[link_to] = None\n" 600 "cPickle.dump(closure.keys(), sys.stdout, 0)'") 601 input_data = cPickle.dumps(dict((path, None) for path in paths), 0) 602 output = self.run(SCRIPT, stdout_tee=None, stdin=input_data, 603 timeout=60).stdout 604 return cPickle.loads(output) 605 606 607 def cleanup_kernels(self, boot_dir='/boot'): 608 """ 609 Remove any kernel image and associated files (vmlinux, system.map, 610 modules) for any image found in the boot directory that is not 611 referenced by entries in the bootloader configuration. 612 613 @param boot_dir: boot directory path string, default '/boot' 614 """ 615 # find all the vmlinuz images referenced by the bootloader 616 vmlinuz_prefix = os.path.join(boot_dir, 'vmlinuz-') 617 boot_info = self.bootloader.get_entries() 618 used_kernver = [boot['kernel'][len(vmlinuz_prefix):] 619 for boot in boot_info.itervalues()] 620 621 # find all the unused vmlinuz images in /boot 622 all_vmlinuz = self.list_files_glob(vmlinuz_prefix + '*') 623 used_vmlinuz = self.symlink_closure(vmlinuz_prefix + kernver 624 for kernver in used_kernver) 625 unused_vmlinuz = set(all_vmlinuz) - set(used_vmlinuz) 626 627 # find all the unused vmlinux images in /boot 628 vmlinux_prefix = os.path.join(boot_dir, 'vmlinux-') 629 all_vmlinux = self.list_files_glob(vmlinux_prefix + '*') 630 used_vmlinux = self.symlink_closure(vmlinux_prefix + kernver 631 for kernver in used_kernver) 632 unused_vmlinux = set(all_vmlinux) - set(used_vmlinux) 633 634 # find all the unused System.map files in /boot 635 systemmap_prefix = os.path.join(boot_dir, 'System.map-') 636 all_system_map = self.list_files_glob(systemmap_prefix + '*') 637 used_system_map = self.symlink_closure( 638 systemmap_prefix + kernver for kernver in used_kernver) 639 unused_system_map = set(all_system_map) - set(used_system_map) 640 641 # find all the module directories associated with unused kernels 642 modules_prefix = '/lib/modules/' 643 all_moddirs = [dir for dir in self.list_files_glob(modules_prefix + '*') 644 if re.match(modules_prefix + r'\d+\.\d+\.\d+.*', dir)] 645 used_moddirs = self.symlink_closure(modules_prefix + kernver 646 for kernver in used_kernver) 647 unused_moddirs = set(all_moddirs) - set(used_moddirs) 648 649 # remove all the vmlinuz files we don't use 650 # TODO: if needed this should become package manager agnostic 651 for vmlinuz in unused_vmlinuz: 652 # try and get an rpm package name 653 rpm = self.run('rpm -qf', args=(vmlinuz,), 654 ignore_status=True, timeout=120) 655 if rpm.exit_status == 0: 656 packages = set(line.strip() for line in 657 rpm.stdout.splitlines()) 658 # if we found some package names, try to remove them 659 for package in packages: 660 self.run('rpm -e', args=(package,), 661 ignore_status=True, timeout=120) 662 # remove the image files anyway, even if rpm didn't 663 self.run('rm -f', args=(vmlinuz,), 664 ignore_status=True, timeout=120) 665 666 # remove all the vmlinux and System.map files left over 667 for f in (unused_vmlinux | unused_system_map): 668 self.run('rm -f', args=(f,), 669 ignore_status=True, timeout=120) 670 671 # remove all unused module directories 672 # the regex match should keep us safe from removing the wrong files 673 for moddir in unused_moddirs: 674 self.run('rm -fr', args=(moddir,), ignore_status=True) 675 676 677 def get_attributes_to_clear_before_provision(self): 678 """Get a list of attributes to be cleared before machine_install starts. 679 680 If provision runs in a lab environment, it is necessary to clear certain 681 host attributes for the host in afe_host_attributes table. For example, 682 `job_repo_url` is a devserver url pointed to autotest packages for 683 CrosHost, it needs to be removed before provision starts for tests to 684 run reliably. 685 For ADBHost, the job repo url has a different format, i.e., appended by 686 adb_serial, so this method should be overriden in ADBHost. 687 """ 688 return ['job_repo_url'] 689 690 691 def get_platform(self): 692 """Determine the correct platform label for this host. 693 694 @return: A string representing this host's platform. 695 """ 696 raise NotImplementedError("Get platform not implemented!") 697 698 699 def get_labels(self): 700 """Return a list of the labels gathered from the devices connected. 701 702 @return: A list of strings that denote the labels from all the devices 703 connected. 704 """ 705 raise NotImplementedError("Get labels not implemented!") 706 707 708 def check_cached_up_status(self, expiration_seconds): 709 """Check if the DUT responded to ping in the past `expiration_seconds`. 710 711 @param expiration_seconds: The number of seconds to keep the cached 712 status of whether the DUT responded to ping. 713 @return: True if the DUT has responded to ping during the past 714 `expiration_seconds`. 715 """ 716 raise NotImplementedError("check_cached_up_status not implemented!") 717