1# Copyright 2009 Google Inc. Released under the GPL v2 2 3""" 4This module defines the base classes for the Host hierarchy. 5 6Implementation details: 7You should import the "hosts" package instead of importing each type of host. 8 9 Host: a machine on which you can run programs 10""" 11 12__author__ = """ 13mbligh@google.com (Martin J. Bligh), 14poirier@google.com (Benjamin Poirier), 15stutsman@google.com (Ryan Stutsman) 16""" 17 18import cPickle, logging, os, re, time 19 20from autotest_lib.client.common_lib import global_config, error, utils 21from autotest_lib.client.common_lib.cros import path_utils 22 23 24class Host(object): 25 """ 26 This class represents a machine on which you can run programs. 27 28 It may be a local machine, the one autoserv is running on, a remote 29 machine or a virtual machine. 30 31 Implementation details: 32 This is an abstract class, leaf subclasses must implement the methods 33 listed here. You must not instantiate this class but should 34 instantiate one of those leaf subclasses. 35 36 When overriding methods that raise NotImplementedError, the leaf class 37 is fully responsible for the implementation and should not chain calls 38 to super. When overriding methods that are a NOP in Host, the subclass 39 should chain calls to super(). The criteria for fitting a new method into 40 one category or the other should be: 41 1. If two separate generic implementations could reasonably be 42 concatenated, then the abstract implementation should pass and 43 subclasses should chain calls to super. 44 2. If only one class could reasonably perform the stated function 45 (e.g. two separate run() implementations cannot both be executed) 46 then the method should raise NotImplementedError in Host, and 47 the implementor should NOT chain calls to super, to ensure that 48 only one implementation ever gets executed. 49 """ 50 51 job = None 52 DEFAULT_REBOOT_TIMEOUT = global_config.global_config.get_config_value( 53 "HOSTS", "default_reboot_timeout", type=int, default=1800) 54 WAIT_DOWN_REBOOT_TIMEOUT = global_config.global_config.get_config_value( 55 "HOSTS", "wait_down_reboot_timeout", type=int, default=840) 56 WAIT_DOWN_REBOOT_WARNING = global_config.global_config.get_config_value( 57 "HOSTS", "wait_down_reboot_warning", type=int, default=540) 58 HOURS_TO_WAIT_FOR_RECOVERY = global_config.global_config.get_config_value( 59 "HOSTS", "hours_to_wait_for_recovery", type=float, default=2.5) 60 # the number of hardware repair requests that need to happen before we 61 # actually send machines to hardware repair 62 HARDWARE_REPAIR_REQUEST_THRESHOLD = 4 63 OP_REBOOT = 'reboot' 64 OP_SUSPEND = 'suspend' 65 PWR_OPERATION = [OP_REBOOT, OP_SUSPEND] 66 67 68 def __init__(self, *args, **dargs): 69 self._initialize(*args, **dargs) 70 71 72 def _initialize(self, *args, **dargs): 73 pass 74 75 76 @property 77 def job_repo_url_attribute(self): 78 """Get the host attribute name for job_repo_url. 79 """ 80 return 'job_repo_url' 81 82 83 def close(self): 84 """Close the connection to the host. 85 """ 86 pass 87 88 89 def setup(self): 90 """Setup the host object. 91 """ 92 pass 93 94 95 def run(self, command, timeout=3600, ignore_status=False, 96 stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS, 97 stdin=None, args=()): 98 """ 99 Run a command on this host. 100 101 @param command: the command line string 102 @param timeout: time limit in seconds before attempting to 103 kill the running process. The run() function 104 will take a few seconds longer than 'timeout' 105 to complete if it has to kill the process. 106 @param ignore_status: do not raise an exception, no matter 107 what the exit code of the command is. 108 @param stdout_tee: where to tee the stdout 109 @param stderr_tee: where to tee the stderr 110 @param stdin: stdin to pass (a string) to the executed command 111 @param args: sequence of strings to pass as arguments to command by 112 quoting them in " and escaping their contents if necessary 113 114 @return a utils.CmdResult object 115 116 @raises AutotestHostRunError: the exit code of the command execution 117 was not 0 and ignore_status was not enabled 118 """ 119 raise NotImplementedError('Run not implemented!') 120 121 122 def run_output(self, command, *args, **dargs): 123 """Run and retrieve the value of stdout stripped of whitespace. 124 125 @param command: Command to execute. 126 @param *args: Extra arguments to run. 127 @param **dargs: Extra keyword arguments to run. 128 129 @return: String value of stdout. 130 """ 131 return self.run(command, *args, **dargs).stdout.rstrip() 132 133 134 def reboot(self): 135 """Reboot the host. 136 """ 137 raise NotImplementedError('Reboot not implemented!') 138 139 140 def suspend(self): 141 """Suspend the host. 142 """ 143 raise NotImplementedError('Suspend not implemented!') 144 145 146 def sysrq_reboot(self): 147 """Execute host reboot via SysRq key. 148 """ 149 raise NotImplementedError('Sysrq reboot not implemented!') 150 151 152 def reboot_setup(self, *args, **dargs): 153 """Prepare for reboot. 154 155 This doesn't appear to be implemented by any current hosts. 156 157 @param *args: Extra arguments to ?. 158 @param **dargs: Extra keyword arguments to ?. 159 """ 160 pass 161 162 163 def reboot_followup(self, *args, **dargs): 164 """Post reboot work. 165 166 This doesn't appear to be implemented by any current hosts. 167 168 @param *args: Extra arguments to ?. 169 @param **dargs: Extra keyword arguments to ?. 170 """ 171 pass 172 173 174 def get_file(self, source, dest, delete_dest=False): 175 """Retrieve a file from the host. 176 177 @param source: Remote file path (directory, file or list). 178 @param dest: Local file path (directory, file or list). 179 @param delete_dest: Delete files in remote path that are not in local 180 path. 181 """ 182 raise NotImplementedError('Get file not implemented!') 183 184 185 def send_file(self, source, dest, delete_dest=False): 186 """Send a file to the host. 187 188 @param source: Local file path (directory, file or list). 189 @param dest: Remote file path (directory, file or list). 190 @param delete_dest: Delete files in remote path that are not in local 191 path. 192 """ 193 raise NotImplementedError('Send file not implemented!') 194 195 196 def get_tmp_dir(self): 197 """Create a temporary directory on the host. 198 """ 199 raise NotImplementedError('Get temp dir not implemented!') 200 201 202 def is_up(self): 203 """Confirm the host is online. 204 """ 205 raise NotImplementedError('Is up not implemented!') 206 207 208 def is_shutting_down(self): 209 """ Indicates is a machine is currently shutting down. """ 210 return False 211 212 213 def get_wait_up_processes(self): 214 """ Gets the list of local processes to wait for in wait_up. """ 215 get_config = global_config.global_config.get_config_value 216 proc_list = get_config("HOSTS", "wait_up_processes", 217 default="").strip() 218 processes = set(p.strip() for p in proc_list.split(",")) 219 processes.discard("") 220 return processes 221 222 223 def get_boot_id(self, timeout=60): 224 """ Get a unique ID associated with the current boot. 225 226 Should return a string with the semantics such that two separate 227 calls to Host.get_boot_id() return the same string if the host did 228 not reboot between the two calls, and two different strings if it 229 has rebooted at least once between the two calls. 230 231 @param timeout The number of seconds to wait before timing out. 232 233 @return A string unique to this boot or None if not available.""" 234 BOOT_ID_FILE = '/proc/sys/kernel/random/boot_id' 235 NO_ID_MSG = 'no boot_id available' 236 cmd = 'if [ -f %r ]; then cat %r; else echo %r; fi' % ( 237 BOOT_ID_FILE, BOOT_ID_FILE, NO_ID_MSG) 238 boot_id = self.run(cmd, timeout=timeout).stdout.strip() 239 if boot_id == NO_ID_MSG: 240 return None 241 return boot_id 242 243 244 def wait_up(self, timeout=None): 245 """Wait for the host to come up. 246 247 @param timeout: Max seconds to wait. 248 """ 249 raise NotImplementedError('Wait up not implemented!') 250 251 252 def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None): 253 """Wait for the host to go down. 254 255 @param timeout: Max seconds to wait before returning. 256 @param warning_timer: Seconds before warning host is not down. 257 @param old_boot_id: Result of self.get_boot_id() before shutdown. 258 """ 259 raise NotImplementedError('Wait down not implemented!') 260 261 262 def _construct_host_metadata(self, type_str): 263 """Returns dict of metadata with type_str, hostname, time_recorded. 264 265 @param type_str: String representing _type field in es db. 266 For example: type_str='reboot_total'. 267 """ 268 metadata = { 269 'hostname': self.hostname, 270 'time_recorded': time.time(), 271 '_type': type_str, 272 } 273 return metadata 274 275 276 def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, 277 down_timeout=WAIT_DOWN_REBOOT_TIMEOUT, 278 down_warning=WAIT_DOWN_REBOOT_WARNING, 279 log_failure=True, old_boot_id=None, **dargs): 280 """Wait for the host to come back from a reboot. 281 282 This is a generic implementation based entirely on wait_up and 283 wait_down. 284 285 @param timeout: Max seconds to wait for reboot to start. 286 @param down_timeout: Max seconds to wait for host to go down. 287 @param down_warning: Seconds to wait before warning host hasn't gone 288 down. 289 @param log_failure: bool(Log when host does not go down.) 290 @param old_boot_id: Result of self.get_boot_id() before restart. 291 @param **dargs: Extra arguments to reboot_followup. 292 293 @raises AutoservRebootError if host does not come back up. 294 """ 295 key_string = 'Reboot.%s' % dargs.get('board') 296 297 if not self.wait_down(timeout=down_timeout, 298 warning_timer=down_warning, 299 old_boot_id=old_boot_id): 300 if log_failure: 301 self.record("ABORT", None, "reboot.verify", "shut down failed") 302 raise error.AutoservShutdownError("Host did not shut down") 303 if self.wait_up(timeout): 304 self.record("GOOD", None, "reboot.verify") 305 self.reboot_followup(**dargs) 306 else: 307 self.record("ABORT", None, "reboot.verify", 308 "Host did not return from reboot") 309 raise error.AutoservRebootError("Host did not return from reboot") 310 311 312 def verify(self): 313 """Check if host is in good state. 314 """ 315 self.verify_hardware() 316 self.verify_connectivity() 317 self.verify_software() 318 319 320 def verify_hardware(self): 321 """Check host hardware. 322 """ 323 pass 324 325 326 def verify_connectivity(self): 327 """Check host network connectivity. 328 """ 329 pass 330 331 332 def verify_software(self): 333 """Check host software. 334 """ 335 pass 336 337 338 def check_diskspace(self, path, gb): 339 """Raises an error if path does not have at least gb GB free. 340 341 @param path The path to check for free disk space. 342 @param gb A floating point number to compare with a granularity 343 of 1 MB. 344 345 1000 based SI units are used. 346 347 @raises AutoservDiskFullHostError if path has less than gb GB free. 348 """ 349 one_mb = 10 ** 6 # Bytes (SI unit). 350 mb_per_gb = 1000.0 351 logging.info('Checking for >= %s GB of space under %s on machine %s', 352 gb, path, self.hostname) 353 df = self.run('df -PB %d %s | tail -1' % (one_mb, path)).stdout.split() 354 free_space_gb = int(df[3]) / mb_per_gb 355 if free_space_gb < gb: 356 raise error.AutoservDiskFullHostError(path, gb, free_space_gb) 357 else: 358 logging.info('Found %s GB >= %s GB of space under %s on machine %s', 359 free_space_gb, gb, path, self.hostname) 360 361 362 def check_inodes(self, path, min_kilo_inodes): 363 """Raises an error if a file system is short on i-nodes. 364 365 @param path The path to check for free i-nodes. 366 @param min_kilo_inodes Minimum number of i-nodes required, 367 in units of 1000 i-nodes. 368 369 @raises AutoservNoFreeInodesError If the minimum required 370 i-node count isn't available. 371 """ 372 min_inodes = 1000 * min_kilo_inodes 373 logging.info('Checking for >= %d i-nodes under %s ' 374 'on machine %s', min_inodes, path, self.hostname) 375 df = self.run('df -Pi %s | tail -1' % path).stdout.split() 376 free_inodes = int(df[3]) 377 if free_inodes < min_inodes: 378 raise error.AutoservNoFreeInodesError(path, min_inodes, 379 free_inodes) 380 else: 381 logging.info('Found %d >= %d i-nodes under %s on ' 382 'machine %s', free_inodes, min_inodes, 383 path, self.hostname) 384 385 386 def erase_dir_contents(self, path, ignore_status=True, timeout=3600): 387 """Empty a given directory path contents. 388 389 @param path: Path to empty. 390 @param ignore_status: Ignore the exit status from run. 391 @param timeout: Max seconds to allow command to complete. 392 """ 393 rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf' 394 self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout) 395 396 397 def repair(self): 398 """Try and get the host to pass `self.verify()`.""" 399 self.verify() 400 401 402 def disable_ipfilters(self): 403 """Allow all network packets in and out of the host.""" 404 self.run('iptables-save > /tmp/iptable-rules') 405 self.run('iptables -P INPUT ACCEPT') 406 self.run('iptables -P FORWARD ACCEPT') 407 self.run('iptables -P OUTPUT ACCEPT') 408 409 410 def enable_ipfilters(self): 411 """Re-enable the IP filters disabled from disable_ipfilters()""" 412 if self.path_exists('/tmp/iptable-rules'): 413 self.run('iptables-restore < /tmp/iptable-rules') 414 415 416 def cleanup(self): 417 """Restore host to clean state. 418 """ 419 pass 420 421 422 def machine_install(self): 423 """Install on the host. 424 """ 425 raise NotImplementedError('Machine install not implemented!') 426 427 428 def install(self, installableObject): 429 """Call install on a thing. 430 431 @param installableObject: Thing with install method that will accept our 432 self. 433 """ 434 installableObject.install(self) 435 436 437 def get_autodir(self): 438 raise NotImplementedError('Get autodir not implemented!') 439 440 441 def set_autodir(self): 442 raise NotImplementedError('Set autodir not implemented!') 443 444 445 def start_loggers(self): 446 """ Called to start continuous host logging. """ 447 pass 448 449 450 def stop_loggers(self): 451 """ Called to stop continuous host logging. """ 452 pass 453 454 455 # some extra methods simplify the retrieval of information about the 456 # Host machine, with generic implementations based on run(). subclasses 457 # should feel free to override these if they can provide better 458 # implementations for their specific Host types 459 460 def get_num_cpu(self): 461 """ Get the number of CPUs in the host according to /proc/cpuinfo. """ 462 proc_cpuinfo = self.run('cat /proc/cpuinfo', 463 stdout_tee=open(os.devnull, 'w')).stdout 464 cpus = 0 465 for line in proc_cpuinfo.splitlines(): 466 if line.startswith('processor'): 467 cpus += 1 468 return cpus 469 470 471 def get_arch(self): 472 """ Get the hardware architecture of the remote machine. """ 473 cmd_uname = path_utils.must_be_installed('/bin/uname', host=self) 474 arch = self.run('%s -m' % cmd_uname).stdout.rstrip() 475 if re.match(r'i\d86$', arch): 476 arch = 'i386' 477 return arch 478 479 480 def get_kernel_ver(self): 481 """ Get the kernel version of the remote machine. """ 482 cmd_uname = path_utils.must_be_installed('/bin/uname', host=self) 483 return self.run('%s -r' % cmd_uname).stdout.rstrip() 484 485 486 def get_cmdline(self): 487 """ Get the kernel command line of the remote machine. """ 488 return self.run('cat /proc/cmdline').stdout.rstrip() 489 490 491 def get_meminfo(self): 492 """ Get the kernel memory info (/proc/meminfo) of the remote machine 493 and return a dictionary mapping the various statistics. """ 494 meminfo_dict = {} 495 meminfo = self.run('cat /proc/meminfo').stdout.splitlines() 496 for key, val in (line.split(':', 1) for line in meminfo): 497 meminfo_dict[key.strip()] = val.strip() 498 return meminfo_dict 499 500 501 def path_exists(self, path): 502 """Determine if path exists on the remote machine. 503 504 @param path: path to check 505 506 @return: bool(path exists)""" 507 result = self.run('test -e "%s"' % utils.sh_escape(path), 508 ignore_status=True) 509 return result.exit_status == 0 510 511 512 # some extra helpers for doing job-related operations 513 514 def record(self, *args, **dargs): 515 """ Helper method for recording status logs against Host.job that 516 silently becomes a NOP if Host.job is not available. The args and 517 dargs are passed on to Host.job.record unchanged. """ 518 if self.job: 519 self.job.record(*args, **dargs) 520 521 522 def log_kernel(self): 523 """ Helper method for logging kernel information into the status logs. 524 Intended for cases where the "current" kernel is not really defined 525 and we want to explicitly log it. Does nothing if this host isn't 526 actually associated with a job. """ 527 if self.job: 528 kernel = self.get_kernel_ver() 529 self.job.record("INFO", None, None, 530 optional_fields={"kernel": kernel}) 531 532 533 def log_op(self, op, op_func): 534 """ Decorator for wrapping a management operaiton in a group for status 535 logging purposes. 536 537 @param op: name of the operation. 538 @param op_func: a function that carries out the operation 539 (reboot, suspend) 540 """ 541 if self.job and not hasattr(self, "RUNNING_LOG_OP"): 542 self.RUNNING_LOG_OP = True 543 try: 544 self.job.run_op(op, op_func, self.get_kernel_ver) 545 finally: 546 del self.RUNNING_LOG_OP 547 else: 548 op_func() 549 550 551 def list_files_glob(self, glob): 552 """Get a list of files on a remote host given a glob pattern path. 553 554 @param glob: pattern 555 556 @return: list of files 557 """ 558 SCRIPT = ("python -c 'import cPickle, glob, sys;" 559 "cPickle.dump(glob.glob(sys.argv[1]), sys.stdout, 0)'") 560 output = self.run(SCRIPT, args=(glob,), stdout_tee=None, 561 timeout=60).stdout 562 return cPickle.loads(output) 563 564 565 def symlink_closure(self, paths): 566 """ 567 Given a sequence of path strings, return the set of all paths that 568 can be reached from the initial set by following symlinks. 569 570 @param paths: sequence of path strings. 571 @return: a sequence of path strings that are all the unique paths that 572 can be reached from the given ones after following symlinks. 573 """ 574 SCRIPT = ("python -c 'import cPickle, os, sys\n" 575 "paths = cPickle.load(sys.stdin)\n" 576 "closure = {}\n" 577 "while paths:\n" 578 " path = paths.keys()[0]\n" 579 " del paths[path]\n" 580 " if not os.path.exists(path):\n" 581 " continue\n" 582 " closure[path] = None\n" 583 " if os.path.islink(path):\n" 584 " link_to = os.path.join(os.path.dirname(path),\n" 585 " os.readlink(path))\n" 586 " if link_to not in closure.keys():\n" 587 " paths[link_to] = None\n" 588 "cPickle.dump(closure.keys(), sys.stdout, 0)'") 589 input_data = cPickle.dumps(dict((path, None) for path in paths), 0) 590 output = self.run(SCRIPT, stdout_tee=None, stdin=input_data, 591 timeout=60).stdout 592 return cPickle.loads(output) 593 594 595 def cleanup_kernels(self, boot_dir='/boot'): 596 """ 597 Remove any kernel image and associated files (vmlinux, system.map, 598 modules) for any image found in the boot directory that is not 599 referenced by entries in the bootloader configuration. 600 601 @param boot_dir: boot directory path string, default '/boot' 602 """ 603 # find all the vmlinuz images referenced by the bootloader 604 vmlinuz_prefix = os.path.join(boot_dir, 'vmlinuz-') 605 boot_info = self.bootloader.get_entries() 606 used_kernver = [boot['kernel'][len(vmlinuz_prefix):] 607 for boot in boot_info.itervalues()] 608 609 # find all the unused vmlinuz images in /boot 610 all_vmlinuz = self.list_files_glob(vmlinuz_prefix + '*') 611 used_vmlinuz = self.symlink_closure(vmlinuz_prefix + kernver 612 for kernver in used_kernver) 613 unused_vmlinuz = set(all_vmlinuz) - set(used_vmlinuz) 614 615 # find all the unused vmlinux images in /boot 616 vmlinux_prefix = os.path.join(boot_dir, 'vmlinux-') 617 all_vmlinux = self.list_files_glob(vmlinux_prefix + '*') 618 used_vmlinux = self.symlink_closure(vmlinux_prefix + kernver 619 for kernver in used_kernver) 620 unused_vmlinux = set(all_vmlinux) - set(used_vmlinux) 621 622 # find all the unused System.map files in /boot 623 systemmap_prefix = os.path.join(boot_dir, 'System.map-') 624 all_system_map = self.list_files_glob(systemmap_prefix + '*') 625 used_system_map = self.symlink_closure( 626 systemmap_prefix + kernver for kernver in used_kernver) 627 unused_system_map = set(all_system_map) - set(used_system_map) 628 629 # find all the module directories associated with unused kernels 630 modules_prefix = '/lib/modules/' 631 all_moddirs = [dir for dir in self.list_files_glob(modules_prefix + '*') 632 if re.match(modules_prefix + r'\d+\.\d+\.\d+.*', dir)] 633 used_moddirs = self.symlink_closure(modules_prefix + kernver 634 for kernver in used_kernver) 635 unused_moddirs = set(all_moddirs) - set(used_moddirs) 636 637 # remove all the vmlinuz files we don't use 638 # TODO: if needed this should become package manager agnostic 639 for vmlinuz in unused_vmlinuz: 640 # try and get an rpm package name 641 rpm = self.run('rpm -qf', args=(vmlinuz,), 642 ignore_status=True, timeout=120) 643 if rpm.exit_status == 0: 644 packages = set(line.strip() for line in 645 rpm.stdout.splitlines()) 646 # if we found some package names, try to remove them 647 for package in packages: 648 self.run('rpm -e', args=(package,), 649 ignore_status=True, timeout=120) 650 # remove the image files anyway, even if rpm didn't 651 self.run('rm -f', args=(vmlinuz,), 652 ignore_status=True, timeout=120) 653 654 # remove all the vmlinux and System.map files left over 655 for f in (unused_vmlinux | unused_system_map): 656 self.run('rm -f', args=(f,), 657 ignore_status=True, timeout=120) 658 659 # remove all unused module directories 660 # the regex match should keep us safe from removing the wrong files 661 for moddir in unused_moddirs: 662 self.run('rm -fr', args=(moddir,), ignore_status=True) 663 664 665 def get_attributes_to_clear_before_provision(self): 666 """Get a list of attributes to be cleared before machine_install starts. 667 668 If provision runs in a lab environment, it is necessary to clear certain 669 host attributes for the host in afe_host_attributes table. For example, 670 `job_repo_url` is a devserver url pointed to autotest packages for 671 CrosHost, it needs to be removed before provision starts for tests to 672 run reliably. 673 For ADBHost, the job repo url has a different format, i.e., appended by 674 adb_serial, so this method should be overriden in ADBHost. 675 """ 676 return ['job_repo_url'] 677 678 679 def get_platform(self): 680 """Determine the correct platform label for this host. 681 682 @return: A string representing this host's platform. 683 """ 684 raise NotImplementedError("Get platform not implemented!") 685 686 687 def get_labels(self): 688 """Return a list of the labels gathered from the devices connected. 689 690 @return: A list of strings that denote the labels from all the devices 691 connected. 692 """ 693 raise NotImplementedError("Get labels not implemented!") 694 695