• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2009 Google Inc. Released under the GPL v2
2
3"""
4This module defines the base classes for the Host hierarchy.
5
6Implementation details:
7You should import the "hosts" package instead of importing each type of host.
8
9        Host: a machine on which you can run programs
10"""
11
12__author__ = """
13mbligh@google.com (Martin J. Bligh),
14poirier@google.com (Benjamin Poirier),
15stutsman@google.com (Ryan Stutsman)
16"""
17
18import cPickle, logging, os, re, time
19
20from autotest_lib.client.common_lib import global_config, error, utils
21from autotest_lib.client.common_lib.cros import path_utils
22
23
24class Host(object):
25    """
26    This class represents a machine on which you can run programs.
27
28    It may be a local machine, the one autoserv is running on, a remote
29    machine or a virtual machine.
30
31    Implementation details:
32    This is an abstract class, leaf subclasses must implement the methods
33    listed here. You must not instantiate this class but should
34    instantiate one of those leaf subclasses.
35
36    When overriding methods that raise NotImplementedError, the leaf class
37    is fully responsible for the implementation and should not chain calls
38    to super. When overriding methods that are a NOP in Host, the subclass
39    should chain calls to super(). The criteria for fitting a new method into
40    one category or the other should be:
41        1. If two separate generic implementations could reasonably be
42           concatenated, then the abstract implementation should pass and
43           subclasses should chain calls to super.
44        2. If only one class could reasonably perform the stated function
45           (e.g. two separate run() implementations cannot both be executed)
46           then the method should raise NotImplementedError in Host, and
47           the implementor should NOT chain calls to super, to ensure that
48           only one implementation ever gets executed.
49    """
50
51    job = None
52    DEFAULT_REBOOT_TIMEOUT = global_config.global_config.get_config_value(
53        "HOSTS", "default_reboot_timeout", type=int, default=1800)
54    WAIT_DOWN_REBOOT_TIMEOUT = global_config.global_config.get_config_value(
55        "HOSTS", "wait_down_reboot_timeout", type=int, default=840)
56    WAIT_DOWN_REBOOT_WARNING = global_config.global_config.get_config_value(
57        "HOSTS", "wait_down_reboot_warning", type=int, default=540)
58    HOURS_TO_WAIT_FOR_RECOVERY = global_config.global_config.get_config_value(
59        "HOSTS", "hours_to_wait_for_recovery", type=float, default=2.5)
60    # the number of hardware repair requests that need to happen before we
61    # actually send machines to hardware repair
62    HARDWARE_REPAIR_REQUEST_THRESHOLD = 4
63    OP_REBOOT = 'reboot'
64    OP_SUSPEND = 'suspend'
65    PWR_OPERATION = [OP_REBOOT, OP_SUSPEND]
66
67
68    def __init__(self, *args, **dargs):
69        self._initialize(*args, **dargs)
70
71
72    def _initialize(self, *args, **dargs):
73        pass
74
75
76    @property
77    def job_repo_url_attribute(self):
78        """Get the host attribute name for job_repo_url.
79        """
80        return 'job_repo_url'
81
82
83    def close(self):
84        """Close the connection to the host.
85        """
86        pass
87
88
89    def setup(self):
90        """Setup the host object.
91        """
92        pass
93
94
95    def run(self, command, timeout=3600, ignore_status=False,
96            stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
97            stdin=None, args=()):
98        """
99        Run a command on this host.
100
101        @param command: the command line string
102        @param timeout: time limit in seconds before attempting to
103                kill the running process. The run() function
104                will take a few seconds longer than 'timeout'
105                to complete if it has to kill the process.
106        @param ignore_status: do not raise an exception, no matter
107                what the exit code of the command is.
108        @param stdout_tee: where to tee the stdout
109        @param stderr_tee: where to tee the stderr
110        @param stdin: stdin to pass (a string) to the executed command
111        @param args: sequence of strings to pass as arguments to command by
112                quoting them in " and escaping their contents if necessary
113
114        @return a utils.CmdResult object
115
116        @raises AutotestHostRunError: the exit code of the command execution
117                was not 0 and ignore_status was not enabled
118        """
119        raise NotImplementedError('Run not implemented!')
120
121
122    def run_output(self, command, *args, **dargs):
123        """Run and retrieve the value of stdout stripped of whitespace.
124
125        @param command: Command to execute.
126        @param *args: Extra arguments to run.
127        @param **dargs: Extra keyword arguments to run.
128
129        @return: String value of stdout.
130        """
131        return self.run(command, *args, **dargs).stdout.rstrip()
132
133
134    def reboot(self):
135        """Reboot the host.
136        """
137        raise NotImplementedError('Reboot not implemented!')
138
139
140    def suspend(self):
141        """Suspend the host.
142        """
143        raise NotImplementedError('Suspend not implemented!')
144
145
146    def sysrq_reboot(self):
147        """Execute host reboot via SysRq key.
148        """
149        raise NotImplementedError('Sysrq reboot not implemented!')
150
151
152    def reboot_setup(self, *args, **dargs):
153        """Prepare for reboot.
154
155        This doesn't appear to be implemented by any current hosts.
156
157        @param *args: Extra arguments to ?.
158        @param **dargs: Extra keyword arguments to ?.
159        """
160        pass
161
162
163    def reboot_followup(self, *args, **dargs):
164        """Post reboot work.
165
166        This doesn't appear to be implemented by any current hosts.
167
168        @param *args: Extra arguments to ?.
169        @param **dargs: Extra keyword arguments to ?.
170        """
171        pass
172
173
174    def get_file(self, source, dest, delete_dest=False):
175        """Retrieve a file from the host.
176
177        @param source: Remote file path (directory, file or list).
178        @param dest: Local file path (directory, file or list).
179        @param delete_dest: Delete files in remote path that are not in local
180            path.
181        """
182        raise NotImplementedError('Get file not implemented!')
183
184
185    def send_file(self, source, dest, delete_dest=False):
186        """Send a file to the host.
187
188        @param source: Local file path (directory, file or list).
189        @param dest: Remote file path (directory, file or list).
190        @param delete_dest: Delete files in remote path that are not in local
191            path.
192        """
193        raise NotImplementedError('Send file not implemented!')
194
195
196    def get_tmp_dir(self):
197        """Create a temporary directory on the host.
198        """
199        raise NotImplementedError('Get temp dir not implemented!')
200
201
202    def is_up(self):
203        """Confirm the host is online.
204        """
205        raise NotImplementedError('Is up not implemented!')
206
207
208    def is_shutting_down(self):
209        """ Indicates is a machine is currently shutting down. """
210        return False
211
212
213    def get_wait_up_processes(self):
214        """ Gets the list of local processes to wait for in wait_up. """
215        get_config = global_config.global_config.get_config_value
216        proc_list = get_config("HOSTS", "wait_up_processes",
217                               default="").strip()
218        processes = set(p.strip() for p in proc_list.split(","))
219        processes.discard("")
220        return processes
221
222
223    def get_boot_id(self, timeout=60):
224        """ Get a unique ID associated with the current boot.
225
226        Should return a string with the semantics such that two separate
227        calls to Host.get_boot_id() return the same string if the host did
228        not reboot between the two calls, and two different strings if it
229        has rebooted at least once between the two calls.
230
231        @param timeout The number of seconds to wait before timing out.
232
233        @return A string unique to this boot or None if not available."""
234        BOOT_ID_FILE = '/proc/sys/kernel/random/boot_id'
235        NO_ID_MSG = 'no boot_id available'
236        cmd = 'if [ -f %r ]; then cat %r; else echo %r; fi' % (
237                BOOT_ID_FILE, BOOT_ID_FILE, NO_ID_MSG)
238        boot_id = self.run(cmd, timeout=timeout).stdout.strip()
239        if boot_id == NO_ID_MSG:
240            return None
241        return boot_id
242
243
244    def wait_up(self, timeout=None):
245        """Wait for the host to come up.
246
247        @param timeout: Max seconds to wait.
248        """
249        raise NotImplementedError('Wait up not implemented!')
250
251
252    def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None):
253        """Wait for the host to go down.
254
255        @param timeout: Max seconds to wait before returning.
256        @param warning_timer: Seconds before warning host is not down.
257        @param old_boot_id: Result of self.get_boot_id() before shutdown.
258        """
259        raise NotImplementedError('Wait down not implemented!')
260
261
262    def _construct_host_metadata(self, type_str):
263        """Returns dict of metadata with type_str, hostname, time_recorded.
264
265        @param type_str: String representing _type field in es db.
266            For example: type_str='reboot_total'.
267        """
268        metadata = {
269            'hostname': self.hostname,
270            'time_recorded': time.time(),
271            '_type': type_str,
272        }
273        return metadata
274
275
276    def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT,
277                         down_timeout=WAIT_DOWN_REBOOT_TIMEOUT,
278                         down_warning=WAIT_DOWN_REBOOT_WARNING,
279                         log_failure=True, old_boot_id=None, **dargs):
280        """Wait for the host to come back from a reboot.
281
282        This is a generic implementation based entirely on wait_up and
283        wait_down.
284
285        @param timeout: Max seconds to wait for reboot to start.
286        @param down_timeout: Max seconds to wait for host to go down.
287        @param down_warning: Seconds to wait before warning host hasn't gone
288            down.
289        @param log_failure: bool(Log when host does not go down.)
290        @param old_boot_id: Result of self.get_boot_id() before restart.
291        @param **dargs: Extra arguments to reboot_followup.
292
293        @raises AutoservRebootError if host does not come back up.
294        """
295        key_string = 'Reboot.%s' % dargs.get('board')
296
297        if not self.wait_down(timeout=down_timeout,
298                              warning_timer=down_warning,
299                              old_boot_id=old_boot_id):
300            if log_failure:
301                self.record("ABORT", None, "reboot.verify", "shut down failed")
302            raise error.AutoservShutdownError("Host did not shut down")
303        if self.wait_up(timeout):
304            self.record("GOOD", None, "reboot.verify")
305            self.reboot_followup(**dargs)
306        else:
307            self.record("ABORT", None, "reboot.verify",
308                        "Host did not return from reboot")
309            raise error.AutoservRebootError("Host did not return from reboot")
310
311
312    def verify(self):
313        """Check if host is in good state.
314        """
315        self.verify_hardware()
316        self.verify_connectivity()
317        self.verify_software()
318
319
320    def verify_hardware(self):
321        """Check host hardware.
322        """
323        pass
324
325
326    def verify_connectivity(self):
327        """Check host network connectivity.
328        """
329        pass
330
331
332    def verify_software(self):
333        """Check host software.
334        """
335        pass
336
337
338    def check_diskspace(self, path, gb):
339        """Raises an error if path does not have at least gb GB free.
340
341        @param path The path to check for free disk space.
342        @param gb A floating point number to compare with a granularity
343            of 1 MB.
344
345        1000 based SI units are used.
346
347        @raises AutoservDiskFullHostError if path has less than gb GB free.
348        """
349        one_mb = 10 ** 6  # Bytes (SI unit).
350        mb_per_gb = 1000.0
351        logging.info('Checking for >= %s GB of space under %s on machine %s',
352                     gb, path, self.hostname)
353        df = self.run('df -PB %d %s | tail -1' % (one_mb, path)).stdout.split()
354        free_space_gb = int(df[3]) / mb_per_gb
355        if free_space_gb < gb:
356            raise error.AutoservDiskFullHostError(path, gb, free_space_gb)
357        else:
358            logging.info('Found %s GB >= %s GB of space under %s on machine %s',
359                free_space_gb, gb, path, self.hostname)
360
361
362    def check_inodes(self, path, min_kilo_inodes):
363        """Raises an error if a file system is short on i-nodes.
364
365        @param path The path to check for free i-nodes.
366        @param min_kilo_inodes Minimum number of i-nodes required,
367                               in units of 1000 i-nodes.
368
369        @raises AutoservNoFreeInodesError If the minimum required
370                                  i-node count isn't available.
371        """
372        min_inodes = 1000 * min_kilo_inodes
373        logging.info('Checking for >= %d i-nodes under %s '
374                     'on machine %s', min_inodes, path, self.hostname)
375        df = self.run('df -Pi %s | tail -1' % path).stdout.split()
376        free_inodes = int(df[3])
377        if free_inodes < min_inodes:
378            raise error.AutoservNoFreeInodesError(path, min_inodes,
379                                                  free_inodes)
380        else:
381            logging.info('Found %d >= %d i-nodes under %s on '
382                         'machine %s', free_inodes, min_inodes,
383                         path, self.hostname)
384
385
386    def erase_dir_contents(self, path, ignore_status=True, timeout=3600):
387        """Empty a given directory path contents.
388
389        @param path: Path to empty.
390        @param ignore_status: Ignore the exit status from run.
391        @param timeout: Max seconds to allow command to complete.
392        """
393        rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf'
394        self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout)
395
396
397    def repair(self):
398        """Try and get the host to pass `self.verify()`."""
399        self.verify()
400
401
402    def disable_ipfilters(self):
403        """Allow all network packets in and out of the host."""
404        self.run('iptables-save > /tmp/iptable-rules')
405        self.run('iptables -P INPUT ACCEPT')
406        self.run('iptables -P FORWARD ACCEPT')
407        self.run('iptables -P OUTPUT ACCEPT')
408
409
410    def enable_ipfilters(self):
411        """Re-enable the IP filters disabled from disable_ipfilters()"""
412        if self.path_exists('/tmp/iptable-rules'):
413            self.run('iptables-restore < /tmp/iptable-rules')
414
415
416    def cleanup(self):
417        """Restore host to clean state.
418        """
419        pass
420
421
422    def machine_install(self):
423        """Install on the host.
424        """
425        raise NotImplementedError('Machine install not implemented!')
426
427
428    def install(self, installableObject):
429        """Call install on a thing.
430
431        @param installableObject: Thing with install method that will accept our
432            self.
433        """
434        installableObject.install(self)
435
436
437    def get_autodir(self):
438        raise NotImplementedError('Get autodir not implemented!')
439
440
441    def set_autodir(self):
442        raise NotImplementedError('Set autodir not implemented!')
443
444
445    def start_loggers(self):
446        """ Called to start continuous host logging. """
447        pass
448
449
450    def stop_loggers(self):
451        """ Called to stop continuous host logging. """
452        pass
453
454
455    # some extra methods simplify the retrieval of information about the
456    # Host machine, with generic implementations based on run(). subclasses
457    # should feel free to override these if they can provide better
458    # implementations for their specific Host types
459
460    def get_num_cpu(self):
461        """ Get the number of CPUs in the host according to /proc/cpuinfo. """
462        proc_cpuinfo = self.run('cat /proc/cpuinfo',
463                                stdout_tee=open(os.devnull, 'w')).stdout
464        cpus = 0
465        for line in proc_cpuinfo.splitlines():
466            if line.startswith('processor'):
467                cpus += 1
468        return cpus
469
470
471    def get_arch(self):
472        """ Get the hardware architecture of the remote machine. """
473        cmd_uname = path_utils.must_be_installed('/bin/uname', host=self)
474        arch = self.run('%s -m' % cmd_uname).stdout.rstrip()
475        if re.match(r'i\d86$', arch):
476            arch = 'i386'
477        return arch
478
479
480    def get_kernel_ver(self):
481        """ Get the kernel version of the remote machine. """
482        cmd_uname = path_utils.must_be_installed('/bin/uname', host=self)
483        return self.run('%s -r' % cmd_uname).stdout.rstrip()
484
485
486    def get_cmdline(self):
487        """ Get the kernel command line of the remote machine. """
488        return self.run('cat /proc/cmdline').stdout.rstrip()
489
490
491    def get_meminfo(self):
492        """ Get the kernel memory info (/proc/meminfo) of the remote machine
493        and return a dictionary mapping the various statistics. """
494        meminfo_dict = {}
495        meminfo = self.run('cat /proc/meminfo').stdout.splitlines()
496        for key, val in (line.split(':', 1) for line in meminfo):
497            meminfo_dict[key.strip()] = val.strip()
498        return meminfo_dict
499
500
501    def path_exists(self, path):
502        """Determine if path exists on the remote machine.
503
504        @param path: path to check
505
506        @return: bool(path exists)"""
507        result = self.run('test -e "%s"' % utils.sh_escape(path),
508                          ignore_status=True)
509        return result.exit_status == 0
510
511
512    # some extra helpers for doing job-related operations
513
514    def record(self, *args, **dargs):
515        """ Helper method for recording status logs against Host.job that
516        silently becomes a NOP if Host.job is not available. The args and
517        dargs are passed on to Host.job.record unchanged. """
518        if self.job:
519            self.job.record(*args, **dargs)
520
521
522    def log_kernel(self):
523        """ Helper method for logging kernel information into the status logs.
524        Intended for cases where the "current" kernel is not really defined
525        and we want to explicitly log it. Does nothing if this host isn't
526        actually associated with a job. """
527        if self.job:
528            kernel = self.get_kernel_ver()
529            self.job.record("INFO", None, None,
530                            optional_fields={"kernel": kernel})
531
532
533    def log_op(self, op, op_func):
534        """ Decorator for wrapping a management operaiton in a group for status
535        logging purposes.
536
537        @param op: name of the operation.
538        @param op_func: a function that carries out the operation
539                        (reboot, suspend)
540        """
541        if self.job and not hasattr(self, "RUNNING_LOG_OP"):
542            self.RUNNING_LOG_OP = True
543            try:
544                self.job.run_op(op, op_func, self.get_kernel_ver)
545            finally:
546                del self.RUNNING_LOG_OP
547        else:
548            op_func()
549
550
551    def list_files_glob(self, glob):
552        """Get a list of files on a remote host given a glob pattern path.
553
554        @param glob: pattern
555
556        @return: list of files
557        """
558        SCRIPT = ("python -c 'import cPickle, glob, sys;"
559                  "cPickle.dump(glob.glob(sys.argv[1]), sys.stdout, 0)'")
560        output = self.run(SCRIPT, args=(glob,), stdout_tee=None,
561                          timeout=60).stdout
562        return cPickle.loads(output)
563
564
565    def symlink_closure(self, paths):
566        """
567        Given a sequence of path strings, return the set of all paths that
568        can be reached from the initial set by following symlinks.
569
570        @param paths: sequence of path strings.
571        @return: a sequence of path strings that are all the unique paths that
572                can be reached from the given ones after following symlinks.
573        """
574        SCRIPT = ("python -c 'import cPickle, os, sys\n"
575                  "paths = cPickle.load(sys.stdin)\n"
576                  "closure = {}\n"
577                  "while paths:\n"
578                  "    path = paths.keys()[0]\n"
579                  "    del paths[path]\n"
580                  "    if not os.path.exists(path):\n"
581                  "        continue\n"
582                  "    closure[path] = None\n"
583                  "    if os.path.islink(path):\n"
584                  "        link_to = os.path.join(os.path.dirname(path),\n"
585                  "                               os.readlink(path))\n"
586                  "        if link_to not in closure.keys():\n"
587                  "            paths[link_to] = None\n"
588                  "cPickle.dump(closure.keys(), sys.stdout, 0)'")
589        input_data = cPickle.dumps(dict((path, None) for path in paths), 0)
590        output = self.run(SCRIPT, stdout_tee=None, stdin=input_data,
591                          timeout=60).stdout
592        return cPickle.loads(output)
593
594
595    def cleanup_kernels(self, boot_dir='/boot'):
596        """
597        Remove any kernel image and associated files (vmlinux, system.map,
598        modules) for any image found in the boot directory that is not
599        referenced by entries in the bootloader configuration.
600
601        @param boot_dir: boot directory path string, default '/boot'
602        """
603        # find all the vmlinuz images referenced by the bootloader
604        vmlinuz_prefix = os.path.join(boot_dir, 'vmlinuz-')
605        boot_info = self.bootloader.get_entries()
606        used_kernver = [boot['kernel'][len(vmlinuz_prefix):]
607                        for boot in boot_info.itervalues()]
608
609        # find all the unused vmlinuz images in /boot
610        all_vmlinuz = self.list_files_glob(vmlinuz_prefix + '*')
611        used_vmlinuz = self.symlink_closure(vmlinuz_prefix + kernver
612                                            for kernver in used_kernver)
613        unused_vmlinuz = set(all_vmlinuz) - set(used_vmlinuz)
614
615        # find all the unused vmlinux images in /boot
616        vmlinux_prefix = os.path.join(boot_dir, 'vmlinux-')
617        all_vmlinux = self.list_files_glob(vmlinux_prefix + '*')
618        used_vmlinux = self.symlink_closure(vmlinux_prefix + kernver
619                                            for kernver in used_kernver)
620        unused_vmlinux = set(all_vmlinux) - set(used_vmlinux)
621
622        # find all the unused System.map files in /boot
623        systemmap_prefix = os.path.join(boot_dir, 'System.map-')
624        all_system_map = self.list_files_glob(systemmap_prefix + '*')
625        used_system_map = self.symlink_closure(
626            systemmap_prefix + kernver for kernver in used_kernver)
627        unused_system_map = set(all_system_map) - set(used_system_map)
628
629        # find all the module directories associated with unused kernels
630        modules_prefix = '/lib/modules/'
631        all_moddirs = [dir for dir in self.list_files_glob(modules_prefix + '*')
632                       if re.match(modules_prefix + r'\d+\.\d+\.\d+.*', dir)]
633        used_moddirs = self.symlink_closure(modules_prefix + kernver
634                                            for kernver in used_kernver)
635        unused_moddirs = set(all_moddirs) - set(used_moddirs)
636
637        # remove all the vmlinuz files we don't use
638        # TODO: if needed this should become package manager agnostic
639        for vmlinuz in unused_vmlinuz:
640            # try and get an rpm package name
641            rpm = self.run('rpm -qf', args=(vmlinuz,),
642                           ignore_status=True, timeout=120)
643            if rpm.exit_status == 0:
644                packages = set(line.strip() for line in
645                               rpm.stdout.splitlines())
646                # if we found some package names, try to remove them
647                for package in packages:
648                    self.run('rpm -e', args=(package,),
649                             ignore_status=True, timeout=120)
650            # remove the image files anyway, even if rpm didn't
651            self.run('rm -f', args=(vmlinuz,),
652                     ignore_status=True, timeout=120)
653
654        # remove all the vmlinux and System.map files left over
655        for f in (unused_vmlinux | unused_system_map):
656            self.run('rm -f', args=(f,),
657                     ignore_status=True, timeout=120)
658
659        # remove all unused module directories
660        # the regex match should keep us safe from removing the wrong files
661        for moddir in unused_moddirs:
662            self.run('rm -fr', args=(moddir,), ignore_status=True)
663
664
665    def get_attributes_to_clear_before_provision(self):
666        """Get a list of attributes to be cleared before machine_install starts.
667
668        If provision runs in a lab environment, it is necessary to clear certain
669        host attributes for the host in afe_host_attributes table. For example,
670        `job_repo_url` is a devserver url pointed to autotest packages for
671        CrosHost, it needs to be removed before provision starts for tests to
672        run reliably.
673        For ADBHost, the job repo url has a different format, i.e., appended by
674        adb_serial, so this method should be overriden in ADBHost.
675        """
676        return ['job_repo_url']
677
678
679    def get_platform(self):
680        """Determine the correct platform label for this host.
681
682        @return: A string representing this host's platform.
683        """
684        raise NotImplementedError("Get platform not implemented!")
685
686
687    def get_labels(self):
688        """Return a list of the labels gathered from the devices connected.
689
690        @return: A list of strings that denote the labels from all the devices
691        connected.
692        """
693        raise NotImplementedError("Get labels not implemented!")
694
695