• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# Copyright 2015 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Cleanup orphaned containers.
7
8If an autoserv process dies without being able to call handler of SIGTERM, the
9container used to run the test will be orphaned. This adds overhead to the
10drone. This script is used to clean up such containers.
11
12This module also checks if the test job associated with a container has
13finished. If so, kill the autoserv process for the test job and destroy the
14container. To avoid racing condition, this only applies to job finished at least
151 hour ago.
16
17"""
18
19import argparse
20import datetime
21import logging
22import os
23import re
24import signal
25import socket
26
27import common
28from autotest_lib.client.common_lib import logging_config
29from autotest_lib.client.common_lib import time_utils
30from autotest_lib.client.common_lib import utils
31from autotest_lib.client.common_lib.cros.graphite import autotest_stats
32from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
33from autotest_lib.site_utils import lxc
34
35
36AFE = frontend_wrappers.RetryingAFE(timeout_min=0.1, delay_sec=10)
37# The cutoff time to declare a test job is completed and container is orphaned.
38# This is to avoid a race condition that scheduler aborts a job and autoserv
39# is still in the process of destroying the container it used.
40FINISHED_JOB_CUTOFF_TIME = datetime.datetime.now() - datetime.timedelta(hours=1)
41
42def get_info(container_name):
43    """Get job id and autoserv process id from container name.
44
45    @param container: Name of the container.
46
47    @return: job id and autoserv process id for the given container name.
48
49    """
50    match = re.match('test_(\d+)_(\d+)_(\d+)', container_name)
51    if not match:
52        # Container is not created for test, e.g., the base container.
53        return None, None
54    job_id = int(match.groups()[0])
55    pid = match.groups()[2]
56    return job_id, pid
57
58
59def is_container_orphaned(container):
60    """Check if a container is orphaned.
61
62    A container is orphaned if any of these condition is True:
63    1. The autoserv process created the container is no longer running.
64    2. The test job is finished at least 1 hour ago.
65
66    @param container: A Container object.
67
68    @return: True if the container is orphaned.
69
70    """
71    logging.debug('Checking if container is orphaned: %s', container.name)
72    job_id, pid = get_info(container.name)
73    if not job_id:
74        logging.debug('Container %s is not created for test.', container.name)
75        return False
76
77    if pid and not utils.pid_is_alive(pid):
78        logging.debug('Process with PID %s is not alive, container %s is '
79                      'orphaned.', pid, container.name)
80        return True
81
82    try:
83        hqes = AFE.get_host_queue_entries(job_id=job_id)
84    except Exception as e:
85        logging.error('Failed to get hqe for job %s. Error: %s.', job_id, e)
86        return False
87
88    if not hqes:
89        # The job has not run yet.
90        return False
91    for hqe in hqes:
92        if hqe.active or not hqe.complete:
93            logging.debug('Test job %s is not completed yet, container %s is '
94                          'not orphaned.', job_id, container.name)
95            return False
96        if (hqe.finished_on and
97            (time_utils.time_string_to_datetime(hqes.finished_on) >
98             FINISHED_JOB_CUTOFF_TIME)):
99            logging.debug('Test job %s was completed less than an hour ago.',
100                          job_id)
101            return False
102
103    logging.debug('Test job %s was completed, container %s is orphaned.',
104                  job_id, container.name)
105    return True
106
107
108def cleanup(container, options):
109    """Cleanup orphaned container.
110
111    @param container: A Container object to be cleaned up.
112    @param options: Options to do cleanup.
113
114    @return: True if cleanup is successful. False otherwise.
115
116    """
117    if not options.execute:
118        logging.info('dryrun: Cleanup container %s', container.name)
119        return False
120
121    try:
122        _, pid = get_info(container.name)
123        # Kill autoserv process
124        if pid and utils.pid_is_alive(pid):
125            logging.info('Stopping process %s...', pid)
126            utils.nuke_pid(int(pid), (signal.SIGKILL,))
127
128        # Destroy container
129        logging.info('Destroying container %s...', container.name)
130        container.destroy()
131        return True
132    except Exception as e:
133        logging.error('Failed to cleanup container %s. Error: %s',
134                      container.name, e)
135        return False
136
137
138def parse_options():
139    """Parse command line inputs.
140
141    @return: Options to run the script.
142    """
143    parser = argparse.ArgumentParser()
144    parser.add_argument('-v', '--verbose', action='store_true',
145                        default=False,
146                        help='Print out ALL entries.')
147    parser.add_argument('-x', '--execute', action='store_true',
148                        default=False,
149                        help=('Execute the actions to kill autoserv processes '
150                              'and destroy containers. Default is False to do '
151                              'dry run'))
152    # TODO(dshi): Consider to adopt the scheduler log model:
153    # 1. Create one log per run.
154    # 2. Create a symlink to the latest log.
155    parser.add_argument('-l', '--logfile', type=str,
156                        default=None,
157                        help='Path to the log file to save logs.')
158    return parser.parse_args()
159
160
161def main(options):
162    """Main script.
163
164    @param options: Options to run the script.
165    """
166    config = logging_config.LoggingConfig()
167    if options.logfile:
168        config.add_file_handler(
169                file_path=os.path.abspath(options.logfile),
170                level=logging.DEBUG if options.verbose else logging.INFO)
171
172    bucket = lxc.ContainerBucket()
173    logging.info('')
174    logging.info('Cleaning container bucket %s', bucket.container_path)
175    success_count = 0
176    failure_count = 0
177    for container in bucket.get_all().values():
178        if is_container_orphaned(container):
179            if cleanup(container, options):
180                success_count += 1
181            else:
182                failure_count += 1
183    if options.execute:
184        key = 'container_cleanup.%s' % socket.gethostname().replace('.', '_')
185        autotest_stats.Gauge(key).send('success', success_count)
186        autotest_stats.Gauge(key).send('failure', failure_count)
187    logging.info('Cleanup finished.')
188
189
190if __name__ == '__main__':
191    options = parse_options()
192    main(options)
193