• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# Copyright 2015 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Cleanup orphaned containers.
7
8If an autoserv process dies without being able to call handler of SIGTERM, the
9container used to run the test will be orphaned. This adds overhead to the
10drone. This script is used to clean up such containers.
11
12This module also checks if the test job associated with a container has
13finished. If so, kill the autoserv process for the test job and destroy the
14container. To avoid racing condition, this only applies to job finished at least
151 hour ago.
16
17"""
18
19import argparse
20import datetime
21import logging
22import os
23import re
24import signal
25
26import common
27from autotest_lib.client.common_lib import logging_config
28from autotest_lib.client.common_lib import time_utils
29from autotest_lib.client.common_lib import utils
30from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
31from autotest_lib.site_utils import lxc
32
33
34AFE = frontend_wrappers.RetryingAFE(timeout_min=0.1, delay_sec=10)
35# The cutoff time to declare a test job is completed and container is orphaned.
36# This is to avoid a race condition that scheduler aborts a job and autoserv
37# is still in the process of destroying the container it used.
38FINISHED_JOB_CUTOFF_TIME = datetime.datetime.now() - datetime.timedelta(hours=1)
39
40def get_info(container_name):
41    """Get job id and autoserv process id from container name.
42
43    @param container: Name of the container.
44
45    @return: job id and autoserv process id for the given container name.
46
47    """
48    match = re.match('test_(\d+)_(\d+)_(\d+)', container_name)
49    if not match:
50        # Container is not created for test, e.g., the base container.
51        return None, None
52    job_id = int(match.groups()[0])
53    pid = match.groups()[2]
54    return job_id, pid
55
56
57def is_container_orphaned(container):
58    """Check if a container is orphaned.
59
60    A container is orphaned if any of these condition is True:
61    1. The autoserv process created the container is no longer running.
62    2. The test job is finished at least 1 hour ago.
63
64    @param container: A Container object.
65
66    @return: True if the container is orphaned.
67
68    """
69    logging.debug('Checking if container is orphaned: %s', container.name)
70    job_id, pid = get_info(container.name)
71    if not job_id:
72        logging.debug('Container %s is not created for test.', container.name)
73        return False
74
75    if pid and not utils.pid_is_alive(pid):
76        logging.debug('Process with PID %s is not alive, container %s is '
77                      'orphaned.', pid, container.name)
78        return True
79
80    try:
81        hqes = AFE.get_host_queue_entries(job_id=job_id)
82    except Exception as e:
83        logging.error('Failed to get hqe for job %s. Error: %s.', job_id, e)
84        return False
85
86    if not hqes:
87        # The job has not run yet.
88        return False
89    for hqe in hqes:
90        if hqe.active or not hqe.complete:
91            logging.debug('Test job %s is not completed yet, container %s is '
92                          'not orphaned.', job_id, container.name)
93            return False
94        if (hqe.finished_on and
95            (time_utils.time_string_to_datetime(hqes.finished_on) >
96             FINISHED_JOB_CUTOFF_TIME)):
97            logging.debug('Test job %s was completed less than an hour ago.',
98                          job_id)
99            return False
100
101    logging.debug('Test job %s was completed, container %s is orphaned.',
102                  job_id, container.name)
103    return True
104
105
106def cleanup(container, options):
107    """Cleanup orphaned container.
108
109    @param container: A Container object to be cleaned up.
110    @param options: Options to do cleanup.
111
112    @return: True if cleanup is successful. False otherwise.
113
114    """
115    if not options.execute:
116        logging.info('dryrun: Cleanup container %s', container.name)
117        return False
118
119    try:
120        _, pid = get_info(container.name)
121        # Kill autoserv process
122        if pid and utils.pid_is_alive(pid):
123            logging.info('Stopping process %s...', pid)
124            utils.nuke_pid(int(pid), (signal.SIGKILL,))
125
126        # Destroy container
127        logging.info('Destroying container %s...', container.name)
128        container.destroy()
129        return True
130    except Exception as e:
131        logging.error('Failed to cleanup container %s. Error: %s',
132                      container.name, e)
133        return False
134
135
136def parse_options():
137    """Parse command line inputs.
138
139    @return: Options to run the script.
140    """
141    parser = argparse.ArgumentParser()
142    parser.add_argument('-v', '--verbose', action='store_true',
143                        default=False,
144                        help='Print out ALL entries.')
145    parser.add_argument('-x', '--execute', action='store_true',
146                        default=False,
147                        help=('Execute the actions to kill autoserv processes '
148                              'and destroy containers. Default is False to do '
149                              'dry run'))
150    # TODO(dshi): Consider to adopt the scheduler log model:
151    # 1. Create one log per run.
152    # 2. Create a symlink to the latest log.
153    parser.add_argument('-l', '--logfile', type=str,
154                        default=None,
155                        help='Path to the log file to save logs.')
156    return parser.parse_args()
157
158
159def main(options):
160    """Main script.
161
162    @param options: Options to run the script.
163    """
164    config = logging_config.LoggingConfig()
165    if options.logfile:
166        config.add_file_handler(
167                file_path=os.path.abspath(options.logfile),
168                level=logging.DEBUG if options.verbose else logging.INFO)
169
170    bucket = lxc.ContainerBucket()
171    logging.info('')
172    logging.info('Cleaning container bucket %s', bucket.container_path)
173    success_count = 0
174    failure_count = 0
175    for container in bucket.get_all().values():
176        if is_container_orphaned(container):
177            if cleanup(container, options):
178                success_count += 1
179            else:
180                failure_count += 1
181    logging.info('Cleanup finished.')
182
183
184if __name__ == '__main__':
185    options = parse_options()
186    main(options)
187