1#!/usr/bin/env python 2# Copyright 2015 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Cleanup orphaned containers. 7 8If an autoserv process dies without being able to call handler of SIGTERM, the 9container used to run the test will be orphaned. This adds overhead to the 10drone. This script is used to clean up such containers. 11 12This module also checks if the test job associated with a container has 13finished. If so, kill the autoserv process for the test job and destroy the 14container. To avoid racing condition, this only applies to job finished at least 151 hour ago. 16 17""" 18 19import argparse 20import datetime 21import logging 22import os 23import re 24import signal 25import socket 26 27import common 28from autotest_lib.client.common_lib import logging_config 29from autotest_lib.client.common_lib import time_utils 30from autotest_lib.client.common_lib import utils 31from autotest_lib.client.common_lib.cros.graphite import autotest_stats 32from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 33from autotest_lib.site_utils import lxc 34 35 36AFE = frontend_wrappers.RetryingAFE(timeout_min=0.1, delay_sec=10) 37# The cutoff time to declare a test job is completed and container is orphaned. 38# This is to avoid a race condition that scheduler aborts a job and autoserv 39# is still in the process of destroying the container it used. 40FINISHED_JOB_CUTOFF_TIME = datetime.datetime.now() - datetime.timedelta(hours=1) 41 42def get_info(container_name): 43 """Get job id and autoserv process id from container name. 44 45 @param container: Name of the container. 46 47 @return: job id and autoserv process id for the given container name. 48 49 """ 50 match = re.match('test_(\d+)_(\d+)_(\d+)', container_name) 51 if not match: 52 # Container is not created for test, e.g., the base container. 53 return None, None 54 job_id = int(match.groups()[0]) 55 pid = match.groups()[2] 56 return job_id, pid 57 58 59def is_container_orphaned(container): 60 """Check if a container is orphaned. 61 62 A container is orphaned if any of these condition is True: 63 1. The autoserv process created the container is no longer running. 64 2. The test job is finished at least 1 hour ago. 65 66 @param container: A Container object. 67 68 @return: True if the container is orphaned. 69 70 """ 71 logging.debug('Checking if container is orphaned: %s', container.name) 72 job_id, pid = get_info(container.name) 73 if not job_id: 74 logging.debug('Container %s is not created for test.', container.name) 75 return False 76 77 if pid and not utils.pid_is_alive(pid): 78 logging.debug('Process with PID %s is not alive, container %s is ' 79 'orphaned.', pid, container.name) 80 return True 81 82 try: 83 hqes = AFE.get_host_queue_entries(job_id=job_id) 84 except Exception as e: 85 logging.error('Failed to get hqe for job %s. Error: %s.', job_id, e) 86 return False 87 88 if not hqes: 89 # The job has not run yet. 90 return False 91 for hqe in hqes: 92 if hqe.active or not hqe.complete: 93 logging.debug('Test job %s is not completed yet, container %s is ' 94 'not orphaned.', job_id, container.name) 95 return False 96 if (hqe.finished_on and 97 (time_utils.time_string_to_datetime(hqes.finished_on) > 98 FINISHED_JOB_CUTOFF_TIME)): 99 logging.debug('Test job %s was completed less than an hour ago.', 100 job_id) 101 return False 102 103 logging.debug('Test job %s was completed, container %s is orphaned.', 104 job_id, container.name) 105 return True 106 107 108def cleanup(container, options): 109 """Cleanup orphaned container. 110 111 @param container: A Container object to be cleaned up. 112 @param options: Options to do cleanup. 113 114 @return: True if cleanup is successful. False otherwise. 115 116 """ 117 if not options.execute: 118 logging.info('dryrun: Cleanup container %s', container.name) 119 return False 120 121 try: 122 _, pid = get_info(container.name) 123 # Kill autoserv process 124 if pid and utils.pid_is_alive(pid): 125 logging.info('Stopping process %s...', pid) 126 utils.nuke_pid(int(pid), (signal.SIGKILL,)) 127 128 # Destroy container 129 logging.info('Destroying container %s...', container.name) 130 container.destroy() 131 return True 132 except Exception as e: 133 logging.error('Failed to cleanup container %s. Error: %s', 134 container.name, e) 135 return False 136 137 138def parse_options(): 139 """Parse command line inputs. 140 141 @return: Options to run the script. 142 """ 143 parser = argparse.ArgumentParser() 144 parser.add_argument('-v', '--verbose', action='store_true', 145 default=False, 146 help='Print out ALL entries.') 147 parser.add_argument('-x', '--execute', action='store_true', 148 default=False, 149 help=('Execute the actions to kill autoserv processes ' 150 'and destroy containers. Default is False to do ' 151 'dry run')) 152 # TODO(dshi): Consider to adopt the scheduler log model: 153 # 1. Create one log per run. 154 # 2. Create a symlink to the latest log. 155 parser.add_argument('-l', '--logfile', type=str, 156 default=None, 157 help='Path to the log file to save logs.') 158 return parser.parse_args() 159 160 161def main(options): 162 """Main script. 163 164 @param options: Options to run the script. 165 """ 166 config = logging_config.LoggingConfig() 167 if options.logfile: 168 config.add_file_handler( 169 file_path=os.path.abspath(options.logfile), 170 level=logging.DEBUG if options.verbose else logging.INFO) 171 172 bucket = lxc.ContainerBucket() 173 logging.info('') 174 logging.info('Cleaning container bucket %s', bucket.container_path) 175 success_count = 0 176 failure_count = 0 177 for container in bucket.get_all().values(): 178 if is_container_orphaned(container): 179 if cleanup(container, options): 180 success_count += 1 181 else: 182 failure_count += 1 183 if options.execute: 184 key = 'container_cleanup.%s' % socket.gethostname().replace('.', '_') 185 autotest_stats.Gauge(key).send('success', success_count) 186 autotest_stats.Gauge(key).send('failure', failure_count) 187 logging.info('Cleanup finished.') 188 189 190if __name__ == '__main__': 191 options = parse_options() 192 main(options) 193