1#!/usr/bin/env python 2# Copyright 2015 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Cleanup orphaned containers. 7 8If an autoserv process dies without being able to call handler of SIGTERM, the 9container used to run the test will be orphaned. This adds overhead to the 10drone. This script is used to clean up such containers. 11 12This module also checks if the test job associated with a container has 13finished. If so, kill the autoserv process for the test job and destroy the 14container. To avoid racing condition, this only applies to job finished at least 151 hour ago. 16 17""" 18 19import argparse 20import datetime 21import logging 22import os 23import signal 24 25import common 26from autotest_lib.client.common_lib import logging_config 27from autotest_lib.client.common_lib import time_utils 28from autotest_lib.client.common_lib import utils 29from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 30from autotest_lib.site_utils import lxc 31 32 33AFE = frontend_wrappers.RetryingAFE(timeout_min=0.1, delay_sec=10) 34# The cutoff time to declare a test job is completed and container is orphaned. 35# This is to avoid a race condition that scheduler aborts a job and autoserv 36# is still in the process of destroying the container it used. 37FINISHED_JOB_CUTOFF_TIME = datetime.datetime.now() - datetime.timedelta(hours=1) 38 39def is_container_orphaned(container): 40 """Check if a container is orphaned. 41 42 A container is orphaned if any of these condition is True: 43 1. The autoserv process created the container is no longer running. 44 2. The test job is finished at least 1 hour ago. 45 46 @param container: A Container object. 47 48 @return: True if the container is orphaned. 49 50 """ 51 logging.debug('Checking if container is orphaned: %s', container.name) 52 if container.id is None: 53 logging.debug('Container %s is not created for test.', container.name) 54 return False 55 56 job_id = container.id.job_id 57 pid = container.id.pid 58 59 if pid and not utils.pid_is_alive(pid): 60 logging.debug('Process with PID %s is not alive, container %s is ' 61 'orphaned.', pid, container.name) 62 return True 63 64 try: 65 hqes = AFE.get_host_queue_entries(job_id=job_id) 66 except Exception as e: 67 logging.error('Failed to get hqe for job %s. Error: %s.', job_id, e) 68 return False 69 70 if not hqes: 71 # The job has not run yet. 72 return False 73 for hqe in hqes: 74 if hqe.active or not hqe.complete: 75 logging.debug('Test job %s is not completed yet, container %s is ' 76 'not orphaned.', job_id, container.name) 77 return False 78 if (hqe.finished_on and 79 (time_utils.time_string_to_datetime(hqe.finished_on) > 80 FINISHED_JOB_CUTOFF_TIME)): 81 logging.debug('Test job %s was completed less than an hour ago.', 82 job_id) 83 return False 84 85 logging.debug('Test job %s was completed, container %s is orphaned.', 86 job_id, container.name) 87 return True 88 89 90def cleanup(container, options): 91 """Cleanup orphaned container. 92 93 @param container: A Container object to be cleaned up. 94 @param options: Options to do cleanup. 95 96 @return: True if cleanup is successful. False otherwise. 97 98 """ 99 if not options.execute: 100 logging.info('dryrun: Cleanup container %s', container.name) 101 return False 102 103 try: 104 # cleanup is protected by is_container_orphaned. At this point the 105 # container may be assumed to have a valid ID. 106 pid = container.id.pid 107 # Kill autoserv process 108 if pid and utils.pid_is_alive(pid): 109 logging.info('Stopping process %s...', pid) 110 utils.nuke_pid(int(pid), (signal.SIGKILL,)) 111 112 # Destroy container 113 logging.info('Destroying container %s...', container.name) 114 container.destroy() 115 return True 116 except Exception as e: 117 logging.error('Failed to cleanup container %s. Error: %s', 118 container.name, e) 119 return False 120 121 122def parse_options(): 123 """Parse command line inputs. 124 125 @return: Options to run the script. 126 """ 127 parser = argparse.ArgumentParser() 128 parser.add_argument('-v', '--verbose', action='store_true', 129 default=False, 130 help='Print out ALL entries.') 131 parser.add_argument('-x', '--execute', action='store_true', 132 default=False, 133 help=('Execute the actions to kill autoserv processes ' 134 'and destroy containers. Default is False to do ' 135 'dry run')) 136 # TODO(dshi): Consider to adopt the scheduler log model: 137 # 1. Create one log per run. 138 # 2. Create a symlink to the latest log. 139 parser.add_argument('-l', '--logfile', type=str, 140 default=None, 141 help='Path to the log file to save logs.') 142 return parser.parse_args() 143 144 145def main(options): 146 """Main script. 147 148 @param options: Options to run the script. 149 """ 150 config = logging_config.LoggingConfig() 151 if options.logfile: 152 config.add_file_handler( 153 file_path=os.path.abspath(options.logfile), 154 level=logging.DEBUG if options.verbose else logging.INFO) 155 156 bucket = lxc.ContainerBucket() 157 logging.info('') 158 logging.info('Cleaning container bucket %s', bucket.container_path) 159 success_count = 0 160 failure_count = 0 161 for container in bucket.get_all().values(): 162 if is_container_orphaned(container): 163 if cleanup(container, options): 164 success_count += 1 165 else: 166 failure_count += 1 167 logging.info('Cleanup finished.') 168 169 170if __name__ == '__main__': 171 options = parse_options() 172 main(options) 173