1#pylint: disable-msg=C0111 2 3""" 4Pidfile monitor. 5""" 6 7import logging 8import time 9import traceback 10 11import common 12 13from autotest_lib.client.common_lib import utils 14from autotest_lib.client.common_lib import global_config 15from autotest_lib.scheduler import drone_manager 16from autotest_lib.scheduler import scheduler_config 17 18try: 19 from chromite.lib import metrics 20except ImportError: 21 metrics = utils.metrics_mock 22 23 24def _get_pidfile_timeout_secs(): 25 """@returns How long to wait for autoserv to write pidfile.""" 26 pidfile_timeout_mins = global_config.global_config.get_config_value( 27 scheduler_config.CONFIG_SECTION, 'pidfile_timeout_mins', type=int) 28 return pidfile_timeout_mins * 60 29 30 31class PidfileRunMonitor(object): 32 """ 33 Client must call either run() to start a new process or 34 attach_to_existing_process(). 35 """ 36 37 class _PidfileException(Exception): 38 """ 39 Raised when there's some unexpected behavior with the pid file, but only 40 used internally (never allowed to escape this class). 41 """ 42 43 44 def __init__(self): 45 self._drone_manager = drone_manager.instance() 46 self.lost_process = False 47 self._start_time = None 48 self.pidfile_id = None 49 self._killed = False 50 self._state = drone_manager.PidfileContents() 51 52 53 def _add_nice_command(self, command, nice_level): 54 if not nice_level: 55 return command 56 return ['nice', '-n', str(nice_level)] + command 57 58 59 def _set_start_time(self): 60 self._start_time = time.time() 61 62 63 def run(self, command, working_directory, num_processes, nice_level=None, 64 log_file=None, pidfile_name=None, paired_with_pidfile=None, 65 username=None, drone_hostnames_allowed=None): 66 assert command is not None 67 if nice_level is not None: 68 command = ['nice', '-n', str(nice_level)] + command 69 self._set_start_time() 70 self.pidfile_id = self._drone_manager.execute_command( 71 command, working_directory, pidfile_name=pidfile_name, 72 num_processes=num_processes, log_file=log_file, 73 paired_with_pidfile=paired_with_pidfile, username=username, 74 drone_hostnames_allowed=drone_hostnames_allowed) 75 76 77 def attach_to_existing_process(self, execution_path, 78 pidfile_name=drone_manager.AUTOSERV_PID_FILE, 79 num_processes=None): 80 self._set_start_time() 81 self.pidfile_id = self._drone_manager.get_pidfile_id_from( 82 execution_path, pidfile_name=pidfile_name) 83 if num_processes is not None: 84 self._drone_manager.declare_process_count(self.pidfile_id, num_processes) 85 86 87 def kill(self): 88 if self.has_process(): 89 self._drone_manager.kill_process(self.get_process()) 90 self._killed = True 91 92 93 def has_process(self): 94 self._get_pidfile_info() 95 return self._state.process is not None 96 97 98 def get_process(self): 99 self._get_pidfile_info() 100 assert self._state.process is not None 101 return self._state.process 102 103 104 def _read_pidfile(self, use_second_read=False): 105 assert self.pidfile_id is not None, ( 106 'You must call run() or attach_to_existing_process()') 107 contents = self._drone_manager.get_pidfile_contents( 108 self.pidfile_id, use_second_read=use_second_read) 109 if contents.is_invalid(): 110 self._state = drone_manager.PidfileContents() 111 raise self._PidfileException(contents) 112 self._state = contents 113 114 115 def _handle_pidfile_error(self, error, message=''): 116 self.on_lost_process(self._state.process) 117 118 119 def _get_pidfile_info_helper(self): 120 if self.lost_process: 121 return 122 123 self._read_pidfile() 124 125 if self._state.process is None: 126 self._handle_no_process() 127 return 128 129 if self._state.exit_status is None: 130 # double check whether or not autoserv is running 131 if self._drone_manager.is_process_running(self._state.process): 132 return 133 134 # pid but no running process - maybe process *just* exited 135 self._read_pidfile(use_second_read=True) 136 if self._state.exit_status is None: 137 # autoserv exited without writing an exit code 138 # to the pidfile 139 self._handle_pidfile_error( 140 'autoserv died without writing exit code') 141 142 143 def _get_pidfile_info(self): 144 """\ 145 After completion, self._state will contain: 146 pid=None, exit_status=None if autoserv has not yet run 147 pid!=None, exit_status=None if autoserv is running 148 pid!=None, exit_status!=None if autoserv has completed 149 """ 150 try: 151 self._get_pidfile_info_helper() 152 except self._PidfileException, exc: 153 self._handle_pidfile_error('Pidfile error', traceback.format_exc()) 154 155 156 def _handle_no_process(self): 157 """\ 158 Called when no pidfile is found or no pid is in the pidfile. 159 """ 160 if time.time() - self._start_time > _get_pidfile_timeout_secs(): 161 # If we aborted the process, and we find that it has exited without 162 # writing a pidfile, then it's because we killed it, and thus this 163 # isn't a surprising situation. 164 if not self._killed: 165 metrics.Counter('chromeos/autotest/errors/scheduler/no_pidfile' 166 ).increment() 167 else: 168 logging.warning("%s didn't exit after SIGTERM", self.pidfile_id) 169 self.on_lost_process() 170 171 172 def on_lost_process(self, process=None): 173 """\ 174 Called when autoserv has exited without writing an exit status, 175 or we've timed out waiting for autoserv to write a pid to the 176 pidfile. In either case, we just return failure and the caller 177 should signal some kind of warning. 178 179 process is unimportant here, as it shouldn't be used by anyone. 180 """ 181 self.lost_process = True 182 self._state.process = process 183 self._state.exit_status = 1 184 self._state.num_tests_failed = 0 185 186 187 def exit_code(self): 188 self._get_pidfile_info() 189 return self._state.exit_status 190 191 192 def num_tests_failed(self): 193 """@returns The number of tests that failed or -1 if unknown.""" 194 self._get_pidfile_info() 195 if self._state.num_tests_failed is None: 196 return -1 197 return self._state.num_tests_failed 198 199 200 def try_copy_results_on_drone(self, **kwargs): 201 if self.has_process(): 202 # copy results logs into the normal place for job results 203 self._drone_manager.copy_results_on_drone(self.get_process(), **kwargs) 204 205 206 def try_copy_to_results_repository(self, source, **kwargs): 207 if self.has_process(): 208 self._drone_manager.copy_to_results_repository(self.get_process(), 209 source, **kwargs) 210 211