1#!/usr/bin/python 2# 3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7import datetime as datetime_base 8import logging 9from datetime import datetime 10 11import common 12 13from autotest_lib.client.common_lib import global_config 14from autotest_lib.client.common_lib import host_states 15from autotest_lib.client.common_lib import time_utils 16from autotest_lib.server import utils 17from autotest_lib.server.cros.dynamic_suite import reporting_utils 18from autotest_lib.server.lib import status_history 19 20CONFIG = global_config.global_config 21 22 23class BoardNotAvailableError(utils.TestLabException): 24 """Raised when a board is not available in the lab.""" 25 26 27class NotEnoughDutsError(utils.TestLabException): 28 """Rasied when the lab doesn't have the minimum number of duts.""" 29 30 def __init__(self, board, pool, num_available, num_required, hosts): 31 """Initialize instance. 32 33 Please pass arguments by keyword. 34 35 @param board: Name of board. 36 @param pool: Name of pool. 37 @param num_available: Number of available hosts. 38 @param num_required: Number of hosts required. 39 @param hosts: Sequence of Host instances for given board and pool. 40 """ 41 self.board = board 42 self.pool = pool 43 self.num_available = num_available 44 self.num_required = num_required 45 self.hosts = hosts 46 self.bug_id = None 47 self.suite_name = None 48 self.build = None 49 50 51 def __repr__(self): 52 return ( 53 '<{cls} at 0x{id:x} with' 54 ' board={this.board!r},' 55 ' pool={this.pool!r},' 56 ' num_available={this.num_available!r},' 57 ' num_required={this.num_required!r},' 58 ' bug_id={this.bug_id!r},' 59 ' suite_name={this.suite_name!r},' 60 ' build={this.build!r}>' 61 .format(cls=type(self).__name__, id=id(self), this=self) 62 ) 63 64 65 def __str__(self): 66 msg_parts = [ 67 'Not enough DUTs for board: {this.board}, pool: {this.pool};' 68 ' required: {this.num_required}, found: {this.num_available}' 69 ] 70 format_dict = {'this': self} 71 if self.bug_id is not None: 72 msg_parts.append('bug: {bug_url}') 73 format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id) 74 if self.suite_name is not None: 75 msg_parts.append('suite: {this.suite_name}') 76 if self.build is not None: 77 msg_parts.append('build: {this.build}') 78 return ', '.join(msg_parts).format(**format_dict) 79 80 81 def add_bug_id(self, bug_id): 82 """Add crbug id associated with this exception. 83 84 @param bug_id crbug id whose str() value is used in a crbug URL. 85 """ 86 self.bug_id = bug_id 87 88 89 def add_suite_name(self, suite_name): 90 """Add name of test suite that needed the DUTs. 91 92 @param suite_name Name of test suite. 93 """ 94 self.suite_name = suite_name 95 96 97 def add_build(self, build): 98 """Add name of build of job that needed the DUTs. 99 100 @param build Name of build. 101 """ 102 self.build = build 103 104 105class SimpleTimer(object): 106 """A simple timer used to periodically check if a deadline has passed.""" 107 108 def _reset(self): 109 """Reset the deadline.""" 110 if not self.interval_hours or self.interval_hours < 0: 111 logging.error('Bad interval %s', self.interval_hours) 112 self.deadline = None 113 return 114 self.deadline = datetime.now() + datetime_base.timedelta( 115 hours=self.interval_hours) 116 117 118 def __init__(self, interval_hours=0.5): 119 """Initialize a simple periodic deadline timer. 120 121 @param interval_hours: Interval of the deadline. 122 """ 123 self.interval_hours = interval_hours 124 self._reset() 125 126 127 def poll(self): 128 """Poll the timer to see if we've hit the deadline. 129 130 This method resets the deadline if it has passed. If the deadline 131 hasn't been set, or the current time is less than the deadline, the 132 method returns False. 133 134 @return: True if the deadline has passed, False otherwise. 135 """ 136 if not self.deadline or datetime.now() < self.deadline: 137 return False 138 self._reset() 139 return True 140 141 142class JobTimer(object): 143 """Utility class capable of measuring job timeouts. 144 """ 145 146 # Format used in datetime - string conversion. 147 time_format = '%m-%d-%Y [%H:%M:%S]' 148 149 def __init__(self, job_created_time, timeout_mins): 150 """JobTimer constructor. 151 152 @param job_created_time: float representing the time a job was 153 created. Eg: time.time() 154 @param timeout_mins: float representing the timeout in minutes. 155 """ 156 self.job_created_time = datetime.fromtimestamp(job_created_time) 157 self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0) 158 self.debug_output_timer = SimpleTimer(interval_hours=0.5) 159 self.past_halftime = False 160 161 162 @classmethod 163 def format_time(cls, datetime_obj): 164 """Get the string formatted version of the datetime object. 165 166 @param datetime_obj: A datetime.datetime object. 167 Eg: datetime.datetime.now() 168 169 @return: A formatted string containing the date/time of the 170 input datetime. 171 """ 172 return datetime_obj.strftime(cls.time_format) 173 174 175 def elapsed_time(self): 176 """Get the time elapsed since this job was created. 177 178 @return: A timedelta object representing the elapsed time. 179 """ 180 return datetime.now() - self.job_created_time 181 182 183 def is_suite_timeout(self): 184 """Check if the suite timed out. 185 186 @return: True if more than timeout_hours has elapsed since the suite job 187 was created. 188 """ 189 if self.elapsed_time() >= self.timeout_hours: 190 logging.info('Suite timed out. Started on %s, timed out on %s', 191 self.format_time(self.job_created_time), 192 self.format_time(datetime.now())) 193 return True 194 return False 195 196 197 def first_past_halftime(self): 198 """Check if we just crossed half time. 199 200 This method will only return True once, the first time it is called 201 after a job's elapsed time is past half its timeout. 202 203 @return True: If this is the first call of the method after halftime. 204 """ 205 if (not self.past_halftime and 206 self.elapsed_time() > self.timeout_hours/2): 207 self.past_halftime = True 208 return True 209 return False 210 211 212class RPCHelper(object): 213 """A class to help diagnose a suite run through the rpc interface. 214 """ 215 216 def __init__(self, rpc_interface): 217 """Constructor for rpc helper class. 218 219 @param rpc_interface: An rpc object, eg: A RetryingAFE instance. 220 """ 221 self.rpc_interface = rpc_interface 222 223 224 def diagnose_pool(self, board, pool, time_delta_hours, limit=10): 225 """Log diagnostic information about a timeout for a board/pool. 226 227 @param board: The board for which the current suite was run. 228 @param pool: The pool against which the current suite was run. 229 @param time_delta_hours: The time from which we should log information. 230 This is a datetime.timedelta object, as stored by the JobTimer. 231 @param limit: The maximum number of jobs per host, to log. 232 233 @raises proxy.JSONRPCException: For exceptions thrown across the wire. 234 """ 235 end_time = datetime.now() 236 start_time = end_time - time_delta_hours 237 get_histories = status_history.HostJobHistory.get_multiple_histories 238 host_histories = get_histories( 239 self.rpc_interface, 240 time_utils.to_epoch_time(start_time), 241 time_utils.to_epoch_time(end_time), 242 board=board, pool=pool) 243 if not host_histories: 244 logging.error('No hosts found for board:%s in pool:%s', 245 board, pool) 246 return 247 status_map = { 248 status_history.UNUSED: 'Unused', 249 status_history.UNKNOWN: 'No job history', 250 status_history.WORKING: 'Working', 251 status_history.BROKEN: 'Failed repair' 252 } 253 for history in host_histories: 254 count = 0 255 job_info ='' 256 for job in history: 257 start_time = ( 258 time_utils.epoch_time_to_date_string(job.start_time)) 259 job_info += ('%s %s started on: %s status %s\n' % 260 (job.id, job.name, start_time, job.job_status)) 261 count += 1 262 if count >= limit: 263 break 264 host = history.host 265 logging.error('host: %s, status: %s, locked: %s ' 266 'diagnosis: %s\n' 267 'labels: %s\nLast %s jobs within %s:\n' 268 '%s', 269 history.hostname, host.status, host.locked, 270 status_map[history.last_diagnosis()[0]], 271 host.labels, limit, time_delta_hours, 272 job_info) 273 274 275 def _is_host_available(self, host): 276 """Check whether DUT host is available. 277 278 @param host: The Host instance for the DUT. 279 @return: bool 280 """ 281 return not (host.locked or host.status in host_states.UNAVAILABLE_STATES) 282 283 284 def check_dut_availability(self, board, pool, minimum_duts=0, skip_duts_check=False): 285 """Check if DUT availability for a given board and pool is less than 286 minimum. 287 288 @param board: The board to check DUT availability. 289 @param pool: The pool to check DUT availability. 290 @param minimum_duts: Minimum Number of available machines required to 291 run the suite. Default is set to 0, which means do 292 not force the check of available machines before 293 running the suite. 294 @param skip_duts_check: If True, skip minimum available DUTs check. 295 @raise: NotEnoughDutsError if DUT availability is lower than minimum. 296 @raise: BoardNotAvailableError if no host found for requested 297 board/pool. 298 """ 299 if minimum_duts == 0: 300 return 301 302 # TODO(ayatane): Replace label prefixes with constants in 303 # site_utils.suite_scheduler.constants 304 hosts = self.rpc_interface.get_hosts( 305 invalid=False, 306 multiple_labels=('pool:%s' % pool, 'board:%s' % board)) 307 if not hosts: 308 raise BoardNotAvailableError( 309 'No hosts found for board:%s in pool:%s. The test lab ' 310 'currently does not cover test for this board and pool.'% 311 (board, pool)) 312 313 if skip_duts_check: 314 # Bypass minimum avilable DUTs check 315 logging.debug('skip_duts_check is on, do not enforce minimum DUTs check.') 316 return 317 318 if len(hosts) < minimum_duts: 319 logging.debug('The total number of DUTs for %s in pool:%s is %d, ' 320 'which is less than %d, the required minimum number of' 321 ' available DUTS', board, pool, len(hosts), 322 minimum_duts) 323 324 available_hosts = 0 325 for host in hosts: 326 if self._is_host_available(host): 327 available_hosts += 1 328 logging.debug('%d of %d DUTs are available for board %s pool %s.', 329 available_hosts, len(hosts), board, pool) 330 if available_hosts < minimum_duts: 331 raise NotEnoughDutsError( 332 board=board, 333 pool=pool, 334 num_available=available_hosts, 335 num_required=minimum_duts, 336 hosts=hosts) 337 338 339 def diagnose_job(self, job_id, instance_server): 340 """Diagnose a suite job. 341 342 Logs information about the jobs that are still to run in the suite. 343 344 @param job_id: The id of the suite job to get information about. 345 No meaningful information gets logged if the id is for a sub-job. 346 @param instance_server: The instance server. 347 Eg: cautotest, cautotest-cq, localhost. 348 """ 349 incomplete_jobs = self.rpc_interface.get_jobs( 350 parent_job_id=job_id, summary=True, 351 hostqueueentry__complete=False) 352 if incomplete_jobs: 353 logging.info('\n%s printing summary of incomplete jobs (%s):\n', 354 JobTimer.format_time(datetime.now()), 355 len(incomplete_jobs)) 356 for job in incomplete_jobs: 357 logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:], 358 reporting_utils.link_job(job.id, instance_server)) 359 else: 360 logging.info('All jobs in suite have already completed.') 361