1#!/usr/bin/python 2# 3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7import datetime as datetime_base 8import logging 9from datetime import datetime 10 11import common 12 13from autotest_lib.client.common_lib import global_config 14from autotest_lib.client.common_lib import time_utils 15from autotest_lib.server import utils 16from autotest_lib.server.cros.dynamic_suite import reporting_utils 17from autotest_lib.server.lib import status_history 18from autotest_lib.utils import labellib 19 20CONFIG = global_config.global_config 21 22 23class BoardNotAvailableError(utils.TestLabException): 24 """Raised when a board is not available in the lab.""" 25 26 27class NotEnoughDutsError(utils.TestLabException): 28 """Rasied when the lab doesn't have the minimum number of duts.""" 29 30 def __init__(self, board, pool, num_available, num_required, hosts): 31 """Initialize instance. 32 33 Please pass arguments by keyword. 34 35 @param board: Name of board. 36 @param pool: Name of pool. 37 @param num_available: Number of available hosts. 38 @param num_required: Number of hosts required. 39 @param hosts: Sequence of Host instances for given board and pool. 40 """ 41 self.board = board 42 self.pool = pool 43 self.num_available = num_available 44 self.num_required = num_required 45 self.hosts = hosts 46 self.bug_id = None 47 self.suite_name = None 48 self.build = None 49 50 51 def __repr__(self): 52 return ( 53 '<{cls} at 0x{id:x} with' 54 ' board={this.board!r},' 55 ' pool={this.pool!r},' 56 ' num_available={this.num_available!r},' 57 ' num_required={this.num_required!r},' 58 ' bug_id={this.bug_id!r},' 59 ' suite_name={this.suite_name!r},' 60 ' build={this.build!r}>' 61 .format(cls=type(self).__name__, id=id(self), this=self) 62 ) 63 64 65 def __str__(self): 66 msg_parts = [ 67 'Not enough DUTs for board: {this.board}, pool: {this.pool};' 68 ' required: {this.num_required}, found: {this.num_available}' 69 ] 70 format_dict = {'this': self} 71 if self.bug_id is not None: 72 msg_parts.append('bug: {bug_url}') 73 format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id) 74 if self.suite_name is not None: 75 msg_parts.append('suite: {this.suite_name}') 76 if self.build is not None: 77 msg_parts.append('build: {this.build}') 78 return ', '.join(msg_parts).format(**format_dict) 79 80 81 def add_bug_id(self, bug_id): 82 """Add crbug id associated with this exception. 83 84 @param bug_id crbug id whose str() value is used in a crbug URL. 85 """ 86 self.bug_id = bug_id 87 88 89 def add_suite_name(self, suite_name): 90 """Add name of test suite that needed the DUTs. 91 92 @param suite_name Name of test suite. 93 """ 94 self.suite_name = suite_name 95 96 97 def add_build(self, build): 98 """Add name of build of job that needed the DUTs. 99 100 @param build Name of build. 101 """ 102 self.build = build 103 104 105class SimpleTimer(object): 106 """A simple timer used to periodically check if a deadline has passed.""" 107 108 def _reset(self): 109 """Reset the deadline.""" 110 if not self.interval_hours or self.interval_hours < 0: 111 logging.error('Bad interval %s', self.interval_hours) 112 self.deadline = None 113 return 114 self.deadline = datetime.now() + datetime_base.timedelta( 115 hours=self.interval_hours) 116 117 118 def __init__(self, interval_hours=0.5): 119 """Initialize a simple periodic deadline timer. 120 121 @param interval_hours: Interval of the deadline. 122 """ 123 self.interval_hours = interval_hours 124 self._reset() 125 126 127 def poll(self): 128 """Poll the timer to see if we've hit the deadline. 129 130 This method resets the deadline if it has passed. If the deadline 131 hasn't been set, or the current time is less than the deadline, the 132 method returns False. 133 134 @return: True if the deadline has passed, False otherwise. 135 """ 136 if not self.deadline or datetime.now() < self.deadline: 137 return False 138 self._reset() 139 return True 140 141 142class JobTimer(object): 143 """Utility class capable of measuring job timeouts. 144 """ 145 146 # Format used in datetime - string conversion. 147 time_format = '%m-%d-%Y [%H:%M:%S]' 148 149 def __init__(self, job_created_time, timeout_mins): 150 """JobTimer constructor. 151 152 @param job_created_time: float representing the time a job was 153 created. Eg: time.time() 154 @param timeout_mins: float representing the timeout in minutes. 155 """ 156 self.job_created_time = datetime.fromtimestamp(job_created_time) 157 self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0) 158 self.debug_output_timer = SimpleTimer(interval_hours=0.5) 159 self.past_halftime = False 160 161 162 @classmethod 163 def format_time(cls, datetime_obj): 164 """Get the string formatted version of the datetime object. 165 166 @param datetime_obj: A datetime.datetime object. 167 Eg: datetime.datetime.now() 168 169 @return: A formatted string containing the date/time of the 170 input datetime. 171 """ 172 return datetime_obj.strftime(cls.time_format) 173 174 175 def elapsed_time(self): 176 """Get the time elapsed since this job was created. 177 178 @return: A timedelta object representing the elapsed time. 179 """ 180 return datetime.now() - self.job_created_time 181 182 183 def is_suite_timeout(self): 184 """Check if the suite timed out. 185 186 @return: True if more than timeout_hours has elapsed since the suite job 187 was created. 188 """ 189 if self.elapsed_time() >= self.timeout_hours: 190 logging.info('Suite timed out. Started on %s, timed out on %s', 191 self.format_time(self.job_created_time), 192 self.format_time(datetime.now())) 193 return True 194 return False 195 196 197 def first_past_halftime(self): 198 """Check if we just crossed half time. 199 200 This method will only return True once, the first time it is called 201 after a job's elapsed time is past half its timeout. 202 203 @return True: If this is the first call of the method after halftime. 204 """ 205 if (not self.past_halftime and 206 self.elapsed_time() > self.timeout_hours/2): 207 self.past_halftime = True 208 return True 209 return False 210 211 212class RPCHelper(object): 213 """A class to help diagnose a suite run through the rpc interface. 214 """ 215 216 def __init__(self, rpc_interface): 217 """Constructor for rpc helper class. 218 219 @param rpc_interface: An rpc object, eg: A RetryingAFE instance. 220 """ 221 self.rpc_interface = rpc_interface 222 223 224 def diagnose_pool(self, board, pool, time_delta_hours, limit=10): 225 """Log diagnostic information about a timeout for a board/pool. 226 227 @param board: The board for which the current suite was run. 228 @param pool: The pool against which the current suite was run. 229 @param time_delta_hours: The time from which we should log information. 230 This is a datetime.timedelta object, as stored by the JobTimer. 231 @param limit: The maximum number of jobs per host, to log. 232 233 @raises proxy.JSONRPCException: For exceptions thrown across the wire. 234 """ 235 end_time = datetime.now() 236 start_time = end_time - time_delta_hours 237 labels = labellib.LabelsMapping() 238 labels['board'] = board 239 labels['pool'] = pool 240 host_histories = status_history.HostJobHistory.get_multiple_histories( 241 self.rpc_interface, 242 time_utils.to_epoch_time(start_time), 243 time_utils.to_epoch_time(end_time), 244 labels.getlabels(), 245 ) 246 if not host_histories: 247 logging.error('No hosts found for board:%s in pool:%s', 248 board, pool) 249 return 250 status_map = { 251 status_history.UNUSED: 'Unused', 252 status_history.UNKNOWN: 'No job history', 253 status_history.WORKING: 'Working', 254 status_history.BROKEN: 'Failed repair' 255 } 256 for history in host_histories: 257 count = 0 258 job_info ='' 259 for job in history: 260 start_time = ( 261 time_utils.epoch_time_to_date_string(job.start_time)) 262 job_info += ('%s %s started on: %s status %s\n' % 263 (job.id, job.name, start_time, job.job_status)) 264 count += 1 265 if count >= limit: 266 break 267 host = history.host 268 logging.error('host: %s, status: %s, locked: %s ' 269 'diagnosis: %s\n' 270 'labels: %s\nLast %s jobs within %s:\n' 271 '%s', 272 history.hostname, host.status, host.locked, 273 status_map[history.last_diagnosis()[0]], 274 host.labels, limit, time_delta_hours, 275 job_info) 276 277 278 def check_dut_availability(self, board, pool, minimum_duts=0, skip_duts_check=False): 279 """Check if DUT availability for a given board and pool is less than 280 minimum. 281 282 @param board: The board to check DUT availability. 283 @param pool: The pool to check DUT availability. 284 @param minimum_duts: Minimum Number of available machines required to 285 run the suite. Default is set to 0, which means do 286 not force the check of available machines before 287 running the suite. 288 @param skip_duts_check: If True, skip minimum available DUTs check. 289 @raise: NotEnoughDutsError if DUT availability is lower than minimum. 290 @raise: BoardNotAvailableError if no host found for requested 291 board/pool. 292 """ 293 if minimum_duts == 0: 294 return 295 296 # TODO(ayatane): Replace label prefixes with constants in 297 # autotest_lib.server.constants 298 hosts = self.rpc_interface.get_hosts( 299 invalid=False, 300 multiple_labels=('pool:%s' % pool, 'board:%s' % board)) 301 if not hosts: 302 raise BoardNotAvailableError( 303 'No hosts found for board:%s in pool:%s. The test lab ' 304 'currently does not cover test for this board and pool.'% 305 (board, pool)) 306 307 if skip_duts_check: 308 # Bypass minimum avilable DUTs check 309 logging.debug('skip_duts_check is on, do not enforce minimum DUTs check.') 310 return 311 312 if len(hosts) < minimum_duts: 313 logging.debug('The total number of DUTs for %s in pool:%s is %d, ' 314 'which is less than %d, the required minimum number of' 315 ' available DUTS', board, pool, len(hosts), 316 minimum_duts) 317 318 available_hosts = 0 319 for host in hosts: 320 if host.is_available(): 321 available_hosts += 1 322 logging.debug('%d of %d DUTs are available for board %s pool %s.', 323 available_hosts, len(hosts), board, pool) 324 if available_hosts < minimum_duts: 325 raise NotEnoughDutsError( 326 board=board, 327 pool=pool, 328 num_available=available_hosts, 329 num_required=minimum_duts, 330 hosts=hosts) 331 332 333 def diagnose_job(self, job_id, instance_server): 334 """Diagnose a suite job. 335 336 Logs information about the jobs that are still to run in the suite. 337 338 @param job_id: The id of the suite job to get information about. 339 No meaningful information gets logged if the id is for a sub-job. 340 @param instance_server: The instance server. 341 Eg: cautotest, cautotest-cq, localhost. 342 """ 343 incomplete_jobs = self.rpc_interface.get_jobs( 344 parent_job_id=job_id, summary=True, 345 hostqueueentry__complete=False) 346 if incomplete_jobs: 347 logging.info('\n%s printing summary of incomplete jobs (%s):\n', 348 JobTimer.format_time(datetime.now()), 349 len(incomplete_jobs)) 350 for job in incomplete_jobs: 351 logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:], 352 reporting_utils.link_job(job.id, instance_server)) 353 else: 354 logging.info('All jobs in suite have already completed.') 355