1#!/usr/bin/python 2# 3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7import datetime as datetime_base 8import logging 9from datetime import datetime 10 11import common 12 13from autotest_lib.client.common_lib import global_config 14from autotest_lib.client.common_lib import time_utils 15from autotest_lib.server import utils 16from autotest_lib.server.cros.dynamic_suite import reporting_utils 17from autotest_lib.server.lib import status_history 18 19CONFIG = global_config.global_config 20 21 22class DUTsNotAvailableError(utils.TestLabException): 23 """Raised when a DUT label combination is not available in the lab.""" 24 25 26class NotEnoughDutsError(utils.TestLabException): 27 """Rasied when the lab doesn't have the minimum number of duts.""" 28 29 def __init__(self, labels, num_available, num_required, hosts): 30 """Initialize instance. 31 32 Please pass arguments by keyword. 33 34 @param labels: Labels required, including board an pool labels. 35 @param num_available: Number of available hosts. 36 @param num_required: Number of hosts required. 37 @param hosts: Sequence of Host instances for given board and pool. 38 """ 39 self.labels = labels 40 self.num_available = num_available 41 self.num_required = num_required 42 self.hosts = hosts 43 self.bug_id = None 44 self.suite_name = None 45 self.build = None 46 47 48 def __repr__(self): 49 return ( 50 '<{cls} at 0x{id:x} with' 51 ' labels={this.labels!r},' 52 ' num_available={this.num_available!r},' 53 ' num_required={this.num_required!r},' 54 ' bug_id={this.bug_id!r},' 55 ' suite_name={this.suite_name!r},' 56 ' build={this.build!r}>' 57 .format(cls=type(self).__name__, id=id(self), this=self) 58 ) 59 60 61 def __str__(self): 62 msg_parts = [ 63 'Not enough DUTs for requirements: {this.labels};' 64 ' required: {this.num_required}, found: {this.num_available}' 65 ] 66 format_dict = {'this': self} 67 if self.bug_id is not None: 68 msg_parts.append('bug: {bug_url}') 69 format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id) 70 if self.suite_name is not None: 71 msg_parts.append('suite: {this.suite_name}') 72 if self.build is not None: 73 msg_parts.append('build: {this.build}') 74 return ', '.join(msg_parts).format(**format_dict) 75 76 77 def add_bug_id(self, bug_id): 78 """Add crbug id associated with this exception. 79 80 @param bug_id crbug id whose str() value is used in a crbug URL. 81 """ 82 self.bug_id = bug_id 83 84 85 def add_suite_name(self, suite_name): 86 """Add name of test suite that needed the DUTs. 87 88 @param suite_name Name of test suite. 89 """ 90 self.suite_name = suite_name 91 92 93 def add_build(self, build): 94 """Add name of build of job that needed the DUTs. 95 96 @param build Name of build. 97 """ 98 self.build = build 99 100 101class SimpleTimer(object): 102 """A simple timer used to periodically check if a deadline has passed.""" 103 104 def _reset(self): 105 """Reset the deadline.""" 106 if not self.interval_hours or self.interval_hours < 0: 107 logging.error('Bad interval %s', self.interval_hours) 108 self.deadline = None 109 return 110 self.deadline = datetime.now() + datetime_base.timedelta( 111 hours=self.interval_hours) 112 113 114 def __init__(self, interval_hours=0.5): 115 """Initialize a simple periodic deadline timer. 116 117 @param interval_hours: Interval of the deadline. 118 """ 119 self.interval_hours = interval_hours 120 self._reset() 121 122 123 def poll(self): 124 """Poll the timer to see if we've hit the deadline. 125 126 This method resets the deadline if it has passed. If the deadline 127 hasn't been set, or the current time is less than the deadline, the 128 method returns False. 129 130 @return: True if the deadline has passed, False otherwise. 131 """ 132 if not self.deadline or datetime.now() < self.deadline: 133 return False 134 self._reset() 135 return True 136 137 138class JobTimer(object): 139 """Utility class capable of measuring job timeouts. 140 """ 141 142 # Format used in datetime - string conversion. 143 time_format = '%m-%d-%Y [%H:%M:%S]' 144 145 def __init__(self, job_created_time, timeout_mins): 146 """JobTimer constructor. 147 148 @param job_created_time: float representing the time a job was 149 created. Eg: time.time() 150 @param timeout_mins: float representing the timeout in minutes. 151 """ 152 self.job_created_time = datetime.fromtimestamp(job_created_time) 153 self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0) 154 self.debug_output_timer = SimpleTimer(interval_hours=0.5) 155 self.past_halftime = False 156 157 158 @classmethod 159 def format_time(cls, datetime_obj): 160 """Get the string formatted version of the datetime object. 161 162 @param datetime_obj: A datetime.datetime object. 163 Eg: datetime.datetime.now() 164 165 @return: A formatted string containing the date/time of the 166 input datetime. 167 """ 168 return datetime_obj.strftime(cls.time_format) 169 170 171 def elapsed_time(self): 172 """Get the time elapsed since this job was created. 173 174 @return: A timedelta object representing the elapsed time. 175 """ 176 return datetime.now() - self.job_created_time 177 178 179 def is_suite_timeout(self): 180 """Check if the suite timed out. 181 182 @return: True if more than timeout_hours has elapsed since the suite job 183 was created. 184 """ 185 if self.elapsed_time() >= self.timeout_hours: 186 logging.info('Suite timed out. Started on %s, timed out on %s', 187 self.format_time(self.job_created_time), 188 self.format_time(datetime.now())) 189 return True 190 return False 191 192 193 def first_past_halftime(self): 194 """Check if we just crossed half time. 195 196 This method will only return True once, the first time it is called 197 after a job's elapsed time is past half its timeout. 198 199 @return True: If this is the first call of the method after halftime. 200 """ 201 if (not self.past_halftime and 202 self.elapsed_time() > self.timeout_hours/2): 203 self.past_halftime = True 204 return True 205 return False 206 207 208class RPCHelper(object): 209 """A class to help diagnose a suite run through the rpc interface. 210 """ 211 212 def __init__(self, rpc_interface): 213 """Constructor for rpc helper class. 214 215 @param rpc_interface: An rpc object, eg: A RetryingAFE instance. 216 """ 217 self.rpc_interface = rpc_interface 218 219 220 def diagnose_pool(self, labels, time_delta_hours, limit=10): 221 """Log diagnostic information about a timeout for a board/pool. 222 223 @param labels: DUT label dependencies, including board and pool 224 labels. 225 @param time_delta_hours: The time from which we should log information. 226 This is a datetime.timedelta object, as stored by the JobTimer. 227 @param limit: The maximum number of jobs per host, to log. 228 229 @raises proxy.JSONRPCException: For exceptions thrown across the wire. 230 """ 231 end_time = datetime.now() 232 start_time = end_time - time_delta_hours 233 host_histories = status_history.HostJobHistory.get_multiple_histories( 234 self.rpc_interface, 235 time_utils.to_epoch_time(start_time), 236 time_utils.to_epoch_time(end_time), 237 labels, 238 ) 239 if not host_histories: 240 logging.error('No hosts found for labels %r', labels) 241 return 242 status_map = { 243 status_history.UNUSED: 'Unused', 244 status_history.UNKNOWN: 'No job history', 245 status_history.WORKING: 'Working', 246 status_history.BROKEN: 'Failed repair' 247 } 248 for history in host_histories: 249 count = 0 250 job_info ='' 251 for job in history: 252 start_time = ( 253 time_utils.epoch_time_to_date_string(job.start_time)) 254 job_info += ('%s %s started on: %s status %s\n' % 255 (job.id, job.name, start_time, job.job_status)) 256 count += 1 257 if count >= limit: 258 break 259 host = history.host 260 logging.error('host: %s, status: %s, locked: %s ' 261 'diagnosis: %s\n' 262 'labels: %s\nLast %s jobs within %s:\n' 263 '%s', 264 history.hostname, host.status, host.locked, 265 status_map[history.last_diagnosis()[0]], 266 host.labels, limit, time_delta_hours, 267 job_info) 268 269 270 def check_dut_availability(self, labels, minimum_duts=0, 271 skip_duts_check=False): 272 """Check if DUT availability for a given board and pool is less than 273 minimum. 274 275 @param labels: DUT label dependencies, including board and pool 276 labels. 277 @param minimum_duts: Minimum Number of available machines required to 278 run the suite. Default is set to 0, which means do 279 not force the check of available machines before 280 running the suite. 281 @param skip_duts_check: If True, skip minimum available DUTs check. 282 @raise: NotEnoughDutsError if DUT availability is lower than minimum. 283 @raise: DUTsNotAvailableError if no host found for requested 284 board/pool. 285 """ 286 if minimum_duts == 0: 287 return 288 289 hosts = self.rpc_interface.get_hosts( 290 invalid=False, multiple_labels=labels) 291 if not hosts: 292 raise DUTsNotAvailableError( 293 'No hosts found for labels %r. The test lab ' 294 'currently does not cover test for those DUTs.' % 295 (labels,)) 296 297 if skip_duts_check: 298 # Bypass minimum avilable DUTs check 299 logging.debug('skip_duts_check is on, do not enforce minimum ' 300 'DUTs check.') 301 return 302 303 if len(hosts) < minimum_duts: 304 logging.debug('The total number of DUTs for %r is %d, ' 305 'which is less than %d, the required minimum ' 306 'number of available DUTS', labels, len(hosts), 307 minimum_duts) 308 309 available_hosts = 0 310 for host in hosts: 311 if host.is_available(): 312 available_hosts += 1 313 logging.debug('%d of %d DUTs are available for %r.', 314 available_hosts, len(hosts), labels) 315 if available_hosts < minimum_duts: 316 raise NotEnoughDutsError( 317 labels=labels, 318 num_available=available_hosts, 319 num_required=minimum_duts, 320 hosts=hosts) 321 322 323 def diagnose_job(self, job_id, instance_server): 324 """Diagnose a suite job. 325 326 Logs information about the jobs that are still to run in the suite. 327 328 @param job_id: The id of the suite job to get information about. 329 No meaningful information gets logged if the id is for a sub-job. 330 @param instance_server: The instance server. 331 Eg: cautotest, cautotest-cq, localhost. 332 """ 333 incomplete_jobs = self.rpc_interface.get_jobs( 334 parent_job_id=job_id, summary=True, 335 hostqueueentry__complete=False) 336 if incomplete_jobs: 337 logging.info('\n%s printing summary of incomplete jobs (%s):\n', 338 JobTimer.format_time(datetime.now()), 339 len(incomplete_jobs)) 340 for job in incomplete_jobs: 341 logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:], 342 reporting_utils.link_job(job.id, instance_server)) 343 else: 344 logging.info('All jobs in suite have already completed.') 345