1# Copyright 2015 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5"""Services relating to DUT status and job history. 6 7The central abstraction of this module is the `HostJobHistory` 8class. This class provides two related pieces of information 9regarding a single DUT: 10 * A history of tests and special tasks that have run on 11 the DUT in a given time range. 12 * Whether the DUT was "working" or "broken" at a given 13 time. 14 15The "working" or "broken" status of a DUT is determined by 16the DUT's special task history. At the end of any job or 17task, the status is indicated as follows: 18 * After any successful special task, the DUT is considered 19 "working". 20 * After any failed Repair task, the DUT is considered "broken". 21 * After any other special task or after any regular test job, the 22 DUT's status is considered unchanged. 23 24Definitions for terms used in the code below: 25 * status task - Any special task that determines the DUT's 26 status; that is, any successful task, or any failed Repair. 27 * diagnosis interval - A time interval during which DUT status 28 changed either from "working" to "broken", or vice versa. The 29 interval starts with the last status task with the old status, 30 and ends after the first status task with the new status. 31 32Diagnosis intervals are interesting because they normally contain 33the logs explaining a failure or repair event. 34 35""" 36 37import common 38from autotest_lib.frontend import setup_django_environment 39from django.db import models as django_models 40 41from autotest_lib.client.common_lib import global_config 42from autotest_lib.client.common_lib import time_utils 43from autotest_lib.frontend.afe import models as afe_models 44from autotest_lib.site_utils.suite_scheduler import constants 45 46 47# Values used to describe the diagnosis of a DUT. These values are 48# used to indicate both DUT status after a job or task, and also 49# diagnosis of whether the DUT was working at the end of a given 50# time interval. 51# 52# UNUSED: Used when there are no events recorded in a given 53# time interval. 54# UNKNOWN: For an individual event, indicates that the DUT status 55# is unchanged from the previous event. For a time interval, 56# indicates that the DUT's status can't be determined from the 57# DUT's history. 58# WORKING: Indicates that the DUT was working normally after the 59# event, or at the end of the time interval. 60# BROKEN: Indicates that the DUT needed manual repair after the 61# event, or at the end of the time interval. 62# 63UNUSED = 0 64UNKNOWN = 1 65WORKING = 2 66BROKEN = 3 67 68 69def parse_time(time_string): 70 """Parse time according to a canonical form. 71 72 The "canonical" form is the form in which date/time 73 values are stored in the database. 74 75 @param time_string Time to be parsed. 76 """ 77 return int(time_utils.to_epoch_time(time_string)) 78 79 80class _JobEvent(object): 81 """Information about an event in host history. 82 83 This remembers the relevant data from a single event in host 84 history. An event is any change in DUT state caused by a job 85 or special task. The data captured are the start and end times 86 of the event, the URL of logs to the job or task causing the 87 event, and a diagnosis of whether the DUT was working or failed 88 afterwards. 89 90 This class is an adapter around the database model objects 91 describing jobs and special tasks. This is an abstract 92 superclass, with concrete subclasses for `HostQueueEntry` and 93 `SpecialTask` objects. 94 95 @property start_time Time the job or task began execution. 96 @property end_time Time the job or task finished execution. 97 @property job_url URL to the logs for the event's job. 98 @property diagnosis Working status of the DUT after the event. 99 100 """ 101 102 get_config_value = global_config.global_config.get_config_value 103 _LOG_URL_PATTERN = get_config_value('CROS', 'log_url_pattern') 104 105 @classmethod 106 def get_log_url(cls, afe_hostname, logdir): 107 """Return a URL to job results. 108 109 The URL is constructed from a base URL determined by the 110 global config, plus the relative path of the job's log 111 directory. 112 113 @param afe_hostname Hostname for autotest frontend 114 @param logdir Relative path of the results log directory. 115 116 @return A URL to the requested results log. 117 118 """ 119 return cls._LOG_URL_PATTERN % (afe_hostname, logdir) 120 121 122 def __init__(self, start_time, end_time): 123 self.start_time = parse_time(start_time) 124 self.end_time = parse_time(end_time) 125 126 127 def __cmp__(self, other): 128 """Compare two jobs by their start time. 129 130 This is a standard Python `__cmp__` method to allow sorting 131 `_JobEvent` objects by their times. 132 133 @param other The `_JobEvent` object to compare to `self`. 134 135 """ 136 return self.start_time - other.start_time 137 138 139 @property 140 def job_url(self): 141 """Return the URL for this event's job logs.""" 142 raise NotImplemented() 143 144 145 @property 146 def diagnosis(self): 147 """Return the status of the DUT after this event. 148 149 The diagnosis is interpreted as follows: 150 UNKNOWN - The DUT status was the same before and after 151 the event. 152 WORKING - The DUT appeared to be working after the event. 153 BROKEN - The DUT likely required manual intervention 154 after the event. 155 156 @return A valid diagnosis value. 157 158 """ 159 raise NotImplemented() 160 161 162class _SpecialTaskEvent(_JobEvent): 163 """`_JobEvent` adapter for special tasks. 164 165 This class wraps the standard `_JobEvent` interface around a row 166 in the `afe_special_tasks` table. 167 168 """ 169 170 @classmethod 171 def get_tasks(cls, afe, host_id, start_time, end_time): 172 """Return special tasks for a host in a given time range. 173 174 Return a list of `_SpecialTaskEvent` objects representing all 175 special tasks that ran on the given host in the given time 176 range. The list is ordered as it was returned by the query 177 (i.e. unordered). 178 179 @param afe Autotest frontend 180 @param host_id Database host id of the desired host. 181 @param start_time Start time of the range of interest. 182 @param end_time End time of the range of interest. 183 184 @return A list of `_SpecialTaskEvent` objects. 185 186 """ 187 query_start = time_utils.epoch_time_to_date_string(start_time) 188 query_end = time_utils.epoch_time_to_date_string(end_time) 189 tasks = afe.get_host_special_tasks( 190 host_id, 191 time_started__gte=query_start, 192 time_finished__lte=query_end, 193 is_complete=1) 194 return [cls(afe.server, t) for t in tasks] 195 196 197 @classmethod 198 def get_status_task(cls, afe, host_id, end_time): 199 """Return the task indicating a host's status at a given time. 200 201 The task returned determines the status of the DUT; the 202 diagnosis on the task indicates the diagnosis for the DUT at 203 the given `end_time`. 204 205 @param afe Autotest frontend 206 @param host_id Database host id of the desired host. 207 @param end_time Find status as of this time. 208 209 @return A `_SpecialTaskEvent` object for the requested task, 210 or `None` if no task was found. 211 212 """ 213 query_end = time_utils.epoch_time_to_date_string(end_time) 214 task = afe.get_host_status_task(host_id, query_end) 215 return cls(afe.server, task) if task else None 216 217 218 def __init__(self, afe_hostname, afetask): 219 self._afe_hostname = afe_hostname 220 self._afetask = afetask 221 super(_SpecialTaskEvent, self).__init__( 222 afetask.time_started, afetask.time_finished) 223 224 225 @property 226 def job_url(self): 227 logdir = ('hosts/%s/%s-%s' % 228 (self._afetask.host.hostname, self._afetask.id, 229 self._afetask.task.lower())) 230 return _SpecialTaskEvent.get_log_url(self._afe_hostname, logdir) 231 232 233 @property 234 def diagnosis(self): 235 if self._afetask.success: 236 return WORKING 237 elif self._afetask.task == 'Repair': 238 return BROKEN 239 else: 240 return UNKNOWN 241 242 243class _TestJobEvent(_JobEvent): 244 """`_JobEvent` adapter for regular test jobs. 245 246 This class wraps the standard `_JobEvent` interface around a row 247 in the `afe_host_queue_entries` table. 248 249 """ 250 251 @classmethod 252 def get_hqes(cls, afe, host_id, start_time, end_time): 253 """Return HQEs for a host in a given time range. 254 255 Return a list of `_TestJobEvent` objects representing all the 256 HQEs of all the jobs that ran on the given host in the given 257 time range. The list is ordered as it was returned by the 258 query (i.e. unordered). 259 260 @param afe Autotest frontend 261 @param host_id Database host id of the desired host. 262 @param start_time Start time of the range of interest. 263 @param end_time End time of the range of interest. 264 265 @return A list of `_TestJobEvent` objects. 266 267 """ 268 query_start = time_utils.epoch_time_to_date_string(start_time) 269 query_end = time_utils.epoch_time_to_date_string(end_time) 270 hqelist = afe.get_host_queue_entries( 271 host_id=host_id, 272 start_time=query_start, 273 end_time=query_end, 274 complete=1) 275 return [cls(afe.server, hqe) for hqe in hqelist] 276 277 278 def __init__(self, afe_hostname, hqe): 279 self._afe_hostname = afe_hostname 280 self._hqe = hqe 281 super(_TestJobEvent, self).__init__( 282 hqe.started_on, hqe.finished_on) 283 284 285 @property 286 def job_url(self): 287 logdir = '%s-%s' % (self._hqe.job.id, self._hqe.job.owner) 288 return _TestJobEvent.get_log_url(self._afe_hostname, logdir) 289 290 291 @property 292 def diagnosis(self): 293 return UNKNOWN 294 295 296class HostJobHistory(object): 297 """Class to query and remember DUT execution and status history. 298 299 This class is responsible for querying the database to determine 300 the history of a single DUT in a time interval of interest, and 301 for remembering the query results for reporting. 302 303 @property hostname Host name of the DUT. 304 @property start_time Start of the requested time interval. 305 This field may be `None`. 306 @property end_time End of the requested time interval. 307 @property _afe Autotest frontend for queries. 308 @property _host Database host object for the DUT. 309 @property _history A list of jobs and special tasks that 310 ran on the DUT in the requested time 311 interval, ordered in reverse, from latest 312 to earliest. 313 314 @property _status_interval A list of all the jobs and special 315 tasks that ran on the DUT in the 316 last diagnosis interval prior to 317 `end_time`, ordered from latest to 318 earliest. 319 @property _status_diagnosis The DUT's status as of `end_time`. 320 @property _status_task The DUT's last status task as of 321 `end_time`. 322 323 """ 324 325 @classmethod 326 def get_host_history(cls, afe, hostname, start_time, end_time): 327 """Create a `HostJobHistory` instance for a single host. 328 329 Simple factory method to construct host history from a 330 hostname. Simply looks up the host in the AFE database, and 331 passes it to the class constructor. 332 333 @param afe Autotest frontend 334 @param hostname Name of the host. 335 @param start_time Start time for the history's time 336 interval. 337 @param end_time End time for the history's time interval. 338 339 @return A new `HostJobHistory` instance. 340 341 """ 342 afehost = afe.get_hosts(hostname=hostname)[0] 343 return cls(afe, afehost, start_time, end_time) 344 345 346 @classmethod 347 def get_multiple_histories(cls, afe, start_time, end_time, 348 board=None, pool=None): 349 """Create `HostJobHistory` instances for a set of hosts. 350 351 The set of hosts can be specified as "all hosts of a given 352 board type", "all hosts in a given pool", or "all hosts 353 of a given board and pool". 354 355 @param afe Autotest frontend 356 @param start_time Start time for the history's time 357 interval. 358 @param end_time End time for the history's time interval. 359 @param board All hosts must have this board type; if 360 `None`, all boards are allowed. 361 @param pool All hosts must be in this pool; if 362 `None`, all pools are allowed. 363 364 @return A list of new `HostJobHistory` instances. 365 366 """ 367 # If `board` or `pool` are both `None`, we could search the 368 # entire database, which is more expensive than we want. 369 # Our caller currently won't (can't) do this, but assert to 370 # be safe. 371 assert board is not None or pool is not None 372 labels = [] 373 if board is not None: 374 labels.append(constants.Labels.BOARD_PREFIX + board) 375 if pool is not None: 376 labels.append(constants.Labels.POOL_PREFIX + pool) 377 kwargs = {'multiple_labels': labels} 378 hosts = afe.get_hosts(**kwargs) 379 return [cls(afe, h, start_time, end_time) for h in hosts] 380 381 382 def __init__(self, afe, afehost, start_time, end_time): 383 self._afe = afe 384 self.hostname = afehost.hostname 385 self.end_time = end_time 386 self.start_time = start_time 387 self._host = afehost 388 # Don't spend time on queries until they're needed. 389 self._history = None 390 self._status_interval = None 391 self._status_diagnosis = None 392 self._status_task = None 393 394 395 def _get_history(self, start_time, end_time): 396 """Get the list of events for the given interval.""" 397 newtasks = _SpecialTaskEvent.get_tasks( 398 self._afe, self._host.id, start_time, end_time) 399 newhqes = _TestJobEvent.get_hqes( 400 self._afe, self._host.id, start_time, end_time) 401 newhistory = newtasks + newhqes 402 newhistory.sort(reverse=True) 403 return newhistory 404 405 406 def __iter__(self): 407 if self._history is None: 408 self._history = self._get_history(self.start_time, 409 self.end_time) 410 return self._history.__iter__() 411 412 413 def _extract_prefixed_label(self, prefix): 414 labels = [l for l in self._host.labels 415 if l.startswith(prefix)] 416 return labels[0][len(prefix) : ] if labels else None 417 418 419 @property 420 def host(self): 421 """Return the AFE host object for this history.""" 422 return self._host 423 424 425 @property 426 def host_board(self): 427 """Return the board name for this history's DUT.""" 428 prefix = constants.Labels.BOARD_PREFIX 429 return self._extract_prefixed_label(prefix) 430 431 432 @property 433 def host_pool(self): 434 """Return the pool name for this history's DUT.""" 435 prefix = constants.Labels.POOL_PREFIX 436 return self._extract_prefixed_label(prefix) 437 438 439 def _init_status_task(self): 440 """Fill in `self._status_diagnosis` and `_status_task`.""" 441 if self._status_diagnosis is not None: 442 return 443 self._status_task = _SpecialTaskEvent.get_status_task( 444 self._afe, self._host.id, self.end_time) 445 if self._status_task is not None: 446 self._status_diagnosis = self._status_task.diagnosis 447 else: 448 self._status_diagnosis = UNKNOWN 449 450 451 def _init_status_interval(self): 452 """Fill in `self._status_interval`.""" 453 if self._status_interval is not None: 454 return 455 self._init_status_task() 456 self._status_interval = [] 457 if self._status_task is None: 458 return 459 query_end = time_utils.epoch_time_to_date_string(self.end_time) 460 interval = self._afe.get_host_diagnosis_interval( 461 self._host.id, query_end, 462 self._status_diagnosis != WORKING) 463 if not interval: 464 return 465 self._status_interval = self._get_history( 466 parse_time(interval[0]), 467 parse_time(interval[1])) 468 469 470 def diagnosis_interval(self): 471 """Find this history's most recent diagnosis interval. 472 473 Returns a list of `_JobEvent` instances corresponding to the 474 most recent diagnosis interval occurring before this 475 history's end time. 476 477 The list is returned as with `self._history`, ordered from 478 most to least recent. 479 480 @return The list of the `_JobEvent`s in the diagnosis 481 interval. 482 483 """ 484 self._init_status_interval() 485 return self._status_interval 486 487 488 def last_diagnosis(self): 489 """Return the diagnosis of whether the DUT is working. 490 491 This searches the DUT's job history, looking for the most 492 recent status task for the DUT. Return a tuple of 493 `(diagnosis, task)`. 494 495 The `diagnosis` entry in the tuple is one of these values: 496 * UNUSED - The host's last status task is older than 497 `self.start_time`. 498 * WORKING - The DUT is working. 499 * BROKEN - The DUT likely requires manual intervention. 500 * UNKNOWN - No task could be found indicating status for 501 the DUT. 502 503 If the DUT was working at last check, but hasn't been used 504 inside this history's time interval, the status `UNUSED` is 505 returned with the last status task, instead of `WORKING`. 506 507 The `task` entry in the tuple is the status task that led to 508 the diagnosis. The task will be `None` if the diagnosis is 509 `UNKNOWN`. 510 511 @return A tuple with the DUT's diagnosis and the task that 512 determined it. 513 514 """ 515 self._init_status_task() 516 diagnosis = self._status_diagnosis 517 if (self.start_time is not None and 518 self._status_task is not None and 519 self._status_task.end_time < self.start_time and 520 diagnosis == WORKING): 521 diagnosis = UNUSED 522 return diagnosis, self._status_task 523 524 525def get_diagnosis_interval(host_id, end_time, success): 526 """Return the last diagnosis interval for a given host and time. 527 528 This routine queries the database for the special tasks on a 529 given host before a given time. From those tasks it selects the 530 last status task before a change in status, and the first status 531 task after the change. When `success` is true, the change must 532 be from "working" to "broken". When false, the search is for a 533 change in the opposite direction. 534 535 A "successful status task" is any successful special task. A 536 "failed status task" is a failed Repair task. These criteria 537 are based on the definition of "status task" in the module-level 538 docstring, above. 539 540 This is the RPC endpoint for `AFE.get_host_diagnosis_interval()`. 541 542 @param host_id Database host id of the desired host. 543 @param end_time Find the last eligible interval before this time. 544 @param success Whether the eligible interval should start with a 545 success or a failure. 546 547 @return A list containing the start time of the earliest job 548 selected, and the end time of the latest job. 549 550 """ 551 base_query = afe_models.SpecialTask.objects.filter( 552 host_id=host_id, is_complete=True) 553 success_query = base_query.filter(success=True) 554 failure_query = base_query.filter(success=False, task='Repair') 555 if success: 556 query0 = success_query 557 query1 = failure_query 558 else: 559 query0 = failure_query 560 query1 = success_query 561 query0 = query0.filter(time_finished__lte=end_time) 562 query0 = query0.order_by('time_started').reverse() 563 if not query0: 564 return [] 565 task0 = query0[0] 566 query1 = query1.filter(time_finished__gt=task0.time_finished) 567 task1 = query1.order_by('time_started')[0] 568 return [task0.time_started.strftime(time_utils.TIME_FMT), 569 task1.time_finished.strftime(time_utils.TIME_FMT)] 570 571 572def get_status_task(host_id, end_time): 573 """Get the last status task for a host before a given time. 574 575 This routine returns a Django query for the AFE database to find 576 the last task that finished on the given host before the given 577 time that was either a successful task, or a Repair task. The 578 query criteria are based on the definition of "status task" in 579 the module-level docstring, above. 580 581 This is the RPC endpoint for `_SpecialTaskEvent.get_status_task()`. 582 583 @param host_id Database host id of the desired host. 584 @param end_time End time of the range of interest. 585 586 @return A Django query-set selecting the single special task of 587 interest. 588 589 """ 590 # Selects status tasks: any Repair task, or any successful task. 591 status_tasks = (django_models.Q(task='Repair') | 592 django_models.Q(success=True)) 593 # Our caller needs a Django query set in order to serialize the 594 # result, so we don't resolve the query here; we just return a 595 # slice with at most one element. 596 return afe_models.SpecialTask.objects.filter( 597 status_tasks, 598 host_id=host_id, 599 time_finished__lte=end_time, 600 is_complete=True).order_by('time_started').reverse()[0:1] 601