1# Copyright 2015 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5"""Services relating to DUT status and job history. 6 7The central abstraction of this module is the `HostJobHistory` 8class. This class provides two related pieces of information 9regarding a single DUT: 10 * A history of tests and special tasks that have run on 11 the DUT in a given time range. 12 * Whether the DUT was "working" or "broken" at a given 13 time. 14 15The "working" or "broken" status of a DUT is determined by 16the DUT's special task history. At the end of any job or 17task, the status is indicated as follows: 18 * After any successful special task, the DUT is considered 19 "working". 20 * After any failed Repair task, the DUT is considered "broken". 21 * After any other special task or after any regular test job, the 22 DUT's status is considered unchanged. 23 24Definitions for terms used in the code below: 25 * status task - Any special task that determines the DUT's 26 status; that is, any successful task, or any failed Repair. 27 * diagnosis interval - A time interval during which DUT status 28 changed either from "working" to "broken", or vice versa. The 29 interval starts with the last status task with the old status, 30 and ends after the first status task with the new status. 31 32Diagnosis intervals are interesting because they normally contain 33the logs explaining a failure or repair event. 34 35""" 36 37import common 38import os 39from autotest_lib.frontend import setup_django_environment 40from django.db import models as django_models 41 42from autotest_lib.client.common_lib import global_config 43from autotest_lib.client.common_lib import utils 44from autotest_lib.client.common_lib import time_utils 45from autotest_lib.frontend.afe import models as afe_models 46from autotest_lib.server import constants 47 48 49# Values used to describe the diagnosis of a DUT. These values are 50# used to indicate both DUT status after a job or task, and also 51# diagnosis of whether the DUT was working at the end of a given 52# time interval. 53# 54# UNUSED: Used when there are no events recorded in a given 55# time interval. 56# UNKNOWN: For an individual event, indicates that the DUT status 57# is unchanged from the previous event. For a time interval, 58# indicates that the DUT's status can't be determined from the 59# DUT's history. 60# WORKING: Indicates that the DUT was working normally after the 61# event, or at the end of the time interval. 62# BROKEN: Indicates that the DUT needed manual repair after the 63# event, or at the end of the time interval. 64# 65UNUSED = 0 66UNKNOWN = 1 67WORKING = 2 68BROKEN = 3 69 70 71status_names = { 72 UNUSED: "UNUSED", 73 UNKNOWN: "UNKNOWN", 74 WORKING: "WORKING", 75 BROKEN: "BROKEN", 76} 77 78 79def parse_time(time_string): 80 """Parse time according to a canonical form. 81 82 The "canonical" form is the form in which date/time 83 values are stored in the database. 84 85 @param time_string Time to be parsed. 86 """ 87 return int(time_utils.to_epoch_time(time_string)) 88 89 90class _JobEvent(object): 91 """Information about an event in host history. 92 93 This remembers the relevant data from a single event in host 94 history. An event is any change in DUT state caused by a job 95 or special task. The data captured are the start and end times 96 of the event, the URL of logs to the job or task causing the 97 event, and a diagnosis of whether the DUT was working or failed 98 afterwards. 99 100 This class is an adapter around the database model objects 101 describing jobs and special tasks. This is an abstract 102 superclass, with concrete subclasses for `HostQueueEntry` and 103 `SpecialTask` objects. 104 105 @property start_time Time the job or task began execution. 106 @property end_time Time the job or task finished execution. 107 @property id id of the event in the AFE database. 108 @property name Name of the event, derived from the AFE database. 109 @property job_status Short string describing the event's final status. 110 @property logdir Relative path to the logs for the event's job. 111 @property job_url URL to the logs for the event's job. 112 @property gs_url GS URL to the logs for the event's job. 113 @property job_id id of the AFE job for HQEs. None otherwise. 114 @property diagnosis Working status of the DUT after the event. 115 @property is_special Boolean indicating if the event is a special task. 116 117 """ 118 119 get_config_value = global_config.global_config.get_config_value 120 _LOG_URL_PATTERN = ('%s/browse/chromeos-autotest-results/%%s/' 121 % get_config_value('AUTOTEST_WEB', 'stainless_url', 122 default=None)) 123 124 @classmethod 125 def get_gs_url(cls, logdir): 126 """Return a GS URL to job results. 127 128 The URL is constructed from a base URL determined by the 129 global config, plus the relative path of the job's log 130 directory. 131 132 @param logdir Relative path of the results log directory. 133 134 @return A URL to the requested results log. 135 136 """ 137 return os.path.join(utils.get_offload_gsuri(), logdir) 138 139 140 def __init__(self, start_time, end_time): 141 self.start_time = parse_time(start_time) 142 self.end_time = parse_time(end_time) 143 144 145 def __cmp__(self, other): 146 """Compare two jobs by their start time. 147 148 This is a standard Python `__cmp__` method to allow sorting 149 `_JobEvent` objects by their times. 150 151 @param other The `_JobEvent` object to compare to `self`. 152 153 """ 154 return self.start_time - other.start_time 155 156 157 @property 158 def id(self): 159 """Return the id of the event in the AFE database.""" 160 raise NotImplementedError() 161 162 163 @property 164 def name(self): 165 """Return the name of the event.""" 166 raise NotImplementedError() 167 168 169 @property 170 def job_status(self): 171 """Return a short string describing the event's final status.""" 172 raise NotImplementedError() 173 174 175 @property 176 def logdir(self): 177 """Return the relative path for this event's job logs.""" 178 raise NotImplementedError() 179 180 181 @property 182 def job_url(self): 183 """Return the URL for this event's job logs.""" 184 return self._LOG_URL_PATTERN % self.logdir 185 186 187 @property 188 def gs_url(self): 189 """Return the GS URL for this event's job logs.""" 190 return self.get_gs_url(self.logdir) 191 192 193 @property 194 def job_id(self): 195 """Return the id of the AFE job for HQEs. None otherwise.""" 196 raise NotImplementedError() 197 198 199 @property 200 def diagnosis(self): 201 """Return the status of the DUT after this event. 202 203 The diagnosis is interpreted as follows: 204 UNKNOWN - The DUT status was the same before and after 205 the event. 206 WORKING - The DUT appeared to be working after the event. 207 BROKEN - The DUT likely required manual intervention 208 after the event. 209 210 @return A valid diagnosis value. 211 212 """ 213 raise NotImplementedError() 214 215 216 @property 217 def is_special(self): 218 """Return if the event is for a special task.""" 219 raise NotImplementedError() 220 221 222class _SpecialTaskEvent(_JobEvent): 223 """`_JobEvent` adapter for special tasks. 224 225 This class wraps the standard `_JobEvent` interface around a row 226 in the `afe_special_tasks` table. 227 228 """ 229 230 @classmethod 231 def get_tasks(cls, afe, host_id, start_time, end_time): 232 """Return special tasks for a host in a given time range. 233 234 Return a list of `_SpecialTaskEvent` objects representing all 235 special tasks that ran on the given host in the given time 236 range. The list is ordered as it was returned by the query 237 (i.e. unordered). 238 239 @param afe Autotest frontend 240 @param host_id Database host id of the desired host. 241 @param start_time Start time of the range of interest. 242 @param end_time End time of the range of interest. 243 244 @return A list of `_SpecialTaskEvent` objects. 245 246 """ 247 query_start = time_utils.epoch_time_to_date_string(start_time) 248 query_end = time_utils.epoch_time_to_date_string(end_time) 249 tasks = afe.get_host_special_tasks( 250 host_id, 251 time_started__gte=query_start, 252 time_finished__lte=query_end, 253 is_complete=1) 254 return [cls(t) for t in tasks] 255 256 257 @classmethod 258 def get_status_task(cls, afe, host_id, end_time): 259 """Return the task indicating a host's status at a given time. 260 261 The task returned determines the status of the DUT; the 262 diagnosis on the task indicates the diagnosis for the DUT at 263 the given `end_time`. 264 265 @param afe Autotest frontend 266 @param host_id Database host id of the desired host. 267 @param end_time Find status as of this time. 268 269 @return A `_SpecialTaskEvent` object for the requested task, 270 or `None` if no task was found. 271 272 """ 273 query_end = time_utils.epoch_time_to_date_string(end_time) 274 task = afe.get_host_status_task(host_id, query_end) 275 return cls(task) if task else None 276 277 278 def __init__(self, afetask): 279 self._afetask = afetask 280 super(_SpecialTaskEvent, self).__init__( 281 afetask.time_started, afetask.time_finished) 282 283 284 @property 285 def id(self): 286 return self._afetask.id 287 288 289 @property 290 def name(self): 291 return self._afetask.task 292 293 294 @property 295 def job_status(self): 296 if self._afetask.is_aborted: 297 return 'ABORTED' 298 elif self._afetask.success: 299 return 'PASS' 300 else: 301 return 'FAIL' 302 303 304 @property 305 def logdir(self): 306 return ('hosts/%s/%s-%s' % 307 (self._afetask.host.hostname, self._afetask.id, 308 self._afetask.task.lower())) 309 310 311 @property 312 def job_id(self): 313 return None 314 315 316 @property 317 def diagnosis(self): 318 if self._afetask.success: 319 return WORKING 320 elif self._afetask.task == 'Repair': 321 return BROKEN 322 else: 323 return UNKNOWN 324 325 326 @property 327 def is_special(self): 328 return True 329 330 331class _TestJobEvent(_JobEvent): 332 """`_JobEvent` adapter for regular test jobs. 333 334 This class wraps the standard `_JobEvent` interface around a row 335 in the `afe_host_queue_entries` table. 336 337 """ 338 339 @classmethod 340 def get_hqes(cls, afe, host_id, start_time, end_time): 341 """Return HQEs for a host in a given time range. 342 343 Return a list of `_TestJobEvent` objects representing all the 344 HQEs of all the jobs that ran on the given host in the given 345 time range. The list is ordered as it was returned by the 346 query (i.e. unordered). 347 348 @param afe Autotest frontend 349 @param host_id Database host id of the desired host. 350 @param start_time Start time of the range of interest. 351 @param end_time End time of the range of interest. 352 353 @return A list of `_TestJobEvent` objects. 354 355 """ 356 query_start = time_utils.epoch_time_to_date_string(start_time) 357 query_end = time_utils.epoch_time_to_date_string(end_time) 358 hqelist = afe.get_host_queue_entries_by_insert_time( 359 host_id=host_id, 360 insert_time_after=query_start, 361 insert_time_before=query_end, 362 started_on__gte=query_start, 363 started_on__lte=query_end, 364 complete=1) 365 return [cls(hqe) for hqe in hqelist] 366 367 368 def __init__(self, hqe): 369 self._hqe = hqe 370 super(_TestJobEvent, self).__init__( 371 hqe.started_on, hqe.finished_on) 372 373 374 @property 375 def id(self): 376 return self._hqe.id 377 378 379 @property 380 def name(self): 381 return self._hqe.job.name 382 383 384 @property 385 def job_status(self): 386 return self._hqe.status 387 388 389 @property 390 def logdir(self): 391 return _get_job_logdir(self._hqe.job) 392 393 394 @property 395 def job_id(self): 396 return self._hqe.job.id 397 398 399 @property 400 def diagnosis(self): 401 return UNKNOWN 402 403 404 @property 405 def is_special(self): 406 return False 407 408 409class HostJobHistory(object): 410 """Class to query and remember DUT execution and status history. 411 412 This class is responsible for querying the database to determine 413 the history of a single DUT in a time interval of interest, and 414 for remembering the query results for reporting. 415 416 @property hostname Host name of the DUT. 417 @property start_time Start of the requested time interval, as a unix 418 timestamp (epoch time). 419 This field may be `None`. 420 @property end_time End of the requested time interval, as a unix 421 timestamp (epoch time). 422 @property _afe Autotest frontend for queries. 423 @property _host Database host object for the DUT. 424 @property _history A list of jobs and special tasks that 425 ran on the DUT in the requested time 426 interval, ordered in reverse, from latest 427 to earliest. 428 429 @property _status_interval A list of all the jobs and special 430 tasks that ran on the DUT in the 431 last diagnosis interval prior to 432 `end_time`, ordered from latest to 433 earliest. 434 @property _status_diagnosis The DUT's status as of `end_time`. 435 @property _status_task The DUT's last status task as of 436 `end_time`. 437 438 """ 439 440 @classmethod 441 def get_host_history(cls, afe, hostname, start_time, end_time): 442 """Create a `HostJobHistory` instance for a single host. 443 444 Simple factory method to construct host history from a 445 hostname. Simply looks up the host in the AFE database, and 446 passes it to the class constructor. 447 448 @param afe Autotest frontend 449 @param hostname Name of the host. 450 @param start_time Start time for the history's time 451 interval. 452 @param end_time End time for the history's time interval. 453 454 @return A new `HostJobHistory` instance. 455 456 """ 457 afehost = afe.get_hosts(hostname=hostname)[0] 458 return cls(afe, afehost, start_time, end_time) 459 460 461 @classmethod 462 def get_multiple_histories(cls, afe, start_time, end_time, labels=()): 463 """Create `HostJobHistory` instances for a set of hosts. 464 465 @param afe Autotest frontend 466 @param start_time Start time for the history's time 467 interval. 468 @param end_time End time for the history's time interval. 469 @param labels type: [str]. AFE labels to constrain the host query. 470 This option must be non-empty. An unconstrained 471 search of the DB is too costly. 472 473 @return A list of new `HostJobHistory` instances. 474 475 """ 476 assert labels, ( 477 'Must specify labels for get_multiple_histories. ' 478 'Unconstrainted search of the database is prohibitively costly.') 479 480 kwargs = {'multiple_labels': labels} 481 hosts = afe.get_hosts(**kwargs) 482 return [cls(afe, h, start_time, end_time) for h in hosts] 483 484 485 def __init__(self, afe, afehost, start_time, end_time): 486 self._afe = afe 487 self.hostname = afehost.hostname 488 self.end_time = end_time 489 self.start_time = start_time 490 self._host = afehost 491 # Don't spend time on queries until they're needed. 492 self._history = None 493 self._status_interval = None 494 self._status_diagnosis = None 495 self._status_task = None 496 497 498 def _get_history(self, start_time, end_time): 499 """Get the list of events for the given interval.""" 500 newtasks = _SpecialTaskEvent.get_tasks( 501 self._afe, self._host.id, start_time, end_time) 502 newhqes = _TestJobEvent.get_hqes( 503 self._afe, self._host.id, start_time, end_time) 504 newhistory = newtasks + newhqes 505 newhistory.sort(reverse=True) 506 return newhistory 507 508 509 def __iter__(self): 510 if self._history is None: 511 self._history = self._get_history(self.start_time, 512 self.end_time) 513 return self._history.__iter__() 514 515 516 def _extract_prefixed_label(self, prefix): 517 labels = [l for l in self._host.labels 518 if l.startswith(prefix)] 519 return labels[0][len(prefix) : ] if labels else None 520 521 522 @property 523 def host(self): 524 """Return the AFE host object for this history.""" 525 return self._host 526 527 528 @property 529 def host_model(self): 530 """Return the model name for this history's DUT.""" 531 prefix = constants.Labels.MODEL_PREFIX 532 return self._extract_prefixed_label(prefix) 533 534 535 @property 536 def host_board(self): 537 """Return the board name for this history's DUT.""" 538 prefix = constants.Labels.BOARD_PREFIX 539 return self._extract_prefixed_label(prefix) 540 541 542 @property 543 def host_pool(self): 544 """Return the pool name for this history's DUT.""" 545 prefix = constants.Labels.POOL_PREFIX 546 return self._extract_prefixed_label(prefix) 547 548 549 def _init_status_task(self): 550 """Fill in `self._status_diagnosis` and `_status_task`.""" 551 if self._status_diagnosis is not None: 552 return 553 self._status_task = _SpecialTaskEvent.get_status_task( 554 self._afe, self._host.id, self.end_time) 555 if self._status_task is not None: 556 self._status_diagnosis = self._status_task.diagnosis 557 else: 558 self._status_diagnosis = UNKNOWN 559 560 561 def _init_status_interval(self): 562 """Fill in `self._status_interval`.""" 563 if self._status_interval is not None: 564 return 565 self._init_status_task() 566 self._status_interval = [] 567 if self._status_task is None: 568 return 569 query_end = time_utils.epoch_time_to_date_string(self.end_time) 570 interval = self._afe.get_host_diagnosis_interval( 571 self._host.id, query_end, 572 self._status_diagnosis != WORKING) 573 if not interval: 574 return 575 self._status_interval = self._get_history( 576 parse_time(interval[0]), 577 parse_time(interval[1])) 578 579 580 def diagnosis_interval(self): 581 """Find this history's most recent diagnosis interval. 582 583 Returns a list of `_JobEvent` instances corresponding to the 584 most recent diagnosis interval occurring before this 585 history's end time. 586 587 The list is returned as with `self._history`, ordered from 588 most to least recent. 589 590 @return The list of the `_JobEvent`s in the diagnosis 591 interval. 592 593 """ 594 self._init_status_interval() 595 return self._status_interval 596 597 598 def last_diagnosis(self): 599 """Return the diagnosis of whether the DUT is working. 600 601 This searches the DUT's job history, looking for the most 602 recent status task for the DUT. Return a tuple of 603 `(diagnosis, task)`. 604 605 The `diagnosis` entry in the tuple is one of these values: 606 * UNUSED - The host's last status task is older than 607 `self.start_time`. 608 * WORKING - The DUT is working. 609 * BROKEN - The DUT likely requires manual intervention. 610 * UNKNOWN - No task could be found indicating status for 611 the DUT. 612 613 If the DUT was working at last check, but hasn't been used 614 inside this history's time interval, the status `UNUSED` is 615 returned with the last status task, instead of `WORKING`. 616 617 The `task` entry in the tuple is the status task that led to 618 the diagnosis. The task will be `None` if the diagnosis is 619 `UNKNOWN`. 620 621 @return A tuple with the DUT's diagnosis and the task that 622 determined it. 623 624 """ 625 self._init_status_task() 626 diagnosis = self._status_diagnosis 627 if (self.start_time is not None and 628 self._status_task is not None and 629 self._status_task.end_time < self.start_time and 630 diagnosis == WORKING): 631 diagnosis = UNUSED 632 return diagnosis, self._status_task 633 634 635def get_diagnosis_interval(host_id, end_time, success): 636 """Return the last diagnosis interval for a given host and time. 637 638 This routine queries the database for the special tasks on a 639 given host before a given time. From those tasks it selects the 640 last status task before a change in status, and the first status 641 task after the change. When `success` is true, the change must 642 be from "working" to "broken". When false, the search is for a 643 change in the opposite direction. 644 645 A "successful status task" is any successful special task. A 646 "failed status task" is a failed Repair task. These criteria 647 are based on the definition of "status task" in the module-level 648 docstring, above. 649 650 This is the RPC endpoint for `AFE.get_host_diagnosis_interval()`. 651 652 @param host_id Database host id of the desired host. 653 @param end_time Find the last eligible interval before this time. 654 @param success Whether the eligible interval should start with a 655 success or a failure. 656 657 @return A list containing the start time of the earliest job 658 selected, and the end time of the latest job. 659 660 """ 661 base_query = afe_models.SpecialTask.objects.filter( 662 host_id=host_id, is_complete=True) 663 success_query = base_query.filter(success=True) 664 failure_query = base_query.filter(success=False, task='Repair') 665 if success: 666 query0 = success_query 667 query1 = failure_query 668 else: 669 query0 = failure_query 670 query1 = success_query 671 query0 = query0.filter(time_finished__lte=end_time) 672 query0 = query0.order_by('time_started').reverse() 673 if not query0: 674 return [] 675 task0 = query0[0] 676 query1 = query1.filter(time_finished__gt=task0.time_finished) 677 task1 = query1.order_by('time_started')[0] 678 return [task0.time_started.strftime(time_utils.TIME_FMT), 679 task1.time_finished.strftime(time_utils.TIME_FMT)] 680 681 682def get_status_task(host_id, end_time): 683 """Get the last status task for a host before a given time. 684 685 This routine returns a Django query for the AFE database to find 686 the last task that finished on the given host before the given 687 time that was either a successful task, or a Repair task. The 688 query criteria are based on the definition of "status task" in 689 the module-level docstring, above. 690 691 This is the RPC endpoint for `_SpecialTaskEvent.get_status_task()`. 692 693 @param host_id Database host id of the desired host. 694 @param end_time End time of the range of interest. 695 696 @return A Django query-set selecting the single special task of 697 interest. 698 699 """ 700 # Selects status tasks: any Repair task, or any successful task. 701 status_tasks = (django_models.Q(task='Repair') | 702 django_models.Q(success=True)) 703 # Our caller needs a Django query set in order to serialize the 704 # result, so we don't resolve the query here; we just return a 705 # slice with at most one element. 706 return afe_models.SpecialTask.objects.filter( 707 status_tasks, 708 host_id=host_id, 709 time_finished__lte=end_time, 710 is_complete=True).order_by('time_started').reverse()[0:1] 711 712 713def _get_job_logdir(job): 714 """Gets the logdir for an AFE job. 715 716 @param job Job object which has id and owner properties. 717 718 @return Relative path of the results log directory. 719 """ 720 return '%s-%s' % (job.id, job.owner) 721 722 723def get_job_gs_url(job): 724 """Gets the GS URL for an AFE job. 725 726 @param job Job object which has id and owner properties. 727 728 @return Absolute GS URL to the results log directory. 729 """ 730 return _JobEvent.get_gs_url(_get_job_logdir(job)) 731