1# Copyright 2015 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5"""Services relating to DUT status and job history. 6 7The central abstraction of this module is the `HostJobHistory` 8class. This class provides two related pieces of information 9regarding a single DUT: 10 * A history of tests and special tasks that have run on 11 the DUT in a given time range. 12 * Whether the DUT was "working" or "broken" at a given 13 time. 14 15The "working" or "broken" status of a DUT is determined by 16the DUT's special task history. At the end of any job or 17task, the status is indicated as follows: 18 * After any successful special task, the DUT is considered 19 "working". 20 * After any failed Repair task, the DUT is considered "broken". 21 * After any other special task or after any regular test job, the 22 DUT's status is considered unchanged. 23 24Definitions for terms used in the code below: 25 * status task - Any special task that determines the DUT's 26 status; that is, any successful task, or any failed Repair. 27 * diagnosis interval - A time interval during which DUT status 28 changed either from "working" to "broken", or vice versa. The 29 interval starts with the last status task with the old status, 30 and ends after the first status task with the new status. 31 32Diagnosis intervals are interesting because they normally contain 33the logs explaining a failure or repair event. 34 35""" 36 37import common 38import os 39from autotest_lib.frontend import setup_django_environment 40from django.db import models as django_models 41 42from autotest_lib.client.common_lib import global_config 43from autotest_lib.client.common_lib import utils 44from autotest_lib.client.common_lib import time_utils 45from autotest_lib.frontend.afe import models as afe_models 46from autotest_lib.frontend.afe import rpc_client_lib 47from autotest_lib.server import constants 48 49 50# Values used to describe the diagnosis of a DUT. These values are 51# used to indicate both DUT status after a job or task, and also 52# diagnosis of whether the DUT was working at the end of a given 53# time interval. 54# 55# UNUSED: Used when there are no events recorded in a given 56# time interval. 57# UNKNOWN: For an individual event, indicates that the DUT status 58# is unchanged from the previous event. For a time interval, 59# indicates that the DUT's status can't be determined from the 60# DUT's history. 61# WORKING: Indicates that the DUT was working normally after the 62# event, or at the end of the time interval. 63# BROKEN: Indicates that the DUT needed manual repair after the 64# event, or at the end of the time interval. 65# 66UNUSED = 0 67UNKNOWN = 1 68WORKING = 2 69BROKEN = 3 70 71 72def parse_time(time_string): 73 """Parse time according to a canonical form. 74 75 The "canonical" form is the form in which date/time 76 values are stored in the database. 77 78 @param time_string Time to be parsed. 79 """ 80 return int(time_utils.to_epoch_time(time_string)) 81 82 83class _JobEvent(object): 84 """Information about an event in host history. 85 86 This remembers the relevant data from a single event in host 87 history. An event is any change in DUT state caused by a job 88 or special task. The data captured are the start and end times 89 of the event, the URL of logs to the job or task causing the 90 event, and a diagnosis of whether the DUT was working or failed 91 afterwards. 92 93 This class is an adapter around the database model objects 94 describing jobs and special tasks. This is an abstract 95 superclass, with concrete subclasses for `HostQueueEntry` and 96 `SpecialTask` objects. 97 98 @property start_time Time the job or task began execution. 99 @property end_time Time the job or task finished execution. 100 @property id id of the event in the AFE database. 101 @property name Name of the event, derived from the AFE database. 102 @property job_status Short string describing the event's final status. 103 @property logdir Relative path to the logs for the event's job. 104 @property job_url URL to the logs for the event's job. 105 @property gs_url GS URL to the logs for the event's job. 106 @property job_id id of the AFE job for HQEs. None otherwise. 107 @property diagnosis Working status of the DUT after the event. 108 @property is_special Boolean indicating if the event is a special task. 109 110 """ 111 112 get_config_value = global_config.global_config.get_config_value 113 _LOG_URL_PATTERN = get_config_value('CROS', 'log_url_pattern') 114 115 @classmethod 116 def get_log_url(cls, afe_hostname, logdir): 117 """Return a URL to job results. 118 119 The URL is constructed from a base URL determined by the 120 global config, plus the relative path of the job's log 121 directory. 122 123 @param afe_hostname Hostname for autotest frontend 124 @param logdir Relative path of the results log directory. 125 126 @return A URL to the requested results log. 127 128 """ 129 return cls._LOG_URL_PATTERN % ( 130 rpc_client_lib.add_protocol(afe_hostname), 131 logdir, 132 ) 133 134 135 @classmethod 136 def get_gs_url(cls, logdir): 137 """Return a GS URL to job results. 138 139 The URL is constructed from a base URL determined by the 140 global config, plus the relative path of the job's log 141 directory. 142 143 @param logdir Relative path of the results log directory. 144 145 @return A URL to the requested results log. 146 147 """ 148 return os.path.join(utils.get_offload_gsuri(), logdir) 149 150 151 def __init__(self, start_time, end_time): 152 self.start_time = parse_time(start_time) 153 self.end_time = parse_time(end_time) 154 155 156 def __cmp__(self, other): 157 """Compare two jobs by their start time. 158 159 This is a standard Python `__cmp__` method to allow sorting 160 `_JobEvent` objects by their times. 161 162 @param other The `_JobEvent` object to compare to `self`. 163 164 """ 165 return self.start_time - other.start_time 166 167 168 @property 169 def id(self): 170 """Return the id of the event in the AFE database.""" 171 raise NotImplementedError() 172 173 174 @property 175 def name(self): 176 """Return the name of the event.""" 177 raise NotImplementedError() 178 179 180 @property 181 def job_status(self): 182 """Return a short string describing the event's final status.""" 183 raise NotImplementedError() 184 185 186 @property 187 def logdir(self): 188 """Return the relative path for this event's job logs.""" 189 raise NotImplementedError() 190 191 192 @property 193 def job_url(self): 194 """Return the URL for this event's job logs.""" 195 raise NotImplementedError() 196 197 198 @property 199 def gs_url(self): 200 """Return the GS URL for this event's job logs.""" 201 raise NotImplementedError() 202 203 204 @property 205 def job_id(self): 206 """Return the id of the AFE job for HQEs. None otherwise.""" 207 raise NotImplementedError() 208 209 210 @property 211 def diagnosis(self): 212 """Return the status of the DUT after this event. 213 214 The diagnosis is interpreted as follows: 215 UNKNOWN - The DUT status was the same before and after 216 the event. 217 WORKING - The DUT appeared to be working after the event. 218 BROKEN - The DUT likely required manual intervention 219 after the event. 220 221 @return A valid diagnosis value. 222 223 """ 224 raise NotImplementedError() 225 226 227 @property 228 def is_special(self): 229 """Return if the event is for a special task.""" 230 raise NotImplementedError() 231 232 233class _SpecialTaskEvent(_JobEvent): 234 """`_JobEvent` adapter for special tasks. 235 236 This class wraps the standard `_JobEvent` interface around a row 237 in the `afe_special_tasks` table. 238 239 """ 240 241 @classmethod 242 def get_tasks(cls, afe, host_id, start_time, end_time): 243 """Return special tasks for a host in a given time range. 244 245 Return a list of `_SpecialTaskEvent` objects representing all 246 special tasks that ran on the given host in the given time 247 range. The list is ordered as it was returned by the query 248 (i.e. unordered). 249 250 @param afe Autotest frontend 251 @param host_id Database host id of the desired host. 252 @param start_time Start time of the range of interest. 253 @param end_time End time of the range of interest. 254 255 @return A list of `_SpecialTaskEvent` objects. 256 257 """ 258 query_start = time_utils.epoch_time_to_date_string(start_time) 259 query_end = time_utils.epoch_time_to_date_string(end_time) 260 tasks = afe.get_host_special_tasks( 261 host_id, 262 time_started__gte=query_start, 263 time_finished__lte=query_end, 264 is_complete=1) 265 return [cls(afe.server, t) for t in tasks] 266 267 268 @classmethod 269 def get_status_task(cls, afe, host_id, end_time): 270 """Return the task indicating a host's status at a given time. 271 272 The task returned determines the status of the DUT; the 273 diagnosis on the task indicates the diagnosis for the DUT at 274 the given `end_time`. 275 276 @param afe Autotest frontend 277 @param host_id Database host id of the desired host. 278 @param end_time Find status as of this time. 279 280 @return A `_SpecialTaskEvent` object for the requested task, 281 or `None` if no task was found. 282 283 """ 284 query_end = time_utils.epoch_time_to_date_string(end_time) 285 task = afe.get_host_status_task(host_id, query_end) 286 return cls(afe.server, task) if task else None 287 288 289 def __init__(self, afe_hostname, afetask): 290 self._afe_hostname = afe_hostname 291 self._afetask = afetask 292 super(_SpecialTaskEvent, self).__init__( 293 afetask.time_started, afetask.time_finished) 294 295 296 @property 297 def id(self): 298 return self._afetask.id 299 300 301 @property 302 def name(self): 303 return self._afetask.task 304 305 306 @property 307 def job_status(self): 308 if self._afetask.is_aborted: 309 return 'ABORTED' 310 elif self._afetask.success: 311 return 'PASS' 312 else: 313 return 'FAIL' 314 315 316 @property 317 def logdir(self): 318 return ('hosts/%s/%s-%s' % 319 (self._afetask.host.hostname, self._afetask.id, 320 self._afetask.task.lower())) 321 322 323 @property 324 def job_url(self): 325 return _SpecialTaskEvent.get_log_url(self._afe_hostname, self.logdir) 326 327 328 @property 329 def gs_url(self): 330 return _SpecialTaskEvent.get_gs_url(self.logdir) 331 332 333 @property 334 def job_id(self): 335 return None 336 337 338 @property 339 def diagnosis(self): 340 if self._afetask.success: 341 return WORKING 342 elif self._afetask.task == 'Repair': 343 return BROKEN 344 else: 345 return UNKNOWN 346 347 348 @property 349 def is_special(self): 350 return True 351 352 353class _TestJobEvent(_JobEvent): 354 """`_JobEvent` adapter for regular test jobs. 355 356 This class wraps the standard `_JobEvent` interface around a row 357 in the `afe_host_queue_entries` table. 358 359 """ 360 361 @classmethod 362 def get_hqes(cls, afe, host_id, start_time, end_time): 363 """Return HQEs for a host in a given time range. 364 365 Return a list of `_TestJobEvent` objects representing all the 366 HQEs of all the jobs that ran on the given host in the given 367 time range. The list is ordered as it was returned by the 368 query (i.e. unordered). 369 370 @param afe Autotest frontend 371 @param host_id Database host id of the desired host. 372 @param start_time Start time of the range of interest. 373 @param end_time End time of the range of interest. 374 375 @return A list of `_TestJobEvent` objects. 376 377 """ 378 query_start = time_utils.epoch_time_to_date_string(start_time) 379 query_end = time_utils.epoch_time_to_date_string(end_time) 380 hqelist = afe.get_host_queue_entries_by_insert_time( 381 host_id=host_id, 382 insert_time_after=query_start, 383 insert_time_before=query_end, 384 started_on__gte=query_start, 385 started_on__lte=query_end, 386 complete=1) 387 return [cls(afe.server, hqe) for hqe in hqelist] 388 389 390 def __init__(self, afe_hostname, hqe): 391 self._afe_hostname = afe_hostname 392 self._hqe = hqe 393 super(_TestJobEvent, self).__init__( 394 hqe.started_on, hqe.finished_on) 395 396 397 @property 398 def id(self): 399 return self._hqe.id 400 401 402 @property 403 def name(self): 404 return self._hqe.job.name 405 406 407 @property 408 def job_status(self): 409 return self._hqe.status 410 411 412 @property 413 def logdir(self): 414 return _get_job_logdir(self._hqe.job) 415 416 417 @property 418 def job_url(self): 419 return _TestJobEvent.get_log_url(self._afe_hostname, self.logdir) 420 421 422 @property 423 def gs_url(self): 424 return _TestJobEvent.get_gs_url(self.logdir) 425 426 427 @property 428 def job_id(self): 429 return self._hqe.job.id 430 431 432 @property 433 def diagnosis(self): 434 return UNKNOWN 435 436 437 @property 438 def is_special(self): 439 return False 440 441 442class HostJobHistory(object): 443 """Class to query and remember DUT execution and status history. 444 445 This class is responsible for querying the database to determine 446 the history of a single DUT in a time interval of interest, and 447 for remembering the query results for reporting. 448 449 @property hostname Host name of the DUT. 450 @property start_time Start of the requested time interval, as a unix 451 timestamp (epoch time). 452 This field may be `None`. 453 @property end_time End of the requested time interval, as a unix 454 timestamp (epoch time). 455 @property _afe Autotest frontend for queries. 456 @property _host Database host object for the DUT. 457 @property _history A list of jobs and special tasks that 458 ran on the DUT in the requested time 459 interval, ordered in reverse, from latest 460 to earliest. 461 462 @property _status_interval A list of all the jobs and special 463 tasks that ran on the DUT in the 464 last diagnosis interval prior to 465 `end_time`, ordered from latest to 466 earliest. 467 @property _status_diagnosis The DUT's status as of `end_time`. 468 @property _status_task The DUT's last status task as of 469 `end_time`. 470 471 """ 472 473 @classmethod 474 def get_host_history(cls, afe, hostname, start_time, end_time): 475 """Create a `HostJobHistory` instance for a single host. 476 477 Simple factory method to construct host history from a 478 hostname. Simply looks up the host in the AFE database, and 479 passes it to the class constructor. 480 481 @param afe Autotest frontend 482 @param hostname Name of the host. 483 @param start_time Start time for the history's time 484 interval. 485 @param end_time End time for the history's time interval. 486 487 @return A new `HostJobHistory` instance. 488 489 """ 490 afehost = afe.get_hosts(hostname=hostname)[0] 491 return cls(afe, afehost, start_time, end_time) 492 493 494 @classmethod 495 def get_multiple_histories(cls, afe, start_time, end_time, labels=()): 496 """Create `HostJobHistory` instances for a set of hosts. 497 498 @param afe Autotest frontend 499 @param start_time Start time for the history's time 500 interval. 501 @param end_time End time for the history's time interval. 502 @param labels type: [str]. AFE labels to constrain the host query. 503 This option must be non-empty. An unconstrained 504 search of the DB is too costly. 505 506 @return A list of new `HostJobHistory` instances. 507 508 """ 509 assert labels, ( 510 'Must specify labels for get_multiple_histories. ' 511 'Unconstrainted search of the database is prohibitively costly.') 512 513 kwargs = {'multiple_labels': labels} 514 hosts = afe.get_hosts(**kwargs) 515 return [cls(afe, h, start_time, end_time) for h in hosts] 516 517 518 def __init__(self, afe, afehost, start_time, end_time): 519 self._afe = afe 520 self.hostname = afehost.hostname 521 self.end_time = end_time 522 self.start_time = start_time 523 self._host = afehost 524 # Don't spend time on queries until they're needed. 525 self._history = None 526 self._status_interval = None 527 self._status_diagnosis = None 528 self._status_task = None 529 530 531 def _get_history(self, start_time, end_time): 532 """Get the list of events for the given interval.""" 533 newtasks = _SpecialTaskEvent.get_tasks( 534 self._afe, self._host.id, start_time, end_time) 535 newhqes = _TestJobEvent.get_hqes( 536 self._afe, self._host.id, start_time, end_time) 537 newhistory = newtasks + newhqes 538 newhistory.sort(reverse=True) 539 return newhistory 540 541 542 def __iter__(self): 543 if self._history is None: 544 self._history = self._get_history(self.start_time, 545 self.end_time) 546 return self._history.__iter__() 547 548 549 def _extract_prefixed_label(self, prefix): 550 labels = [l for l in self._host.labels 551 if l.startswith(prefix)] 552 return labels[0][len(prefix) : ] if labels else None 553 554 555 @property 556 def host(self): 557 """Return the AFE host object for this history.""" 558 return self._host 559 560 561 @property 562 def host_model(self): 563 """Return the model name for this history's DUT.""" 564 prefix = constants.Labels.MODEL_PREFIX 565 return self._extract_prefixed_label(prefix) 566 567 568 @property 569 def host_board(self): 570 """Return the board name for this history's DUT.""" 571 prefix = constants.Labels.BOARD_PREFIX 572 return self._extract_prefixed_label(prefix) 573 574 575 @property 576 def host_pool(self): 577 """Return the pool name for this history's DUT.""" 578 prefix = constants.Labels.POOL_PREFIX 579 return self._extract_prefixed_label(prefix) 580 581 582 def _init_status_task(self): 583 """Fill in `self._status_diagnosis` and `_status_task`.""" 584 if self._status_diagnosis is not None: 585 return 586 self._status_task = _SpecialTaskEvent.get_status_task( 587 self._afe, self._host.id, self.end_time) 588 if self._status_task is not None: 589 self._status_diagnosis = self._status_task.diagnosis 590 else: 591 self._status_diagnosis = UNKNOWN 592 593 594 def _init_status_interval(self): 595 """Fill in `self._status_interval`.""" 596 if self._status_interval is not None: 597 return 598 self._init_status_task() 599 self._status_interval = [] 600 if self._status_task is None: 601 return 602 query_end = time_utils.epoch_time_to_date_string(self.end_time) 603 interval = self._afe.get_host_diagnosis_interval( 604 self._host.id, query_end, 605 self._status_diagnosis != WORKING) 606 if not interval: 607 return 608 self._status_interval = self._get_history( 609 parse_time(interval[0]), 610 parse_time(interval[1])) 611 612 613 def diagnosis_interval(self): 614 """Find this history's most recent diagnosis interval. 615 616 Returns a list of `_JobEvent` instances corresponding to the 617 most recent diagnosis interval occurring before this 618 history's end time. 619 620 The list is returned as with `self._history`, ordered from 621 most to least recent. 622 623 @return The list of the `_JobEvent`s in the diagnosis 624 interval. 625 626 """ 627 self._init_status_interval() 628 return self._status_interval 629 630 631 def last_diagnosis(self): 632 """Return the diagnosis of whether the DUT is working. 633 634 This searches the DUT's job history, looking for the most 635 recent status task for the DUT. Return a tuple of 636 `(diagnosis, task)`. 637 638 The `diagnosis` entry in the tuple is one of these values: 639 * UNUSED - The host's last status task is older than 640 `self.start_time`. 641 * WORKING - The DUT is working. 642 * BROKEN - The DUT likely requires manual intervention. 643 * UNKNOWN - No task could be found indicating status for 644 the DUT. 645 646 If the DUT was working at last check, but hasn't been used 647 inside this history's time interval, the status `UNUSED` is 648 returned with the last status task, instead of `WORKING`. 649 650 The `task` entry in the tuple is the status task that led to 651 the diagnosis. The task will be `None` if the diagnosis is 652 `UNKNOWN`. 653 654 @return A tuple with the DUT's diagnosis and the task that 655 determined it. 656 657 """ 658 self._init_status_task() 659 diagnosis = self._status_diagnosis 660 if (self.start_time is not None and 661 self._status_task is not None and 662 self._status_task.end_time < self.start_time and 663 diagnosis == WORKING): 664 diagnosis = UNUSED 665 return diagnosis, self._status_task 666 667 668def get_diagnosis_interval(host_id, end_time, success): 669 """Return the last diagnosis interval for a given host and time. 670 671 This routine queries the database for the special tasks on a 672 given host before a given time. From those tasks it selects the 673 last status task before a change in status, and the first status 674 task after the change. When `success` is true, the change must 675 be from "working" to "broken". When false, the search is for a 676 change in the opposite direction. 677 678 A "successful status task" is any successful special task. A 679 "failed status task" is a failed Repair task. These criteria 680 are based on the definition of "status task" in the module-level 681 docstring, above. 682 683 This is the RPC endpoint for `AFE.get_host_diagnosis_interval()`. 684 685 @param host_id Database host id of the desired host. 686 @param end_time Find the last eligible interval before this time. 687 @param success Whether the eligible interval should start with a 688 success or a failure. 689 690 @return A list containing the start time of the earliest job 691 selected, and the end time of the latest job. 692 693 """ 694 base_query = afe_models.SpecialTask.objects.filter( 695 host_id=host_id, is_complete=True) 696 success_query = base_query.filter(success=True) 697 failure_query = base_query.filter(success=False, task='Repair') 698 if success: 699 query0 = success_query 700 query1 = failure_query 701 else: 702 query0 = failure_query 703 query1 = success_query 704 query0 = query0.filter(time_finished__lte=end_time) 705 query0 = query0.order_by('time_started').reverse() 706 if not query0: 707 return [] 708 task0 = query0[0] 709 query1 = query1.filter(time_finished__gt=task0.time_finished) 710 task1 = query1.order_by('time_started')[0] 711 return [task0.time_started.strftime(time_utils.TIME_FMT), 712 task1.time_finished.strftime(time_utils.TIME_FMT)] 713 714 715def get_status_task(host_id, end_time): 716 """Get the last status task for a host before a given time. 717 718 This routine returns a Django query for the AFE database to find 719 the last task that finished on the given host before the given 720 time that was either a successful task, or a Repair task. The 721 query criteria are based on the definition of "status task" in 722 the module-level docstring, above. 723 724 This is the RPC endpoint for `_SpecialTaskEvent.get_status_task()`. 725 726 @param host_id Database host id of the desired host. 727 @param end_time End time of the range of interest. 728 729 @return A Django query-set selecting the single special task of 730 interest. 731 732 """ 733 # Selects status tasks: any Repair task, or any successful task. 734 status_tasks = (django_models.Q(task='Repair') | 735 django_models.Q(success=True)) 736 # Our caller needs a Django query set in order to serialize the 737 # result, so we don't resolve the query here; we just return a 738 # slice with at most one element. 739 return afe_models.SpecialTask.objects.filter( 740 status_tasks, 741 host_id=host_id, 742 time_finished__lte=end_time, 743 is_complete=True).order_by('time_started').reverse()[0:1] 744 745 746def _get_job_logdir(job): 747 """Gets the logdir for an AFE job. 748 749 @param job Job object which has id and owner properties. 750 751 @return Relative path of the results log directory. 752 """ 753 return '%s-%s' % (job.id, job.owner) 754 755 756def get_job_gs_url(job): 757 """Gets the GS URL for an AFE job. 758 759 @param job Job object which has id and owner properties. 760 761 @return Absolute GS URL to the results log directory. 762 """ 763 return _JobEvent.get_gs_url(_get_job_logdir(job)) 764