1# Copyright 2015 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5"""Services relating to DUT status and job history. 6 7The central abstraction of this module is the `HostJobHistory` 8class. This class provides two related pieces of information 9regarding a single DUT: 10 * A history of tests and special tasks that have run on 11 the DUT in a given time range. 12 * Whether the DUT was "working" or "broken" at a given 13 time. 14 15The "working" or "broken" status of a DUT is determined by 16the DUT's special task history. At the end of any job or 17task, the status is indicated as follows: 18 * After any successful special task, the DUT is considered 19 "working". 20 * After any failed Repair task, the DUT is considered "broken". 21 * After any other special task or after any regular test job, the 22 DUT's status is considered unchanged. 23 24Definitions for terms used in the code below: 25 * status task - Any special task that determines the DUT's 26 status; that is, any successful task, or any failed Repair. 27 * diagnosis interval - A time interval during which DUT status 28 changed either from "working" to "broken", or vice versa. The 29 interval starts with the last status task with the old status, 30 and ends after the first status task with the new status. 31 32Diagnosis intervals are interesting because they normally contain 33the logs explaining a failure or repair event. 34 35""" 36 37import common 38import os 39from autotest_lib.frontend import setup_django_environment 40from django.db import models as django_models 41 42from autotest_lib.client.common_lib import global_config 43from autotest_lib.client.common_lib import utils 44from autotest_lib.client.common_lib import time_utils 45from autotest_lib.frontend.afe import models as afe_models 46from autotest_lib.server import constants 47 48 49# Values used to describe the diagnosis of a DUT. These values are 50# used to indicate both DUT status after a job or task, and also 51# diagnosis of whether the DUT was working at the end of a given 52# time interval. 53# 54# UNUSED: Used when there are no events recorded in a given 55# time interval. 56# UNKNOWN: For an individual event, indicates that the DUT status 57# is unchanged from the previous event. For a time interval, 58# indicates that the DUT's status can't be determined from the 59# DUT's history. 60# WORKING: Indicates that the DUT was working normally after the 61# event, or at the end of the time interval. 62# BROKEN: Indicates that the DUT needed manual repair after the 63# event, or at the end of the time interval. 64# 65UNUSED = 0 66UNKNOWN = 1 67WORKING = 2 68BROKEN = 3 69 70 71def parse_time(time_string): 72 """Parse time according to a canonical form. 73 74 The "canonical" form is the form in which date/time 75 values are stored in the database. 76 77 @param time_string Time to be parsed. 78 """ 79 return int(time_utils.to_epoch_time(time_string)) 80 81 82class _JobEvent(object): 83 """Information about an event in host history. 84 85 This remembers the relevant data from a single event in host 86 history. An event is any change in DUT state caused by a job 87 or special task. The data captured are the start and end times 88 of the event, the URL of logs to the job or task causing the 89 event, and a diagnosis of whether the DUT was working or failed 90 afterwards. 91 92 This class is an adapter around the database model objects 93 describing jobs and special tasks. This is an abstract 94 superclass, with concrete subclasses for `HostQueueEntry` and 95 `SpecialTask` objects. 96 97 @property start_time Time the job or task began execution. 98 @property end_time Time the job or task finished execution. 99 @property id id of the event in the AFE database. 100 @property name Name of the event, derived from the AFE database. 101 @property job_status Short string describing the event's final status. 102 @property logdir Relative path to the logs for the event's job. 103 @property job_url URL to the logs for the event's job. 104 @property gs_url GS URL to the logs for the event's job. 105 @property job_id id of the AFE job for HQEs. None otherwise. 106 @property diagnosis Working status of the DUT after the event. 107 @property is_special Boolean indicating if the event is a special task. 108 109 """ 110 111 get_config_value = global_config.global_config.get_config_value 112 _LOG_URL_PATTERN = ('%s/browse/chromeos-autotest-results/%%s/' 113 % get_config_value('AUTOTEST_WEB', 'stainless_url', 114 default=None)) 115 116 @classmethod 117 def get_gs_url(cls, logdir): 118 """Return a GS URL to job results. 119 120 The URL is constructed from a base URL determined by the 121 global config, plus the relative path of the job's log 122 directory. 123 124 @param logdir Relative path of the results log directory. 125 126 @return A URL to the requested results log. 127 128 """ 129 return os.path.join(utils.get_offload_gsuri(), logdir) 130 131 132 def __init__(self, start_time, end_time): 133 self.start_time = parse_time(start_time) 134 self.end_time = parse_time(end_time) 135 136 137 def __cmp__(self, other): 138 """Compare two jobs by their start time. 139 140 This is a standard Python `__cmp__` method to allow sorting 141 `_JobEvent` objects by their times. 142 143 @param other The `_JobEvent` object to compare to `self`. 144 145 """ 146 return self.start_time - other.start_time 147 148 149 @property 150 def id(self): 151 """Return the id of the event in the AFE database.""" 152 raise NotImplementedError() 153 154 155 @property 156 def name(self): 157 """Return the name of the event.""" 158 raise NotImplementedError() 159 160 161 @property 162 def job_status(self): 163 """Return a short string describing the event's final status.""" 164 raise NotImplementedError() 165 166 167 @property 168 def logdir(self): 169 """Return the relative path for this event's job logs.""" 170 raise NotImplementedError() 171 172 173 @property 174 def job_url(self): 175 """Return the URL for this event's job logs.""" 176 return self._LOG_URL_PATTERN % self.logdir 177 178 179 @property 180 def gs_url(self): 181 """Return the GS URL for this event's job logs.""" 182 return self.get_gs_url(self.logdir) 183 184 185 @property 186 def job_id(self): 187 """Return the id of the AFE job for HQEs. None otherwise.""" 188 raise NotImplementedError() 189 190 191 @property 192 def diagnosis(self): 193 """Return the status of the DUT after this event. 194 195 The diagnosis is interpreted as follows: 196 UNKNOWN - The DUT status was the same before and after 197 the event. 198 WORKING - The DUT appeared to be working after the event. 199 BROKEN - The DUT likely required manual intervention 200 after the event. 201 202 @return A valid diagnosis value. 203 204 """ 205 raise NotImplementedError() 206 207 208 @property 209 def is_special(self): 210 """Return if the event is for a special task.""" 211 raise NotImplementedError() 212 213 214class _SpecialTaskEvent(_JobEvent): 215 """`_JobEvent` adapter for special tasks. 216 217 This class wraps the standard `_JobEvent` interface around a row 218 in the `afe_special_tasks` table. 219 220 """ 221 222 @classmethod 223 def get_tasks(cls, afe, host_id, start_time, end_time): 224 """Return special tasks for a host in a given time range. 225 226 Return a list of `_SpecialTaskEvent` objects representing all 227 special tasks that ran on the given host in the given time 228 range. The list is ordered as it was returned by the query 229 (i.e. unordered). 230 231 @param afe Autotest frontend 232 @param host_id Database host id of the desired host. 233 @param start_time Start time of the range of interest. 234 @param end_time End time of the range of interest. 235 236 @return A list of `_SpecialTaskEvent` objects. 237 238 """ 239 query_start = time_utils.epoch_time_to_date_string(start_time) 240 query_end = time_utils.epoch_time_to_date_string(end_time) 241 tasks = afe.get_host_special_tasks( 242 host_id, 243 time_started__gte=query_start, 244 time_finished__lte=query_end, 245 is_complete=1) 246 return [cls(t) for t in tasks] 247 248 249 @classmethod 250 def get_status_task(cls, afe, host_id, end_time): 251 """Return the task indicating a host's status at a given time. 252 253 The task returned determines the status of the DUT; the 254 diagnosis on the task indicates the diagnosis for the DUT at 255 the given `end_time`. 256 257 @param afe Autotest frontend 258 @param host_id Database host id of the desired host. 259 @param end_time Find status as of this time. 260 261 @return A `_SpecialTaskEvent` object for the requested task, 262 or `None` if no task was found. 263 264 """ 265 query_end = time_utils.epoch_time_to_date_string(end_time) 266 task = afe.get_host_status_task(host_id, query_end) 267 return cls(task) if task else None 268 269 270 def __init__(self, afetask): 271 self._afetask = afetask 272 super(_SpecialTaskEvent, self).__init__( 273 afetask.time_started, afetask.time_finished) 274 275 276 @property 277 def id(self): 278 return self._afetask.id 279 280 281 @property 282 def name(self): 283 return self._afetask.task 284 285 286 @property 287 def job_status(self): 288 if self._afetask.is_aborted: 289 return 'ABORTED' 290 elif self._afetask.success: 291 return 'PASS' 292 else: 293 return 'FAIL' 294 295 296 @property 297 def logdir(self): 298 return ('hosts/%s/%s-%s' % 299 (self._afetask.host.hostname, self._afetask.id, 300 self._afetask.task.lower())) 301 302 303 @property 304 def job_id(self): 305 return None 306 307 308 @property 309 def diagnosis(self): 310 if self._afetask.success: 311 return WORKING 312 elif self._afetask.task == 'Repair': 313 return BROKEN 314 else: 315 return UNKNOWN 316 317 318 @property 319 def is_special(self): 320 return True 321 322 323class _TestJobEvent(_JobEvent): 324 """`_JobEvent` adapter for regular test jobs. 325 326 This class wraps the standard `_JobEvent` interface around a row 327 in the `afe_host_queue_entries` table. 328 329 """ 330 331 @classmethod 332 def get_hqes(cls, afe, host_id, start_time, end_time): 333 """Return HQEs for a host in a given time range. 334 335 Return a list of `_TestJobEvent` objects representing all the 336 HQEs of all the jobs that ran on the given host in the given 337 time range. The list is ordered as it was returned by the 338 query (i.e. unordered). 339 340 @param afe Autotest frontend 341 @param host_id Database host id of the desired host. 342 @param start_time Start time of the range of interest. 343 @param end_time End time of the range of interest. 344 345 @return A list of `_TestJobEvent` objects. 346 347 """ 348 query_start = time_utils.epoch_time_to_date_string(start_time) 349 query_end = time_utils.epoch_time_to_date_string(end_time) 350 hqelist = afe.get_host_queue_entries_by_insert_time( 351 host_id=host_id, 352 insert_time_after=query_start, 353 insert_time_before=query_end, 354 started_on__gte=query_start, 355 started_on__lte=query_end, 356 complete=1) 357 return [cls(hqe) for hqe in hqelist] 358 359 360 def __init__(self, hqe): 361 self._hqe = hqe 362 super(_TestJobEvent, self).__init__( 363 hqe.started_on, hqe.finished_on) 364 365 366 @property 367 def id(self): 368 return self._hqe.id 369 370 371 @property 372 def name(self): 373 return self._hqe.job.name 374 375 376 @property 377 def job_status(self): 378 return self._hqe.status 379 380 381 @property 382 def logdir(self): 383 return _get_job_logdir(self._hqe.job) 384 385 386 @property 387 def job_id(self): 388 return self._hqe.job.id 389 390 391 @property 392 def diagnosis(self): 393 return UNKNOWN 394 395 396 @property 397 def is_special(self): 398 return False 399 400 401class HostJobHistory(object): 402 """Class to query and remember DUT execution and status history. 403 404 This class is responsible for querying the database to determine 405 the history of a single DUT in a time interval of interest, and 406 for remembering the query results for reporting. 407 408 @property hostname Host name of the DUT. 409 @property start_time Start of the requested time interval, as a unix 410 timestamp (epoch time). 411 This field may be `None`. 412 @property end_time End of the requested time interval, as a unix 413 timestamp (epoch time). 414 @property _afe Autotest frontend for queries. 415 @property _host Database host object for the DUT. 416 @property _history A list of jobs and special tasks that 417 ran on the DUT in the requested time 418 interval, ordered in reverse, from latest 419 to earliest. 420 421 @property _status_interval A list of all the jobs and special 422 tasks that ran on the DUT in the 423 last diagnosis interval prior to 424 `end_time`, ordered from latest to 425 earliest. 426 @property _status_diagnosis The DUT's status as of `end_time`. 427 @property _status_task The DUT's last status task as of 428 `end_time`. 429 430 """ 431 432 @classmethod 433 def get_host_history(cls, afe, hostname, start_time, end_time): 434 """Create a `HostJobHistory` instance for a single host. 435 436 Simple factory method to construct host history from a 437 hostname. Simply looks up the host in the AFE database, and 438 passes it to the class constructor. 439 440 @param afe Autotest frontend 441 @param hostname Name of the host. 442 @param start_time Start time for the history's time 443 interval. 444 @param end_time End time for the history's time interval. 445 446 @return A new `HostJobHistory` instance. 447 448 """ 449 afehost = afe.get_hosts(hostname=hostname)[0] 450 return cls(afe, afehost, start_time, end_time) 451 452 453 @classmethod 454 def get_multiple_histories(cls, afe, start_time, end_time, labels=()): 455 """Create `HostJobHistory` instances for a set of hosts. 456 457 @param afe Autotest frontend 458 @param start_time Start time for the history's time 459 interval. 460 @param end_time End time for the history's time interval. 461 @param labels type: [str]. AFE labels to constrain the host query. 462 This option must be non-empty. An unconstrained 463 search of the DB is too costly. 464 465 @return A list of new `HostJobHistory` instances. 466 467 """ 468 assert labels, ( 469 'Must specify labels for get_multiple_histories. ' 470 'Unconstrainted search of the database is prohibitively costly.') 471 472 kwargs = {'multiple_labels': labels} 473 hosts = afe.get_hosts(**kwargs) 474 return [cls(afe, h, start_time, end_time) for h in hosts] 475 476 477 def __init__(self, afe, afehost, start_time, end_time): 478 self._afe = afe 479 self.hostname = afehost.hostname 480 self.end_time = end_time 481 self.start_time = start_time 482 self._host = afehost 483 # Don't spend time on queries until they're needed. 484 self._history = None 485 self._status_interval = None 486 self._status_diagnosis = None 487 self._status_task = None 488 489 490 def _get_history(self, start_time, end_time): 491 """Get the list of events for the given interval.""" 492 newtasks = _SpecialTaskEvent.get_tasks( 493 self._afe, self._host.id, start_time, end_time) 494 newhqes = _TestJobEvent.get_hqes( 495 self._afe, self._host.id, start_time, end_time) 496 newhistory = newtasks + newhqes 497 newhistory.sort(reverse=True) 498 return newhistory 499 500 501 def __iter__(self): 502 if self._history is None: 503 self._history = self._get_history(self.start_time, 504 self.end_time) 505 return self._history.__iter__() 506 507 508 def _extract_prefixed_label(self, prefix): 509 labels = [l for l in self._host.labels 510 if l.startswith(prefix)] 511 return labels[0][len(prefix) : ] if labels else None 512 513 514 @property 515 def host(self): 516 """Return the AFE host object for this history.""" 517 return self._host 518 519 520 @property 521 def host_model(self): 522 """Return the model name for this history's DUT.""" 523 prefix = constants.Labels.MODEL_PREFIX 524 return self._extract_prefixed_label(prefix) 525 526 527 @property 528 def host_board(self): 529 """Return the board name for this history's DUT.""" 530 prefix = constants.Labels.BOARD_PREFIX 531 return self._extract_prefixed_label(prefix) 532 533 534 @property 535 def host_pool(self): 536 """Return the pool name for this history's DUT.""" 537 prefix = constants.Labels.POOL_PREFIX 538 return self._extract_prefixed_label(prefix) 539 540 541 def _init_status_task(self): 542 """Fill in `self._status_diagnosis` and `_status_task`.""" 543 if self._status_diagnosis is not None: 544 return 545 self._status_task = _SpecialTaskEvent.get_status_task( 546 self._afe, self._host.id, self.end_time) 547 if self._status_task is not None: 548 self._status_diagnosis = self._status_task.diagnosis 549 else: 550 self._status_diagnosis = UNKNOWN 551 552 553 def _init_status_interval(self): 554 """Fill in `self._status_interval`.""" 555 if self._status_interval is not None: 556 return 557 self._init_status_task() 558 self._status_interval = [] 559 if self._status_task is None: 560 return 561 query_end = time_utils.epoch_time_to_date_string(self.end_time) 562 interval = self._afe.get_host_diagnosis_interval( 563 self._host.id, query_end, 564 self._status_diagnosis != WORKING) 565 if not interval: 566 return 567 self._status_interval = self._get_history( 568 parse_time(interval[0]), 569 parse_time(interval[1])) 570 571 572 def diagnosis_interval(self): 573 """Find this history's most recent diagnosis interval. 574 575 Returns a list of `_JobEvent` instances corresponding to the 576 most recent diagnosis interval occurring before this 577 history's end time. 578 579 The list is returned as with `self._history`, ordered from 580 most to least recent. 581 582 @return The list of the `_JobEvent`s in the diagnosis 583 interval. 584 585 """ 586 self._init_status_interval() 587 return self._status_interval 588 589 590 def last_diagnosis(self): 591 """Return the diagnosis of whether the DUT is working. 592 593 This searches the DUT's job history, looking for the most 594 recent status task for the DUT. Return a tuple of 595 `(diagnosis, task)`. 596 597 The `diagnosis` entry in the tuple is one of these values: 598 * UNUSED - The host's last status task is older than 599 `self.start_time`. 600 * WORKING - The DUT is working. 601 * BROKEN - The DUT likely requires manual intervention. 602 * UNKNOWN - No task could be found indicating status for 603 the DUT. 604 605 If the DUT was working at last check, but hasn't been used 606 inside this history's time interval, the status `UNUSED` is 607 returned with the last status task, instead of `WORKING`. 608 609 The `task` entry in the tuple is the status task that led to 610 the diagnosis. The task will be `None` if the diagnosis is 611 `UNKNOWN`. 612 613 @return A tuple with the DUT's diagnosis and the task that 614 determined it. 615 616 """ 617 self._init_status_task() 618 diagnosis = self._status_diagnosis 619 if (self.start_time is not None and 620 self._status_task is not None and 621 self._status_task.end_time < self.start_time and 622 diagnosis == WORKING): 623 diagnosis = UNUSED 624 return diagnosis, self._status_task 625 626 627def get_diagnosis_interval(host_id, end_time, success): 628 """Return the last diagnosis interval for a given host and time. 629 630 This routine queries the database for the special tasks on a 631 given host before a given time. From those tasks it selects the 632 last status task before a change in status, and the first status 633 task after the change. When `success` is true, the change must 634 be from "working" to "broken". When false, the search is for a 635 change in the opposite direction. 636 637 A "successful status task" is any successful special task. A 638 "failed status task" is a failed Repair task. These criteria 639 are based on the definition of "status task" in the module-level 640 docstring, above. 641 642 This is the RPC endpoint for `AFE.get_host_diagnosis_interval()`. 643 644 @param host_id Database host id of the desired host. 645 @param end_time Find the last eligible interval before this time. 646 @param success Whether the eligible interval should start with a 647 success or a failure. 648 649 @return A list containing the start time of the earliest job 650 selected, and the end time of the latest job. 651 652 """ 653 base_query = afe_models.SpecialTask.objects.filter( 654 host_id=host_id, is_complete=True) 655 success_query = base_query.filter(success=True) 656 failure_query = base_query.filter(success=False, task='Repair') 657 if success: 658 query0 = success_query 659 query1 = failure_query 660 else: 661 query0 = failure_query 662 query1 = success_query 663 query0 = query0.filter(time_finished__lte=end_time) 664 query0 = query0.order_by('time_started').reverse() 665 if not query0: 666 return [] 667 task0 = query0[0] 668 query1 = query1.filter(time_finished__gt=task0.time_finished) 669 task1 = query1.order_by('time_started')[0] 670 return [task0.time_started.strftime(time_utils.TIME_FMT), 671 task1.time_finished.strftime(time_utils.TIME_FMT)] 672 673 674def get_status_task(host_id, end_time): 675 """Get the last status task for a host before a given time. 676 677 This routine returns a Django query for the AFE database to find 678 the last task that finished on the given host before the given 679 time that was either a successful task, or a Repair task. The 680 query criteria are based on the definition of "status task" in 681 the module-level docstring, above. 682 683 This is the RPC endpoint for `_SpecialTaskEvent.get_status_task()`. 684 685 @param host_id Database host id of the desired host. 686 @param end_time End time of the range of interest. 687 688 @return A Django query-set selecting the single special task of 689 interest. 690 691 """ 692 # Selects status tasks: any Repair task, or any successful task. 693 status_tasks = (django_models.Q(task='Repair') | 694 django_models.Q(success=True)) 695 # Our caller needs a Django query set in order to serialize the 696 # result, so we don't resolve the query here; we just return a 697 # slice with at most one element. 698 return afe_models.SpecialTask.objects.filter( 699 status_tasks, 700 host_id=host_id, 701 time_finished__lte=end_time, 702 is_complete=True).order_by('time_started').reverse()[0:1] 703 704 705def _get_job_logdir(job): 706 """Gets the logdir for an AFE job. 707 708 @param job Job object which has id and owner properties. 709 710 @return Relative path of the results log directory. 711 """ 712 return '%s-%s' % (job.id, job.owner) 713 714 715def get_job_gs_url(job): 716 """Gets the GS URL for an AFE job. 717 718 @param job Job object which has id and owner properties. 719 720 @return Absolute GS URL to the results log directory. 721 """ 722 return _JobEvent.get_gs_url(_get_job_logdir(job)) 723