• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2015 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Services relating to DUT status and job history.
6
7The central abstraction of this module is the `HostJobHistory`
8class.  This class provides two related pieces of information
9regarding a single DUT:
10  * A history of tests and special tasks that have run on
11    the DUT in a given time range.
12  * Whether the DUT was "working" or "broken" at a given
13    time.
14
15The "working" or "broken" status of a DUT is determined by
16the DUT's special task history.  At the end of any job or
17task, the status is indicated as follows:
18  * After any successful special task, the DUT is considered
19    "working".
20  * After any failed Repair task, the DUT is considered "broken".
21  * After any other special task or after any regular test job, the
22    DUT's status is considered unchanged.
23
24Definitions for terms used in the code below:
25  * status task - Any special task that determines the DUT's
26    status; that is, any successful task, or any failed Repair.
27  * diagnosis interval - A time interval during which DUT status
28    changed either from "working" to "broken", or vice versa.  The
29    interval starts with the last status task with the old status,
30    and ends after the first status task with the new status.
31
32Diagnosis intervals are interesting because they normally contain
33the logs explaining a failure or repair event.
34
35"""
36
37import common
38import os
39from autotest_lib.frontend import setup_django_environment
40from django.db import models as django_models
41
42from autotest_lib.client.common_lib import global_config
43from autotest_lib.client.common_lib import utils
44from autotest_lib.client.common_lib import time_utils
45from autotest_lib.frontend.afe import models as afe_models
46from autotest_lib.frontend.afe import rpc_client_lib
47from autotest_lib.server import constants
48
49
50# Values used to describe the diagnosis of a DUT.  These values are
51# used to indicate both DUT status after a job or task, and also
52# diagnosis of whether the DUT was working at the end of a given
53# time interval.
54#
55# UNUSED:  Used when there are no events recorded in a given
56#     time interval.
57# UNKNOWN:  For an individual event, indicates that the DUT status
58#     is unchanged from the previous event.  For a time interval,
59#     indicates that the DUT's status can't be determined from the
60#     DUT's history.
61# WORKING:  Indicates that the DUT was working normally after the
62#     event, or at the end of the time interval.
63# BROKEN:  Indicates that the DUT needed manual repair after the
64#     event, or at the end of the time interval.
65#
66UNUSED = 0
67UNKNOWN = 1
68WORKING = 2
69BROKEN = 3
70
71
72def parse_time(time_string):
73    """Parse time according to a canonical form.
74
75    The "canonical" form is the form in which date/time
76    values are stored in the database.
77
78    @param time_string Time to be parsed.
79    """
80    return int(time_utils.to_epoch_time(time_string))
81
82
83class _JobEvent(object):
84    """Information about an event in host history.
85
86    This remembers the relevant data from a single event in host
87    history.  An event is any change in DUT state caused by a job
88    or special task.  The data captured are the start and end times
89    of the event, the URL of logs to the job or task causing the
90    event, and a diagnosis of whether the DUT was working or failed
91    afterwards.
92
93    This class is an adapter around the database model objects
94    describing jobs and special tasks.  This is an abstract
95    superclass, with concrete subclasses for `HostQueueEntry` and
96    `SpecialTask` objects.
97
98    @property start_time  Time the job or task began execution.
99    @property end_time    Time the job or task finished execution.
100    @property id          id of the event in the AFE database.
101    @property name        Name of the event, derived from the AFE database.
102    @property job_status  Short string describing the event's final status.
103    @property logdir      Relative path to the logs for the event's job.
104    @property job_url     URL to the logs for the event's job.
105    @property gs_url      GS URL to the logs for the event's job.
106    @property job_id      id of the AFE job for HQEs.  None otherwise.
107    @property diagnosis   Working status of the DUT after the event.
108    @property is_special  Boolean indicating if the event is a special task.
109
110    """
111
112    get_config_value = global_config.global_config.get_config_value
113    _LOG_URL_PATTERN = get_config_value('CROS', 'log_url_pattern')
114
115    @classmethod
116    def get_log_url(cls, afe_hostname, logdir):
117        """Return a URL to job results.
118
119        The URL is constructed from a base URL determined by the
120        global config, plus the relative path of the job's log
121        directory.
122
123        @param afe_hostname Hostname for autotest frontend
124        @param logdir Relative path of the results log directory.
125
126        @return A URL to the requested results log.
127
128        """
129        return cls._LOG_URL_PATTERN % (
130            rpc_client_lib.add_protocol(afe_hostname),
131            logdir,
132        )
133
134
135    @classmethod
136    def get_gs_url(cls, logdir):
137        """Return a GS URL to job results.
138
139        The URL is constructed from a base URL determined by the
140        global config, plus the relative path of the job's log
141        directory.
142
143        @param logdir Relative path of the results log directory.
144
145        @return A URL to the requested results log.
146
147        """
148        return os.path.join(utils.get_offload_gsuri(), logdir)
149
150
151    def __init__(self, start_time, end_time):
152        self.start_time = parse_time(start_time)
153        self.end_time = parse_time(end_time)
154
155
156    def __cmp__(self, other):
157        """Compare two jobs by their start time.
158
159        This is a standard Python `__cmp__` method to allow sorting
160        `_JobEvent` objects by their times.
161
162        @param other The `_JobEvent` object to compare to `self`.
163
164        """
165        return self.start_time - other.start_time
166
167
168    @property
169    def id(self):
170        """Return the id of the event in the AFE database."""
171        raise NotImplementedError()
172
173
174    @property
175    def name(self):
176        """Return the name of the event."""
177        raise NotImplementedError()
178
179
180    @property
181    def job_status(self):
182        """Return a short string describing the event's final status."""
183        raise NotImplementedError()
184
185
186    @property
187    def logdir(self):
188        """Return the relative path for this event's job logs."""
189        raise NotImplementedError()
190
191
192    @property
193    def job_url(self):
194        """Return the URL for this event's job logs."""
195        raise NotImplementedError()
196
197
198    @property
199    def gs_url(self):
200        """Return the GS URL for this event's job logs."""
201        raise NotImplementedError()
202
203
204    @property
205    def job_id(self):
206        """Return the id of the AFE job for HQEs.  None otherwise."""
207        raise NotImplementedError()
208
209
210    @property
211    def diagnosis(self):
212        """Return the status of the DUT after this event.
213
214        The diagnosis is interpreted as follows:
215          UNKNOWN - The DUT status was the same before and after
216              the event.
217          WORKING - The DUT appeared to be working after the event.
218          BROKEN - The DUT likely required manual intervention
219              after the event.
220
221        @return A valid diagnosis value.
222
223        """
224        raise NotImplementedError()
225
226
227    @property
228    def is_special(self):
229        """Return if the event is for a special task."""
230        raise NotImplementedError()
231
232
233class _SpecialTaskEvent(_JobEvent):
234    """`_JobEvent` adapter for special tasks.
235
236    This class wraps the standard `_JobEvent` interface around a row
237    in the `afe_special_tasks` table.
238
239    """
240
241    @classmethod
242    def get_tasks(cls, afe, host_id, start_time, end_time):
243        """Return special tasks for a host in a given time range.
244
245        Return a list of `_SpecialTaskEvent` objects representing all
246        special tasks that ran on the given host in the given time
247        range.  The list is ordered as it was returned by the query
248        (i.e. unordered).
249
250        @param afe         Autotest frontend
251        @param host_id     Database host id of the desired host.
252        @param start_time  Start time of the range of interest.
253        @param end_time    End time of the range of interest.
254
255        @return A list of `_SpecialTaskEvent` objects.
256
257        """
258        query_start = time_utils.epoch_time_to_date_string(start_time)
259        query_end = time_utils.epoch_time_to_date_string(end_time)
260        tasks = afe.get_host_special_tasks(
261                host_id,
262                time_started__gte=query_start,
263                time_finished__lte=query_end,
264                is_complete=1)
265        return [cls(afe.server, t) for t in tasks]
266
267
268    @classmethod
269    def get_status_task(cls, afe, host_id, end_time):
270        """Return the task indicating a host's status at a given time.
271
272        The task returned determines the status of the DUT; the
273        diagnosis on the task indicates the diagnosis for the DUT at
274        the given `end_time`.
275
276        @param afe         Autotest frontend
277        @param host_id     Database host id of the desired host.
278        @param end_time    Find status as of this time.
279
280        @return A `_SpecialTaskEvent` object for the requested task,
281                or `None` if no task was found.
282
283        """
284        query_end = time_utils.epoch_time_to_date_string(end_time)
285        task = afe.get_host_status_task(host_id, query_end)
286        return cls(afe.server, task) if task else None
287
288
289    def __init__(self, afe_hostname, afetask):
290        self._afe_hostname = afe_hostname
291        self._afetask = afetask
292        super(_SpecialTaskEvent, self).__init__(
293                afetask.time_started, afetask.time_finished)
294
295
296    @property
297    def id(self):
298        return self._afetask.id
299
300
301    @property
302    def name(self):
303        return self._afetask.task
304
305
306    @property
307    def job_status(self):
308        if self._afetask.is_aborted:
309            return 'ABORTED'
310        elif self._afetask.success:
311            return 'PASS'
312        else:
313            return 'FAIL'
314
315
316    @property
317    def logdir(self):
318        return ('hosts/%s/%s-%s' %
319                (self._afetask.host.hostname, self._afetask.id,
320                 self._afetask.task.lower()))
321
322
323    @property
324    def job_url(self):
325        return _SpecialTaskEvent.get_log_url(self._afe_hostname, self.logdir)
326
327
328    @property
329    def gs_url(self):
330        return _SpecialTaskEvent.get_gs_url(self.logdir)
331
332
333    @property
334    def job_id(self):
335        return None
336
337
338    @property
339    def diagnosis(self):
340        if self._afetask.success:
341            return WORKING
342        elif self._afetask.task == 'Repair':
343            return BROKEN
344        else:
345            return UNKNOWN
346
347
348    @property
349    def is_special(self):
350        return True
351
352
353class _TestJobEvent(_JobEvent):
354    """`_JobEvent` adapter for regular test jobs.
355
356    This class wraps the standard `_JobEvent` interface around a row
357    in the `afe_host_queue_entries` table.
358
359    """
360
361    @classmethod
362    def get_hqes(cls, afe, host_id, start_time, end_time):
363        """Return HQEs for a host in a given time range.
364
365        Return a list of `_TestJobEvent` objects representing all the
366        HQEs of all the jobs that ran on the given host in the given
367        time range.  The list is ordered as it was returned by the
368        query (i.e. unordered).
369
370        @param afe         Autotest frontend
371        @param host_id     Database host id of the desired host.
372        @param start_time  Start time of the range of interest.
373        @param end_time    End time of the range of interest.
374
375        @return A list of `_TestJobEvent` objects.
376
377        """
378        query_start = time_utils.epoch_time_to_date_string(start_time)
379        query_end = time_utils.epoch_time_to_date_string(end_time)
380        hqelist = afe.get_host_queue_entries_by_insert_time(
381                host_id=host_id,
382                insert_time_after=query_start,
383                insert_time_before=query_end,
384                started_on__gte=query_start,
385                started_on__lte=query_end,
386                complete=1)
387        return [cls(afe.server, hqe) for hqe in hqelist]
388
389
390    def __init__(self, afe_hostname, hqe):
391        self._afe_hostname = afe_hostname
392        self._hqe = hqe
393        super(_TestJobEvent, self).__init__(
394                hqe.started_on, hqe.finished_on)
395
396
397    @property
398    def id(self):
399        return self._hqe.id
400
401
402    @property
403    def name(self):
404        return self._hqe.job.name
405
406
407    @property
408    def job_status(self):
409        return self._hqe.status
410
411
412    @property
413    def logdir(self):
414        return _get_job_logdir(self._hqe.job)
415
416
417    @property
418    def job_url(self):
419        return _TestJobEvent.get_log_url(self._afe_hostname, self.logdir)
420
421
422    @property
423    def gs_url(self):
424        return _TestJobEvent.get_gs_url(self.logdir)
425
426
427    @property
428    def job_id(self):
429        return self._hqe.job.id
430
431
432    @property
433    def diagnosis(self):
434        return UNKNOWN
435
436
437    @property
438    def is_special(self):
439        return False
440
441
442class HostJobHistory(object):
443    """Class to query and remember DUT execution and status history.
444
445    This class is responsible for querying the database to determine
446    the history of a single DUT in a time interval of interest, and
447    for remembering the query results for reporting.
448
449    @property hostname    Host name of the DUT.
450    @property start_time  Start of the requested time interval, as a unix
451                          timestamp (epoch time).
452                          This field may be `None`.
453    @property end_time    End of the requested time interval, as a unix
454                          timestamp (epoch time).
455    @property _afe        Autotest frontend for queries.
456    @property _host       Database host object for the DUT.
457    @property _history    A list of jobs and special tasks that
458                          ran on the DUT in the requested time
459                          interval, ordered in reverse, from latest
460                          to earliest.
461
462    @property _status_interval   A list of all the jobs and special
463                                 tasks that ran on the DUT in the
464                                 last diagnosis interval prior to
465                                 `end_time`, ordered from latest to
466                                 earliest.
467    @property _status_diagnosis  The DUT's status as of `end_time`.
468    @property _status_task       The DUT's last status task as of
469                                 `end_time`.
470
471    """
472
473    @classmethod
474    def get_host_history(cls, afe, hostname, start_time, end_time):
475        """Create a `HostJobHistory` instance for a single host.
476
477        Simple factory method to construct host history from a
478        hostname.  Simply looks up the host in the AFE database, and
479        passes it to the class constructor.
480
481        @param afe         Autotest frontend
482        @param hostname    Name of the host.
483        @param start_time  Start time for the history's time
484                           interval.
485        @param end_time    End time for the history's time interval.
486
487        @return A new `HostJobHistory` instance.
488
489        """
490        afehost = afe.get_hosts(hostname=hostname)[0]
491        return cls(afe, afehost, start_time, end_time)
492
493
494    @classmethod
495    def get_multiple_histories(cls, afe, start_time, end_time, labels=()):
496        """Create `HostJobHistory` instances for a set of hosts.
497
498        @param afe         Autotest frontend
499        @param start_time  Start time for the history's time
500                           interval.
501        @param end_time    End time for the history's time interval.
502        @param labels      type: [str]. AFE labels to constrain the host query.
503                           This option must be non-empty. An unconstrained
504                           search of the DB is too costly.
505
506        @return A list of new `HostJobHistory` instances.
507
508        """
509        assert labels, (
510            'Must specify labels for get_multiple_histories. '
511            'Unconstrainted search of the database is prohibitively costly.')
512
513        kwargs = {'multiple_labels': labels}
514        hosts = afe.get_hosts(**kwargs)
515        return [cls(afe, h, start_time, end_time) for h in hosts]
516
517
518    def __init__(self, afe, afehost, start_time, end_time):
519        self._afe = afe
520        self.hostname = afehost.hostname
521        self.end_time = end_time
522        self.start_time = start_time
523        self._host = afehost
524        # Don't spend time on queries until they're needed.
525        self._history = None
526        self._status_interval = None
527        self._status_diagnosis = None
528        self._status_task = None
529
530
531    def _get_history(self, start_time, end_time):
532        """Get the list of events for the given interval."""
533        newtasks = _SpecialTaskEvent.get_tasks(
534                self._afe, self._host.id, start_time, end_time)
535        newhqes = _TestJobEvent.get_hqes(
536                self._afe, self._host.id, start_time, end_time)
537        newhistory = newtasks + newhqes
538        newhistory.sort(reverse=True)
539        return newhistory
540
541
542    def __iter__(self):
543        if self._history is None:
544            self._history = self._get_history(self.start_time,
545                                              self.end_time)
546        return self._history.__iter__()
547
548
549    def _extract_prefixed_label(self, prefix):
550        labels = [l for l in self._host.labels
551                    if l.startswith(prefix)]
552        return labels[0][len(prefix) : ] if labels else None
553
554
555    @property
556    def host(self):
557        """Return the AFE host object for this history."""
558        return self._host
559
560
561    @property
562    def host_model(self):
563        """Return the model name for this history's DUT."""
564        prefix = constants.Labels.MODEL_PREFIX
565        return self._extract_prefixed_label(prefix)
566
567
568    @property
569    def host_board(self):
570        """Return the board name for this history's DUT."""
571        prefix = constants.Labels.BOARD_PREFIX
572        return self._extract_prefixed_label(prefix)
573
574
575    @property
576    def host_pool(self):
577        """Return the pool name for this history's DUT."""
578        prefix = constants.Labels.POOL_PREFIX
579        return self._extract_prefixed_label(prefix)
580
581
582    def _init_status_task(self):
583        """Fill in `self._status_diagnosis` and `_status_task`."""
584        if self._status_diagnosis is not None:
585            return
586        self._status_task = _SpecialTaskEvent.get_status_task(
587                self._afe, self._host.id, self.end_time)
588        if self._status_task is not None:
589            self._status_diagnosis = self._status_task.diagnosis
590        else:
591            self._status_diagnosis = UNKNOWN
592
593
594    def _init_status_interval(self):
595        """Fill in `self._status_interval`."""
596        if self._status_interval is not None:
597            return
598        self._init_status_task()
599        self._status_interval = []
600        if self._status_task is None:
601            return
602        query_end = time_utils.epoch_time_to_date_string(self.end_time)
603        interval = self._afe.get_host_diagnosis_interval(
604                self._host.id, query_end,
605                self._status_diagnosis != WORKING)
606        if not interval:
607            return
608        self._status_interval = self._get_history(
609                parse_time(interval[0]),
610                parse_time(interval[1]))
611
612
613    def diagnosis_interval(self):
614        """Find this history's most recent diagnosis interval.
615
616        Returns a list of `_JobEvent` instances corresponding to the
617        most recent diagnosis interval occurring before this
618        history's end time.
619
620        The list is returned as with `self._history`, ordered from
621        most to least recent.
622
623        @return The list of the `_JobEvent`s in the diagnosis
624                interval.
625
626        """
627        self._init_status_interval()
628        return self._status_interval
629
630
631    def last_diagnosis(self):
632        """Return the diagnosis of whether the DUT is working.
633
634        This searches the DUT's job history, looking for the most
635        recent status task for the DUT.  Return a tuple of
636        `(diagnosis, task)`.
637
638        The `diagnosis` entry in the tuple is one of these values:
639          * UNUSED - The host's last status task is older than
640              `self.start_time`.
641          * WORKING - The DUT is working.
642          * BROKEN - The DUT likely requires manual intervention.
643          * UNKNOWN - No task could be found indicating status for
644              the DUT.
645
646        If the DUT was working at last check, but hasn't been used
647        inside this history's time interval, the status `UNUSED` is
648        returned with the last status task, instead of `WORKING`.
649
650        The `task` entry in the tuple is the status task that led to
651        the diagnosis.  The task will be `None` if the diagnosis is
652        `UNKNOWN`.
653
654        @return A tuple with the DUT's diagnosis and the task that
655                determined it.
656
657        """
658        self._init_status_task()
659        diagnosis = self._status_diagnosis
660        if (self.start_time is not None and
661                self._status_task is not None and
662                self._status_task.end_time < self.start_time and
663                diagnosis == WORKING):
664            diagnosis = UNUSED
665        return diagnosis, self._status_task
666
667
668def get_diagnosis_interval(host_id, end_time, success):
669    """Return the last diagnosis interval for a given host and time.
670
671    This routine queries the database for the special tasks on a
672    given host before a given time.  From those tasks it selects the
673    last status task before a change in status, and the first status
674    task after the change.  When `success` is true, the change must
675    be from "working" to "broken".  When false, the search is for a
676    change in the opposite direction.
677
678    A "successful status task" is any successful special task.  A
679    "failed status task" is a failed Repair task.  These criteria
680    are based on the definition of "status task" in the module-level
681    docstring, above.
682
683    This is the RPC endpoint for `AFE.get_host_diagnosis_interval()`.
684
685    @param host_id     Database host id of the desired host.
686    @param end_time    Find the last eligible interval before this time.
687    @param success     Whether the eligible interval should start with a
688                       success or a failure.
689
690    @return A list containing the start time of the earliest job
691            selected, and the end time of the latest job.
692
693    """
694    base_query = afe_models.SpecialTask.objects.filter(
695            host_id=host_id, is_complete=True)
696    success_query = base_query.filter(success=True)
697    failure_query = base_query.filter(success=False, task='Repair')
698    if success:
699        query0 = success_query
700        query1 = failure_query
701    else:
702        query0 = failure_query
703        query1 = success_query
704    query0 = query0.filter(time_finished__lte=end_time)
705    query0 = query0.order_by('time_started').reverse()
706    if not query0:
707        return []
708    task0 = query0[0]
709    query1 = query1.filter(time_finished__gt=task0.time_finished)
710    task1 = query1.order_by('time_started')[0]
711    return [task0.time_started.strftime(time_utils.TIME_FMT),
712            task1.time_finished.strftime(time_utils.TIME_FMT)]
713
714
715def get_status_task(host_id, end_time):
716    """Get the last status task for a host before a given time.
717
718    This routine returns a Django query for the AFE database to find
719    the last task that finished on the given host before the given
720    time that was either a successful task, or a Repair task.  The
721    query criteria are based on the definition of "status task" in
722    the module-level docstring, above.
723
724    This is the RPC endpoint for `_SpecialTaskEvent.get_status_task()`.
725
726    @param host_id     Database host id of the desired host.
727    @param end_time    End time of the range of interest.
728
729    @return A Django query-set selecting the single special task of
730            interest.
731
732    """
733    # Selects status tasks:  any Repair task, or any successful task.
734    status_tasks = (django_models.Q(task='Repair') |
735                    django_models.Q(success=True))
736    # Our caller needs a Django query set in order to serialize the
737    # result, so we don't resolve the query here; we just return a
738    # slice with at most one element.
739    return afe_models.SpecialTask.objects.filter(
740            status_tasks,
741            host_id=host_id,
742            time_finished__lte=end_time,
743            is_complete=True).order_by('time_started').reverse()[0:1]
744
745
746def _get_job_logdir(job):
747    """Gets the logdir for an AFE job.
748
749    @param job Job object which has id and owner properties.
750
751    @return Relative path of the results log directory.
752    """
753    return '%s-%s' % (job.id, job.owner)
754
755
756def get_job_gs_url(job):
757    """Gets the GS URL for an AFE job.
758
759    @param job Job object which has id and owner properties.
760
761    @return Absolute GS URL to the results log directory.
762    """
763    return _JobEvent.get_gs_url(_get_job_logdir(job))
764