• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2015 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Services relating to DUT status and job history.
6
7The central abstraction of this module is the `HostJobHistory`
8class.  This class provides two related pieces of information
9regarding a single DUT:
10  * A history of tests and special tasks that have run on
11    the DUT in a given time range.
12  * Whether the DUT was "working" or "broken" at a given
13    time.
14
15The "working" or "broken" status of a DUT is determined by
16the DUT's special task history.  At the end of any job or
17task, the status is indicated as follows:
18  * After any successful special task, the DUT is considered
19    "working".
20  * After any failed Repair task, the DUT is considered "broken".
21  * After any other special task or after any regular test job, the
22    DUT's status is considered unchanged.
23
24Definitions for terms used in the code below:
25  * status task - Any special task that determines the DUT's
26    status; that is, any successful task, or any failed Repair.
27  * diagnosis interval - A time interval during which DUT status
28    changed either from "working" to "broken", or vice versa.  The
29    interval starts with the last status task with the old status,
30    and ends after the first status task with the new status.
31
32Diagnosis intervals are interesting because they normally contain
33the logs explaining a failure or repair event.
34
35"""
36
37import common
38from autotest_lib.frontend import setup_django_environment
39from django.db import models as django_models
40
41from autotest_lib.client.common_lib import global_config
42from autotest_lib.client.common_lib import time_utils
43from autotest_lib.frontend.afe import models as afe_models
44from autotest_lib.site_utils.suite_scheduler import constants
45
46
47# Values used to describe the diagnosis of a DUT.  These values are
48# used to indicate both DUT status after a job or task, and also
49# diagnosis of whether the DUT was working at the end of a given
50# time interval.
51#
52# UNUSED:  Used when there are no events recorded in a given
53#     time interval.
54# UNKNOWN:  For an individual event, indicates that the DUT status
55#     is unchanged from the previous event.  For a time interval,
56#     indicates that the DUT's status can't be determined from the
57#     DUT's history.
58# WORKING:  Indicates that the DUT was working normally after the
59#     event, or at the end of the time interval.
60# BROKEN:  Indicates that the DUT needed manual repair after the
61#     event, or at the end of the time interval.
62#
63UNUSED = 0
64UNKNOWN = 1
65WORKING = 2
66BROKEN = 3
67
68
69def parse_time(time_string):
70    """Parse time according to a canonical form.
71
72    The "canonical" form is the form in which date/time
73    values are stored in the database.
74
75    @param time_string Time to be parsed.
76    """
77    return int(time_utils.to_epoch_time(time_string))
78
79
80class _JobEvent(object):
81    """Information about an event in host history.
82
83    This remembers the relevant data from a single event in host
84    history.  An event is any change in DUT state caused by a job
85    or special task.  The data captured are the start and end times
86    of the event, the URL of logs to the job or task causing the
87    event, and a diagnosis of whether the DUT was working or failed
88    afterwards.
89
90    This class is an adapter around the database model objects
91    describing jobs and special tasks.  This is an abstract
92    superclass, with concrete subclasses for `HostQueueEntry` and
93    `SpecialTask` objects.
94
95    @property start_time  Time the job or task began execution.
96    @property end_time    Time the job or task finished execution.
97    @property job_url     URL to the logs for the event's job.
98    @property diagnosis   Working status of the DUT after the event.
99
100    """
101
102    get_config_value = global_config.global_config.get_config_value
103    _LOG_URL_PATTERN = get_config_value('CROS', 'log_url_pattern')
104
105    @classmethod
106    def get_log_url(cls, afe_hostname, logdir):
107        """Return a URL to job results.
108
109        The URL is constructed from a base URL determined by the
110        global config, plus the relative path of the job's log
111        directory.
112
113        @param afe_hostname Hostname for autotest frontend
114        @param logdir Relative path of the results log directory.
115
116        @return A URL to the requested results log.
117
118        """
119        return cls._LOG_URL_PATTERN % (afe_hostname, logdir)
120
121
122    def __init__(self, start_time, end_time):
123        self.start_time = parse_time(start_time)
124        self.end_time = parse_time(end_time)
125
126
127    def __cmp__(self, other):
128        """Compare two jobs by their start time.
129
130        This is a standard Python `__cmp__` method to allow sorting
131        `_JobEvent` objects by their times.
132
133        @param other The `_JobEvent` object to compare to `self`.
134
135        """
136        return self.start_time - other.start_time
137
138
139    @property
140    def job_url(self):
141        """Return the URL for this event's job logs."""
142        raise NotImplemented()
143
144
145    @property
146    def diagnosis(self):
147        """Return the status of the DUT after this event.
148
149        The diagnosis is interpreted as follows:
150          UNKNOWN - The DUT status was the same before and after
151              the event.
152          WORKING - The DUT appeared to be working after the event.
153          BROKEN - The DUT likely required manual intervention
154              after the event.
155
156        @return A valid diagnosis value.
157
158        """
159        raise NotImplemented()
160
161
162class _SpecialTaskEvent(_JobEvent):
163    """`_JobEvent` adapter for special tasks.
164
165    This class wraps the standard `_JobEvent` interface around a row
166    in the `afe_special_tasks` table.
167
168    """
169
170    @classmethod
171    def get_tasks(cls, afe, host_id, start_time, end_time):
172        """Return special tasks for a host in a given time range.
173
174        Return a list of `_SpecialTaskEvent` objects representing all
175        special tasks that ran on the given host in the given time
176        range.  The list is ordered as it was returned by the query
177        (i.e. unordered).
178
179        @param afe         Autotest frontend
180        @param host_id     Database host id of the desired host.
181        @param start_time  Start time of the range of interest.
182        @param end_time    End time of the range of interest.
183
184        @return A list of `_SpecialTaskEvent` objects.
185
186        """
187        query_start = time_utils.epoch_time_to_date_string(start_time)
188        query_end = time_utils.epoch_time_to_date_string(end_time)
189        tasks = afe.get_host_special_tasks(
190                host_id,
191                time_started__gte=query_start,
192                time_finished__lte=query_end,
193                is_complete=1)
194        return [cls(afe.server, t) for t in tasks]
195
196
197    @classmethod
198    def get_status_task(cls, afe, host_id, end_time):
199        """Return the task indicating a host's status at a given time.
200
201        The task returned determines the status of the DUT; the
202        diagnosis on the task indicates the diagnosis for the DUT at
203        the given `end_time`.
204
205        @param afe         Autotest frontend
206        @param host_id     Database host id of the desired host.
207        @param end_time    Find status as of this time.
208
209        @return A `_SpecialTaskEvent` object for the requested task,
210                or `None` if no task was found.
211
212        """
213        query_end = time_utils.epoch_time_to_date_string(end_time)
214        task = afe.get_host_status_task(host_id, query_end)
215        return cls(afe.server, task) if task else None
216
217
218    def __init__(self, afe_hostname, afetask):
219        self._afe_hostname = afe_hostname
220        self._afetask = afetask
221        super(_SpecialTaskEvent, self).__init__(
222                afetask.time_started, afetask.time_finished)
223
224
225    @property
226    def job_url(self):
227        logdir = ('hosts/%s/%s-%s' %
228                  (self._afetask.host.hostname, self._afetask.id,
229                   self._afetask.task.lower()))
230        return _SpecialTaskEvent.get_log_url(self._afe_hostname, logdir)
231
232
233    @property
234    def diagnosis(self):
235        if self._afetask.success:
236            return WORKING
237        elif self._afetask.task == 'Repair':
238            return BROKEN
239        else:
240            return UNKNOWN
241
242
243class _TestJobEvent(_JobEvent):
244    """`_JobEvent` adapter for regular test jobs.
245
246    This class wraps the standard `_JobEvent` interface around a row
247    in the `afe_host_queue_entries` table.
248
249    """
250
251    @classmethod
252    def get_hqes(cls, afe, host_id, start_time, end_time):
253        """Return HQEs for a host in a given time range.
254
255        Return a list of `_TestJobEvent` objects representing all the
256        HQEs of all the jobs that ran on the given host in the given
257        time range.  The list is ordered as it was returned by the
258        query (i.e. unordered).
259
260        @param afe         Autotest frontend
261        @param host_id     Database host id of the desired host.
262        @param start_time  Start time of the range of interest.
263        @param end_time    End time of the range of interest.
264
265        @return A list of `_TestJobEvent` objects.
266
267        """
268        query_start = time_utils.epoch_time_to_date_string(start_time)
269        query_end = time_utils.epoch_time_to_date_string(end_time)
270        hqelist = afe.get_host_queue_entries(
271                host_id=host_id,
272                start_time=query_start,
273                end_time=query_end,
274                complete=1)
275        return [cls(afe.server, hqe) for hqe in hqelist]
276
277
278    def __init__(self, afe_hostname, hqe):
279        self._afe_hostname = afe_hostname
280        self._hqe = hqe
281        super(_TestJobEvent, self).__init__(
282                hqe.started_on, hqe.finished_on)
283
284
285    @property
286    def job_url(self):
287        logdir = '%s-%s' % (self._hqe.job.id, self._hqe.job.owner)
288        return _TestJobEvent.get_log_url(self._afe_hostname, logdir)
289
290
291    @property
292    def diagnosis(self):
293        return UNKNOWN
294
295
296class HostJobHistory(object):
297    """Class to query and remember DUT execution and status history.
298
299    This class is responsible for querying the database to determine
300    the history of a single DUT in a time interval of interest, and
301    for remembering the query results for reporting.
302
303    @property hostname    Host name of the DUT.
304    @property start_time  Start of the requested time interval.
305                          This field may be `None`.
306    @property end_time    End of the requested time interval.
307    @property _afe        Autotest frontend for queries.
308    @property _host       Database host object for the DUT.
309    @property _history    A list of jobs and special tasks that
310                          ran on the DUT in the requested time
311                          interval, ordered in reverse, from latest
312                          to earliest.
313
314    @property _status_interval   A list of all the jobs and special
315                                 tasks that ran on the DUT in the
316                                 last diagnosis interval prior to
317                                 `end_time`, ordered from latest to
318                                 earliest.
319    @property _status_diagnosis  The DUT's status as of `end_time`.
320    @property _status_task       The DUT's last status task as of
321                                 `end_time`.
322
323    """
324
325    @classmethod
326    def get_host_history(cls, afe, hostname, start_time, end_time):
327        """Create a `HostJobHistory` instance for a single host.
328
329        Simple factory method to construct host history from a
330        hostname.  Simply looks up the host in the AFE database, and
331        passes it to the class constructor.
332
333        @param afe         Autotest frontend
334        @param hostname    Name of the host.
335        @param start_time  Start time for the history's time
336                           interval.
337        @param end_time    End time for the history's time interval.
338
339        @return A new `HostJobHistory` instance.
340
341        """
342        afehost = afe.get_hosts(hostname=hostname)[0]
343        return cls(afe, afehost, start_time, end_time)
344
345
346    @classmethod
347    def get_multiple_histories(cls, afe, start_time, end_time,
348                               board=None, pool=None):
349        """Create `HostJobHistory` instances for a set of hosts.
350
351        The set of hosts can be specified as "all hosts of a given
352        board type", "all hosts in a given pool", or "all hosts
353        of a given board and pool".
354
355        @param afe         Autotest frontend
356        @param start_time  Start time for the history's time
357                           interval.
358        @param end_time    End time for the history's time interval.
359        @param board       All hosts must have this board type; if
360                           `None`, all boards are allowed.
361        @param pool        All hosts must be in this pool; if
362                           `None`, all pools are allowed.
363
364        @return A list of new `HostJobHistory` instances.
365
366        """
367        # If `board` or `pool` are both `None`, we could search the
368        # entire database, which is more expensive than we want.
369        # Our caller currently won't (can't) do this, but assert to
370        # be safe.
371        assert board is not None or pool is not None
372        labels = []
373        if board is not None:
374            labels.append(constants.Labels.BOARD_PREFIX + board)
375        if pool is not None:
376            labels.append(constants.Labels.POOL_PREFIX + pool)
377        kwargs = {'multiple_labels': labels}
378        hosts = afe.get_hosts(**kwargs)
379        return [cls(afe, h, start_time, end_time) for h in hosts]
380
381
382    def __init__(self, afe, afehost, start_time, end_time):
383        self._afe = afe
384        self.hostname = afehost.hostname
385        self.end_time = end_time
386        self.start_time = start_time
387        self._host = afehost
388        # Don't spend time on queries until they're needed.
389        self._history = None
390        self._status_interval = None
391        self._status_diagnosis = None
392        self._status_task = None
393
394
395    def _get_history(self, start_time, end_time):
396        """Get the list of events for the given interval."""
397        newtasks = _SpecialTaskEvent.get_tasks(
398                self._afe, self._host.id, start_time, end_time)
399        newhqes = _TestJobEvent.get_hqes(
400                self._afe, self._host.id, start_time, end_time)
401        newhistory = newtasks + newhqes
402        newhistory.sort(reverse=True)
403        return newhistory
404
405
406    def __iter__(self):
407        if self._history is None:
408            self._history = self._get_history(self.start_time,
409                                              self.end_time)
410        return self._history.__iter__()
411
412
413    def _extract_prefixed_label(self, prefix):
414        labels = [l for l in self._host.labels
415                    if l.startswith(prefix)]
416        return labels[0][len(prefix) : ] if labels else None
417
418
419    @property
420    def host(self):
421        """Return the AFE host object for this history."""
422        return self._host
423
424
425    @property
426    def host_board(self):
427        """Return the board name for this history's DUT."""
428        prefix = constants.Labels.BOARD_PREFIX
429        return self._extract_prefixed_label(prefix)
430
431
432    @property
433    def host_pool(self):
434        """Return the pool name for this history's DUT."""
435        prefix = constants.Labels.POOL_PREFIX
436        return self._extract_prefixed_label(prefix)
437
438
439    def _init_status_task(self):
440        """Fill in `self._status_diagnosis` and `_status_task`."""
441        if self._status_diagnosis is not None:
442            return
443        self._status_task = _SpecialTaskEvent.get_status_task(
444                self._afe, self._host.id, self.end_time)
445        if self._status_task is not None:
446            self._status_diagnosis = self._status_task.diagnosis
447        else:
448            self._status_diagnosis = UNKNOWN
449
450
451    def _init_status_interval(self):
452        """Fill in `self._status_interval`."""
453        if self._status_interval is not None:
454            return
455        self._init_status_task()
456        self._status_interval = []
457        if self._status_task is None:
458            return
459        query_end = time_utils.epoch_time_to_date_string(self.end_time)
460        interval = self._afe.get_host_diagnosis_interval(
461                self._host.id, query_end,
462                self._status_diagnosis != WORKING)
463        if not interval:
464            return
465        self._status_interval = self._get_history(
466                parse_time(interval[0]),
467                parse_time(interval[1]))
468
469
470    def diagnosis_interval(self):
471        """Find this history's most recent diagnosis interval.
472
473        Returns a list of `_JobEvent` instances corresponding to the
474        most recent diagnosis interval occurring before this
475        history's end time.
476
477        The list is returned as with `self._history`, ordered from
478        most to least recent.
479
480        @return The list of the `_JobEvent`s in the diagnosis
481                interval.
482
483        """
484        self._init_status_interval()
485        return self._status_interval
486
487
488    def last_diagnosis(self):
489        """Return the diagnosis of whether the DUT is working.
490
491        This searches the DUT's job history, looking for the most
492        recent status task for the DUT.  Return a tuple of
493        `(diagnosis, task)`.
494
495        The `diagnosis` entry in the tuple is one of these values:
496          * UNUSED - The host's last status task is older than
497              `self.start_time`.
498          * WORKING - The DUT is working.
499          * BROKEN - The DUT likely requires manual intervention.
500          * UNKNOWN - No task could be found indicating status for
501              the DUT.
502
503        If the DUT was working at last check, but hasn't been used
504        inside this history's time interval, the status `UNUSED` is
505        returned with the last status task, instead of `WORKING`.
506
507        The `task` entry in the tuple is the status task that led to
508        the diagnosis.  The task will be `None` if the diagnosis is
509        `UNKNOWN`.
510
511        @return A tuple with the DUT's diagnosis and the task that
512                determined it.
513
514        """
515        self._init_status_task()
516        diagnosis = self._status_diagnosis
517        if (self.start_time is not None and
518                self._status_task is not None and
519                self._status_task.end_time < self.start_time and
520                diagnosis == WORKING):
521            diagnosis = UNUSED
522        return diagnosis, self._status_task
523
524
525def get_diagnosis_interval(host_id, end_time, success):
526    """Return the last diagnosis interval for a given host and time.
527
528    This routine queries the database for the special tasks on a
529    given host before a given time.  From those tasks it selects the
530    last status task before a change in status, and the first status
531    task after the change.  When `success` is true, the change must
532    be from "working" to "broken".  When false, the search is for a
533    change in the opposite direction.
534
535    A "successful status task" is any successful special task.  A
536    "failed status task" is a failed Repair task.  These criteria
537    are based on the definition of "status task" in the module-level
538    docstring, above.
539
540    This is the RPC endpoint for `AFE.get_host_diagnosis_interval()`.
541
542    @param host_id     Database host id of the desired host.
543    @param end_time    Find the last eligible interval before this time.
544    @param success     Whether the eligible interval should start with a
545                       success or a failure.
546
547    @return A list containing the start time of the earliest job
548            selected, and the end time of the latest job.
549
550    """
551    base_query = afe_models.SpecialTask.objects.filter(
552            host_id=host_id, is_complete=True)
553    success_query = base_query.filter(success=True)
554    failure_query = base_query.filter(success=False, task='Repair')
555    if success:
556        query0 = success_query
557        query1 = failure_query
558    else:
559        query0 = failure_query
560        query1 = success_query
561    query0 = query0.filter(time_finished__lte=end_time)
562    query0 = query0.order_by('time_started').reverse()
563    if not query0:
564        return []
565    task0 = query0[0]
566    query1 = query1.filter(time_finished__gt=task0.time_finished)
567    task1 = query1.order_by('time_started')[0]
568    return [task0.time_started.strftime(time_utils.TIME_FMT),
569            task1.time_finished.strftime(time_utils.TIME_FMT)]
570
571
572def get_status_task(host_id, end_time):
573    """Get the last status task for a host before a given time.
574
575    This routine returns a Django query for the AFE database to find
576    the last task that finished on the given host before the given
577    time that was either a successful task, or a Repair task.  The
578    query criteria are based on the definition of "status task" in
579    the module-level docstring, above.
580
581    This is the RPC endpoint for `_SpecialTaskEvent.get_status_task()`.
582
583    @param host_id     Database host id of the desired host.
584    @param end_time    End time of the range of interest.
585
586    @return A Django query-set selecting the single special task of
587            interest.
588
589    """
590    # Selects status tasks:  any Repair task, or any successful task.
591    status_tasks = (django_models.Q(task='Repair') |
592                    django_models.Q(success=True))
593    # Our caller needs a Django query set in order to serialize the
594    # result, so we don't resolve the query here; we just return a
595    # slice with at most one element.
596    return afe_models.SpecialTask.objects.filter(
597            status_tasks,
598            host_id=host_id,
599            time_finished__lte=end_time,
600            is_complete=True).order_by('time_started').reverse()[0:1]
601