• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
9model and pool, and determines whether each DUT is working or
10broken.  Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
13usage:  lab_inventory.py [ options ] [ model ... ]
14
15Options:
16--duration / -d <hours>
17    How far back in time to search job history to determine DUT
18    status.
19
20--model-notify <address>[,<address>]
21    Send the "model status" e-mail to all the specified e-mail
22    addresses.
23
24--pool-notify <address>[,<address>]
25    Send the "pool status" e-mail to all the specified e-mail
26    addresses.
27
28--recommend <number>
29    When generating the "model status" e-mail, include a list of
30    <number> specific DUTs to be recommended for repair.
31
32--repair-loops
33    Scan the inventory for DUTs stuck in repair loops, and report them
34    via a Monarch presence metric.
35
36--logdir <directory>
37    Log progress and actions in a file under this directory.  Text
38    of any e-mail sent will also be logged in a timestamped file in
39    this directory.
40
41--debug
42    Suppress all logging, metrics reporting, and sending e-mail.
43    Instead, write the output that would be generated onto stdout.
44
45<model> arguments:
46    With no arguments, gathers the status for all models in the lab.
47    With one or more named models on the command line, restricts
48    reporting to just those models.
49
50"""
51
52
53import argparse
54import collections
55import logging
56import logging.handlers
57import os
58import re
59import sys
60import time
61
62import common
63from autotest_lib.client.bin import utils
64from autotest_lib.client.common_lib import time_utils
65from autotest_lib.server import constants
66from autotest_lib.server import site_utils
67from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
68from autotest_lib.server.hosts import servo_host
69from autotest_lib.server.lib import status_history
70from autotest_lib.site_utils import gmail_lib
71from chromite.lib import metrics
72
73
74CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
75SPARE_POOL = constants.Pools.SPARE_POOL
76MANAGED_POOLS = constants.Pools.MANAGED_POOLS
77
78# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
79#     monitoring by this script.  Currently, we're excluding these:
80#   + 'adb' - We're not ready to monitor Android or Brillo hosts.
81#   + 'board:guado_moblab' - These are maintained by a separate
82#     process that doesn't use this script.
83
84_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'}
85
86# _DEFAULT_DURATION:
87#     Default value used for the --duration command line option.
88#     Specifies how far back in time to search in order to determine
89#     DUT status.
90
91_DEFAULT_DURATION = 24
92
93# _LOGDIR:
94#     Relative path used in the calculation of the default setting for
95#     the --logdir option.  The full path is relative to the root of the
96#     autotest directory, as determined from sys.argv[0].
97# _LOGFILE:
98#     Basename of a file to which general log information will be
99#     written.
100# _LOG_FORMAT:
101#     Format string for log messages.
102
103_LOGDIR = os.path.join('logs', 'dut-data')
104_LOGFILE = 'lab-inventory.log'
105_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
106
107# Pattern describing location-based host names in the Chrome OS test
108# labs.  Each DUT hostname designates the DUT's location:
109#   * A lab (room) that's physically separated from other labs
110#     (i.e. there's a door).
111#   * A row (or aisle) of DUTs within the lab.
112#   * A vertical rack of shelves on the row.
113#   * A specific host on one shelf of the rack.
114
115_HOSTNAME_PATTERN = re.compile(
116        r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
117
118# _REPAIR_LOOP_THRESHOLD:
119#    The number of repeated Repair tasks that must be seen to declare
120#    that a DUT is stuck in a repair loop.
121
122_REPAIR_LOOP_THRESHOLD = 4
123
124
125class _HostSetInventory(object):
126    """Maintains a set of related `HostJobHistory` objects.
127
128    The collection is segregated into disjoint categories of "working",
129    "broken", and "idle" DUTs.  Accessor methods allow finding both the
130    list of DUTs in each category, as well as counts of each category.
131
132    Performance note:  Certain methods in this class are potentially
133    expensive:
134      * `get_working()`
135      * `get_working_list()`
136      * `get_broken()`
137      * `get_broken_list()`
138      * `get_idle()`
139      * `get_idle_list()`
140    The first time any one of these methods is called, it causes
141    multiple RPC calls with a relatively expensive set of database
142    queries.  However, the results of the queries are cached in the
143    individual `HostJobHistory` objects, so only the first call
144    actually pays the full cost.
145
146    Additionally, `get_working_list()`, `get_broken_list()` and
147    `get_idle_list()` cache their return values to avoid recalculating
148    lists at every call; this caching is separate from the caching of
149    RPC results described above.
150
151    This class is deliberately constructed to delay the RPC cost until
152    the accessor methods are called (rather than to query in
153    `record_host()`) so that it's possible to construct a complete
154    `_LabInventory` without making the expensive queries at creation
155    time.  `_populate_model_counts()`, below, assumes this behavior.
156
157    Current usage of this class is that all DUTs are part of a single
158    scheduling pool of DUTs; however, this class make no assumptions
159    about the actual relationship among the DUTs.
160    """
161
162    def __init__(self):
163        self._histories = []
164        self._working_list = None
165        self._broken_list = None
166        self._idle_list = None
167
168
169    def record_host(self, host_history):
170        """Add one `HostJobHistory` object to the collection.
171
172        @param host_history The `HostJobHistory` object to be
173                            remembered.
174
175        """
176        self._working_list = None
177        self._broken_list = None
178        self._idle_list = None
179        self._histories.append(host_history)
180
181
182    def get_working_list(self):
183        """Return a list of all working DUTs in the pool.
184
185        Filter `self._histories` for histories where the last
186        diagnosis is `WORKING`.
187
188        Cache the result so that we only cacluate it once.
189
190        @return A list of HostJobHistory objects.
191
192        """
193        if self._working_list is None:
194            self._working_list = [h for h in self._histories
195                    if h.last_diagnosis()[0] == status_history.WORKING]
196        return self._working_list
197
198
199    def get_working(self):
200        """Return the number of working DUTs in the pool."""
201        return len(self.get_working_list())
202
203
204    def get_broken_list(self):
205        """Return a list of all broken DUTs in the pool.
206
207        Filter `self._histories` for histories where the last
208        diagnosis is `BROKEN`.
209
210        Cache the result so that we only cacluate it once.
211
212        @return A list of HostJobHistory objects.
213
214        """
215        if self._broken_list is None:
216            self._broken_list = [h for h in self._histories
217                    if h.last_diagnosis()[0] == status_history.BROKEN]
218        return self._broken_list
219
220
221    def get_broken(self):
222        """Return the number of broken DUTs in the pool."""
223        return len(self.get_broken_list())
224
225
226    def get_idle_list(self):
227        """Return a list of all idle DUTs in the pool.
228
229        Filter `self._histories` for histories where the last
230        diagnosis is `UNUSED` or `UNKNOWN`.
231
232        Cache the result so that we only cacluate it once.
233
234        @return A list of HostJobHistory objects.
235
236        """
237        idle_statuses = {status_history.UNUSED, status_history.UNKNOWN}
238        if self._idle_list is None:
239            self._idle_list = [h for h in self._histories
240                    if h.last_diagnosis()[0] in idle_statuses]
241        return self._idle_list
242
243
244    def get_idle(self):
245        """Return the number of idle DUTs in the pool."""
246        return len(self.get_idle_list())
247
248
249    def get_total(self):
250        """Return the total number of DUTs in the pool."""
251        return len(self._histories)
252
253
254class _PoolSetInventory(object):
255    """Maintains a set of `HostJobHistory`s for a set of pools.
256
257    The collection is segregated into disjoint categories of "working",
258    "broken", and "idle" DUTs.  Accessor methods allow finding both the
259    list of DUTs in each category, as well as counts of each category.
260    Accessor queries can be for an individual pool, or against all
261    pools.
262
263    Performance note:  This class relies on `_HostSetInventory`.  Public
264    methods in this class generally rely on methods of the same name in
265    the underlying class, and so will have the same underlying
266    performance characteristics.
267    """
268
269    def __init__(self, pools):
270        self._histories_by_pool = {
271            pool: _HostSetInventory() for pool in pools
272        }
273
274    def record_host(self, host_history):
275        """Add one `HostJobHistory` object to the collection.
276
277        @param host_history The `HostJobHistory` object to be
278                            remembered.
279
280        """
281        pool = host_history.host_pool
282        self._histories_by_pool[pool].record_host(host_history)
283
284
285    def _count_pool(self, get_pool_count, pool=None):
286        """Internal helper to count hosts in a given pool.
287
288        The `get_pool_count` parameter is a function to calculate
289        the exact count of interest for the pool.
290
291        @param get_pool_count  Function to return a count from a
292                               _PoolCount object.
293        @param pool            The pool to be counted.  If `None`,
294                               return the total across all pools.
295
296        """
297        if pool is None:
298            return sum([get_pool_count(cached_history) for cached_history in
299                        self._histories_by_pool.values()])
300        else:
301            return get_pool_count(self._histories_by_pool[pool])
302
303
304    def get_working_list(self):
305        """Return a list of all working DUTs (across all pools).
306
307        Go through all HostJobHistory objects across all pools, selecting the
308        ones where the last diagnosis is `WORKING`.
309
310        @return A list of HostJobHistory objects.
311
312        """
313        l = []
314        for p in self._histories_by_pool.values():
315            l.extend(p.get_working_list())
316        return l
317
318
319    def get_working(self, pool=None):
320        """Return the number of working DUTs in a pool.
321
322        @param pool  The pool to be counted.  If `None`, return the
323                     total across all pools.
324
325        @return The total number of working DUTs in the selected
326                pool(s).
327        """
328        return self._count_pool(_HostSetInventory.get_working, pool)
329
330
331    def get_broken_list(self):
332        """Return a list of all broken DUTs (across all pools).
333
334        Go through all HostJobHistory objects in the across all pools,
335        selecting the ones where the last diagnosis is `BROKEN`.
336
337        @return A list of HostJobHistory objects.
338
339        """
340        l = []
341        for p in self._histories_by_pool.values():
342            l.extend(p.get_broken_list())
343        return l
344
345
346    def get_broken(self, pool=None):
347        """Return the number of broken DUTs in a pool.
348
349        @param pool  The pool to be counted.  If `None`, return the
350                     total across all pools.
351
352        @return The total number of broken DUTs in the selected pool(s).
353        """
354        return self._count_pool(_HostSetInventory.get_broken, pool)
355
356
357    def get_idle_list(self, pool=None):
358        """Return a list of all idle DUTs in the given pool.
359
360        Go through all HostJobHistory objects in the given pool, selecting the
361        ones where the last diagnosis is `UNUSED` or `UNKNOWN`.
362
363        @param pool: The pool to be counted. If `None`, return the total list
364                     across all pools.
365
366        @return A list of HostJobHistory objects.
367
368        """
369        if pool is None:
370            l = []
371            for p in self._histories_by_pool.itervalues():
372                l.extend(p.get_idle_list())
373            return l
374        else:
375            return self._histories_by_pool[pool].get_idle_list()
376
377
378    def get_idle(self, pool=None):
379        """Return the number of idle DUTs in a pool.
380
381        @param pool: The pool to be counted. If `None`, return the total
382                     across all pools.
383
384        @return The total number of idle DUTs in the selected pool(s).
385        """
386        return self._count_pool(_HostSetInventory.get_idle, pool)
387
388
389    def get_spares_buffer(self, spare_pool=SPARE_POOL):
390        """Return the the nominal number of working spares.
391
392        Calculates and returns how many working spares there would
393        be in the spares pool if all broken DUTs were in the spares
394        pool.  This number may be negative, indicating a shortfall
395        in the critical pools.
396
397        @return The total number DUTs in the spares pool, less the total
398                number of broken DUTs in all pools.
399        """
400        return self.get_total(spare_pool) - self.get_broken()
401
402
403    def get_total(self, pool=None):
404        """Return the total number of DUTs in a pool.
405
406        @param pool  The pool to be counted.  If `None`, return the
407                     total across all pools.
408
409        @return The total number of DUTs in the selected pool(s).
410        """
411        return self._count_pool(_HostSetInventory.get_total, pool)
412
413
414def _eligible_host(afehost):
415    """Return whether this host is eligible for monitoring.
416
417    A host is eligible if it has a (unique) 'model' label, it's in
418    exactly one pool, and it has no labels from the
419    `_EXCLUDED_LABELS` set.
420
421    @param afehost  The host to be tested for eligibility.
422    """
423    # DUTs without an existing, unique 'model' or 'pool' label
424    # aren't meant to exist in the managed inventory; their presence
425    # generally indicates an error in the database.  Unfortunately
426    # such errors have been seen to occur from time to time.
427    #
428    # The _LabInventory constructor requires hosts to conform to the
429    # label restrictions, and may fail if they don't.  Failing an
430    # inventory run for a single bad entry is the wrong thing, so we
431    # ignore the problem children here, to keep them out of the
432    # inventory.
433    models = [l for l in afehost.labels
434                 if l.startswith(constants.Labels.MODEL_PREFIX)]
435    pools = [l for l in afehost.labels
436                 if l.startswith(constants.Labels.POOL_PREFIX)]
437    excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
438    return len(models) == 1 and len(pools) == 1 and not excluded
439
440
441class _LabInventory(collections.Mapping):
442    """Collection of `HostJobHistory` objects for the Lab's inventory.
443
444    This is a dict-like collection indexed by model.  Indexing returns
445    the _PoolSetInventory object associated with the model.
446    """
447
448    @classmethod
449    def create_inventory(cls, afe, start_time, end_time, modellist=[]):
450        """Return a Lab inventory with specified parameters.
451
452        By default, gathers inventory from `HostJobHistory` objects for
453        all DUTs in the `MANAGED_POOLS` list.  If `modellist` is
454        supplied, the inventory will be restricted to only the given
455        models.
456
457        @param afe          AFE object for constructing the
458                            `HostJobHistory` objects.
459        @param start_time   Start time for the `HostJobHistory` objects.
460        @param end_time     End time for the `HostJobHistory` objects.
461        @param modellist    List of models to include.  If empty,
462                            include all available models.
463        @return A `_LabInventory` object for the specified models.
464
465        """
466        target_pools = MANAGED_POOLS
467        label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools]
468        afehosts = afe.get_hosts(labels__name__in=label_list)
469        if modellist:
470            # We're deliberately not checking host eligibility in this
471            # code path.  This is a debug path, not used in production;
472            # it may be useful to include ineligible hosts here.
473            modelhosts = []
474            for model in modellist:
475                model_label = constants.Labels.MODEL_PREFIX + model
476                host_list = [h for h in afehosts
477                                  if model_label in h.labels]
478                modelhosts.extend(host_list)
479            afehosts = modelhosts
480        else:
481            afehosts = [h for h in afehosts if _eligible_host(h)]
482        create = lambda host: (
483                status_history.HostJobHistory(afe, host,
484                                              start_time, end_time))
485        return cls([create(host) for host in afehosts], target_pools)
486
487
488    def __init__(self, histories, pools):
489        models = {h.host_model for h in histories}
490        self._modeldata = {model: _PoolSetInventory(pools) for model in models}
491        self._dut_count = len(histories)
492        for h in histories:
493            self[h.host_model].record_host(h)
494        self._boards = {h.host_board for h in histories}
495
496
497    def __getitem__(self, key):
498        return self._modeldata.__getitem__(key)
499
500
501    def __len__(self):
502        return self._modeldata.__len__()
503
504
505    def __iter__(self):
506        return self._modeldata.__iter__()
507
508
509    def reportable_items(self, spare_pool=SPARE_POOL):
510        """Iterate over  all items subject to reporting.
511
512        Yields the contents of `self.iteritems()` filtered to include
513        only reportable models.  A model is reportable if it has DUTs in
514        both `spare_pool` and at least one other pool.
515
516        @param spare_pool  The spare pool to be tested for reporting.
517        """
518        for model, histories in self.iteritems():
519            spares = histories.get_total(spare_pool)
520            total = histories.get_total()
521            if spares != 0 and spares != total:
522                yield model, histories
523
524
525    def get_num_duts(self):
526        """Return the total number of DUTs in the inventory."""
527        return self._dut_count
528
529
530    def get_num_models(self):
531        """Return the total number of models in the inventory."""
532        return len(self)
533
534
535    def get_pool_models(self, pool):
536        """Return all models in `pool`.
537
538        @param pool The pool to be inventoried for models.
539        """
540        return {m for m, h in self.iteritems() if h.get_total(pool)}
541
542
543    def get_boards(self):
544        return self._boards
545
546
547def _sort_by_location(inventory_list):
548    """Return a list of DUTs, organized by location.
549
550    Take the given list of `HostJobHistory` objects, separate it
551    into a list per lab, and sort each lab's list by location.  The
552    order of sorting within a lab is
553      * By row number within the lab,
554      * then by rack number within the row,
555      * then by host shelf number within the rack.
556
557    Return a list of the sorted lists.
558
559    Implementation note: host locations are sorted by converting
560    each location into a base 100 number.  If row, rack or
561    host numbers exceed the range [0..99], then sorting will
562    break down.
563
564    @return A list of sorted lists of DUTs.
565
566    """
567    BASE = 100
568    lab_lists = {}
569    for history in inventory_list:
570        location = _HOSTNAME_PATTERN.match(history.host.hostname)
571        if location:
572            lab = location.group(1)
573            key = 0
574            for idx in location.group(2, 3, 4):
575                key = BASE * key + int(idx)
576            lab_lists.setdefault(lab, []).append((key, history))
577    return_list = []
578    for dut_list in lab_lists.values():
579        dut_list.sort(key=lambda t: t[0])
580        return_list.append([t[1] for t in dut_list])
581    return return_list
582
583
584def _score_repair_set(buffer_counts, repair_list):
585    """Return a numeric score rating a set of DUTs to be repaired.
586
587    `buffer_counts` is a dictionary mapping model names to the size of
588    the model's spares buffer.
589
590    `repair_list` is a list of `HostJobHistory` objects for the DUTs to
591    be repaired.
592
593    This function calculates the new set of buffer counts that would
594    result from the proposed repairs, and scores the new set using two
595    numbers:
596      * Worst case buffer count for any model (higher is better).  This
597        is the more significant number for comparison.
598      * Number of models at the worst case (lower is better).  This is
599        the less significant number.
600
601    Implementation note:  The score could fail to reflect the intended
602    criteria if there are more than 1000 models in the inventory.
603
604    @param spare_counts   A dictionary mapping models to buffer counts.
605    @param repair_list    A list of `HostJobHistory` objects for the
606                          DUTs to be repaired.
607    @return A numeric score.
608    """
609    # Go through `buffer_counts`, and create a list of new counts
610    # that records the buffer count for each model after repair.
611    # The new list of counts discards the model names, as they don't
612    # contribute to the final score.
613    _NMODELS = 1000
614    pools = {h.host_pool for h in repair_list}
615    repair_inventory = _LabInventory(repair_list, pools)
616    new_counts = []
617    for m, c in buffer_counts.iteritems():
618        if m in repair_inventory:
619            newcount = repair_inventory[m].get_total()
620        else:
621            newcount = 0
622        new_counts.append(c + newcount)
623    # Go through the new list of counts.  Find the worst available
624    # spares count, and count how many times that worst case occurs.
625    worst_count = new_counts[0]
626    num_worst = 1
627    for c in new_counts[1:]:
628        if c == worst_count:
629            num_worst += 1
630        elif c < worst_count:
631            worst_count = c
632            num_worst = 1
633    # Return the calculated score
634    return _NMODELS * worst_count - num_worst
635
636
637def _generate_repair_recommendation(inventory, num_recommend):
638    """Return a summary of selected DUTs needing repair.
639
640    Returns a message recommending a list of broken DUTs to be repaired.
641    The list of DUTs is selected based on these criteria:
642      * No more than `num_recommend` DUTs will be listed.
643      * All DUTs must be in the same lab.
644      * DUTs should be selected for some degree of physical proximity.
645      * DUTs for models with a low spares buffer are more important than
646        DUTs with larger buffers.
647
648    The algorithm used will guarantee that at least one DUT from a model
649    with the lowest spares buffer will be recommended.  If the worst
650    spares buffer number is shared by more than one model, the algorithm
651    will tend to prefer repair sets that include more of those models
652    over sets that cover fewer models.
653
654    @param inventory      `_LabInventory` object from which to generate
655                          recommendations.
656    @param num_recommend  Number of DUTs to recommend for repair.
657
658    """
659    logging.debug('Creating DUT repair recommendations')
660    model_buffer_counts = {}
661    broken_list = []
662    for model, counts in inventory.reportable_items():
663        logging.debug('Listing failed DUTs for %s', model)
664        if counts.get_broken() != 0:
665            model_buffer_counts[model] = counts.get_spares_buffer()
666            broken_list.extend(counts.get_broken_list())
667    # N.B. The logic inside this loop may seem complicated, but
668    # simplification is hard:
669    #   * Calculating an initial recommendation outside of
670    #     the loop likely would make things more complicated,
671    #     not less.
672    #   * It's necessary to calculate an initial lab slice once per
673    #     lab _before_ the while loop, in case the number of broken
674    #     DUTs in a lab is less than `num_recommend`.
675    recommendation = None
676    best_score = None
677    for lab_duts in _sort_by_location(broken_list):
678        start = 0
679        end = num_recommend
680        lab_slice = lab_duts[start : end]
681        lab_score = _score_repair_set(model_buffer_counts, lab_slice)
682        while end < len(lab_duts):
683            start += 1
684            end += 1
685            new_slice = lab_duts[start : end]
686            new_score = _score_repair_set(model_buffer_counts, new_slice)
687            if new_score > lab_score:
688                lab_slice = new_slice
689                lab_score = new_score
690        if recommendation is None or lab_score > best_score:
691            recommendation = lab_slice
692            best_score = lab_score
693    # N.B. The trailing space in `line_fmt` is manadatory:  Without it,
694    # Gmail will parse the URL wrong.  Don't ask.  If you simply _must_
695    # know more, go try it yourself...
696    line_fmt = '%-30s %-16s %-6s\n    %s '
697    message = ['Repair recommendations:\n',
698               line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')]
699    for h in recommendation:
700        servo_name = servo_host.make_servo_hostname(h.host.hostname)
701        servo_present = utils.host_is_in_lab_zone(servo_name)
702        _, event = h.last_diagnosis()
703        line = line_fmt % (
704                h.host.hostname, h.host_model,
705                'Yes' if servo_present else 'No', event.job_url)
706        message.append(line)
707    return '\n'.join(message)
708
709
710def _generate_model_inventory_message(inventory):
711    """Generate the "model inventory" e-mail message.
712
713    The model inventory is a list by model summarizing the number of
714    working, broken, and idle DUTs, and the total shortfall or surplus
715    of working devices relative to the minimum critical pool
716    requirement.
717
718    The report omits models with no DUTs in the spare pool or with no
719    DUTs in a critical pool.
720
721    N.B. For sample output text formattted as users can expect to
722    see it in e-mail and log files, refer to the unit tests.
723
724    @param inventory  `_LabInventory` object to be reported on.
725    @return String with the inventory message to be sent.
726    """
727    logging.debug('Creating model inventory')
728    nworking = 0
729    nbroken = 0
730    nidle = 0
731    nbroken_models = 0
732    ntotal_models = 0
733    summaries = []
734    column_names = (
735        'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total')
736    for model, counts in inventory.reportable_items():
737        logging.debug('Counting %2d DUTS for model %s',
738                      counts.get_total(), model)
739        # Summary elements laid out in the same order as the column
740        # headers:
741        #     Model Avail   Bad  Idle  Good  Spare Total
742        #      e[0]  e[1]  e[2]  e[3]  e[4]  e[5]  e[6]
743        element = (model,
744                   counts.get_spares_buffer(),
745                   counts.get_broken(),
746                   counts.get_idle(),
747                   counts.get_working(),
748                   counts.get_total(SPARE_POOL),
749                   counts.get_total())
750        if element[2]:
751            summaries.append(element)
752            nbroken_models += 1
753        ntotal_models += 1
754        nbroken += element[2]
755        nidle += element[3]
756        nworking += element[4]
757    ntotal = nworking + nbroken + nidle
758    summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
759    broken_percent = int(round(100.0 * nbroken / ntotal))
760    idle_percent = int(round(100.0 * nidle / ntotal))
761    working_percent = 100 - broken_percent - idle_percent
762    message = ['Summary of DUTs in inventory:',
763               '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
764               '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
765                   nbroken, broken_percent,
766                   nidle, idle_percent,
767                   nworking, working_percent,
768                   ntotal),
769               '',
770               'Models with failures: %d' % nbroken_models,
771               'Models in inventory:  %d' % ntotal_models,
772               '', '',
773               'Full model inventory:\n',
774               '%-22s %5s %5s %5s %5s %5s %5s' % column_names]
775    message.extend(
776            ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
777    return '\n'.join(message)
778
779
780_POOL_INVENTORY_HEADER = '''\
781Notice to Infrastructure deputies:  All models shown below are at
782less than full strength, please take action to resolve the issues.
783Once you're satisified that failures won't recur, failed DUTs can
784be replaced with spares by running `balance_pool`.  Detailed
785instructions can be found here:
786    http://go/cros-manage-duts
787'''
788
789
790def _generate_pool_inventory_message(inventory):
791    """Generate the "pool inventory" e-mail message.
792
793    The pool inventory is a list by pool and model summarizing the
794    number of working and broken DUTs in the pool.  Only models with
795    at least one broken DUT are included in the list.
796
797    N.B. For sample output text formattted as users can expect to see it
798    in e-mail and log files, refer to the unit tests.
799
800    @param inventory  `_LabInventory` object to be reported on.
801    @return String with the inventory message to be sent.
802    """
803    logging.debug('Creating pool inventory')
804    message = [_POOL_INVENTORY_HEADER]
805    newline = ''
806    for pool in CRITICAL_POOLS:
807        message.append(
808            '%sStatus for pool:%s, by model:' % (newline, pool))
809        message.append(
810            '%-20s   %5s %5s %5s %5s' % (
811                'Model', 'Bad', 'Idle', 'Good', 'Total'))
812        data_list = []
813        for model, counts in inventory.iteritems():
814            logging.debug('Counting %2d DUTs for %s, %s',
815                          counts.get_total(pool), model, pool)
816            broken = counts.get_broken(pool)
817            idle = counts.get_idle(pool)
818            # models at full strength are not reported
819            if not broken and not idle:
820                continue
821            working = counts.get_working(pool)
822            total = counts.get_total(pool)
823            data_list.append((model, broken, idle, working, total))
824        if data_list:
825            data_list = sorted(data_list, key=lambda d: -d[1])
826            message.extend(
827                ['%-20s   %5d %5d %5d %5d' % t for t in data_list])
828        else:
829            message.append('(All models at full strength)')
830        newline = '\n'
831    return '\n'.join(message)
832
833
834_IDLE_INVENTORY_HEADER = '''\
835Notice to Infrastructure deputies:  The hosts shown below haven't
836run any jobs for at least 24 hours. Please check each host; locked
837hosts should normally be unlocked; stuck jobs should normally be
838aborted.
839'''
840
841
842def _generate_idle_inventory_message(inventory):
843    """Generate the "idle inventory" e-mail message.
844
845    The idle inventory is a host list with corresponding pool and model,
846    where the hosts are idle (`UNKWOWN` or `UNUSED`).
847
848    N.B. For sample output text format as users can expect to
849    see it in e-mail and log files, refer to the unit tests.
850
851    @param inventory  `_LabInventory` object to be reported on.
852    @return String with the inventory message to be sent.
853
854    """
855    logging.debug('Creating idle inventory')
856    message = [_IDLE_INVENTORY_HEADER]
857    message.append('Idle Host List:')
858    message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool'))
859    data_list = []
860    for pool in MANAGED_POOLS:
861        for model, counts in inventory.iteritems():
862            logging.debug('Counting %2d DUTs for %s, %s',
863                          counts.get_total(pool), model, pool)
864            data_list.extend([(dut.host.hostname, model, pool)
865                                  for dut in counts.get_idle_list(pool)])
866    if data_list:
867        message.extend(['%-30s %-20s %s' % t for t in data_list])
868    else:
869        message.append('(No idle DUTs)')
870    return '\n'.join(message)
871
872
873def _send_email(arguments, tag, subject, recipients, body):
874    """Send an inventory e-mail message.
875
876    The message is logged in the selected log directory using `tag` for
877    the file name.
878
879    If the --debug option was requested, the message is neither logged
880    nor sent, but merely printed on stdout.
881
882    @param arguments   Parsed command-line options.
883    @param tag         Tag identifying the inventory for logging
884                       purposes.
885    @param subject     E-mail Subject: header line.
886    @param recipients  E-mail addresses for the To: header line.
887    @param body        E-mail message body.
888    """
889    logging.debug('Generating email: "%s"', subject)
890    all_recipients = ', '.join(recipients)
891    report_body = '\n'.join([
892            'To: %s' % all_recipients,
893            'Subject: %s' % subject,
894            '', body, ''])
895    if arguments.debug:
896        print report_body
897    else:
898        filename = os.path.join(arguments.logdir, tag)
899        try:
900            report_file = open(filename, 'w')
901            report_file.write(report_body)
902            report_file.close()
903        except EnvironmentError as e:
904            logging.error('Failed to write %s:  %s', filename, e)
905        try:
906            gmail_lib.send_email(all_recipients, subject, body)
907        except Exception as e:
908            logging.error('Failed to send e-mail to %s:  %s',
909                          all_recipients, e)
910
911
912def _populate_model_counts(inventory):
913    """Gather model counts while providing interactive feedback.
914
915    Gathering the status of all individual DUTs in the lab can take
916    considerable time (~30 minutes at the time of this writing).
917    Normally, we pay that cost by querying as we go.  However, with
918    the `--debug` option, we expect a human being to be watching the
919    progress in real time.  So, we force the first (expensive) queries
920    to happen up front, and provide simple ASCII output on sys.stdout
921    to show a progress bar and results.
922
923    @param inventory  `_LabInventory` object from which to gather
924                      counts.
925    """
926    n = 0
927    total_broken = 0
928    for counts in inventory.itervalues():
929        n += 1
930        if n % 10 == 5:
931            c = '+'
932        elif n % 10 == 0:
933            c = '%d' % ((n / 10) % 10)
934        else:
935            c = '.'
936        sys.stdout.write(c)
937        sys.stdout.flush()
938        # This next call is where all the time goes - it forces all of a
939        # model's `HostJobHistory` objects to query the database and
940        # cache their results.
941        total_broken += counts.get_broken()
942    sys.stdout.write('\n')
943    sys.stdout.write('Found %d broken DUTs\n' % total_broken)
944
945
946def _perform_model_inventory(arguments, inventory, timestamp):
947    """Perform the model inventory report.
948
949    The model inventory report consists of the following:
950      * A list of DUTs that are recommended to be repaired.  This list
951        is optional, and only appears if the `--recommend` option is
952        present.
953      * A list of all models that have failed DUTs, with counts
954        of working, broken, and spare DUTs, among others.
955
956    @param arguments  Command-line arguments as returned by
957                      `ArgumentParser`
958    @param inventory  `_LabInventory` object to be reported on.
959    @param timestamp  A string used to identify this run's timestamp
960                      in logs and email output.
961    """
962    if arguments.recommend:
963        recommend_message = _generate_repair_recommendation(
964                inventory, arguments.recommend) + '\n\n\n'
965    else:
966        recommend_message = ''
967    model_message = _generate_model_inventory_message(inventory)
968    _send_email(arguments,
969                'models-%s.txt' % timestamp,
970                'DUT model inventory %s' % timestamp,
971                arguments.model_notify,
972                recommend_message + model_message)
973
974
975def _perform_pool_inventory(arguments, inventory, timestamp):
976    """Perform the pool inventory report.
977
978    The pool inventory report consists of the following:
979      * A list of all critical pools that have failed DUTs, with counts
980        of working, broken, and idle DUTs.
981      * A list of all idle DUTs by hostname including the model and
982        pool.
983
984    @param arguments  Command-line arguments as returned by
985                      `ArgumentParser`
986    @param inventory  `_LabInventory` object to be reported on.
987    @param timestamp  A string used to identify this run's timestamp in
988                      logs and email output.
989    """
990    pool_message = _generate_pool_inventory_message(inventory)
991    idle_message = _generate_idle_inventory_message(inventory)
992    _send_email(arguments,
993                'pools-%s.txt' % timestamp,
994                'DUT pool inventory %s' % timestamp,
995                arguments.pool_notify,
996                pool_message + '\n\n\n' + idle_message)
997
998
999def _dut_in_repair_loop(history):
1000    """Return whether a DUT's history indicates a repair loop.
1001
1002    A DUT is considered looping if it runs no tests, and no tasks pass
1003    other than repair tasks.
1004
1005    @param history  An instance of `status_history.HostJobHistory` to be
1006                    scanned for a repair loop.  The caller guarantees
1007                    that this history corresponds to a working DUT.
1008    @returns  Return a true value if the DUT's most recent history
1009              indicates a repair loop.
1010    """
1011    # Our caller passes only histories for working DUTs; that means
1012    # we've already paid the cost of fetching the diagnosis task, and
1013    # we know that the task was successful.  The diagnosis task will be
1014    # one of the tasks we must scan to find a loop, so if the task isn't
1015    # a repair task, then our history includes a successful non-repair
1016    # task, and we're not looping.
1017    #
1018    # The for loop below  is very expensive, because it must fetch the
1019    # full history, regardless of how many tasks we examine.  At the
1020    # time of this writing, this check against the diagnosis task
1021    # reduces the cost of finding loops in the full inventory from hours
1022    # to minutes.
1023    if history.last_diagnosis()[1].name != 'Repair':
1024        return False
1025    repair_ok_count = 0
1026    for task in history:
1027        if not task.is_special:
1028            # This is a test, so we're not looping.
1029            return False
1030        if task.diagnosis == status_history.BROKEN:
1031            # Failed a repair, so we're not looping.
1032            return False
1033        if (task.diagnosis == status_history.WORKING
1034                and task.name != 'Repair'):
1035            # Non-repair task succeeded, so we're not looping.
1036            return False
1037        # At this point, we have either a failed non-repair task, or
1038        # a successful repair.
1039        if task.name == 'Repair':
1040            repair_ok_count += 1
1041            if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:
1042                return True
1043
1044
1045def _perform_repair_loop_report(arguments, inventory):
1046    """Scan the inventory for DUTs stuck in a repair loop.
1047
1048    This routine walks through the given inventory looking for DUTs
1049    where the most recent history shows that the DUT is regularly
1050    passing repair tasks, but has not run any tests.
1051
1052    @param arguments  Command-line arguments as returned by
1053                      `ArgumentParser`
1054    @param inventory  `_LabInventory` object to be reported on.
1055    """
1056    loop_presence = metrics.BooleanMetric(
1057        'chromeos/autotest/inventory/repair_loops',
1058        'DUTs stuck in repair loops')
1059    logging.info('Scanning for DUTs in repair loops.')
1060    for counts in inventory.itervalues():
1061        for history in counts.get_working_list():
1062            # Managed DUTs with names that don't match
1063            # _HOSTNAME_PATTERN shouldn't be possible.  However, we
1064            # don't want arbitrary strings being attached to the
1065            # 'dut_hostname' field, so for safety, we exclude all
1066            # anomalies.
1067            if not _HOSTNAME_PATTERN.match(history.hostname):
1068                continue
1069            if _dut_in_repair_loop(history):
1070                fields = {'dut_hostname': history.hostname,
1071                          'model': history.host_model,
1072                          'pool': history.host_pool}
1073                logging.info('Looping DUT: %(dut_hostname)s, '
1074                             'model: %(model)s, pool: %(pool)s',
1075                             fields)
1076                loop_presence.set(True, fields=fields)
1077
1078
1079def _log_startup(arguments, startup_time):
1080    """Log the start of this inventory run.
1081
1082    Print various log messages indicating the start of the run.  Return
1083    a string based on `startup_time` that will be used to identify this
1084    run in log files and e-mail messages.
1085
1086    @param startup_time   A UNIX timestamp marking the moment when
1087                          this inventory run began.
1088    @returns  A timestamp string that will be used to identify this run
1089              in logs and email output.
1090    """
1091    timestamp = time.strftime('%Y-%m-%d.%H',
1092                              time.localtime(startup_time))
1093    logging.debug('Starting lab inventory for %s', timestamp)
1094    if arguments.model_notify:
1095        if arguments.recommend:
1096            logging.debug('Will include repair recommendations')
1097        logging.debug('Will include model inventory')
1098    if arguments.pool_notify:
1099        logging.debug('Will include pool inventory')
1100    return timestamp
1101
1102
1103def _create_inventory(arguments, end_time):
1104    """Create the `_LabInventory` instance to use for reporting.
1105
1106    @param end_time   A UNIX timestamp for the end of the time range
1107                      to be searched in this inventory run.
1108    """
1109    start_time = end_time - arguments.duration * 60 * 60
1110    afe = frontend_wrappers.RetryingAFE(server=None)
1111    inventory = _LabInventory.create_inventory(
1112            afe, start_time, end_time, arguments.modelnames)
1113    logging.info('Found %d hosts across %d models',
1114                     inventory.get_num_duts(),
1115                     inventory.get_num_models())
1116    return inventory
1117
1118
1119def _perform_inventory_reports(arguments):
1120    """Perform all inventory checks requested on the command line.
1121
1122    Create the initial inventory and run through the inventory reports
1123    as called for by the parsed command-line arguments.
1124
1125    @param arguments  Command-line arguments as returned by
1126                      `ArgumentParser`.
1127    """
1128    startup_time = time.time()
1129    timestamp = _log_startup(arguments, startup_time)
1130    inventory = _create_inventory(arguments, startup_time)
1131    if arguments.debug:
1132        _populate_model_counts(inventory)
1133    if arguments.model_notify:
1134        _perform_model_inventory(arguments, inventory, timestamp)
1135    if arguments.pool_notify:
1136        _perform_pool_inventory(arguments, inventory, timestamp)
1137    if arguments.repair_loops:
1138        _perform_repair_loop_report(arguments, inventory)
1139
1140
1141def _separate_email_addresses(address_list):
1142    """Parse a list of comma-separated lists of e-mail addresses.
1143
1144    @param address_list  A list of strings containing comma
1145                         separate e-mail addresses.
1146    @return A list of the individual e-mail addresses.
1147
1148    """
1149    newlist = []
1150    for arg in address_list:
1151        newlist.extend([email.strip() for email in arg.split(',')])
1152    return newlist
1153
1154
1155def _verify_arguments(arguments):
1156    """Validate command-line arguments.
1157
1158    Join comma separated e-mail addresses for `--model-notify` and
1159    `--pool-notify` in separate option arguments into a single list.
1160
1161    For non-debug uses, require that at least one inventory report be
1162    requested.  For debug, if a report isn't specified, treat it as "run
1163    all the reports."
1164
1165    The return value indicates success or failure; in the case of
1166    failure, we also write an error message to stderr.
1167
1168    @param arguments  Command-line arguments as returned by
1169                      `ArgumentParser`
1170    @return True if the arguments are semantically good, or False
1171            if the arguments don't meet requirements.
1172
1173    """
1174    arguments.model_notify = _separate_email_addresses(
1175            arguments.model_notify)
1176    arguments.pool_notify = _separate_email_addresses(
1177            arguments.pool_notify)
1178    if not any([arguments.model_notify, arguments.pool_notify,
1179                arguments.repair_loops]):
1180        if not arguments.debug:
1181            sys.stderr.write('Must request at least one report via '
1182                             '--model-notify, --pool-notify, or '
1183                             '--repair-loops\n')
1184            return False
1185        else:
1186            # We want to run all the e-mail reports.  An empty notify
1187            # list will cause a report to be skipped, so make sure the
1188            # lists are non-empty.
1189            arguments.model_notify = ['']
1190            arguments.pool_notify = ['']
1191    return True
1192
1193
1194def _get_default_logdir(script):
1195    """Get the default directory for the `--logdir` option.
1196
1197    The default log directory is based on the parent directory
1198    containing this script.
1199
1200    @param script  Path to this script file.
1201    @return A path to a directory.
1202
1203    """
1204    basedir = os.path.dirname(os.path.abspath(script))
1205    basedir = os.path.dirname(basedir)
1206    return os.path.join(basedir, _LOGDIR)
1207
1208
1209def _parse_command(argv):
1210    """Parse the command line arguments.
1211
1212    Create an argument parser for this command's syntax, parse the
1213    command line, and return the result of the ArgumentParser
1214    parse_args() method.
1215
1216    @param argv Standard command line argument vector; argv[0] is
1217                assumed to be the command name.
1218    @return Result returned by ArgumentParser.parse_args().
1219
1220    """
1221    parser = argparse.ArgumentParser(
1222            prog=argv[0],
1223            description='Gather and report lab inventory statistics')
1224    parser.add_argument('-d', '--duration', type=int,
1225                        default=_DEFAULT_DURATION, metavar='HOURS',
1226                        help='number of hours back to search for status'
1227                             ' (default: %d)' % _DEFAULT_DURATION)
1228    parser.add_argument('--model-notify', action='append',
1229                        default=[], metavar='ADDRESS',
1230                        help='Generate model inventory message, '
1231                        'and send it to the given e-mail address(es)')
1232    parser.add_argument('--pool-notify', action='append',
1233                        default=[], metavar='ADDRESS',
1234                        help='Generate pool inventory message, '
1235                             'and send it to the given address(es)')
1236    parser.add_argument('-r', '--recommend', type=int, default=None,
1237                        help=('Specify how many DUTs should be '
1238                              'recommended for repair (default: no '
1239                              'recommendation)'))
1240    parser.add_argument('--repair-loops', action='store_true',
1241                        help='Check for devices stuck in repair loops.')
1242    parser.add_argument('--debug-metrics', action='store_true',
1243                        help='Include debug information about the metrics '
1244                             'that would be reported ')
1245    parser.add_argument('--debug', action='store_true',
1246                        help='Print e-mail messages on stdout '
1247                             'without sending them.')
1248    parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),
1249                        help='Directory where logs will be written.')
1250    parser.add_argument('modelnames', nargs='*',
1251                        metavar='MODEL',
1252                        help='names of models to report on '
1253                             '(default: all models)')
1254    arguments = parser.parse_args(argv[1:])
1255    if not _verify_arguments(arguments):
1256        return None
1257    return arguments
1258
1259
1260def _configure_logging(arguments):
1261    """Configure the `logging` module for our needs.
1262
1263    How we log depends on whether the `--debug` option was provided on
1264    the command line.
1265      * Without the option, we configure the logging to capture all
1266        potentially relevant events in a log file.  The log file is
1267        configured to rotate once a week on Friday evening, preserving
1268        ~3 months worth of history.
1269      * With the option, we expect stdout to contain other
1270        human-readable output (including the contents of the e-mail
1271        messages), so we restrict the output to INFO level.
1272
1273    For convenience, when `--debug` is on, the logging format has
1274    no adornments, so that a call like `logging.info(msg)` simply writes
1275    `msg` to stdout, plus a trailing newline.
1276
1277    @param arguments  Command-line arguments as returned by
1278                      `ArgumentParser`
1279    """
1280    root_logger = logging.getLogger()
1281    if arguments.debug:
1282        root_logger.setLevel(logging.INFO)
1283        handler = logging.StreamHandler(sys.stdout)
1284        handler.setFormatter(logging.Formatter())
1285    else:
1286        if not os.path.exists(arguments.logdir):
1287            os.mkdir(arguments.logdir)
1288        root_logger.setLevel(logging.DEBUG)
1289        logfile = os.path.join(arguments.logdir, _LOGFILE)
1290        handler = logging.handlers.TimedRotatingFileHandler(
1291                logfile, when='W4', backupCount=13)
1292        formatter = logging.Formatter(_LOG_FORMAT,
1293                                      time_utils.TIME_FMT)
1294        handler.setFormatter(formatter)
1295    # TODO(jrbarnette) This is gross.  Importing client.bin.utils
1296    # implicitly imported logging_config, which calls
1297    # logging.basicConfig() *at module level*.  That gives us an
1298    # extra logging handler that we don't want.  So, clear out all
1299    # the handlers here.
1300    for h in root_logger.handlers:
1301        root_logger.removeHandler(h)
1302    root_logger.addHandler(handler)
1303
1304
1305def main(argv):
1306    """Standard main routine.
1307
1308    @param argv  Command line arguments, including `sys.argv[0]`.
1309    """
1310    arguments = _parse_command(argv)
1311    if not arguments:
1312        sys.exit(1)
1313    _configure_logging(arguments)
1314    try:
1315        if arguments.debug_metrics or not arguments.debug:
1316            metrics_file = None if not arguments.debug_metrics else '/dev/null'
1317            with site_utils.SetupTsMonGlobalState(
1318                    'repair_loops', debug_file=metrics_file,
1319                    auto_flush=False):
1320                _perform_inventory_reports(arguments)
1321            metrics.Flush()
1322        else:
1323            _perform_inventory_reports(arguments)
1324    except KeyboardInterrupt:
1325        pass
1326    except EnvironmentError as e:
1327        logging.exception('Unexpected OS error: %s', e)
1328    except Exception as e:
1329        logging.exception('Unexpected exception: %s', e)
1330
1331
1332def get_inventory(afe):
1333    end_time = int(time.time())
1334    start_time = end_time - 24 * 60 * 60
1335    return _LabInventory.create_inventory(afe, start_time, end_time)
1336
1337
1338def get_managed_boards(afe):
1339    return get_inventory(afe).get_boards()
1340
1341
1342if __name__ == '__main__':
1343    main(sys.argv)
1344