• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Create e-mail reports of the Lab's DUT inventory.
7
8Gathers a list of all DUTs of interest in the Lab, segregated by
9board and pool, and determines whether each DUT is working or
10broken.  Then, send one or more e-mail reports summarizing the
11status to e-mail addresses provided on the command line.
12
13usage:  lab_inventory.py [ options ] [ board ... ]
14
15Options:
16--duration / -d <hours>
17    How far back in time to search job history to determine DUT
18    status.
19
20--board-notify <address>[,<address>]
21    Send the "board status" e-mail to all the specified e-mail
22    addresses.
23
24--pool-notify <address>[,<address>]
25    Send the "pool status" e-mail to all the specified e-mail
26    addresses.
27
28--recommend <number>
29    When generating the "board status" e-mail, included a list of
30    <number> specific DUTs to be recommended for repair.
31
32--logdir <directory>
33    Log progress and actions in a file under this directory.  Text
34    of any e-mail sent will also be logged in a timestamped file in
35    this directory.
36
37--debug
38    Suppress all logging and sending e-mail.  Instead, write the
39    output that would be generated onto stdout.
40
41<board> arguments:
42    With no arguments, gathers the status for all boards in the lab.
43    With one or more named boards on the command line, restricts
44    reporting to just those boards.
45
46"""
47
48
49import argparse
50import logging
51import logging.handlers
52import os
53import re
54import sys
55import time
56
57import common
58from autotest_lib.client.bin import utils
59from autotest_lib.client.common_lib import time_utils
60from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
61from autotest_lib.server.hosts import servo_host
62from autotest_lib.server.lib import status_history
63from autotest_lib.site_utils import gmail_lib
64from autotest_lib.site_utils.suite_scheduler import constants
65
66
67CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
68SPARE_POOL = constants.Pools.SPARE_POOL
69MANAGED_POOLS = constants.Pools.MANAGED_POOLS
70
71# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
72#     monitoring by this script.  Currently, we're excluding any
73#     'adb' host, because we're not ready to monitor Android or
74#     Brillo hosts.
75
76_EXCLUDED_LABELS = set(['adb'])
77
78# _DEFAULT_DURATION:
79#     Default value used for the --duration command line option.
80#     Specifies how far back in time to search in order to determine
81#     DUT status.
82
83_DEFAULT_DURATION = 24
84
85# _LOGDIR:
86#     Relative path used in the calculation of the default setting
87#     for the --logdir option.  The full path path is relative to
88#     the root of the autotest directory, as determined from
89#     sys.argv[0].
90# _LOGFILE:
91#     Basename of a file to which general log information will be
92#     written.
93# _LOG_FORMAT:
94#     Format string for log messages.
95
96_LOGDIR = os.path.join('logs', 'dut-data')
97_LOGFILE = 'lab-inventory.log'
98_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
99
100# Pattern describing location-based host names in the Chrome OS test
101# labs.  Each DUT hostname designates the DUT's location:
102#   * A lab (room) that's physically separated from other labs
103#     (i.e. there's a door).
104#   * A row (or aisle) of DUTs within the lab.
105#   * A vertical rack of shelves on the row.
106#   * A specific host on one shelf of the rack.
107
108_HOSTNAME_PATTERN = re.compile(
109        r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
110
111# Default entry for managed pools.
112
113_MANAGED_POOL_DEFAULT = 'all_pools'
114
115
116class _PoolCounts(object):
117    """Maintains a set of `HostJobHistory` objects for a pool.
118
119    The collected history objects are nominally all part of a single
120    scheduling pool of DUTs.  The collection maintains a list of
121    working DUTs, a list of broken DUTs, and a list of all DUTs.
122
123    Performance note:  Certain methods in this class are potentially
124    expensive:
125      * `get_working()`
126      * `get_working_list()`
127      * `get_broken()`
128      * `get_broken_list()`
129      * `get_idle()`
130      * `get_idle_list()`
131    The first time any one of these methods is called, it causes
132    multiple RPC calls with a relatively expensive set of database
133    queries.  However, the results of the queries are cached in the
134    individual `HostJobHistory` objects, so only the first call
135    actually pays the full cost.
136
137    Additionally, `get_working_list()`, `get_broken_list()` and
138    `get_idle_list()` cache their return values to avoid recalculating
139    lists at every call; this caching is separate from the caching of RPC
140    results described above.
141
142    This class is deliberately constructed to delay the RPC cost
143    until the accessor methods are called (rather than to query in
144    `record_host()`) so that it's possible to construct a complete
145    `_LabInventory` without making the expensive queries at creation
146    time.  `_populate_board_counts()`, below, assumes this behavior.
147
148    """
149
150    def __init__(self):
151        self._histories = []
152        self._working_list = None
153        self._broken_list = None
154        self._idle_list = None
155
156
157    def record_host(self, host_history):
158        """Add one `HostJobHistory` object to the collection.
159
160        @param host_history The `HostJobHistory` object to be
161                            remembered.
162
163        """
164        self._working_list = None
165        self._broken_list = None
166        self._idle_list = None
167        self._histories.append(host_history)
168
169
170    def get_working_list(self):
171        """Return a list of all working DUTs in the pool.
172
173        Filter `self._histories` for histories where the last
174        diagnosis is `WORKING`.
175
176        Cache the result so that we only cacluate it once.
177
178        @return A list of HostJobHistory objects.
179
180        """
181        if self._working_list is None:
182            self._working_list = [h for h in self._histories
183                    if h.last_diagnosis()[0] == status_history.WORKING]
184        return self._working_list
185
186
187    def get_working(self):
188        """Return the number of working DUTs in the pool."""
189        return len(self.get_working_list())
190
191
192    def get_broken_list(self):
193        """Return a list of all broken DUTs in the pool.
194
195        Filter `self._histories` for histories where the last
196        diagnosis is `BROKEN`.
197
198        Cache the result so that we only cacluate it once.
199
200        @return A list of HostJobHistory objects.
201
202        """
203        if self._broken_list is None:
204            self._broken_list = [h for h in self._histories
205                    if h.last_diagnosis()[0] == status_history.BROKEN]
206        return self._broken_list
207
208
209    def get_broken(self):
210        """Return the number of broken DUTs in the pool."""
211        return len(self.get_broken_list())
212
213
214    def get_idle_list(self):
215        """Return a list of all idle DUTs in the pool.
216
217        Filter `self._histories` for histories where the last
218        diagnosis is `UNUSED` or `UNKNOWN`.
219
220        Cache the result so that we only cacluate it once.
221
222        @return A list of HostJobHistory objects.
223
224        """
225        idle_list = [status_history.UNUSED, status_history.UNKNOWN]
226        if self._idle_list is None:
227            self._idle_list = [h for h in self._histories
228                    if h.last_diagnosis()[0] in idle_list]
229        return self._idle_list
230
231
232    def get_idle(self):
233        """Return the number of idle DUTs in the pool."""
234        return len(self.get_idle_list())
235
236
237    def get_total(self):
238        """Return the total number of DUTs in the pool."""
239        return len(self._histories)
240
241
242class _BoardCounts(object):
243    """Maintains a set of `HostJobHistory` objects for a board.
244
245    The collected history objects are nominally all of the same
246    board.  The collection maintains a count of working DUTs, a
247    count of broken DUTs, and a total count.  The counts can be
248    obtained either for a single pool, or as a total across all
249    pools.
250
251    DUTs in the collection must be assigned to one of the pools
252    in `_MANAGED_POOLS`.
253
254    The `get_working()` and `get_broken()` methods rely on the
255    methods of the same name in _PoolCounts, so the performance
256    note in _PoolCounts applies here as well.
257
258    """
259
260    def __init__(self):
261        self._pools = {
262            pool: _PoolCounts() for pool in MANAGED_POOLS
263        }
264
265    def record_host(self, host_history):
266        """Add one `HostJobHistory` object to the collection.
267
268        @param host_history The `HostJobHistory` object to be
269                            remembered.
270
271        """
272        pool = host_history.host_pool
273        self._pools[pool].record_host(host_history)
274
275
276    def _count_pool(self, get_pool_count, pool=None):
277        """Internal helper to count hosts in a given pool.
278
279        The `get_pool_count` parameter is a function to calculate
280        the exact count of interest for the pool.
281
282        @param get_pool_count  Function to return a count from a
283                               _PoolCount object.
284        @param pool            The pool to be counted.  If `None`,
285                               return the total across all pools.
286
287        """
288        if pool is None:
289            return sum([get_pool_count(counts)
290                            for counts in self._pools.values()])
291        else:
292            return get_pool_count(self._pools[pool])
293
294
295    def get_working_list(self):
296        """Return a list of all working DUTs for the board.
297
298        Go through all HostJobHistory objects in the board's pools,
299        selecting the ones where the last diagnosis is `WORKING`.
300
301        @return A list of HostJobHistory objects.
302
303        """
304        l = []
305        for p in self._pools.values():
306            l.extend(p.get_working_list())
307        return l
308
309
310    def get_working(self, pool=None):
311        """Return the number of working DUTs in a pool.
312
313        @param pool  The pool to be counted.  If `None`, return the
314                     total across all pools.
315
316        @return The total number of working DUTs in the selected
317                pool(s).
318        """
319        return self._count_pool(_PoolCounts.get_working, pool)
320
321
322    def get_broken_list(self):
323        """Return a list of all broken DUTs for the board.
324
325        Go through all HostJobHistory objects in the board's pools,
326        selecting the ones where the last diagnosis is `BROKEN`.
327
328        @return A list of HostJobHistory objects.
329
330        """
331        l = []
332        for p in self._pools.values():
333            l.extend(p.get_broken_list())
334        return l
335
336
337    def get_broken(self, pool=None):
338        """Return the number of broken DUTs in a pool.
339
340        @param pool  The pool to be counted.  If `None`, return the
341                     total across all pools.
342
343        @return The total number of broken DUTs in the selected pool(s).
344        """
345        return self._count_pool(_PoolCounts.get_broken, pool)
346
347
348    def get_idle_list(self, pool=None):
349        """Return a list of all idle DUTs for the board.
350
351        Go through all HostJobHistory objects in the board's pools,
352        selecting the ones where the last diagnosis is `UNUSED` or `UNKNOWN`.
353
354        @param pool: The pool to be counted. If `None`, return the total list
355                     across all pools.
356
357        @return A list of HostJobHistory objects.
358
359        """
360        if pool is None:
361            l = []
362            for p in self._pools.values():
363                l.extend(p.get_idle_list())
364            return l
365        else:
366            return _PoolCounts.get_idle_list(self._pools[pool])
367
368
369    def get_idle(self, pool=None):
370        """Return the number of idle DUTs in a pool.
371
372        @param pool: The pool to be counted. If `None`, return the total
373                     across all pools.
374
375        @return The total number of idle DUTs in the selected pool(s).
376        """
377        return self._count_pool(_PoolCounts.get_idle, pool)
378
379
380    def get_spares_buffer(self):
381        """Return the the nominal number of working spares.
382
383        Calculates and returns how many working spares there would
384        be in the spares pool if all broken DUTs were in the spares
385        pool.  This number may be negative, indicating a shortfall
386        in the critical pools.
387
388        @return The total number DUTs in the spares pool, less the total
389                number of broken DUTs in all pools.
390        """
391        return self.get_total(SPARE_POOL) - self.get_broken()
392
393
394    def get_total(self, pool=None):
395        """Return the total number of DUTs in a pool.
396
397        @param pool  The pool to be counted.  If `None`, return the
398                     total across all pools.
399
400        @return The total number of DUTs in the selected pool(s).
401        """
402        return self._count_pool(_PoolCounts.get_total, pool)
403
404
405class _LabInventory(dict):
406    """Collection of `HostJobHistory` objects for the Lab's inventory.
407
408    The collection is indexed by board.  Indexing returns the
409    _BoardCounts object associated with the board.
410
411    The collection is also iterable.  The iterator returns all the
412    boards in the inventory, in unspecified order.
413
414    """
415
416    @staticmethod
417    def _eligible_host(afehost):
418        """Return whether this host is eligible for monitoring.
419
420        Hosts with any label that's in `_EXCLUDED_LABELS` aren't
421        eligible.
422
423        @param afehost  The host to be tested for eligibility.
424        """
425        return not len(_EXCLUDED_LABELS.intersection(afehost.labels))
426
427
428    @classmethod
429    def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
430        """Return a Lab inventory with specified parameters.
431
432        By default, gathers inventory from `HostJobHistory` objects
433        for all DUTs in the `MANAGED_POOLS` list.  If `boardlist`
434        is supplied, the inventory will be restricted to only the
435        given boards.
436
437        @param afe         AFE object for constructing the
438                           `HostJobHistory` objects.
439        @param start_time  Start time for the `HostJobHistory`
440                           objects.
441        @param end_time    End time for the `HostJobHistory`
442                           objects.
443        @param boardlist   List of boards to include.  If empty,
444                           include all available boards.
445        @return A `_LabInventory` object for the specified boards.
446
447        """
448        label_list = [constants.Labels.POOL_PREFIX + l
449                          for l in MANAGED_POOLS]
450        afehosts = afe.get_hosts(labels__name__in=label_list)
451        if boardlist:
452            # We're deliberately not checking host eligibility in this
453            # code path.  This is a debug path, not used in production;
454            # it may be useful to include ineligible hosts here.
455            boardhosts = []
456            for board in boardlist:
457                board_label = constants.Labels.BOARD_PREFIX + board
458                host_list = [h for h in afehosts
459                                  if board_label in h.labels]
460                boardhosts.extend(host_list)
461            afehosts = boardhosts
462        else:
463            afehosts = [h for h in afehosts if cls._eligible_host(h)]
464        create = lambda host: (
465                status_history.HostJobHistory(afe, host,
466                                              start_time, end_time))
467        return cls([create(host) for host in afehosts])
468
469
470    def __init__(self, histories):
471        # N.B. The query that finds our hosts is restricted to those
472        # with a valid pool: label, but doesn't check for a valid
473        # board: label.  In some (insufficiently) rare cases, the
474        # AFE hosts table has been known to (incorrectly) have DUTs
475        # with a pool: but no board: label.  We explicitly exclude
476        # those here.
477        histories = [h for h in histories
478                     if h.host_board is not None]
479        boards = set([h.host_board for h in histories])
480        initval = { board: _BoardCounts() for board in boards }
481        super(_LabInventory, self).__init__(initval)
482        self._dut_count = len(histories)
483        self._managed_boards = {}
484        for h in histories:
485            self[h.host_board].record_host(h)
486
487
488    def get_managed_boards(self, pool=_MANAGED_POOL_DEFAULT):
489        """Return the set of "managed" boards.
490
491        Operationally, saying a board is "managed" means that the
492        board will be included in the "board" and "repair
493        recommendations" reports.  That is, if there are failures in
494        the board's inventory then lab techs will be asked to fix
495        them without a separate ticket.
496
497        For purposes of implementation, a board is "managed" if it
498        has DUTs in both the spare and a non-spare (i.e. critical)
499        pool.
500
501        @param pool: The specified pool for managed boards.
502        @return A set of all the boards that have both spare and
503                non-spare pools, unless the pool is specified,
504                then the set of boards in that pool.
505        """
506        if self._managed_boards.get(pool, None) is None:
507            self._managed_boards[pool] = set()
508            for board, counts in self.items():
509                # Get the counts for all pools, otherwise get it for the
510                # specified pool.
511                if pool == _MANAGED_POOL_DEFAULT:
512                    spares = counts.get_total(SPARE_POOL)
513                    total = counts.get_total()
514                    if spares != 0 and spares != total:
515                        self._managed_boards[pool].add(board)
516                else:
517                    if counts.get_total(pool) != 0:
518                        self._managed_boards[pool].add(board)
519        return self._managed_boards[pool]
520
521
522    def get_num_duts(self):
523        """Return the total number of DUTs in the inventory."""
524        return self._dut_count
525
526
527    def get_num_boards(self):
528        """Return the total number of boards in the inventory."""
529        return len(self)
530
531
532def _sort_by_location(inventory_list):
533    """Return a list of DUTs, organized by location.
534
535    Take the given list of `HostJobHistory` objects, separate it
536    into a list per lab, and sort each lab's list by location.  The
537    order of sorting within a lab is
538      * By row number within the lab,
539      * then by rack number within the row,
540      * then by host shelf number within the rack.
541
542    Return a list of the sorted lists.
543
544    Implementation note: host locations are sorted by converting
545    each location into a base 100 number.  If row, rack or
546    host numbers exceed the range [0..99], then sorting will
547    break down.
548
549    @return A list of sorted lists of DUTs.
550
551    """
552    BASE = 100
553    lab_lists = {}
554    for history in inventory_list:
555        location = _HOSTNAME_PATTERN.match(history.host.hostname)
556        if location:
557            lab = location.group(1)
558            key = 0
559            for idx in location.group(2, 3, 4):
560                key = BASE * key + int(idx)
561            lab_lists.setdefault(lab, []).append((key, history))
562    return_list = []
563    for dut_list in lab_lists.values():
564        dut_list.sort(key=lambda t: t[0])
565        return_list.append([t[1] for t in dut_list])
566    return return_list
567
568
569def _score_repair_set(buffer_counts, repair_list):
570    """Return a numeric score rating a set of DUTs to be repaired.
571
572    `buffer_counts` is a dictionary mapping board names to the
573    size of the board's spares buffer.
574
575    `repair_list` is a list of DUTs to be repaired.
576
577    This function calculates the new set of buffer counts that would
578    result from the proposed repairs, and scores the new set using
579    two numbers:
580      * Worst case buffer count for any board (higher is better).
581        This is the more siginficant number for comparison.
582      * Number of boards at the worst case (lower is better).  This
583        is the less significant number.
584
585    Implementation note:  The score could fail to reflect the
586    intended criteria if there are more than 1000 boards in the
587    inventory.
588
589    @param spare_counts A dictionary mapping boards to buffer counts.
590    @param repair_list  A list of boards to be repaired.
591    @return A numeric score.
592
593    """
594    # Go through `buffer_counts`, and create a list of new counts
595    # that records the buffer count for each board after repair.
596    # The new list of counts discards the board names, as they don't
597    # contribute to the final score.
598    _NBOARDS = 1000
599    repair_inventory = _LabInventory(repair_list)
600    new_counts = []
601    for b, c in buffer_counts.items():
602        if b in repair_inventory:
603            newcount = repair_inventory[b].get_total()
604        else:
605            newcount = 0
606        new_counts.append(c + newcount)
607    # Go through the new list of counts.  Find the worst available
608    # spares count, and count how many times that worst case occurs.
609    worst_count = new_counts[0]
610    num_worst = 1
611    for c in new_counts[1:]:
612        if c == worst_count:
613            num_worst += 1
614        elif c < worst_count:
615            worst_count = c
616            num_worst = 1
617    # Return the calculated score
618    return _NBOARDS * worst_count - num_worst
619
620
621def _generate_repair_recommendation(inventory, num_recommend):
622    """Return a summary of selected DUTs needing repair.
623
624    Returns a message recommending a list of broken DUTs to be
625    repaired.  The list of DUTs is selected based on these
626    criteria:
627      * No more than `num_recommend` DUTs will be listed.
628      * All DUTs must be in the same lab.
629      * DUTs should be selected for some degree of physical
630        proximity.
631      * DUTs for boards with a low spares buffer are more important
632        than DUTs with larger buffers.
633
634    The algorithm used will guarantee that at least one DUT from a
635    board with the smallest spares buffer will be recommended.  If
636    the worst spares buffer number is shared by more than one board,
637    the algorithm will tend to prefer repair sets that include more
638    of those boards over sets that cover fewer boards.
639
640    @param inventory      Inventory for generating recommendations.
641    @param num_recommend  Number of DUTs to recommend for repair.
642
643    """
644    logging.debug('Creating DUT repair recommendations')
645    board_buffer_counts = {}
646    broken_list = []
647    for board in inventory.get_managed_boards():
648        logging.debug('Listing failed DUTs for %s', board)
649        counts = inventory[board]
650        if counts.get_broken() != 0:
651            board_buffer_counts[board] = counts.get_spares_buffer()
652            broken_list.extend(counts.get_broken_list())
653    # N.B. The logic inside this loop may seem complicated, but
654    # simplification is hard:
655    #   * Calculating an initial recommendation outside of
656    #     the loop likely would make things more complicated,
657    #     not less.
658    #   * It's necessary to calculate an initial lab slice once per
659    #     lab _before_ the while loop, in case the number of broken
660    #     DUTs in a lab is less than `num_recommend`.
661    recommendation = None
662    best_score = None
663    for lab_duts in _sort_by_location(broken_list):
664        start = 0
665        end = num_recommend
666        lab_slice = lab_duts[start : end]
667        lab_score = _score_repair_set(board_buffer_counts,
668                                      lab_slice)
669        while end < len(lab_duts):
670            start += 1
671            end += 1
672            new_slice = lab_duts[start : end]
673            new_score = _score_repair_set(board_buffer_counts,
674                                          new_slice)
675            if new_score > lab_score:
676                lab_slice = new_slice
677                lab_score = new_score
678        if recommendation is None or lab_score > best_score:
679            recommendation = lab_slice
680            best_score = lab_score
681    # N.B. The trailing space here is manadatory:  Without it, Gmail
682    # will parse the URL wrong.  Don't ask.  If you simply _must_
683    # know more, go try it yourself...
684    line_fmt = '%-30s %-16s %-6s\n    %s '
685    message = ['Repair recommendations:\n',
686               line_fmt % ( 'Hostname', 'Board', 'Servo?', 'Logs URL')]
687    for h in recommendation:
688        servo_name = servo_host.make_servo_hostname(h.host.hostname)
689        servo_present = utils.host_is_in_lab_zone(servo_name)
690        _, event = h.last_diagnosis()
691        line = line_fmt % (
692                h.host.hostname, h.host_board,
693                'Yes' if servo_present else 'No', event.job_url)
694        message.append(line)
695    return '\n'.join(message)
696
697
698def _generate_board_inventory_message(inventory):
699    """Generate the "board inventory" e-mail message.
700
701    The board inventory is a list by board summarizing the number
702    of working and broken DUTs, and the total shortfall or surplus
703    of working devices relative to the minimum critical pool
704    requirement.
705
706    The report omits boards with no DUTs in the spare pool or with
707    no DUTs in a critical pool.
708
709    N.B. For sample output text formattted as users can expect to
710    see it in e-mail and log files, refer to the unit tests.
711
712    @param inventory  _LabInventory object with the inventory to
713                      be reported on.
714    @return String with the inventory message to be sent.
715
716    """
717    logging.debug('Creating board inventory')
718    nworking = 0
719    nbroken = 0
720    nidle = 0
721    nbroken_boards = 0
722    ntotal_boards = 0
723    summaries = []
724    for board in inventory.get_managed_boards():
725        counts = inventory[board]
726        logging.debug('Counting %2d DUTS for board %s',
727                      counts.get_total(), board)
728        # Summary elements laid out in the same order as the text
729        # headers:
730        #     Board Avail   Bad  Idle  Good  Spare Total
731        #      e[0]  e[1]  e[2]  e[3]  e[4]  e[5]  e[6]
732        element = (board,
733                   counts.get_spares_buffer(),
734                   counts.get_broken(),
735                   counts.get_idle(),
736                   counts.get_working(),
737                   counts.get_total(SPARE_POOL),
738                   counts.get_total())
739        if element[2]:
740            summaries.append(element)
741            nbroken_boards += 1
742        ntotal_boards += 1
743        nbroken += element[2]
744        nidle += element[3]
745        nworking += element[4]
746    ntotal = nworking + nbroken + nidle
747    summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
748    broken_percent = int(round(100.0 * nbroken / ntotal))
749    idle_percent = int(round(100.0 * nidle / ntotal))
750    working_percent = 100 - broken_percent - idle_percent
751    message = ['Summary of DUTs in inventory:',
752               '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
753               '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
754                   nbroken, broken_percent,
755                   nidle, idle_percent,
756                   nworking, working_percent,
757                   ntotal),
758               '',
759               'Boards with failures: %d' % nbroken_boards,
760               'Boards in inventory:  %d' % ntotal_boards,
761               '', '',
762               'Full board inventory:\n',
763               '%-22s %5s %5s %5s %5s %5s %5s' % (
764                   'Board', 'Avail', 'Bad', 'Idle', 'Good',
765                   'Spare', 'Total')]
766    message.extend(
767            ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
768    return '\n'.join(message)
769
770
771_POOL_INVENTORY_HEADER = '''\
772Notice to Infrastructure deputies:  All boards shown below are at
773less than full strength, please take action to resolve the issues.
774Once you're satisified that failures won't recur, failed DUTs can
775be replaced with spares by running `balance_pool`.  Detailed
776instructions can be found here:
777    http://go/cros-manage-duts
778'''
779
780
781def _generate_pool_inventory_message(inventory):
782    """Generate the "pool inventory" e-mail message.
783
784    The pool inventory is a list by pool and board summarizing the
785    number of working and broken DUTs in the pool.  Only boards with
786    at least one broken DUT are included in the list.
787
788    N.B. For sample output text formattted as users can expect to
789    see it in e-mail and log files, refer to the unit tests.
790
791    @param inventory  _LabInventory object with the inventory to
792                      be reported on.
793    @return String with the inventory message to be sent.
794
795    """
796    logging.debug('Creating pool inventory')
797    message = [_POOL_INVENTORY_HEADER]
798    newline = ''
799    for pool in CRITICAL_POOLS:
800        message.append(
801            '%sStatus for pool:%s, by board:' % (newline, pool))
802        message.append(
803            '%-20s   %5s %5s %5s %5s' % (
804                'Board', 'Bad', 'Idle', 'Good', 'Total'))
805        data_list = []
806        for board, counts in inventory.items():
807            logging.debug('Counting %2d DUTs for %s, %s',
808                          counts.get_total(pool), board, pool)
809            broken = counts.get_broken(pool)
810            idle = counts.get_idle(pool)
811            # boards at full strength are not reported
812            if broken == 0 and idle == 0:
813                continue
814            working = counts.get_working(pool)
815            total = counts.get_total(pool)
816            data_list.append((board, broken, idle, working, total))
817        if data_list:
818            data_list = sorted(data_list, key=lambda d: -d[1])
819            message.extend(
820                ['%-20s   %5d %5d %5d %5d' % t for t in data_list])
821        else:
822            message.append('(All boards at full strength)')
823        newline = '\n'
824    return '\n'.join(message)
825
826
827_IDLE_INVENTORY_HEADER = '''\
828Notice to Infrastructure deputies:  The hosts shown below haven't
829run any jobs for at least 24 hours. Please check each host; locked
830hosts should normally be unlocked; stuck jobs should normally be
831aborted.
832'''
833
834
835def _generate_idle_inventory_message(inventory):
836    """Generate the "idle inventory" e-mail message.
837
838    The idle inventory is a host list with corresponding pool and board,
839    where the hosts are idle (`UNKWOWN` or `UNUSED`).
840
841    N.B. For sample output text format as users can expect to
842    see it in e-mail and log files, refer to the unit tests.
843
844    @param inventory  _LabInventory object with the inventory to
845                      be reported on.
846    @return String with the inventory message to be sent.
847
848    """
849    logging.debug('Creating idle inventory')
850    message = [_IDLE_INVENTORY_HEADER]
851    message.append('Idle Host List:')
852    message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))
853    data_list = []
854    for pool in MANAGED_POOLS:
855        for board, counts in inventory.items():
856            logging.debug('Counting %2d DUTs for %s, %s',
857                          counts.get_total(pool), board, pool)
858            data_list.extend([(dut.host.hostname, board, pool)
859                                  for dut in counts.get_idle_list(pool)])
860    if data_list:
861        message.extend(['%-30s %-20s %s' % t for t in data_list])
862    else:
863        message.append('(No idle DUTs)')
864    return '\n'.join(message)
865
866
867def _send_email(arguments, tag, subject, recipients, body):
868    """Send an inventory e-mail message.
869
870    The message is logged in the selected log directory using `tag`
871    for the file name.
872
873    If the --print option was requested, the message is neither
874    logged nor sent, but merely printed on stdout.
875
876    @param arguments   Parsed command-line options.
877    @param tag         Tag identifying the inventory for logging
878                       purposes.
879    @param subject     E-mail Subject: header line.
880    @param recipients  E-mail addresses for the To: header line.
881    @param body        E-mail message body.
882
883    """
884    logging.debug('Generating email: "%s"', subject)
885    all_recipients = ', '.join(recipients)
886    report_body = '\n'.join([
887            'To: %s' % all_recipients,
888            'Subject: %s' % subject,
889            '', body, ''])
890    if arguments.debug:
891        print report_body
892    else:
893        filename = os.path.join(arguments.logdir, tag)
894        try:
895            report_file = open(filename, 'w')
896            report_file.write(report_body)
897            report_file.close()
898        except EnvironmentError as e:
899            logging.error('Failed to write %s:  %s', filename, e)
900        try:
901            gmail_lib.send_email(all_recipients, subject, body)
902        except Exception as e:
903            logging.error('Failed to send e-mail to %s:  %s',
904                          all_recipients, e)
905
906
907def _separate_email_addresses(address_list):
908    """Parse a list of comma-separated lists of e-mail addresses.
909
910    @param address_list  A list of strings containing comma
911                         separate e-mail addresses.
912    @return A list of the individual e-mail addresses.
913
914    """
915    newlist = []
916    for arg in address_list:
917        newlist.extend([email.strip() for email in arg.split(',')])
918    return newlist
919
920
921def _verify_arguments(arguments):
922    """Validate command-line arguments.
923
924    Join comma separated e-mail addresses for `--board-notify` and
925    `--pool-notify` in separate option arguments into a single list.
926
927    For non-debug uses, require that notification be requested for
928    at least one report.  For debug, if notification isn't specified,
929    treat it as "run all the reports."
930
931    The return value indicates success or failure; in the case of
932    failure, we also write an error message to stderr.
933
934    @param arguments  Command-line arguments as returned by
935                      `ArgumentParser`
936    @return True if the arguments are semantically good, or False
937            if the arguments don't meet requirements.
938
939    """
940    arguments.board_notify = _separate_email_addresses(
941            arguments.board_notify)
942    arguments.pool_notify = _separate_email_addresses(
943            arguments.pool_notify)
944    if not arguments.board_notify and not arguments.pool_notify:
945        if not arguments.debug:
946            sys.stderr.write('Must specify at least one of '
947                             '--board-notify or --pool-notify\n')
948            return False
949        else:
950            # We want to run all the reports.  An empty notify list
951            # will cause a report to be skipped, so make sure the
952            # lists are non-empty.
953            arguments.board_notify = ['']
954            arguments.pool_notify = ['']
955    return True
956
957
958def _get_logdir(script):
959    """Get the default directory for the `--logdir` option.
960
961    The default log directory is based on the parent directory
962    containing this script.
963
964    @param script  Path to this script file.
965    @return A path to a directory.
966
967    """
968    basedir = os.path.dirname(os.path.abspath(script))
969    basedir = os.path.dirname(basedir)
970    return os.path.join(basedir, _LOGDIR)
971
972
973def _parse_command(argv):
974    """Parse the command line arguments.
975
976    Create an argument parser for this command's syntax, parse the
977    command line, and return the result of the ArgumentParser
978    parse_args() method.
979
980    @param argv Standard command line argument vector; argv[0] is
981                assumed to be the command name.
982    @return Result returned by ArgumentParser.parse_args().
983
984    """
985    parser = argparse.ArgumentParser(
986            prog=argv[0],
987            description='Gather and report lab inventory statistics')
988    parser.add_argument('-d', '--duration', type=int,
989                        default=_DEFAULT_DURATION, metavar='HOURS',
990                        help='number of hours back to search for status'
991                             ' (default: %d)' % _DEFAULT_DURATION)
992    parser.add_argument('--board-notify', action='append',
993                        default=[], metavar='ADDRESS',
994                        help='Generate board inventory message, '
995                        'and send it to the given e-mail address(es)')
996    parser.add_argument('--pool-notify', action='append',
997                        default=[], metavar='ADDRESS',
998                        help='Generate pool inventory message, '
999                             'and send it to the given address(es)')
1000    parser.add_argument('-r', '--recommend', type=int, default=None,
1001                        help=('Specify how many DUTs should be '
1002                              'recommended for repair (default: no '
1003                              'recommendation)'))
1004    parser.add_argument('--debug', action='store_true',
1005                        help='Print e-mail messages on stdout '
1006                             'without sending them.')
1007    parser.add_argument('--logdir', default=_get_logdir(argv[0]),
1008                        help='Directory where logs will be written.')
1009    parser.add_argument('boardnames', nargs='*',
1010                        metavar='BOARD',
1011                        help='names of boards to report on '
1012                             '(default: all boards)')
1013    arguments = parser.parse_args(argv[1:])
1014    if not _verify_arguments(arguments):
1015        return None
1016    return arguments
1017
1018
1019def _configure_logging(arguments):
1020    """Configure the `logging` module for our needs.
1021
1022    How we log depends on whether the `--print` option was
1023    provided on the command line.  Without the option, we log all
1024    messages at DEBUG level or above, and write them to a file in
1025    the directory specified by the `--logdir` option.  With the
1026    option, we write log messages to stdout; messages below INFO
1027    level are discarded.
1028
1029    The log file is configured to rotate once a week on Friday
1030    evening, preserving ~3 months worth of history.
1031
1032    @param arguments  Command-line arguments as returned by
1033                      `ArgumentParser`
1034
1035    """
1036    root_logger = logging.getLogger()
1037    if arguments.debug:
1038        root_logger.setLevel(logging.INFO)
1039        handler = logging.StreamHandler(sys.stdout)
1040        handler.setFormatter(logging.Formatter())
1041    else:
1042        if not os.path.exists(arguments.logdir):
1043            os.mkdir(arguments.logdir)
1044        root_logger.setLevel(logging.DEBUG)
1045        logfile = os.path.join(arguments.logdir, _LOGFILE)
1046        handler = logging.handlers.TimedRotatingFileHandler(
1047                logfile, when='W4', backupCount=13)
1048        formatter = logging.Formatter(_LOG_FORMAT,
1049                                      time_utils.TIME_FMT)
1050        handler.setFormatter(formatter)
1051    # TODO(jrbarnette) This is gross.  Importing client.bin.utils
1052    # implicitly imported logging_config, which calls
1053    # logging.basicConfig() *at module level*.  That gives us an
1054    # extra logging handler that we don't want.  So, clear out all
1055    # the handlers here.
1056    for h in root_logger.handlers:
1057        root_logger.removeHandler(h)
1058    root_logger.addHandler(handler)
1059
1060
1061def _populate_board_counts(inventory):
1062    """Gather board counts while providing interactive feedback.
1063
1064    Gathering the status of all individual DUTs in the lab can take
1065    considerable time (~30 minutes at the time of this writing).
1066
1067    Normally, we pay that cost by querying as we go.  However, with
1068    the `--print` option, a human being may be watching the
1069    progress.  So, we force the first (expensive) queries to happen
1070    up front, and provide a small ASCII progress bar to give an
1071    indicator of how many boards have been processed.
1072
1073    @param inventory  _LabInventory object with the inventory to
1074                      be gathered.
1075
1076    """
1077    n = 0
1078    total_broken = 0
1079    for counts in inventory.values():
1080        n += 1
1081        if n % 10 == 5:
1082            c = '+'
1083        elif n % 10 == 0:
1084            c = '%d' % ((n / 10) % 10)
1085        else:
1086            c = '.'
1087        sys.stdout.write(c)
1088        sys.stdout.flush()
1089        # This next call is where all the time goes - it forces all
1090        # of a board's HostJobHistory objects to query the database
1091        # and cache their results.
1092        total_broken += counts.get_broken()
1093    sys.stdout.write('\n')
1094    sys.stdout.write('Found %d broken DUTs\n' % total_broken)
1095
1096
1097def main(argv):
1098    """Standard main routine.
1099    @param argv  Command line arguments including `sys.argv[0]`.
1100    """
1101    arguments = _parse_command(argv)
1102    if not arguments:
1103        sys.exit(1)
1104    _configure_logging(arguments)
1105    try:
1106        end_time = int(time.time())
1107        start_time = end_time - arguments.duration * 60 * 60
1108        timestamp = time.strftime('%Y-%m-%d.%H',
1109                                  time.localtime(end_time))
1110        logging.debug('Starting lab inventory for %s', timestamp)
1111        if arguments.board_notify:
1112            if arguments.recommend:
1113                logging.debug('Will include repair recommendations')
1114            logging.debug('Will include board inventory')
1115        if arguments.pool_notify:
1116            logging.debug('Will include pool inventory')
1117
1118        afe = frontend_wrappers.RetryingAFE(server=None)
1119        inventory = _LabInventory.create_inventory(
1120                afe, start_time, end_time, arguments.boardnames)
1121        logging.info('Found %d hosts across %d boards',
1122                         inventory.get_num_duts(),
1123                         inventory.get_num_boards())
1124
1125        if arguments.debug:
1126            _populate_board_counts(inventory)
1127
1128        if arguments.board_notify:
1129            if arguments.recommend:
1130                recommend_message = _generate_repair_recommendation(
1131                        inventory, arguments.recommend) + '\n\n\n'
1132            else:
1133                recommend_message = ''
1134            board_message = _generate_board_inventory_message(inventory)
1135            _send_email(arguments,
1136                        'boards-%s.txt' % timestamp,
1137                        'DUT board inventory %s' % timestamp,
1138                        arguments.board_notify,
1139                        recommend_message + board_message)
1140
1141        if arguments.pool_notify:
1142            pool_message = _generate_pool_inventory_message(inventory)
1143            idle_message = _generate_idle_inventory_message(inventory)
1144            _send_email(arguments,
1145                        'pools-%s.txt' % timestamp,
1146                        'DUT pool inventory %s' % timestamp,
1147                        arguments.pool_notify,
1148                        pool_message + '\n\n\n' + idle_message)
1149    except KeyboardInterrupt:
1150        pass
1151    except EnvironmentError as e:
1152        logging.exception('Unexpected OS error: %s', e)
1153    except Exception as e:
1154        logging.exception('Unexpected exception: %s', e)
1155
1156
1157def get_inventory(afe):
1158    end_time = int(time.time())
1159    start_time = end_time - 24 * 60 * 60
1160    return _LabInventory.create_inventory(afe, start_time, end_time)
1161
1162
1163def get_managed_boards(afe):
1164    return get_inventory(afe).get_managed_boards()
1165
1166
1167if __name__ == '__main__':
1168    main(sys.argv)
1169