1#!/usr/bin/env python 2# Copyright 2015 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Create e-mail reports of the Lab's DUT inventory. 7 8Gathers a list of all DUTs of interest in the Lab, segregated by 9board and pool, and determines whether each DUT is working or 10broken. Then, send one or more e-mail reports summarizing the 11status to e-mail addresses provided on the command line. 12 13usage: lab_inventory.py [ options ] [ board ... ] 14 15Options: 16--duration / -d <hours> 17 How far back in time to search job history to determine DUT 18 status. 19 20--board-notify <address>[,<address>] 21 Send the "board status" e-mail to all the specified e-mail 22 addresses. 23 24--pool-notify <address>[,<address>] 25 Send the "pool status" e-mail to all the specified e-mail 26 addresses. 27 28--recommend <number> 29 When generating the "board status" e-mail, included a list of 30 <number> specific DUTs to be recommended for repair. 31 32--logdir <directory> 33 Log progress and actions in a file under this directory. Text 34 of any e-mail sent will also be logged in a timestamped file in 35 this directory. 36 37--debug 38 Suppress all logging and sending e-mail. Instead, write the 39 output that would be generated onto stdout. 40 41<board> arguments: 42 With no arguments, gathers the status for all boards in the lab. 43 With one or more named boards on the command line, restricts 44 reporting to just those boards. 45 46""" 47 48 49import argparse 50import logging 51import logging.handlers 52import os 53import re 54import sys 55import time 56 57import common 58from autotest_lib.client.bin import utils 59from autotest_lib.client.common_lib import time_utils 60from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 61from autotest_lib.server.hosts import servo_host 62from autotest_lib.server.lib import status_history 63from autotest_lib.site_utils import gmail_lib 64from autotest_lib.site_utils.suite_scheduler import constants 65 66 67CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS 68SPARE_POOL = constants.Pools.SPARE_POOL 69MANAGED_POOLS = constants.Pools.MANAGED_POOLS 70 71# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from 72# monitoring by this script. Currently, we're excluding any 73# 'adb' host, because we're not ready to monitor Android or 74# Brillo hosts. 75 76_EXCLUDED_LABELS = set(['adb']) 77 78# _DEFAULT_DURATION: 79# Default value used for the --duration command line option. 80# Specifies how far back in time to search in order to determine 81# DUT status. 82 83_DEFAULT_DURATION = 24 84 85# _LOGDIR: 86# Relative path used in the calculation of the default setting 87# for the --logdir option. The full path path is relative to 88# the root of the autotest directory, as determined from 89# sys.argv[0]. 90# _LOGFILE: 91# Basename of a file to which general log information will be 92# written. 93# _LOG_FORMAT: 94# Format string for log messages. 95 96_LOGDIR = os.path.join('logs', 'dut-data') 97_LOGFILE = 'lab-inventory.log' 98_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s' 99 100# Pattern describing location-based host names in the Chrome OS test 101# labs. Each DUT hostname designates the DUT's location: 102# * A lab (room) that's physically separated from other labs 103# (i.e. there's a door). 104# * A row (or aisle) of DUTs within the lab. 105# * A vertical rack of shelves on the row. 106# * A specific host on one shelf of the rack. 107 108_HOSTNAME_PATTERN = re.compile( 109 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)') 110 111# Default entry for managed pools. 112 113_MANAGED_POOL_DEFAULT = 'all_pools' 114 115 116class _PoolCounts(object): 117 """Maintains a set of `HostJobHistory` objects for a pool. 118 119 The collected history objects are nominally all part of a single 120 scheduling pool of DUTs. The collection maintains a list of 121 working DUTs, a list of broken DUTs, and a list of all DUTs. 122 123 Performance note: Certain methods in this class are potentially 124 expensive: 125 * `get_working()` 126 * `get_working_list()` 127 * `get_broken()` 128 * `get_broken_list()` 129 * `get_idle()` 130 * `get_idle_list()` 131 The first time any one of these methods is called, it causes 132 multiple RPC calls with a relatively expensive set of database 133 queries. However, the results of the queries are cached in the 134 individual `HostJobHistory` objects, so only the first call 135 actually pays the full cost. 136 137 Additionally, `get_working_list()`, `get_broken_list()` and 138 `get_idle_list()` cache their return values to avoid recalculating 139 lists at every call; this caching is separate from the caching of RPC 140 results described above. 141 142 This class is deliberately constructed to delay the RPC cost 143 until the accessor methods are called (rather than to query in 144 `record_host()`) so that it's possible to construct a complete 145 `_LabInventory` without making the expensive queries at creation 146 time. `_populate_board_counts()`, below, assumes this behavior. 147 148 """ 149 150 def __init__(self): 151 self._histories = [] 152 self._working_list = None 153 self._broken_list = None 154 self._idle_list = None 155 156 157 def record_host(self, host_history): 158 """Add one `HostJobHistory` object to the collection. 159 160 @param host_history The `HostJobHistory` object to be 161 remembered. 162 163 """ 164 self._working_list = None 165 self._broken_list = None 166 self._idle_list = None 167 self._histories.append(host_history) 168 169 170 def get_working_list(self): 171 """Return a list of all working DUTs in the pool. 172 173 Filter `self._histories` for histories where the last 174 diagnosis is `WORKING`. 175 176 Cache the result so that we only cacluate it once. 177 178 @return A list of HostJobHistory objects. 179 180 """ 181 if self._working_list is None: 182 self._working_list = [h for h in self._histories 183 if h.last_diagnosis()[0] == status_history.WORKING] 184 return self._working_list 185 186 187 def get_working(self): 188 """Return the number of working DUTs in the pool.""" 189 return len(self.get_working_list()) 190 191 192 def get_broken_list(self): 193 """Return a list of all broken DUTs in the pool. 194 195 Filter `self._histories` for histories where the last 196 diagnosis is `BROKEN`. 197 198 Cache the result so that we only cacluate it once. 199 200 @return A list of HostJobHistory objects. 201 202 """ 203 if self._broken_list is None: 204 self._broken_list = [h for h in self._histories 205 if h.last_diagnosis()[0] == status_history.BROKEN] 206 return self._broken_list 207 208 209 def get_broken(self): 210 """Return the number of broken DUTs in the pool.""" 211 return len(self.get_broken_list()) 212 213 214 def get_idle_list(self): 215 """Return a list of all idle DUTs in the pool. 216 217 Filter `self._histories` for histories where the last 218 diagnosis is `UNUSED` or `UNKNOWN`. 219 220 Cache the result so that we only cacluate it once. 221 222 @return A list of HostJobHistory objects. 223 224 """ 225 idle_list = [status_history.UNUSED, status_history.UNKNOWN] 226 if self._idle_list is None: 227 self._idle_list = [h for h in self._histories 228 if h.last_diagnosis()[0] in idle_list] 229 return self._idle_list 230 231 232 def get_idle(self): 233 """Return the number of idle DUTs in the pool.""" 234 return len(self.get_idle_list()) 235 236 237 def get_total(self): 238 """Return the total number of DUTs in the pool.""" 239 return len(self._histories) 240 241 242class _BoardCounts(object): 243 """Maintains a set of `HostJobHistory` objects for a board. 244 245 The collected history objects are nominally all of the same 246 board. The collection maintains a count of working DUTs, a 247 count of broken DUTs, and a total count. The counts can be 248 obtained either for a single pool, or as a total across all 249 pools. 250 251 DUTs in the collection must be assigned to one of the pools 252 in `_MANAGED_POOLS`. 253 254 The `get_working()` and `get_broken()` methods rely on the 255 methods of the same name in _PoolCounts, so the performance 256 note in _PoolCounts applies here as well. 257 258 """ 259 260 def __init__(self): 261 self._pools = { 262 pool: _PoolCounts() for pool in MANAGED_POOLS 263 } 264 265 def record_host(self, host_history): 266 """Add one `HostJobHistory` object to the collection. 267 268 @param host_history The `HostJobHistory` object to be 269 remembered. 270 271 """ 272 pool = host_history.host_pool 273 self._pools[pool].record_host(host_history) 274 275 276 def _count_pool(self, get_pool_count, pool=None): 277 """Internal helper to count hosts in a given pool. 278 279 The `get_pool_count` parameter is a function to calculate 280 the exact count of interest for the pool. 281 282 @param get_pool_count Function to return a count from a 283 _PoolCount object. 284 @param pool The pool to be counted. If `None`, 285 return the total across all pools. 286 287 """ 288 if pool is None: 289 return sum([get_pool_count(counts) 290 for counts in self._pools.values()]) 291 else: 292 return get_pool_count(self._pools[pool]) 293 294 295 def get_working_list(self): 296 """Return a list of all working DUTs for the board. 297 298 Go through all HostJobHistory objects in the board's pools, 299 selecting the ones where the last diagnosis is `WORKING`. 300 301 @return A list of HostJobHistory objects. 302 303 """ 304 l = [] 305 for p in self._pools.values(): 306 l.extend(p.get_working_list()) 307 return l 308 309 310 def get_working(self, pool=None): 311 """Return the number of working DUTs in a pool. 312 313 @param pool The pool to be counted. If `None`, return the 314 total across all pools. 315 316 @return The total number of working DUTs in the selected 317 pool(s). 318 """ 319 return self._count_pool(_PoolCounts.get_working, pool) 320 321 322 def get_broken_list(self): 323 """Return a list of all broken DUTs for the board. 324 325 Go through all HostJobHistory objects in the board's pools, 326 selecting the ones where the last diagnosis is `BROKEN`. 327 328 @return A list of HostJobHistory objects. 329 330 """ 331 l = [] 332 for p in self._pools.values(): 333 l.extend(p.get_broken_list()) 334 return l 335 336 337 def get_broken(self, pool=None): 338 """Return the number of broken DUTs in a pool. 339 340 @param pool The pool to be counted. If `None`, return the 341 total across all pools. 342 343 @return The total number of broken DUTs in the selected pool(s). 344 """ 345 return self._count_pool(_PoolCounts.get_broken, pool) 346 347 348 def get_idle_list(self, pool=None): 349 """Return a list of all idle DUTs for the board. 350 351 Go through all HostJobHistory objects in the board's pools, 352 selecting the ones where the last diagnosis is `UNUSED` or `UNKNOWN`. 353 354 @param pool: The pool to be counted. If `None`, return the total list 355 across all pools. 356 357 @return A list of HostJobHistory objects. 358 359 """ 360 if pool is None: 361 l = [] 362 for p in self._pools.values(): 363 l.extend(p.get_idle_list()) 364 return l 365 else: 366 return _PoolCounts.get_idle_list(self._pools[pool]) 367 368 369 def get_idle(self, pool=None): 370 """Return the number of idle DUTs in a pool. 371 372 @param pool: The pool to be counted. If `None`, return the total 373 across all pools. 374 375 @return The total number of idle DUTs in the selected pool(s). 376 """ 377 return self._count_pool(_PoolCounts.get_idle, pool) 378 379 380 def get_spares_buffer(self): 381 """Return the the nominal number of working spares. 382 383 Calculates and returns how many working spares there would 384 be in the spares pool if all broken DUTs were in the spares 385 pool. This number may be negative, indicating a shortfall 386 in the critical pools. 387 388 @return The total number DUTs in the spares pool, less the total 389 number of broken DUTs in all pools. 390 """ 391 return self.get_total(SPARE_POOL) - self.get_broken() 392 393 394 def get_total(self, pool=None): 395 """Return the total number of DUTs in a pool. 396 397 @param pool The pool to be counted. If `None`, return the 398 total across all pools. 399 400 @return The total number of DUTs in the selected pool(s). 401 """ 402 return self._count_pool(_PoolCounts.get_total, pool) 403 404 405class _LabInventory(dict): 406 """Collection of `HostJobHistory` objects for the Lab's inventory. 407 408 The collection is indexed by board. Indexing returns the 409 _BoardCounts object associated with the board. 410 411 The collection is also iterable. The iterator returns all the 412 boards in the inventory, in unspecified order. 413 414 """ 415 416 @staticmethod 417 def _eligible_host(afehost): 418 """Return whether this host is eligible for monitoring. 419 420 Hosts with any label that's in `_EXCLUDED_LABELS` aren't 421 eligible. 422 423 @param afehost The host to be tested for eligibility. 424 """ 425 return not len(_EXCLUDED_LABELS.intersection(afehost.labels)) 426 427 428 @classmethod 429 def create_inventory(cls, afe, start_time, end_time, boardlist=[]): 430 """Return a Lab inventory with specified parameters. 431 432 By default, gathers inventory from `HostJobHistory` objects 433 for all DUTs in the `MANAGED_POOLS` list. If `boardlist` 434 is supplied, the inventory will be restricted to only the 435 given boards. 436 437 @param afe AFE object for constructing the 438 `HostJobHistory` objects. 439 @param start_time Start time for the `HostJobHistory` 440 objects. 441 @param end_time End time for the `HostJobHistory` 442 objects. 443 @param boardlist List of boards to include. If empty, 444 include all available boards. 445 @return A `_LabInventory` object for the specified boards. 446 447 """ 448 label_list = [constants.Labels.POOL_PREFIX + l 449 for l in MANAGED_POOLS] 450 afehosts = afe.get_hosts(labels__name__in=label_list) 451 if boardlist: 452 # We're deliberately not checking host eligibility in this 453 # code path. This is a debug path, not used in production; 454 # it may be useful to include ineligible hosts here. 455 boardhosts = [] 456 for board in boardlist: 457 board_label = constants.Labels.BOARD_PREFIX + board 458 host_list = [h for h in afehosts 459 if board_label in h.labels] 460 boardhosts.extend(host_list) 461 afehosts = boardhosts 462 else: 463 afehosts = [h for h in afehosts if cls._eligible_host(h)] 464 create = lambda host: ( 465 status_history.HostJobHistory(afe, host, 466 start_time, end_time)) 467 return cls([create(host) for host in afehosts]) 468 469 470 def __init__(self, histories): 471 # N.B. The query that finds our hosts is restricted to those 472 # with a valid pool: label, but doesn't check for a valid 473 # board: label. In some (insufficiently) rare cases, the 474 # AFE hosts table has been known to (incorrectly) have DUTs 475 # with a pool: but no board: label. We explicitly exclude 476 # those here. 477 histories = [h for h in histories 478 if h.host_board is not None] 479 boards = set([h.host_board for h in histories]) 480 initval = { board: _BoardCounts() for board in boards } 481 super(_LabInventory, self).__init__(initval) 482 self._dut_count = len(histories) 483 self._managed_boards = {} 484 for h in histories: 485 self[h.host_board].record_host(h) 486 487 488 def get_managed_boards(self, pool=_MANAGED_POOL_DEFAULT): 489 """Return the set of "managed" boards. 490 491 Operationally, saying a board is "managed" means that the 492 board will be included in the "board" and "repair 493 recommendations" reports. That is, if there are failures in 494 the board's inventory then lab techs will be asked to fix 495 them without a separate ticket. 496 497 For purposes of implementation, a board is "managed" if it 498 has DUTs in both the spare and a non-spare (i.e. critical) 499 pool. 500 501 @param pool: The specified pool for managed boards. 502 @return A set of all the boards that have both spare and 503 non-spare pools, unless the pool is specified, 504 then the set of boards in that pool. 505 """ 506 if self._managed_boards.get(pool, None) is None: 507 self._managed_boards[pool] = set() 508 for board, counts in self.items(): 509 # Get the counts for all pools, otherwise get it for the 510 # specified pool. 511 if pool == _MANAGED_POOL_DEFAULT: 512 spares = counts.get_total(SPARE_POOL) 513 total = counts.get_total() 514 if spares != 0 and spares != total: 515 self._managed_boards[pool].add(board) 516 else: 517 if counts.get_total(pool) != 0: 518 self._managed_boards[pool].add(board) 519 return self._managed_boards[pool] 520 521 522 def get_num_duts(self): 523 """Return the total number of DUTs in the inventory.""" 524 return self._dut_count 525 526 527 def get_num_boards(self): 528 """Return the total number of boards in the inventory.""" 529 return len(self) 530 531 532def _sort_by_location(inventory_list): 533 """Return a list of DUTs, organized by location. 534 535 Take the given list of `HostJobHistory` objects, separate it 536 into a list per lab, and sort each lab's list by location. The 537 order of sorting within a lab is 538 * By row number within the lab, 539 * then by rack number within the row, 540 * then by host shelf number within the rack. 541 542 Return a list of the sorted lists. 543 544 Implementation note: host locations are sorted by converting 545 each location into a base 100 number. If row, rack or 546 host numbers exceed the range [0..99], then sorting will 547 break down. 548 549 @return A list of sorted lists of DUTs. 550 551 """ 552 BASE = 100 553 lab_lists = {} 554 for history in inventory_list: 555 location = _HOSTNAME_PATTERN.match(history.host.hostname) 556 if location: 557 lab = location.group(1) 558 key = 0 559 for idx in location.group(2, 3, 4): 560 key = BASE * key + int(idx) 561 lab_lists.setdefault(lab, []).append((key, history)) 562 return_list = [] 563 for dut_list in lab_lists.values(): 564 dut_list.sort(key=lambda t: t[0]) 565 return_list.append([t[1] for t in dut_list]) 566 return return_list 567 568 569def _score_repair_set(buffer_counts, repair_list): 570 """Return a numeric score rating a set of DUTs to be repaired. 571 572 `buffer_counts` is a dictionary mapping board names to the 573 size of the board's spares buffer. 574 575 `repair_list` is a list of DUTs to be repaired. 576 577 This function calculates the new set of buffer counts that would 578 result from the proposed repairs, and scores the new set using 579 two numbers: 580 * Worst case buffer count for any board (higher is better). 581 This is the more siginficant number for comparison. 582 * Number of boards at the worst case (lower is better). This 583 is the less significant number. 584 585 Implementation note: The score could fail to reflect the 586 intended criteria if there are more than 1000 boards in the 587 inventory. 588 589 @param spare_counts A dictionary mapping boards to buffer counts. 590 @param repair_list A list of boards to be repaired. 591 @return A numeric score. 592 593 """ 594 # Go through `buffer_counts`, and create a list of new counts 595 # that records the buffer count for each board after repair. 596 # The new list of counts discards the board names, as they don't 597 # contribute to the final score. 598 _NBOARDS = 1000 599 repair_inventory = _LabInventory(repair_list) 600 new_counts = [] 601 for b, c in buffer_counts.items(): 602 if b in repair_inventory: 603 newcount = repair_inventory[b].get_total() 604 else: 605 newcount = 0 606 new_counts.append(c + newcount) 607 # Go through the new list of counts. Find the worst available 608 # spares count, and count how many times that worst case occurs. 609 worst_count = new_counts[0] 610 num_worst = 1 611 for c in new_counts[1:]: 612 if c == worst_count: 613 num_worst += 1 614 elif c < worst_count: 615 worst_count = c 616 num_worst = 1 617 # Return the calculated score 618 return _NBOARDS * worst_count - num_worst 619 620 621def _generate_repair_recommendation(inventory, num_recommend): 622 """Return a summary of selected DUTs needing repair. 623 624 Returns a message recommending a list of broken DUTs to be 625 repaired. The list of DUTs is selected based on these 626 criteria: 627 * No more than `num_recommend` DUTs will be listed. 628 * All DUTs must be in the same lab. 629 * DUTs should be selected for some degree of physical 630 proximity. 631 * DUTs for boards with a low spares buffer are more important 632 than DUTs with larger buffers. 633 634 The algorithm used will guarantee that at least one DUT from a 635 board with the smallest spares buffer will be recommended. If 636 the worst spares buffer number is shared by more than one board, 637 the algorithm will tend to prefer repair sets that include more 638 of those boards over sets that cover fewer boards. 639 640 @param inventory Inventory for generating recommendations. 641 @param num_recommend Number of DUTs to recommend for repair. 642 643 """ 644 logging.debug('Creating DUT repair recommendations') 645 board_buffer_counts = {} 646 broken_list = [] 647 for board in inventory.get_managed_boards(): 648 logging.debug('Listing failed DUTs for %s', board) 649 counts = inventory[board] 650 if counts.get_broken() != 0: 651 board_buffer_counts[board] = counts.get_spares_buffer() 652 broken_list.extend(counts.get_broken_list()) 653 # N.B. The logic inside this loop may seem complicated, but 654 # simplification is hard: 655 # * Calculating an initial recommendation outside of 656 # the loop likely would make things more complicated, 657 # not less. 658 # * It's necessary to calculate an initial lab slice once per 659 # lab _before_ the while loop, in case the number of broken 660 # DUTs in a lab is less than `num_recommend`. 661 recommendation = None 662 best_score = None 663 for lab_duts in _sort_by_location(broken_list): 664 start = 0 665 end = num_recommend 666 lab_slice = lab_duts[start : end] 667 lab_score = _score_repair_set(board_buffer_counts, 668 lab_slice) 669 while end < len(lab_duts): 670 start += 1 671 end += 1 672 new_slice = lab_duts[start : end] 673 new_score = _score_repair_set(board_buffer_counts, 674 new_slice) 675 if new_score > lab_score: 676 lab_slice = new_slice 677 lab_score = new_score 678 if recommendation is None or lab_score > best_score: 679 recommendation = lab_slice 680 best_score = lab_score 681 # N.B. The trailing space here is manadatory: Without it, Gmail 682 # will parse the URL wrong. Don't ask. If you simply _must_ 683 # know more, go try it yourself... 684 line_fmt = '%-30s %-16s %-6s\n %s ' 685 message = ['Repair recommendations:\n', 686 line_fmt % ( 'Hostname', 'Board', 'Servo?', 'Logs URL')] 687 for h in recommendation: 688 servo_name = servo_host.make_servo_hostname(h.host.hostname) 689 servo_present = utils.host_is_in_lab_zone(servo_name) 690 _, event = h.last_diagnosis() 691 line = line_fmt % ( 692 h.host.hostname, h.host_board, 693 'Yes' if servo_present else 'No', event.job_url) 694 message.append(line) 695 return '\n'.join(message) 696 697 698def _generate_board_inventory_message(inventory): 699 """Generate the "board inventory" e-mail message. 700 701 The board inventory is a list by board summarizing the number 702 of working and broken DUTs, and the total shortfall or surplus 703 of working devices relative to the minimum critical pool 704 requirement. 705 706 The report omits boards with no DUTs in the spare pool or with 707 no DUTs in a critical pool. 708 709 N.B. For sample output text formattted as users can expect to 710 see it in e-mail and log files, refer to the unit tests. 711 712 @param inventory _LabInventory object with the inventory to 713 be reported on. 714 @return String with the inventory message to be sent. 715 716 """ 717 logging.debug('Creating board inventory') 718 nworking = 0 719 nbroken = 0 720 nidle = 0 721 nbroken_boards = 0 722 ntotal_boards = 0 723 summaries = [] 724 for board in inventory.get_managed_boards(): 725 counts = inventory[board] 726 logging.debug('Counting %2d DUTS for board %s', 727 counts.get_total(), board) 728 # Summary elements laid out in the same order as the text 729 # headers: 730 # Board Avail Bad Idle Good Spare Total 731 # e[0] e[1] e[2] e[3] e[4] e[5] e[6] 732 element = (board, 733 counts.get_spares_buffer(), 734 counts.get_broken(), 735 counts.get_idle(), 736 counts.get_working(), 737 counts.get_total(SPARE_POOL), 738 counts.get_total()) 739 if element[2]: 740 summaries.append(element) 741 nbroken_boards += 1 742 ntotal_boards += 1 743 nbroken += element[2] 744 nidle += element[3] 745 nworking += element[4] 746 ntotal = nworking + nbroken + nidle 747 summaries = sorted(summaries, key=lambda e: (e[1], -e[2])) 748 broken_percent = int(round(100.0 * nbroken / ntotal)) 749 idle_percent = int(round(100.0 * nidle / ntotal)) 750 working_percent = 100 - broken_percent - idle_percent 751 message = ['Summary of DUTs in inventory:', 752 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'), 753 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % ( 754 nbroken, broken_percent, 755 nidle, idle_percent, 756 nworking, working_percent, 757 ntotal), 758 '', 759 'Boards with failures: %d' % nbroken_boards, 760 'Boards in inventory: %d' % ntotal_boards, 761 '', '', 762 'Full board inventory:\n', 763 '%-22s %5s %5s %5s %5s %5s %5s' % ( 764 'Board', 'Avail', 'Bad', 'Idle', 'Good', 765 'Spare', 'Total')] 766 message.extend( 767 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries]) 768 return '\n'.join(message) 769 770 771_POOL_INVENTORY_HEADER = '''\ 772Notice to Infrastructure deputies: All boards shown below are at 773less than full strength, please take action to resolve the issues. 774Once you're satisified that failures won't recur, failed DUTs can 775be replaced with spares by running `balance_pool`. Detailed 776instructions can be found here: 777 http://go/cros-manage-duts 778''' 779 780 781def _generate_pool_inventory_message(inventory): 782 """Generate the "pool inventory" e-mail message. 783 784 The pool inventory is a list by pool and board summarizing the 785 number of working and broken DUTs in the pool. Only boards with 786 at least one broken DUT are included in the list. 787 788 N.B. For sample output text formattted as users can expect to 789 see it in e-mail and log files, refer to the unit tests. 790 791 @param inventory _LabInventory object with the inventory to 792 be reported on. 793 @return String with the inventory message to be sent. 794 795 """ 796 logging.debug('Creating pool inventory') 797 message = [_POOL_INVENTORY_HEADER] 798 newline = '' 799 for pool in CRITICAL_POOLS: 800 message.append( 801 '%sStatus for pool:%s, by board:' % (newline, pool)) 802 message.append( 803 '%-20s %5s %5s %5s %5s' % ( 804 'Board', 'Bad', 'Idle', 'Good', 'Total')) 805 data_list = [] 806 for board, counts in inventory.items(): 807 logging.debug('Counting %2d DUTs for %s, %s', 808 counts.get_total(pool), board, pool) 809 broken = counts.get_broken(pool) 810 idle = counts.get_idle(pool) 811 # boards at full strength are not reported 812 if broken == 0 and idle == 0: 813 continue 814 working = counts.get_working(pool) 815 total = counts.get_total(pool) 816 data_list.append((board, broken, idle, working, total)) 817 if data_list: 818 data_list = sorted(data_list, key=lambda d: -d[1]) 819 message.extend( 820 ['%-20s %5d %5d %5d %5d' % t for t in data_list]) 821 else: 822 message.append('(All boards at full strength)') 823 newline = '\n' 824 return '\n'.join(message) 825 826 827_IDLE_INVENTORY_HEADER = '''\ 828Notice to Infrastructure deputies: The hosts shown below haven't 829run any jobs for at least 24 hours. Please check each host; locked 830hosts should normally be unlocked; stuck jobs should normally be 831aborted. 832''' 833 834 835def _generate_idle_inventory_message(inventory): 836 """Generate the "idle inventory" e-mail message. 837 838 The idle inventory is a host list with corresponding pool and board, 839 where the hosts are idle (`UNKWOWN` or `UNUSED`). 840 841 N.B. For sample output text format as users can expect to 842 see it in e-mail and log files, refer to the unit tests. 843 844 @param inventory _LabInventory object with the inventory to 845 be reported on. 846 @return String with the inventory message to be sent. 847 848 """ 849 logging.debug('Creating idle inventory') 850 message = [_IDLE_INVENTORY_HEADER] 851 message.append('Idle Host List:') 852 message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool')) 853 data_list = [] 854 for pool in MANAGED_POOLS: 855 for board, counts in inventory.items(): 856 logging.debug('Counting %2d DUTs for %s, %s', 857 counts.get_total(pool), board, pool) 858 data_list.extend([(dut.host.hostname, board, pool) 859 for dut in counts.get_idle_list(pool)]) 860 if data_list: 861 message.extend(['%-30s %-20s %s' % t for t in data_list]) 862 else: 863 message.append('(No idle DUTs)') 864 return '\n'.join(message) 865 866 867def _send_email(arguments, tag, subject, recipients, body): 868 """Send an inventory e-mail message. 869 870 The message is logged in the selected log directory using `tag` 871 for the file name. 872 873 If the --print option was requested, the message is neither 874 logged nor sent, but merely printed on stdout. 875 876 @param arguments Parsed command-line options. 877 @param tag Tag identifying the inventory for logging 878 purposes. 879 @param subject E-mail Subject: header line. 880 @param recipients E-mail addresses for the To: header line. 881 @param body E-mail message body. 882 883 """ 884 logging.debug('Generating email: "%s"', subject) 885 all_recipients = ', '.join(recipients) 886 report_body = '\n'.join([ 887 'To: %s' % all_recipients, 888 'Subject: %s' % subject, 889 '', body, '']) 890 if arguments.debug: 891 print report_body 892 else: 893 filename = os.path.join(arguments.logdir, tag) 894 try: 895 report_file = open(filename, 'w') 896 report_file.write(report_body) 897 report_file.close() 898 except EnvironmentError as e: 899 logging.error('Failed to write %s: %s', filename, e) 900 try: 901 gmail_lib.send_email(all_recipients, subject, body) 902 except Exception as e: 903 logging.error('Failed to send e-mail to %s: %s', 904 all_recipients, e) 905 906 907def _separate_email_addresses(address_list): 908 """Parse a list of comma-separated lists of e-mail addresses. 909 910 @param address_list A list of strings containing comma 911 separate e-mail addresses. 912 @return A list of the individual e-mail addresses. 913 914 """ 915 newlist = [] 916 for arg in address_list: 917 newlist.extend([email.strip() for email in arg.split(',')]) 918 return newlist 919 920 921def _verify_arguments(arguments): 922 """Validate command-line arguments. 923 924 Join comma separated e-mail addresses for `--board-notify` and 925 `--pool-notify` in separate option arguments into a single list. 926 927 For non-debug uses, require that notification be requested for 928 at least one report. For debug, if notification isn't specified, 929 treat it as "run all the reports." 930 931 The return value indicates success or failure; in the case of 932 failure, we also write an error message to stderr. 933 934 @param arguments Command-line arguments as returned by 935 `ArgumentParser` 936 @return True if the arguments are semantically good, or False 937 if the arguments don't meet requirements. 938 939 """ 940 arguments.board_notify = _separate_email_addresses( 941 arguments.board_notify) 942 arguments.pool_notify = _separate_email_addresses( 943 arguments.pool_notify) 944 if not arguments.board_notify and not arguments.pool_notify: 945 if not arguments.debug: 946 sys.stderr.write('Must specify at least one of ' 947 '--board-notify or --pool-notify\n') 948 return False 949 else: 950 # We want to run all the reports. An empty notify list 951 # will cause a report to be skipped, so make sure the 952 # lists are non-empty. 953 arguments.board_notify = [''] 954 arguments.pool_notify = [''] 955 return True 956 957 958def _get_logdir(script): 959 """Get the default directory for the `--logdir` option. 960 961 The default log directory is based on the parent directory 962 containing this script. 963 964 @param script Path to this script file. 965 @return A path to a directory. 966 967 """ 968 basedir = os.path.dirname(os.path.abspath(script)) 969 basedir = os.path.dirname(basedir) 970 return os.path.join(basedir, _LOGDIR) 971 972 973def _parse_command(argv): 974 """Parse the command line arguments. 975 976 Create an argument parser for this command's syntax, parse the 977 command line, and return the result of the ArgumentParser 978 parse_args() method. 979 980 @param argv Standard command line argument vector; argv[0] is 981 assumed to be the command name. 982 @return Result returned by ArgumentParser.parse_args(). 983 984 """ 985 parser = argparse.ArgumentParser( 986 prog=argv[0], 987 description='Gather and report lab inventory statistics') 988 parser.add_argument('-d', '--duration', type=int, 989 default=_DEFAULT_DURATION, metavar='HOURS', 990 help='number of hours back to search for status' 991 ' (default: %d)' % _DEFAULT_DURATION) 992 parser.add_argument('--board-notify', action='append', 993 default=[], metavar='ADDRESS', 994 help='Generate board inventory message, ' 995 'and send it to the given e-mail address(es)') 996 parser.add_argument('--pool-notify', action='append', 997 default=[], metavar='ADDRESS', 998 help='Generate pool inventory message, ' 999 'and send it to the given address(es)') 1000 parser.add_argument('-r', '--recommend', type=int, default=None, 1001 help=('Specify how many DUTs should be ' 1002 'recommended for repair (default: no ' 1003 'recommendation)')) 1004 parser.add_argument('--debug', action='store_true', 1005 help='Print e-mail messages on stdout ' 1006 'without sending them.') 1007 parser.add_argument('--logdir', default=_get_logdir(argv[0]), 1008 help='Directory where logs will be written.') 1009 parser.add_argument('boardnames', nargs='*', 1010 metavar='BOARD', 1011 help='names of boards to report on ' 1012 '(default: all boards)') 1013 arguments = parser.parse_args(argv[1:]) 1014 if not _verify_arguments(arguments): 1015 return None 1016 return arguments 1017 1018 1019def _configure_logging(arguments): 1020 """Configure the `logging` module for our needs. 1021 1022 How we log depends on whether the `--print` option was 1023 provided on the command line. Without the option, we log all 1024 messages at DEBUG level or above, and write them to a file in 1025 the directory specified by the `--logdir` option. With the 1026 option, we write log messages to stdout; messages below INFO 1027 level are discarded. 1028 1029 The log file is configured to rotate once a week on Friday 1030 evening, preserving ~3 months worth of history. 1031 1032 @param arguments Command-line arguments as returned by 1033 `ArgumentParser` 1034 1035 """ 1036 root_logger = logging.getLogger() 1037 if arguments.debug: 1038 root_logger.setLevel(logging.INFO) 1039 handler = logging.StreamHandler(sys.stdout) 1040 handler.setFormatter(logging.Formatter()) 1041 else: 1042 if not os.path.exists(arguments.logdir): 1043 os.mkdir(arguments.logdir) 1044 root_logger.setLevel(logging.DEBUG) 1045 logfile = os.path.join(arguments.logdir, _LOGFILE) 1046 handler = logging.handlers.TimedRotatingFileHandler( 1047 logfile, when='W4', backupCount=13) 1048 formatter = logging.Formatter(_LOG_FORMAT, 1049 time_utils.TIME_FMT) 1050 handler.setFormatter(formatter) 1051 # TODO(jrbarnette) This is gross. Importing client.bin.utils 1052 # implicitly imported logging_config, which calls 1053 # logging.basicConfig() *at module level*. That gives us an 1054 # extra logging handler that we don't want. So, clear out all 1055 # the handlers here. 1056 for h in root_logger.handlers: 1057 root_logger.removeHandler(h) 1058 root_logger.addHandler(handler) 1059 1060 1061def _populate_board_counts(inventory): 1062 """Gather board counts while providing interactive feedback. 1063 1064 Gathering the status of all individual DUTs in the lab can take 1065 considerable time (~30 minutes at the time of this writing). 1066 1067 Normally, we pay that cost by querying as we go. However, with 1068 the `--print` option, a human being may be watching the 1069 progress. So, we force the first (expensive) queries to happen 1070 up front, and provide a small ASCII progress bar to give an 1071 indicator of how many boards have been processed. 1072 1073 @param inventory _LabInventory object with the inventory to 1074 be gathered. 1075 1076 """ 1077 n = 0 1078 total_broken = 0 1079 for counts in inventory.values(): 1080 n += 1 1081 if n % 10 == 5: 1082 c = '+' 1083 elif n % 10 == 0: 1084 c = '%d' % ((n / 10) % 10) 1085 else: 1086 c = '.' 1087 sys.stdout.write(c) 1088 sys.stdout.flush() 1089 # This next call is where all the time goes - it forces all 1090 # of a board's HostJobHistory objects to query the database 1091 # and cache their results. 1092 total_broken += counts.get_broken() 1093 sys.stdout.write('\n') 1094 sys.stdout.write('Found %d broken DUTs\n' % total_broken) 1095 1096 1097def main(argv): 1098 """Standard main routine. 1099 @param argv Command line arguments including `sys.argv[0]`. 1100 """ 1101 arguments = _parse_command(argv) 1102 if not arguments: 1103 sys.exit(1) 1104 _configure_logging(arguments) 1105 try: 1106 end_time = int(time.time()) 1107 start_time = end_time - arguments.duration * 60 * 60 1108 timestamp = time.strftime('%Y-%m-%d.%H', 1109 time.localtime(end_time)) 1110 logging.debug('Starting lab inventory for %s', timestamp) 1111 if arguments.board_notify: 1112 if arguments.recommend: 1113 logging.debug('Will include repair recommendations') 1114 logging.debug('Will include board inventory') 1115 if arguments.pool_notify: 1116 logging.debug('Will include pool inventory') 1117 1118 afe = frontend_wrappers.RetryingAFE(server=None) 1119 inventory = _LabInventory.create_inventory( 1120 afe, start_time, end_time, arguments.boardnames) 1121 logging.info('Found %d hosts across %d boards', 1122 inventory.get_num_duts(), 1123 inventory.get_num_boards()) 1124 1125 if arguments.debug: 1126 _populate_board_counts(inventory) 1127 1128 if arguments.board_notify: 1129 if arguments.recommend: 1130 recommend_message = _generate_repair_recommendation( 1131 inventory, arguments.recommend) + '\n\n\n' 1132 else: 1133 recommend_message = '' 1134 board_message = _generate_board_inventory_message(inventory) 1135 _send_email(arguments, 1136 'boards-%s.txt' % timestamp, 1137 'DUT board inventory %s' % timestamp, 1138 arguments.board_notify, 1139 recommend_message + board_message) 1140 1141 if arguments.pool_notify: 1142 pool_message = _generate_pool_inventory_message(inventory) 1143 idle_message = _generate_idle_inventory_message(inventory) 1144 _send_email(arguments, 1145 'pools-%s.txt' % timestamp, 1146 'DUT pool inventory %s' % timestamp, 1147 arguments.pool_notify, 1148 pool_message + '\n\n\n' + idle_message) 1149 except KeyboardInterrupt: 1150 pass 1151 except EnvironmentError as e: 1152 logging.exception('Unexpected OS error: %s', e) 1153 except Exception as e: 1154 logging.exception('Unexpected exception: %s', e) 1155 1156 1157def get_inventory(afe): 1158 end_time = int(time.time()) 1159 start_time = end_time - 24 * 60 * 60 1160 return _LabInventory.create_inventory(afe, start_time, end_time) 1161 1162 1163def get_managed_boards(afe): 1164 return get_inventory(afe).get_managed_boards() 1165 1166 1167if __name__ == '__main__': 1168 main(sys.argv) 1169