1#!/usr/bin/env python 2# Copyright 2015 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Create e-mail reports of the Lab's DUT inventory. 7 8Gathers a list of all DUTs of interest in the Lab, segregated by 9model and pool, and determines whether each DUT is working or 10broken. Then, send one or more e-mail reports summarizing the 11status to e-mail addresses provided on the command line. 12 13usage: lab_inventory.py [ options ] [ model ... ] 14 15Options: 16--duration / -d <hours> 17 How far back in time to search job history to determine DUT 18 status. 19 20--model-notify <address>[,<address>] 21 Send the "model status" e-mail to all the specified e-mail 22 addresses. 23 24--pool-notify <address>[,<address>] 25 Send the "pool status" e-mail to all the specified e-mail 26 addresses. 27 28--recommend <number> 29 When generating the "model status" e-mail, include a list of 30 <number> specific DUTs to be recommended for repair. 31 32--repair-loops 33 Scan the inventory for DUTs stuck in repair loops, and report them 34 via a Monarch presence metric. 35 36--logdir <directory> 37 Log progress and actions in a file under this directory. Text 38 of any e-mail sent will also be logged in a timestamped file in 39 this directory. 40 41--debug 42 Suppress all logging, metrics reporting, and sending e-mail. 43 Instead, write the output that would be generated onto stdout. 44 45<model> arguments: 46 With no arguments, gathers the status for all models in the lab. 47 With one or more named models on the command line, restricts 48 reporting to just those models. 49 50""" 51 52 53import argparse 54import collections 55import logging 56import logging.handlers 57import os 58import re 59import sys 60import time 61 62import common 63from autotest_lib.client.bin import utils 64from autotest_lib.client.common_lib import time_utils 65from autotest_lib.server import constants 66from autotest_lib.server import site_utils 67from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 68from autotest_lib.server.hosts import servo_host 69from autotest_lib.server.lib import status_history 70from autotest_lib.site_utils import gmail_lib 71from chromite.lib import metrics 72 73 74CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS 75SPARE_POOL = constants.Pools.SPARE_POOL 76MANAGED_POOLS = constants.Pools.MANAGED_POOLS 77 78# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from 79# monitoring by this script. Currently, we're excluding these: 80# + 'adb' - We're not ready to monitor Android or Brillo hosts. 81# + 'board:guado_moblab' - These are maintained by a separate 82# process that doesn't use this script. 83 84_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'} 85 86# _DEFAULT_DURATION: 87# Default value used for the --duration command line option. 88# Specifies how far back in time to search in order to determine 89# DUT status. 90 91_DEFAULT_DURATION = 24 92 93# _LOGDIR: 94# Relative path used in the calculation of the default setting for 95# the --logdir option. The full path is relative to the root of the 96# autotest directory, as determined from sys.argv[0]. 97# _LOGFILE: 98# Basename of a file to which general log information will be 99# written. 100# _LOG_FORMAT: 101# Format string for log messages. 102 103_LOGDIR = os.path.join('logs', 'dut-data') 104_LOGFILE = 'lab-inventory.log' 105_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s' 106 107# Pattern describing location-based host names in the Chrome OS test 108# labs. Each DUT hostname designates the DUT's location: 109# * A lab (room) that's physically separated from other labs 110# (i.e. there's a door). 111# * A row (or aisle) of DUTs within the lab. 112# * A vertical rack of shelves on the row. 113# * A specific host on one shelf of the rack. 114 115_HOSTNAME_PATTERN = re.compile( 116 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)') 117 118# _REPAIR_LOOP_THRESHOLD: 119# The number of repeated Repair tasks that must be seen to declare 120# that a DUT is stuck in a repair loop. 121 122_REPAIR_LOOP_THRESHOLD = 4 123 124 125class _HostSetInventory(object): 126 """Maintains a set of related `HostJobHistory` objects. 127 128 The collection is segregated into disjoint categories of "working", 129 "broken", and "idle" DUTs. Accessor methods allow finding both the 130 list of DUTs in each category, as well as counts of each category. 131 132 Performance note: Certain methods in this class are potentially 133 expensive: 134 * `get_working()` 135 * `get_working_list()` 136 * `get_broken()` 137 * `get_broken_list()` 138 * `get_idle()` 139 * `get_idle_list()` 140 The first time any one of these methods is called, it causes 141 multiple RPC calls with a relatively expensive set of database 142 queries. However, the results of the queries are cached in the 143 individual `HostJobHistory` objects, so only the first call 144 actually pays the full cost. 145 146 Additionally, `get_working_list()`, `get_broken_list()` and 147 `get_idle_list()` cache their return values to avoid recalculating 148 lists at every call; this caching is separate from the caching of 149 RPC results described above. 150 151 This class is deliberately constructed to delay the RPC cost until 152 the accessor methods are called (rather than to query in 153 `record_host()`) so that it's possible to construct a complete 154 `_LabInventory` without making the expensive queries at creation 155 time. `_populate_model_counts()`, below, assumes this behavior. 156 157 Current usage of this class is that all DUTs are part of a single 158 scheduling pool of DUTs; however, this class make no assumptions 159 about the actual relationship among the DUTs. 160 """ 161 162 def __init__(self): 163 self._histories = [] 164 self._working_list = None 165 self._broken_list = None 166 self._idle_list = None 167 168 169 def record_host(self, host_history): 170 """Add one `HostJobHistory` object to the collection. 171 172 @param host_history The `HostJobHistory` object to be 173 remembered. 174 175 """ 176 self._working_list = None 177 self._broken_list = None 178 self._idle_list = None 179 self._histories.append(host_history) 180 181 182 def get_working_list(self): 183 """Return a list of all working DUTs in the pool. 184 185 Filter `self._histories` for histories where the last 186 diagnosis is `WORKING`. 187 188 Cache the result so that we only cacluate it once. 189 190 @return A list of HostJobHistory objects. 191 192 """ 193 if self._working_list is None: 194 self._working_list = [h for h in self._histories 195 if h.last_diagnosis()[0] == status_history.WORKING] 196 return self._working_list 197 198 199 def get_working(self): 200 """Return the number of working DUTs in the pool.""" 201 return len(self.get_working_list()) 202 203 204 def get_broken_list(self): 205 """Return a list of all broken DUTs in the pool. 206 207 Filter `self._histories` for histories where the last 208 diagnosis is `BROKEN`. 209 210 Cache the result so that we only cacluate it once. 211 212 @return A list of HostJobHistory objects. 213 214 """ 215 if self._broken_list is None: 216 self._broken_list = [h for h in self._histories 217 if h.last_diagnosis()[0] == status_history.BROKEN] 218 return self._broken_list 219 220 221 def get_broken(self): 222 """Return the number of broken DUTs in the pool.""" 223 return len(self.get_broken_list()) 224 225 226 def get_idle_list(self): 227 """Return a list of all idle DUTs in the pool. 228 229 Filter `self._histories` for histories where the last 230 diagnosis is `UNUSED` or `UNKNOWN`. 231 232 Cache the result so that we only cacluate it once. 233 234 @return A list of HostJobHistory objects. 235 236 """ 237 idle_statuses = {status_history.UNUSED, status_history.UNKNOWN} 238 if self._idle_list is None: 239 self._idle_list = [h for h in self._histories 240 if h.last_diagnosis()[0] in idle_statuses] 241 return self._idle_list 242 243 244 def get_idle(self): 245 """Return the number of idle DUTs in the pool.""" 246 return len(self.get_idle_list()) 247 248 249 def get_total(self): 250 """Return the total number of DUTs in the pool.""" 251 return len(self._histories) 252 253 254class _PoolSetInventory(object): 255 """Maintains a set of `HostJobHistory`s for a set of pools. 256 257 The collection is segregated into disjoint categories of "working", 258 "broken", and "idle" DUTs. Accessor methods allow finding both the 259 list of DUTs in each category, as well as counts of each category. 260 Accessor queries can be for an individual pool, or against all 261 pools. 262 263 Performance note: This class relies on `_HostSetInventory`. Public 264 methods in this class generally rely on methods of the same name in 265 the underlying class, and so will have the same underlying 266 performance characteristics. 267 """ 268 269 def __init__(self, pools): 270 self._histories_by_pool = { 271 pool: _HostSetInventory() for pool in pools 272 } 273 274 def record_host(self, host_history): 275 """Add one `HostJobHistory` object to the collection. 276 277 @param host_history The `HostJobHistory` object to be 278 remembered. 279 280 """ 281 pool = host_history.host_pool 282 self._histories_by_pool[pool].record_host(host_history) 283 284 285 def _count_pool(self, get_pool_count, pool=None): 286 """Internal helper to count hosts in a given pool. 287 288 The `get_pool_count` parameter is a function to calculate 289 the exact count of interest for the pool. 290 291 @param get_pool_count Function to return a count from a 292 _PoolCount object. 293 @param pool The pool to be counted. If `None`, 294 return the total across all pools. 295 296 """ 297 if pool is None: 298 return sum([get_pool_count(cached_history) for cached_history in 299 self._histories_by_pool.values()]) 300 else: 301 return get_pool_count(self._histories_by_pool[pool]) 302 303 304 def get_working_list(self): 305 """Return a list of all working DUTs (across all pools). 306 307 Go through all HostJobHistory objects across all pools, selecting the 308 ones where the last diagnosis is `WORKING`. 309 310 @return A list of HostJobHistory objects. 311 312 """ 313 l = [] 314 for p in self._histories_by_pool.values(): 315 l.extend(p.get_working_list()) 316 return l 317 318 319 def get_working(self, pool=None): 320 """Return the number of working DUTs in a pool. 321 322 @param pool The pool to be counted. If `None`, return the 323 total across all pools. 324 325 @return The total number of working DUTs in the selected 326 pool(s). 327 """ 328 return self._count_pool(_HostSetInventory.get_working, pool) 329 330 331 def get_broken_list(self): 332 """Return a list of all broken DUTs (across all pools). 333 334 Go through all HostJobHistory objects in the across all pools, 335 selecting the ones where the last diagnosis is `BROKEN`. 336 337 @return A list of HostJobHistory objects. 338 339 """ 340 l = [] 341 for p in self._histories_by_pool.values(): 342 l.extend(p.get_broken_list()) 343 return l 344 345 346 def get_broken(self, pool=None): 347 """Return the number of broken DUTs in a pool. 348 349 @param pool The pool to be counted. If `None`, return the 350 total across all pools. 351 352 @return The total number of broken DUTs in the selected pool(s). 353 """ 354 return self._count_pool(_HostSetInventory.get_broken, pool) 355 356 357 def get_idle_list(self, pool=None): 358 """Return a list of all idle DUTs in the given pool. 359 360 Go through all HostJobHistory objects in the given pool, selecting the 361 ones where the last diagnosis is `UNUSED` or `UNKNOWN`. 362 363 @param pool: The pool to be counted. If `None`, return the total list 364 across all pools. 365 366 @return A list of HostJobHistory objects. 367 368 """ 369 if pool is None: 370 l = [] 371 for p in self._histories_by_pool.itervalues(): 372 l.extend(p.get_idle_list()) 373 return l 374 else: 375 return self._histories_by_pool[pool].get_idle_list() 376 377 378 def get_idle(self, pool=None): 379 """Return the number of idle DUTs in a pool. 380 381 @param pool: The pool to be counted. If `None`, return the total 382 across all pools. 383 384 @return The total number of idle DUTs in the selected pool(s). 385 """ 386 return self._count_pool(_HostSetInventory.get_idle, pool) 387 388 389 def get_spares_buffer(self, spare_pool=SPARE_POOL): 390 """Return the the nominal number of working spares. 391 392 Calculates and returns how many working spares there would 393 be in the spares pool if all broken DUTs were in the spares 394 pool. This number may be negative, indicating a shortfall 395 in the critical pools. 396 397 @return The total number DUTs in the spares pool, less the total 398 number of broken DUTs in all pools. 399 """ 400 return self.get_total(spare_pool) - self.get_broken() 401 402 403 def get_total(self, pool=None): 404 """Return the total number of DUTs in a pool. 405 406 @param pool The pool to be counted. If `None`, return the 407 total across all pools. 408 409 @return The total number of DUTs in the selected pool(s). 410 """ 411 return self._count_pool(_HostSetInventory.get_total, pool) 412 413 414def _eligible_host(afehost): 415 """Return whether this host is eligible for monitoring. 416 417 A host is eligible if it has a (unique) 'model' label, it's in 418 exactly one pool, and it has no labels from the 419 `_EXCLUDED_LABELS` set. 420 421 @param afehost The host to be tested for eligibility. 422 """ 423 # DUTs without an existing, unique 'model' or 'pool' label 424 # aren't meant to exist in the managed inventory; their presence 425 # generally indicates an error in the database. Unfortunately 426 # such errors have been seen to occur from time to time. 427 # 428 # The _LabInventory constructor requires hosts to conform to the 429 # label restrictions, and may fail if they don't. Failing an 430 # inventory run for a single bad entry is the wrong thing, so we 431 # ignore the problem children here, to keep them out of the 432 # inventory. 433 models = [l for l in afehost.labels 434 if l.startswith(constants.Labels.MODEL_PREFIX)] 435 pools = [l for l in afehost.labels 436 if l.startswith(constants.Labels.POOL_PREFIX)] 437 excluded = _EXCLUDED_LABELS.intersection(afehost.labels) 438 return len(models) == 1 and len(pools) == 1 and not excluded 439 440 441class _LabInventory(collections.Mapping): 442 """Collection of `HostJobHistory` objects for the Lab's inventory. 443 444 This is a dict-like collection indexed by model. Indexing returns 445 the _PoolSetInventory object associated with the model. 446 """ 447 448 @classmethod 449 def create_inventory(cls, afe, start_time, end_time, modellist=[]): 450 """Return a Lab inventory with specified parameters. 451 452 By default, gathers inventory from `HostJobHistory` objects for 453 all DUTs in the `MANAGED_POOLS` list. If `modellist` is 454 supplied, the inventory will be restricted to only the given 455 models. 456 457 @param afe AFE object for constructing the 458 `HostJobHistory` objects. 459 @param start_time Start time for the `HostJobHistory` objects. 460 @param end_time End time for the `HostJobHistory` objects. 461 @param modellist List of models to include. If empty, 462 include all available models. 463 @return A `_LabInventory` object for the specified models. 464 465 """ 466 target_pools = MANAGED_POOLS 467 label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools] 468 afehosts = afe.get_hosts(labels__name__in=label_list) 469 if modellist: 470 # We're deliberately not checking host eligibility in this 471 # code path. This is a debug path, not used in production; 472 # it may be useful to include ineligible hosts here. 473 modelhosts = [] 474 for model in modellist: 475 model_label = constants.Labels.MODEL_PREFIX + model 476 host_list = [h for h in afehosts 477 if model_label in h.labels] 478 modelhosts.extend(host_list) 479 afehosts = modelhosts 480 else: 481 afehosts = [h for h in afehosts if _eligible_host(h)] 482 create = lambda host: ( 483 status_history.HostJobHistory(afe, host, 484 start_time, end_time)) 485 return cls([create(host) for host in afehosts], target_pools) 486 487 488 def __init__(self, histories, pools): 489 models = {h.host_model for h in histories} 490 self._modeldata = {model: _PoolSetInventory(pools) for model in models} 491 self._dut_count = len(histories) 492 for h in histories: 493 self[h.host_model].record_host(h) 494 self._boards = {h.host_board for h in histories} 495 496 497 def __getitem__(self, key): 498 return self._modeldata.__getitem__(key) 499 500 501 def __len__(self): 502 return self._modeldata.__len__() 503 504 505 def __iter__(self): 506 return self._modeldata.__iter__() 507 508 509 def reportable_items(self, spare_pool=SPARE_POOL): 510 """Iterate over all items subject to reporting. 511 512 Yields the contents of `self.iteritems()` filtered to include 513 only reportable models. A model is reportable if it has DUTs in 514 both `spare_pool` and at least one other pool. 515 516 @param spare_pool The spare pool to be tested for reporting. 517 """ 518 for model, histories in self.iteritems(): 519 spares = histories.get_total(spare_pool) 520 total = histories.get_total() 521 if spares != 0 and spares != total: 522 yield model, histories 523 524 525 def get_num_duts(self): 526 """Return the total number of DUTs in the inventory.""" 527 return self._dut_count 528 529 530 def get_num_models(self): 531 """Return the total number of models in the inventory.""" 532 return len(self) 533 534 535 def get_pool_models(self, pool): 536 """Return all models in `pool`. 537 538 @param pool The pool to be inventoried for models. 539 """ 540 return {m for m, h in self.iteritems() if h.get_total(pool)} 541 542 543 def get_boards(self): 544 return self._boards 545 546 547def _sort_by_location(inventory_list): 548 """Return a list of DUTs, organized by location. 549 550 Take the given list of `HostJobHistory` objects, separate it 551 into a list per lab, and sort each lab's list by location. The 552 order of sorting within a lab is 553 * By row number within the lab, 554 * then by rack number within the row, 555 * then by host shelf number within the rack. 556 557 Return a list of the sorted lists. 558 559 Implementation note: host locations are sorted by converting 560 each location into a base 100 number. If row, rack or 561 host numbers exceed the range [0..99], then sorting will 562 break down. 563 564 @return A list of sorted lists of DUTs. 565 566 """ 567 BASE = 100 568 lab_lists = {} 569 for history in inventory_list: 570 location = _HOSTNAME_PATTERN.match(history.host.hostname) 571 if location: 572 lab = location.group(1) 573 key = 0 574 for idx in location.group(2, 3, 4): 575 key = BASE * key + int(idx) 576 lab_lists.setdefault(lab, []).append((key, history)) 577 return_list = [] 578 for dut_list in lab_lists.values(): 579 dut_list.sort(key=lambda t: t[0]) 580 return_list.append([t[1] for t in dut_list]) 581 return return_list 582 583 584def _score_repair_set(buffer_counts, repair_list): 585 """Return a numeric score rating a set of DUTs to be repaired. 586 587 `buffer_counts` is a dictionary mapping model names to the size of 588 the model's spares buffer. 589 590 `repair_list` is a list of `HostJobHistory` objects for the DUTs to 591 be repaired. 592 593 This function calculates the new set of buffer counts that would 594 result from the proposed repairs, and scores the new set using two 595 numbers: 596 * Worst case buffer count for any model (higher is better). This 597 is the more significant number for comparison. 598 * Number of models at the worst case (lower is better). This is 599 the less significant number. 600 601 Implementation note: The score could fail to reflect the intended 602 criteria if there are more than 1000 models in the inventory. 603 604 @param spare_counts A dictionary mapping models to buffer counts. 605 @param repair_list A list of `HostJobHistory` objects for the 606 DUTs to be repaired. 607 @return A numeric score. 608 """ 609 # Go through `buffer_counts`, and create a list of new counts 610 # that records the buffer count for each model after repair. 611 # The new list of counts discards the model names, as they don't 612 # contribute to the final score. 613 _NMODELS = 1000 614 pools = {h.host_pool for h in repair_list} 615 repair_inventory = _LabInventory(repair_list, pools) 616 new_counts = [] 617 for m, c in buffer_counts.iteritems(): 618 if m in repair_inventory: 619 newcount = repair_inventory[m].get_total() 620 else: 621 newcount = 0 622 new_counts.append(c + newcount) 623 # Go through the new list of counts. Find the worst available 624 # spares count, and count how many times that worst case occurs. 625 worst_count = new_counts[0] 626 num_worst = 1 627 for c in new_counts[1:]: 628 if c == worst_count: 629 num_worst += 1 630 elif c < worst_count: 631 worst_count = c 632 num_worst = 1 633 # Return the calculated score 634 return _NMODELS * worst_count - num_worst 635 636 637def _generate_repair_recommendation(inventory, num_recommend): 638 """Return a summary of selected DUTs needing repair. 639 640 Returns a message recommending a list of broken DUTs to be repaired. 641 The list of DUTs is selected based on these criteria: 642 * No more than `num_recommend` DUTs will be listed. 643 * All DUTs must be in the same lab. 644 * DUTs should be selected for some degree of physical proximity. 645 * DUTs for models with a low spares buffer are more important than 646 DUTs with larger buffers. 647 648 The algorithm used will guarantee that at least one DUT from a model 649 with the lowest spares buffer will be recommended. If the worst 650 spares buffer number is shared by more than one model, the algorithm 651 will tend to prefer repair sets that include more of those models 652 over sets that cover fewer models. 653 654 @param inventory `_LabInventory` object from which to generate 655 recommendations. 656 @param num_recommend Number of DUTs to recommend for repair. 657 658 """ 659 logging.debug('Creating DUT repair recommendations') 660 model_buffer_counts = {} 661 broken_list = [] 662 for model, counts in inventory.reportable_items(): 663 logging.debug('Listing failed DUTs for %s', model) 664 if counts.get_broken() != 0: 665 model_buffer_counts[model] = counts.get_spares_buffer() 666 broken_list.extend(counts.get_broken_list()) 667 # N.B. The logic inside this loop may seem complicated, but 668 # simplification is hard: 669 # * Calculating an initial recommendation outside of 670 # the loop likely would make things more complicated, 671 # not less. 672 # * It's necessary to calculate an initial lab slice once per 673 # lab _before_ the while loop, in case the number of broken 674 # DUTs in a lab is less than `num_recommend`. 675 recommendation = None 676 best_score = None 677 for lab_duts in _sort_by_location(broken_list): 678 start = 0 679 end = num_recommend 680 lab_slice = lab_duts[start : end] 681 lab_score = _score_repair_set(model_buffer_counts, lab_slice) 682 while end < len(lab_duts): 683 start += 1 684 end += 1 685 new_slice = lab_duts[start : end] 686 new_score = _score_repair_set(model_buffer_counts, new_slice) 687 if new_score > lab_score: 688 lab_slice = new_slice 689 lab_score = new_score 690 if recommendation is None or lab_score > best_score: 691 recommendation = lab_slice 692 best_score = lab_score 693 # N.B. The trailing space in `line_fmt` is manadatory: Without it, 694 # Gmail will parse the URL wrong. Don't ask. If you simply _must_ 695 # know more, go try it yourself... 696 line_fmt = '%-30s %-16s %-6s\n %s ' 697 message = ['Repair recommendations:\n', 698 line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')] 699 for h in recommendation: 700 servo_name = servo_host.make_servo_hostname(h.host.hostname) 701 servo_present = utils.host_is_in_lab_zone(servo_name) 702 _, event = h.last_diagnosis() 703 line = line_fmt % ( 704 h.host.hostname, h.host_model, 705 'Yes' if servo_present else 'No', event.job_url) 706 message.append(line) 707 return '\n'.join(message) 708 709 710def _generate_model_inventory_message(inventory): 711 """Generate the "model inventory" e-mail message. 712 713 The model inventory is a list by model summarizing the number of 714 working, broken, and idle DUTs, and the total shortfall or surplus 715 of working devices relative to the minimum critical pool 716 requirement. 717 718 The report omits models with no DUTs in the spare pool or with no 719 DUTs in a critical pool. 720 721 N.B. For sample output text formattted as users can expect to 722 see it in e-mail and log files, refer to the unit tests. 723 724 @param inventory `_LabInventory` object to be reported on. 725 @return String with the inventory message to be sent. 726 """ 727 logging.debug('Creating model inventory') 728 nworking = 0 729 nbroken = 0 730 nidle = 0 731 nbroken_models = 0 732 ntotal_models = 0 733 summaries = [] 734 column_names = ( 735 'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total') 736 for model, counts in inventory.reportable_items(): 737 logging.debug('Counting %2d DUTS for model %s', 738 counts.get_total(), model) 739 # Summary elements laid out in the same order as the column 740 # headers: 741 # Model Avail Bad Idle Good Spare Total 742 # e[0] e[1] e[2] e[3] e[4] e[5] e[6] 743 element = (model, 744 counts.get_spares_buffer(), 745 counts.get_broken(), 746 counts.get_idle(), 747 counts.get_working(), 748 counts.get_total(SPARE_POOL), 749 counts.get_total()) 750 if element[2]: 751 summaries.append(element) 752 nbroken_models += 1 753 ntotal_models += 1 754 nbroken += element[2] 755 nidle += element[3] 756 nworking += element[4] 757 ntotal = nworking + nbroken + nidle 758 summaries = sorted(summaries, key=lambda e: (e[1], -e[2])) 759 broken_percent = int(round(100.0 * nbroken / ntotal)) 760 idle_percent = int(round(100.0 * nidle / ntotal)) 761 working_percent = 100 - broken_percent - idle_percent 762 message = ['Summary of DUTs in inventory:', 763 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'), 764 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % ( 765 nbroken, broken_percent, 766 nidle, idle_percent, 767 nworking, working_percent, 768 ntotal), 769 '', 770 'Models with failures: %d' % nbroken_models, 771 'Models in inventory: %d' % ntotal_models, 772 '', '', 773 'Full model inventory:\n', 774 '%-22s %5s %5s %5s %5s %5s %5s' % column_names] 775 message.extend( 776 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries]) 777 return '\n'.join(message) 778 779 780_POOL_INVENTORY_HEADER = '''\ 781Notice to Infrastructure deputies: All models shown below are at 782less than full strength, please take action to resolve the issues. 783Once you're satisified that failures won't recur, failed DUTs can 784be replaced with spares by running `balance_pool`. Detailed 785instructions can be found here: 786 http://go/cros-manage-duts 787''' 788 789 790def _generate_pool_inventory_message(inventory): 791 """Generate the "pool inventory" e-mail message. 792 793 The pool inventory is a list by pool and model summarizing the 794 number of working and broken DUTs in the pool. Only models with 795 at least one broken DUT are included in the list. 796 797 N.B. For sample output text formattted as users can expect to see it 798 in e-mail and log files, refer to the unit tests. 799 800 @param inventory `_LabInventory` object to be reported on. 801 @return String with the inventory message to be sent. 802 """ 803 logging.debug('Creating pool inventory') 804 message = [_POOL_INVENTORY_HEADER] 805 newline = '' 806 for pool in CRITICAL_POOLS: 807 message.append( 808 '%sStatus for pool:%s, by model:' % (newline, pool)) 809 message.append( 810 '%-20s %5s %5s %5s %5s' % ( 811 'Model', 'Bad', 'Idle', 'Good', 'Total')) 812 data_list = [] 813 for model, counts in inventory.iteritems(): 814 logging.debug('Counting %2d DUTs for %s, %s', 815 counts.get_total(pool), model, pool) 816 broken = counts.get_broken(pool) 817 idle = counts.get_idle(pool) 818 # models at full strength are not reported 819 if not broken and not idle: 820 continue 821 working = counts.get_working(pool) 822 total = counts.get_total(pool) 823 data_list.append((model, broken, idle, working, total)) 824 if data_list: 825 data_list = sorted(data_list, key=lambda d: -d[1]) 826 message.extend( 827 ['%-20s %5d %5d %5d %5d' % t for t in data_list]) 828 else: 829 message.append('(All models at full strength)') 830 newline = '\n' 831 return '\n'.join(message) 832 833 834_IDLE_INVENTORY_HEADER = '''\ 835Notice to Infrastructure deputies: The hosts shown below haven't 836run any jobs for at least 24 hours. Please check each host; locked 837hosts should normally be unlocked; stuck jobs should normally be 838aborted. 839''' 840 841 842def _generate_idle_inventory_message(inventory): 843 """Generate the "idle inventory" e-mail message. 844 845 The idle inventory is a host list with corresponding pool and model, 846 where the hosts are idle (`UNKWOWN` or `UNUSED`). 847 848 N.B. For sample output text format as users can expect to 849 see it in e-mail and log files, refer to the unit tests. 850 851 @param inventory `_LabInventory` object to be reported on. 852 @return String with the inventory message to be sent. 853 854 """ 855 logging.debug('Creating idle inventory') 856 message = [_IDLE_INVENTORY_HEADER] 857 message.append('Idle Host List:') 858 message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool')) 859 data_list = [] 860 for pool in MANAGED_POOLS: 861 for model, counts in inventory.iteritems(): 862 logging.debug('Counting %2d DUTs for %s, %s', 863 counts.get_total(pool), model, pool) 864 data_list.extend([(dut.host.hostname, model, pool) 865 for dut in counts.get_idle_list(pool)]) 866 if data_list: 867 message.extend(['%-30s %-20s %s' % t for t in data_list]) 868 else: 869 message.append('(No idle DUTs)') 870 return '\n'.join(message) 871 872 873def _send_email(arguments, tag, subject, recipients, body): 874 """Send an inventory e-mail message. 875 876 The message is logged in the selected log directory using `tag` for 877 the file name. 878 879 If the --debug option was requested, the message is neither logged 880 nor sent, but merely printed on stdout. 881 882 @param arguments Parsed command-line options. 883 @param tag Tag identifying the inventory for logging 884 purposes. 885 @param subject E-mail Subject: header line. 886 @param recipients E-mail addresses for the To: header line. 887 @param body E-mail message body. 888 """ 889 logging.debug('Generating email: "%s"', subject) 890 all_recipients = ', '.join(recipients) 891 report_body = '\n'.join([ 892 'To: %s' % all_recipients, 893 'Subject: %s' % subject, 894 '', body, '']) 895 if arguments.debug: 896 print report_body 897 else: 898 filename = os.path.join(arguments.logdir, tag) 899 try: 900 report_file = open(filename, 'w') 901 report_file.write(report_body) 902 report_file.close() 903 except EnvironmentError as e: 904 logging.error('Failed to write %s: %s', filename, e) 905 try: 906 gmail_lib.send_email(all_recipients, subject, body) 907 except Exception as e: 908 logging.error('Failed to send e-mail to %s: %s', 909 all_recipients, e) 910 911 912def _populate_model_counts(inventory): 913 """Gather model counts while providing interactive feedback. 914 915 Gathering the status of all individual DUTs in the lab can take 916 considerable time (~30 minutes at the time of this writing). 917 Normally, we pay that cost by querying as we go. However, with 918 the `--debug` option, we expect a human being to be watching the 919 progress in real time. So, we force the first (expensive) queries 920 to happen up front, and provide simple ASCII output on sys.stdout 921 to show a progress bar and results. 922 923 @param inventory `_LabInventory` object from which to gather 924 counts. 925 """ 926 n = 0 927 total_broken = 0 928 for counts in inventory.itervalues(): 929 n += 1 930 if n % 10 == 5: 931 c = '+' 932 elif n % 10 == 0: 933 c = '%d' % ((n / 10) % 10) 934 else: 935 c = '.' 936 sys.stdout.write(c) 937 sys.stdout.flush() 938 # This next call is where all the time goes - it forces all of a 939 # model's `HostJobHistory` objects to query the database and 940 # cache their results. 941 total_broken += counts.get_broken() 942 sys.stdout.write('\n') 943 sys.stdout.write('Found %d broken DUTs\n' % total_broken) 944 945 946def _perform_model_inventory(arguments, inventory, timestamp): 947 """Perform the model inventory report. 948 949 The model inventory report consists of the following: 950 * A list of DUTs that are recommended to be repaired. This list 951 is optional, and only appears if the `--recommend` option is 952 present. 953 * A list of all models that have failed DUTs, with counts 954 of working, broken, and spare DUTs, among others. 955 956 @param arguments Command-line arguments as returned by 957 `ArgumentParser` 958 @param inventory `_LabInventory` object to be reported on. 959 @param timestamp A string used to identify this run's timestamp 960 in logs and email output. 961 """ 962 if arguments.recommend: 963 recommend_message = _generate_repair_recommendation( 964 inventory, arguments.recommend) + '\n\n\n' 965 else: 966 recommend_message = '' 967 model_message = _generate_model_inventory_message(inventory) 968 _send_email(arguments, 969 'models-%s.txt' % timestamp, 970 'DUT model inventory %s' % timestamp, 971 arguments.model_notify, 972 recommend_message + model_message) 973 974 975def _perform_pool_inventory(arguments, inventory, timestamp): 976 """Perform the pool inventory report. 977 978 The pool inventory report consists of the following: 979 * A list of all critical pools that have failed DUTs, with counts 980 of working, broken, and idle DUTs. 981 * A list of all idle DUTs by hostname including the model and 982 pool. 983 984 @param arguments Command-line arguments as returned by 985 `ArgumentParser` 986 @param inventory `_LabInventory` object to be reported on. 987 @param timestamp A string used to identify this run's timestamp in 988 logs and email output. 989 """ 990 pool_message = _generate_pool_inventory_message(inventory) 991 idle_message = _generate_idle_inventory_message(inventory) 992 _send_email(arguments, 993 'pools-%s.txt' % timestamp, 994 'DUT pool inventory %s' % timestamp, 995 arguments.pool_notify, 996 pool_message + '\n\n\n' + idle_message) 997 998 999def _dut_in_repair_loop(history): 1000 """Return whether a DUT's history indicates a repair loop. 1001 1002 A DUT is considered looping if it runs no tests, and no tasks pass 1003 other than repair tasks. 1004 1005 @param history An instance of `status_history.HostJobHistory` to be 1006 scanned for a repair loop. The caller guarantees 1007 that this history corresponds to a working DUT. 1008 @returns Return a true value if the DUT's most recent history 1009 indicates a repair loop. 1010 """ 1011 # Our caller passes only histories for working DUTs; that means 1012 # we've already paid the cost of fetching the diagnosis task, and 1013 # we know that the task was successful. The diagnosis task will be 1014 # one of the tasks we must scan to find a loop, so if the task isn't 1015 # a repair task, then our history includes a successful non-repair 1016 # task, and we're not looping. 1017 # 1018 # The for loop below is very expensive, because it must fetch the 1019 # full history, regardless of how many tasks we examine. At the 1020 # time of this writing, this check against the diagnosis task 1021 # reduces the cost of finding loops in the full inventory from hours 1022 # to minutes. 1023 if history.last_diagnosis()[1].name != 'Repair': 1024 return False 1025 repair_ok_count = 0 1026 for task in history: 1027 if not task.is_special: 1028 # This is a test, so we're not looping. 1029 return False 1030 if task.diagnosis == status_history.BROKEN: 1031 # Failed a repair, so we're not looping. 1032 return False 1033 if (task.diagnosis == status_history.WORKING 1034 and task.name != 'Repair'): 1035 # Non-repair task succeeded, so we're not looping. 1036 return False 1037 # At this point, we have either a failed non-repair task, or 1038 # a successful repair. 1039 if task.name == 'Repair': 1040 repair_ok_count += 1 1041 if repair_ok_count >= _REPAIR_LOOP_THRESHOLD: 1042 return True 1043 1044 1045def _perform_repair_loop_report(arguments, inventory): 1046 """Scan the inventory for DUTs stuck in a repair loop. 1047 1048 This routine walks through the given inventory looking for DUTs 1049 where the most recent history shows that the DUT is regularly 1050 passing repair tasks, but has not run any tests. 1051 1052 @param arguments Command-line arguments as returned by 1053 `ArgumentParser` 1054 @param inventory `_LabInventory` object to be reported on. 1055 """ 1056 loop_presence = metrics.BooleanMetric( 1057 'chromeos/autotest/inventory/repair_loops', 1058 'DUTs stuck in repair loops') 1059 logging.info('Scanning for DUTs in repair loops.') 1060 for counts in inventory.itervalues(): 1061 for history in counts.get_working_list(): 1062 # Managed DUTs with names that don't match 1063 # _HOSTNAME_PATTERN shouldn't be possible. However, we 1064 # don't want arbitrary strings being attached to the 1065 # 'dut_hostname' field, so for safety, we exclude all 1066 # anomalies. 1067 if not _HOSTNAME_PATTERN.match(history.hostname): 1068 continue 1069 if _dut_in_repair_loop(history): 1070 fields = {'dut_hostname': history.hostname, 1071 'model': history.host_model, 1072 'pool': history.host_pool} 1073 logging.info('Looping DUT: %(dut_hostname)s, ' 1074 'model: %(model)s, pool: %(pool)s', 1075 fields) 1076 loop_presence.set(True, fields=fields) 1077 1078 1079def _log_startup(arguments, startup_time): 1080 """Log the start of this inventory run. 1081 1082 Print various log messages indicating the start of the run. Return 1083 a string based on `startup_time` that will be used to identify this 1084 run in log files and e-mail messages. 1085 1086 @param startup_time A UNIX timestamp marking the moment when 1087 this inventory run began. 1088 @returns A timestamp string that will be used to identify this run 1089 in logs and email output. 1090 """ 1091 timestamp = time.strftime('%Y-%m-%d.%H', 1092 time.localtime(startup_time)) 1093 logging.debug('Starting lab inventory for %s', timestamp) 1094 if arguments.model_notify: 1095 if arguments.recommend: 1096 logging.debug('Will include repair recommendations') 1097 logging.debug('Will include model inventory') 1098 if arguments.pool_notify: 1099 logging.debug('Will include pool inventory') 1100 return timestamp 1101 1102 1103def _create_inventory(arguments, end_time): 1104 """Create the `_LabInventory` instance to use for reporting. 1105 1106 @param end_time A UNIX timestamp for the end of the time range 1107 to be searched in this inventory run. 1108 """ 1109 start_time = end_time - arguments.duration * 60 * 60 1110 afe = frontend_wrappers.RetryingAFE(server=None) 1111 inventory = _LabInventory.create_inventory( 1112 afe, start_time, end_time, arguments.modelnames) 1113 logging.info('Found %d hosts across %d models', 1114 inventory.get_num_duts(), 1115 inventory.get_num_models()) 1116 return inventory 1117 1118 1119def _perform_inventory_reports(arguments): 1120 """Perform all inventory checks requested on the command line. 1121 1122 Create the initial inventory and run through the inventory reports 1123 as called for by the parsed command-line arguments. 1124 1125 @param arguments Command-line arguments as returned by 1126 `ArgumentParser`. 1127 """ 1128 startup_time = time.time() 1129 timestamp = _log_startup(arguments, startup_time) 1130 inventory = _create_inventory(arguments, startup_time) 1131 if arguments.debug: 1132 _populate_model_counts(inventory) 1133 if arguments.model_notify: 1134 _perform_model_inventory(arguments, inventory, timestamp) 1135 if arguments.pool_notify: 1136 _perform_pool_inventory(arguments, inventory, timestamp) 1137 if arguments.repair_loops: 1138 _perform_repair_loop_report(arguments, inventory) 1139 1140 1141def _separate_email_addresses(address_list): 1142 """Parse a list of comma-separated lists of e-mail addresses. 1143 1144 @param address_list A list of strings containing comma 1145 separate e-mail addresses. 1146 @return A list of the individual e-mail addresses. 1147 1148 """ 1149 newlist = [] 1150 for arg in address_list: 1151 newlist.extend([email.strip() for email in arg.split(',')]) 1152 return newlist 1153 1154 1155def _verify_arguments(arguments): 1156 """Validate command-line arguments. 1157 1158 Join comma separated e-mail addresses for `--model-notify` and 1159 `--pool-notify` in separate option arguments into a single list. 1160 1161 For non-debug uses, require that at least one inventory report be 1162 requested. For debug, if a report isn't specified, treat it as "run 1163 all the reports." 1164 1165 The return value indicates success or failure; in the case of 1166 failure, we also write an error message to stderr. 1167 1168 @param arguments Command-line arguments as returned by 1169 `ArgumentParser` 1170 @return True if the arguments are semantically good, or False 1171 if the arguments don't meet requirements. 1172 1173 """ 1174 arguments.model_notify = _separate_email_addresses( 1175 arguments.model_notify) 1176 arguments.pool_notify = _separate_email_addresses( 1177 arguments.pool_notify) 1178 if not any([arguments.model_notify, arguments.pool_notify, 1179 arguments.repair_loops]): 1180 if not arguments.debug: 1181 sys.stderr.write('Must request at least one report via ' 1182 '--model-notify, --pool-notify, or ' 1183 '--repair-loops\n') 1184 return False 1185 else: 1186 # We want to run all the e-mail reports. An empty notify 1187 # list will cause a report to be skipped, so make sure the 1188 # lists are non-empty. 1189 arguments.model_notify = [''] 1190 arguments.pool_notify = [''] 1191 return True 1192 1193 1194def _get_default_logdir(script): 1195 """Get the default directory for the `--logdir` option. 1196 1197 The default log directory is based on the parent directory 1198 containing this script. 1199 1200 @param script Path to this script file. 1201 @return A path to a directory. 1202 1203 """ 1204 basedir = os.path.dirname(os.path.abspath(script)) 1205 basedir = os.path.dirname(basedir) 1206 return os.path.join(basedir, _LOGDIR) 1207 1208 1209def _parse_command(argv): 1210 """Parse the command line arguments. 1211 1212 Create an argument parser for this command's syntax, parse the 1213 command line, and return the result of the ArgumentParser 1214 parse_args() method. 1215 1216 @param argv Standard command line argument vector; argv[0] is 1217 assumed to be the command name. 1218 @return Result returned by ArgumentParser.parse_args(). 1219 1220 """ 1221 parser = argparse.ArgumentParser( 1222 prog=argv[0], 1223 description='Gather and report lab inventory statistics') 1224 parser.add_argument('-d', '--duration', type=int, 1225 default=_DEFAULT_DURATION, metavar='HOURS', 1226 help='number of hours back to search for status' 1227 ' (default: %d)' % _DEFAULT_DURATION) 1228 parser.add_argument('--model-notify', action='append', 1229 default=[], metavar='ADDRESS', 1230 help='Generate model inventory message, ' 1231 'and send it to the given e-mail address(es)') 1232 parser.add_argument('--pool-notify', action='append', 1233 default=[], metavar='ADDRESS', 1234 help='Generate pool inventory message, ' 1235 'and send it to the given address(es)') 1236 parser.add_argument('-r', '--recommend', type=int, default=None, 1237 help=('Specify how many DUTs should be ' 1238 'recommended for repair (default: no ' 1239 'recommendation)')) 1240 parser.add_argument('--repair-loops', action='store_true', 1241 help='Check for devices stuck in repair loops.') 1242 parser.add_argument('--debug-metrics', action='store_true', 1243 help='Include debug information about the metrics ' 1244 'that would be reported ') 1245 parser.add_argument('--debug', action='store_true', 1246 help='Print e-mail messages on stdout ' 1247 'without sending them.') 1248 parser.add_argument('--logdir', default=_get_default_logdir(argv[0]), 1249 help='Directory where logs will be written.') 1250 parser.add_argument('modelnames', nargs='*', 1251 metavar='MODEL', 1252 help='names of models to report on ' 1253 '(default: all models)') 1254 arguments = parser.parse_args(argv[1:]) 1255 if not _verify_arguments(arguments): 1256 return None 1257 return arguments 1258 1259 1260def _configure_logging(arguments): 1261 """Configure the `logging` module for our needs. 1262 1263 How we log depends on whether the `--debug` option was provided on 1264 the command line. 1265 * Without the option, we configure the logging to capture all 1266 potentially relevant events in a log file. The log file is 1267 configured to rotate once a week on Friday evening, preserving 1268 ~3 months worth of history. 1269 * With the option, we expect stdout to contain other 1270 human-readable output (including the contents of the e-mail 1271 messages), so we restrict the output to INFO level. 1272 1273 For convenience, when `--debug` is on, the logging format has 1274 no adornments, so that a call like `logging.info(msg)` simply writes 1275 `msg` to stdout, plus a trailing newline. 1276 1277 @param arguments Command-line arguments as returned by 1278 `ArgumentParser` 1279 """ 1280 root_logger = logging.getLogger() 1281 if arguments.debug: 1282 root_logger.setLevel(logging.INFO) 1283 handler = logging.StreamHandler(sys.stdout) 1284 handler.setFormatter(logging.Formatter()) 1285 else: 1286 if not os.path.exists(arguments.logdir): 1287 os.mkdir(arguments.logdir) 1288 root_logger.setLevel(logging.DEBUG) 1289 logfile = os.path.join(arguments.logdir, _LOGFILE) 1290 handler = logging.handlers.TimedRotatingFileHandler( 1291 logfile, when='W4', backupCount=13) 1292 formatter = logging.Formatter(_LOG_FORMAT, 1293 time_utils.TIME_FMT) 1294 handler.setFormatter(formatter) 1295 # TODO(jrbarnette) This is gross. Importing client.bin.utils 1296 # implicitly imported logging_config, which calls 1297 # logging.basicConfig() *at module level*. That gives us an 1298 # extra logging handler that we don't want. So, clear out all 1299 # the handlers here. 1300 for h in root_logger.handlers: 1301 root_logger.removeHandler(h) 1302 root_logger.addHandler(handler) 1303 1304 1305def main(argv): 1306 """Standard main routine. 1307 1308 @param argv Command line arguments, including `sys.argv[0]`. 1309 """ 1310 arguments = _parse_command(argv) 1311 if not arguments: 1312 sys.exit(1) 1313 _configure_logging(arguments) 1314 try: 1315 if arguments.debug_metrics or not arguments.debug: 1316 metrics_file = None if not arguments.debug_metrics else '/dev/null' 1317 with site_utils.SetupTsMonGlobalState( 1318 'repair_loops', debug_file=metrics_file, 1319 auto_flush=False): 1320 _perform_inventory_reports(arguments) 1321 metrics.Flush() 1322 else: 1323 _perform_inventory_reports(arguments) 1324 except KeyboardInterrupt: 1325 pass 1326 except EnvironmentError as e: 1327 logging.exception('Unexpected OS error: %s', e) 1328 except Exception as e: 1329 logging.exception('Unexpected exception: %s', e) 1330 1331 1332def get_inventory(afe): 1333 end_time = int(time.time()) 1334 start_time = end_time - 24 * 60 * 60 1335 return _LabInventory.create_inventory(afe, start_time, end_time) 1336 1337 1338def get_managed_boards(afe): 1339 return get_inventory(afe).get_boards() 1340 1341 1342if __name__ == '__main__': 1343 main(sys.argv) 1344