1#!/usr/bin/env python 2# Copyright 2015 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Create e-mail reports of the Lab's DUT inventory. 7 8Gathers a list of all DUTs of interest in the Lab, segregated by 9model and pool, and determines whether each DUT is working or 10broken. Then, send one or more e-mail reports summarizing the 11status to e-mail addresses provided on the command line. 12 13usage: lab_inventory.py [ options ] [ model ... ] 14 15Options: 16--duration / -d <hours> 17 How far back in time to search job history to determine DUT 18 status. 19 20--model-notify <address>[,<address>] 21 Send the "model status" e-mail to all the specified e-mail 22 addresses. 23 24--pool-notify <address>[,<address>] 25 Send the "pool status" e-mail to all the specified e-mail 26 addresses. 27 28--recommend <number> 29 When generating the "model status" e-mail, include a list of 30 <number> specific DUTs to be recommended for repair. 31 32--report-untestable 33 Scan the inventory for DUTs that can't test because they're stuck in 34 repair loops, or because the scheduler can't give them work. 35 36--logdir <directory> 37 Log progress and actions in a file under this directory. Text 38 of any e-mail sent will also be logged in a timestamped file in 39 this directory. 40 41--debug 42 Suppress all logging, metrics reporting, and sending e-mail. 43 Instead, write the output that would be generated onto stdout. 44 45<model> arguments: 46 With no arguments, gathers the status for all models in the lab. 47 With one or more named models on the command line, restricts 48 reporting to just those models. 49""" 50 51 52import argparse 53import collections 54import logging 55import logging.handlers 56import os 57import re 58import sys 59import time 60 61import common 62from autotest_lib.client.bin import utils 63from autotest_lib.client.common_lib import time_utils 64from autotest_lib.frontend.afe.json_rpc import proxy 65from autotest_lib.server import constants 66from autotest_lib.server import site_utils 67from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 68from autotest_lib.server.hosts import servo_host 69from autotest_lib.server.lib import status_history 70from autotest_lib.site_utils import gmail_lib 71from chromite.lib import metrics 72 73 74CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS 75SPARE_POOL = constants.Pools.SPARE_POOL 76MANAGED_POOLS = constants.Pools.MANAGED_POOLS 77 78# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from 79# monitoring by this script. Currently, we're excluding these: 80# + 'adb' - We're not ready to monitor Android or Brillo hosts. 81# + 'board:guado_moblab' - These are maintained by a separate 82# process that doesn't use this script. 83# + 'board:veyron_rialto' due to crbug.com/854404 84 85_EXCLUDED_LABELS = {'adb', 'board:guado_moblab', 86 'board:veyron_rialto'} 87 88# _DEFAULT_DURATION: 89# Default value used for the --duration command line option. 90# Specifies how far back in time to search in order to determine 91# DUT status. 92 93_DEFAULT_DURATION = 24 94 95# _LOGDIR: 96# Relative path used in the calculation of the default setting for 97# the --logdir option. The full path is relative to the root of the 98# autotest directory, as determined from sys.argv[0]. 99# _LOGFILE: 100# Basename of a file to which general log information will be 101# written. 102# _LOG_FORMAT: 103# Format string for log messages. 104 105_LOGDIR = os.path.join('logs', 'dut-data') 106_LOGFILE = 'lab-inventory.log' 107_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s' 108 109# Pattern describing location-based host names in the Chrome OS test 110# labs. Each DUT hostname designates the DUT's location: 111# * A lab (room) that's physically separated from other labs 112# (i.e. there's a door). 113# * A row (or aisle) of DUTs within the lab. 114# * A vertical rack of shelves on the row. 115# * A specific host on one shelf of the rack. 116 117_HOSTNAME_PATTERN = re.compile( 118 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)') 119 120# _REPAIR_LOOP_THRESHOLD: 121# The number of repeated Repair tasks that must be seen to declare 122# that a DUT is stuck in a repair loop. 123 124_REPAIR_LOOP_THRESHOLD = 4 125 126 127_METRICS_PREFIX = 'chromeos/autotest/inventory' 128_UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric( 129 _METRICS_PREFIX + '/untestable', 130 'DUTs that cannot be scheduled for testing') 131 132_MISSING_DUT_METRIC = metrics.Counter( 133 _METRICS_PREFIX + '/missing', 'DUTs which cannot be found by lookup queries' 134 ' because they are invalid or deleted') 135 136# _Diagnosis - namedtuple corresponding to the return value from 137# `HostHistory.last_diagnosis()` 138_Diagnosis = collections.namedtuple('_Diagnosis', ['status', 'task']) 139 140def _get_diagnosis(history): 141 dut_present = True 142 try: 143 diagnosis = _Diagnosis(*history.last_diagnosis()) 144 if (diagnosis.status == status_history.BROKEN 145 and diagnosis.task.end_time < history.start_time): 146 return _Diagnosis(status_history.UNUSED, diagnosis.task) 147 else: 148 return diagnosis 149 except proxy.JSONRPCException as e: 150 logging.warn(e) 151 dut_present = False 152 finally: 153 _MISSING_DUT_METRIC.increment( 154 fields={'host': history.hostname, 'presence': dut_present}) 155 return _Diagnosis(None, None) 156 157 158def _host_is_working(history): 159 return _get_diagnosis(history).status == status_history.WORKING 160 161 162def _host_is_broken(history): 163 return _get_diagnosis(history).status == status_history.BROKEN 164 165 166def _host_is_idle(history): 167 idle_statuses = {status_history.UNUSED, status_history.UNKNOWN} 168 return _get_diagnosis(history).status in idle_statuses 169 170 171class _HostSetInventory(object): 172 """Maintains a set of related `HostJobHistory` objects. 173 174 Current usage of this class is that all DUTs are part of a single 175 scheduling pool of DUTs for a single model; however, this class make 176 no assumptions about the actual relationship among the DUTs. 177 178 The collection is segregated into disjoint categories of "working", 179 "broken", and "idle" DUTs. Accessor methods allow finding both the 180 list of DUTs in each category, as well as counts of each category. 181 182 Performance note: Certain methods in this class are potentially 183 expensive: 184 * `get_working()` 185 * `get_working_list()` 186 * `get_broken()` 187 * `get_broken_list()` 188 * `get_idle()` 189 * `get_idle_list()` 190 The first time any one of these methods is called, it causes 191 multiple RPC calls with a relatively expensive set of database 192 queries. However, the results of the queries are cached in the 193 individual `HostJobHistory` objects, so only the first call 194 actually pays the full cost. 195 196 Additionally, `get_working_list()`, `get_broken_list()` and 197 `get_idle_list()` cache their return values to avoid recalculating 198 lists at every call; this caching is separate from the caching of 199 RPC results described above. 200 201 This class is deliberately constructed to delay the RPC cost until 202 the accessor methods are called (rather than to query in 203 `record_host()`) so that it's possible to construct a complete 204 `_LabInventory` without making the expensive queries at creation 205 time. `_populate_model_counts()`, below, assumes this behavior. 206 """ 207 208 def __init__(self): 209 self._histories = [] 210 self._working_list = None 211 self._broken_list = None 212 self._idle_list = None 213 214 def record_host(self, host_history): 215 """Add one `HostJobHistory` object to the collection. 216 217 @param host_history The `HostJobHistory` object to be 218 remembered. 219 """ 220 self._working_list = None 221 self._broken_list = None 222 self._idle_list = None 223 self._histories.append(host_history) 224 225 def get_working_list(self): 226 """Return a list of all working DUTs in the pool. 227 228 Filter `self._histories` for histories where the DUT is 229 diagnosed as working. 230 231 Cache the result so that we only cacluate it once. 232 233 @return A list of HostJobHistory objects. 234 """ 235 if self._working_list is None: 236 self._working_list = [h for h in self._histories 237 if _host_is_working(h)] 238 return self._working_list 239 240 def get_working(self): 241 """Return the number of working DUTs in the pool.""" 242 return len(self.get_working_list()) 243 244 def get_broken_list(self): 245 """Return a list of all broken DUTs in the pool. 246 247 Filter `self._histories` for histories where the DUT is 248 diagnosed as broken. 249 250 Cache the result so that we only cacluate it once. 251 252 @return A list of HostJobHistory objects. 253 """ 254 if self._broken_list is None: 255 self._broken_list = [h for h in self._histories 256 if _host_is_broken(h)] 257 return self._broken_list 258 259 def get_broken(self): 260 """Return the number of broken DUTs in the pool.""" 261 return len(self.get_broken_list()) 262 263 def get_idle_list(self): 264 """Return a list of all idle DUTs in the pool. 265 266 Filter `self._histories` for histories where the DUT is 267 diagnosed as idle. 268 269 Cache the result so that we only cacluate it once. 270 271 @return A list of HostJobHistory objects. 272 """ 273 if self._idle_list is None: 274 self._idle_list = [h for h in self._histories 275 if _host_is_idle(h)] 276 return self._idle_list 277 278 def get_idle(self): 279 """Return the number of idle DUTs in the pool.""" 280 return len(self.get_idle_list()) 281 282 def get_total(self): 283 """Return the total number of DUTs in the pool.""" 284 return len(self._histories) 285 286 def get_all_histories(self): 287 return self._histories 288 289 290class _PoolSetInventory(object): 291 """Maintains a set of `HostJobHistory`s for a set of pools. 292 293 The collection is segregated into disjoint categories of "working", 294 "broken", and "idle" DUTs. Accessor methods allow finding both the 295 list of DUTs in each category, as well as counts of each category. 296 Accessor queries can be for an individual pool, or against all 297 pools. 298 299 Performance note: This class relies on `_HostSetInventory`. Public 300 methods in this class generally rely on methods of the same name in 301 the underlying class, and so will have the same underlying 302 performance characteristics. 303 """ 304 305 def __init__(self, pools): 306 self._histories_by_pool = { 307 pool: _HostSetInventory() for pool in pools 308 } 309 310 def record_host(self, host_history): 311 """Add one `HostJobHistory` object to the collection. 312 313 @param host_history The `HostJobHistory` object to be 314 remembered. 315 """ 316 pool = host_history.host_pool 317 self._histories_by_pool[pool].record_host(host_history) 318 319 def _count_pool(self, get_pool_count, pool=None): 320 """Internal helper to count hosts in a given pool. 321 322 The `get_pool_count` parameter is a function to calculate 323 the exact count of interest for the pool. 324 325 @param get_pool_count Function to return a count from a 326 _PoolCount object. 327 @param pool The pool to be counted. If `None`, 328 return the total across all pools. 329 """ 330 if pool is None: 331 return sum([get_pool_count(cached_history) for cached_history in 332 self._histories_by_pool.values()]) 333 else: 334 return get_pool_count(self._histories_by_pool[pool]) 335 336 def get_working_list(self): 337 """Return a list of all working DUTs (across all pools). 338 339 Go through all HostJobHistory objects across all pools, 340 selecting all DUTs identified as working. 341 342 @return A list of HostJobHistory objects. 343 """ 344 l = [] 345 for p in self._histories_by_pool.values(): 346 l.extend(p.get_working_list()) 347 return l 348 349 def get_working(self, pool=None): 350 """Return the number of working DUTs in a pool. 351 352 @param pool The pool to be counted. If `None`, return the 353 total across all pools. 354 355 @return The total number of working DUTs in the selected 356 pool(s). 357 """ 358 return self._count_pool(_HostSetInventory.get_working, pool) 359 360 def get_broken_list(self): 361 """Return a list of all broken DUTs (across all pools). 362 363 Go through all HostJobHistory objects across all pools, 364 selecting all DUTs identified as broken. 365 366 @return A list of HostJobHistory objects. 367 """ 368 l = [] 369 for p in self._histories_by_pool.values(): 370 l.extend(p.get_broken_list()) 371 return l 372 373 def get_broken(self, pool=None): 374 """Return the number of broken DUTs in a pool. 375 376 @param pool The pool to be counted. If `None`, return the 377 total across all pools. 378 379 @return The total number of broken DUTs in the selected pool(s). 380 """ 381 return self._count_pool(_HostSetInventory.get_broken, pool) 382 383 def get_idle_list(self, pool=None): 384 """Return a list of all idle DUTs in the given pool. 385 386 Go through all HostJobHistory objects across all pools, 387 selecting all DUTs identified as idle. 388 389 @param pool: The pool to be counted. If `None`, return the total list 390 across all pools. 391 392 @return A list of HostJobHistory objects. 393 """ 394 if pool is None: 395 l = [] 396 for p in self._histories_by_pool.itervalues(): 397 l.extend(p.get_idle_list()) 398 return l 399 else: 400 return self._histories_by_pool[pool].get_idle_list() 401 402 def get_idle(self, pool=None): 403 """Return the number of idle DUTs in a pool. 404 405 @param pool: The pool to be counted. If `None`, return the total 406 across all pools. 407 408 @return The total number of idle DUTs in the selected pool(s). 409 """ 410 return self._count_pool(_HostSetInventory.get_idle, pool) 411 412 def get_spares_buffer(self, spare_pool=SPARE_POOL): 413 """Return the the nominal number of working spares. 414 415 Calculates and returns how many working spares there would 416 be in the spares pool if all broken DUTs were in the spares 417 pool. This number may be negative, indicating a shortfall 418 in the critical pools. 419 420 @return The total number DUTs in the spares pool, less the total 421 number of broken DUTs in all pools. 422 """ 423 return self.get_total(spare_pool) - self.get_broken() 424 425 def get_total(self, pool=None): 426 """Return the total number of DUTs in a pool. 427 428 @param pool The pool to be counted. If `None`, return the 429 total across all pools. 430 431 @return The total number of DUTs in the selected pool(s). 432 """ 433 return self._count_pool(_HostSetInventory.get_total, pool) 434 435 def get_all_histories(self, pool=None): 436 if pool is None: 437 for p in self._histories_by_pool.itervalues(): 438 for h in p.get_all_histories(): 439 yield h 440 else: 441 for h in self._histories_by_pool[pool].get_all_histories(): 442 yield h 443 444 445def _is_migrated_to_skylab(afehost): 446 """Return True if the provided frontend.Host has been migrated to skylab.""" 447 return afehost.hostname.endswith('-migrated-do-not-use') 448 449 450def _eligible_host(afehost): 451 """Return whether this host is eligible for monitoring. 452 453 @param afehost The host to be tested for eligibility. 454 """ 455 if _is_migrated_to_skylab(afehost): 456 return False 457 458 # DUTs without an existing, unique 'model' or 'pool' label aren't meant to 459 # exist in the managed inventory; their presence generally indicates an 460 # error in the database. The _LabInventory constructor requires hosts to 461 # conform to the label restrictions. Failing an inventory run for a single 462 # bad entry is wrong, so we ignore these hosts. 463 models = [l for l in afehost.labels 464 if l.startswith(constants.Labels.MODEL_PREFIX)] 465 pools = [l for l in afehost.labels 466 if l.startswith(constants.Labels.POOL_PREFIX)] 467 excluded = _EXCLUDED_LABELS.intersection(afehost.labels) 468 return len(models) == 1 and len(pools) == 1 and not excluded 469 470 471class _LabInventory(collections.Mapping): 472 """Collection of `HostJobHistory` objects for the Lab's inventory. 473 474 This is a dict-like collection indexed by model. Indexing returns 475 the _PoolSetInventory object associated with the model. 476 """ 477 478 @classmethod 479 def create_inventory(cls, afe, start_time, end_time, modellist=[]): 480 """Return a Lab inventory with specified parameters. 481 482 By default, gathers inventory from `HostJobHistory` objects for 483 all DUTs in the `MANAGED_POOLS` list. If `modellist` is 484 supplied, the inventory will be restricted to only the given 485 models. 486 487 @param afe AFE object for constructing the 488 `HostJobHistory` objects. 489 @param start_time Start time for the `HostJobHistory` objects. 490 @param end_time End time for the `HostJobHistory` objects. 491 @param modellist List of models to include. If empty, 492 include all available models. 493 @return A `_LabInventory` object for the specified models. 494 """ 495 target_pools = MANAGED_POOLS 496 label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools] 497 afehosts = afe.get_hosts(labels__name__in=label_list) 498 if modellist: 499 # We're deliberately not checking host eligibility in this 500 # code path. This is a debug path, not used in production; 501 # it may be useful to include ineligible hosts here. 502 modelhosts = [] 503 for model in modellist: 504 model_label = constants.Labels.MODEL_PREFIX + model 505 host_list = [h for h in afehosts 506 if model_label in h.labels] 507 modelhosts.extend(host_list) 508 afehosts = modelhosts 509 else: 510 afehosts = [h for h in afehosts if _eligible_host(h)] 511 create = lambda host: ( 512 status_history.HostJobHistory(afe, host, 513 start_time, end_time)) 514 return cls([create(host) for host in afehosts], target_pools) 515 516 def __init__(self, histories, pools): 517 models = {h.host_model for h in histories} 518 self._modeldata = {model: _PoolSetInventory(pools) for model in models} 519 self._dut_count = len(histories) 520 for h in histories: 521 self[h.host_model].record_host(h) 522 self._boards = {h.host_board for h in histories} 523 524 def __getitem__(self, key): 525 return self._modeldata.__getitem__(key) 526 527 def __len__(self): 528 return self._modeldata.__len__() 529 530 def __iter__(self): 531 return self._modeldata.__iter__() 532 533 def get_num_duts(self): 534 """Return the total number of DUTs in the inventory.""" 535 return self._dut_count 536 537 def get_num_models(self): 538 """Return the total number of models in the inventory.""" 539 return len(self) 540 541 def get_pool_models(self, pool): 542 """Return all models in `pool`. 543 544 @param pool The pool to be inventoried for models. 545 """ 546 return {m for m, h in self.iteritems() if h.get_total(pool)} 547 548 def get_boards(self): 549 return self._boards 550 551 552def _reportable_models(inventory, spare_pool=SPARE_POOL): 553 """Iterate over all models subject to reporting. 554 555 Yields the contents of `inventory.iteritems()` filtered to include 556 only reportable models. A model is reportable if it has DUTs in 557 both `spare_pool` and at least one other pool. 558 559 @param spare_pool The spare pool to be tested for reporting. 560 """ 561 for model, poolset in inventory.iteritems(): 562 spares = poolset.get_total(spare_pool) 563 total = poolset.get_total() 564 if spares != 0 and spares != total: 565 yield model, poolset 566 567 568def _all_dut_histories(inventory): 569 for poolset in inventory.itervalues(): 570 for h in poolset.get_all_histories(): 571 yield h 572 573 574def _sort_by_location(inventory_list): 575 """Return a list of DUTs, organized by location. 576 577 Take the given list of `HostJobHistory` objects, separate it 578 into a list per lab, and sort each lab's list by location. The 579 order of sorting within a lab is 580 * By row number within the lab, 581 * then by rack number within the row, 582 * then by host shelf number within the rack. 583 584 Return a list of the sorted lists. 585 586 Implementation note: host locations are sorted by converting 587 each location into a base 100 number. If row, rack or 588 host numbers exceed the range [0..99], then sorting will 589 break down. 590 591 @return A list of sorted lists of DUTs. 592 """ 593 BASE = 100 594 lab_lists = {} 595 for history in inventory_list: 596 location = _HOSTNAME_PATTERN.match(history.host.hostname) 597 if location: 598 lab = location.group(1) 599 key = 0 600 for idx in location.group(2, 3, 4): 601 key = BASE * key + int(idx) 602 lab_lists.setdefault(lab, []).append((key, history)) 603 return_list = [] 604 for dut_list in lab_lists.values(): 605 dut_list.sort(key=lambda t: t[0]) 606 return_list.append([t[1] for t in dut_list]) 607 return return_list 608 609 610def _score_repair_set(buffer_counts, repair_list): 611 """Return a numeric score rating a set of DUTs to be repaired. 612 613 `buffer_counts` is a dictionary mapping model names to the size of 614 the model's spares buffer. 615 616 `repair_list` is a list of `HostJobHistory` objects for the DUTs to 617 be repaired. 618 619 This function calculates the new set of buffer counts that would 620 result from the proposed repairs, and scores the new set using two 621 numbers: 622 * Worst case buffer count for any model (higher is better). This 623 is the more significant number for comparison. 624 * Number of models at the worst case (lower is better). This is 625 the less significant number. 626 627 Implementation note: The score could fail to reflect the intended 628 criteria if there are more than 1000 models in the inventory. 629 630 @param spare_counts A dictionary mapping models to buffer counts. 631 @param repair_list A list of `HostJobHistory` objects for the 632 DUTs to be repaired. 633 @return A numeric score. 634 """ 635 # Go through `buffer_counts`, and create a list of new counts 636 # that records the buffer count for each model after repair. 637 # The new list of counts discards the model names, as they don't 638 # contribute to the final score. 639 _NMODELS = 1000 640 pools = {h.host_pool for h in repair_list} 641 repair_inventory = _LabInventory(repair_list, pools) 642 new_counts = [] 643 for m, c in buffer_counts.iteritems(): 644 if m in repair_inventory: 645 newcount = repair_inventory[m].get_total() 646 else: 647 newcount = 0 648 new_counts.append(c + newcount) 649 # Go through the new list of counts. Find the worst available 650 # spares count, and count how many times that worst case occurs. 651 worst_count = new_counts[0] 652 num_worst = 1 653 for c in new_counts[1:]: 654 if c == worst_count: 655 num_worst += 1 656 elif c < worst_count: 657 worst_count = c 658 num_worst = 1 659 # Return the calculated score 660 return _NMODELS * worst_count - num_worst 661 662 663def _generate_repair_recommendation(inventory, num_recommend): 664 """Return a summary of selected DUTs needing repair. 665 666 Returns a message recommending a list of broken DUTs to be repaired. 667 The list of DUTs is selected based on these criteria: 668 * No more than `num_recommend` DUTs will be listed. 669 * All DUTs must be in the same lab. 670 * DUTs should be selected for some degree of physical proximity. 671 * DUTs for models with a low spares buffer are more important than 672 DUTs with larger buffers. 673 674 The algorithm used will guarantee that at least one DUT from a model 675 with the lowest spares buffer will be recommended. If the worst 676 spares buffer number is shared by more than one model, the algorithm 677 will tend to prefer repair sets that include more of those models 678 over sets that cover fewer models. 679 680 @param inventory `_LabInventory` object from which to generate 681 recommendations. 682 @param num_recommend Number of DUTs to recommend for repair. 683 """ 684 logging.debug('Creating DUT repair recommendations') 685 model_buffer_counts = {} 686 broken_list = [] 687 for model, counts in _reportable_models(inventory): 688 logging.debug('Listing failed DUTs for %s', model) 689 if counts.get_broken() != 0: 690 model_buffer_counts[model] = counts.get_spares_buffer() 691 broken_list.extend(counts.get_broken_list()) 692 # N.B. The logic inside this loop may seem complicated, but 693 # simplification is hard: 694 # * Calculating an initial recommendation outside of 695 # the loop likely would make things more complicated, 696 # not less. 697 # * It's necessary to calculate an initial lab slice once per 698 # lab _before_ the while loop, in case the number of broken 699 # DUTs in a lab is less than `num_recommend`. 700 recommendation = None 701 best_score = None 702 for lab_duts in _sort_by_location(broken_list): 703 start = 0 704 end = num_recommend 705 lab_slice = lab_duts[start : end] 706 lab_score = _score_repair_set(model_buffer_counts, lab_slice) 707 while end < len(lab_duts): 708 start += 1 709 end += 1 710 new_slice = lab_duts[start : end] 711 new_score = _score_repair_set(model_buffer_counts, new_slice) 712 if new_score > lab_score: 713 lab_slice = new_slice 714 lab_score = new_score 715 if recommendation is None or lab_score > best_score: 716 recommendation = lab_slice 717 best_score = lab_score 718 # N.B. The trailing space in `line_fmt` is manadatory: Without it, 719 # Gmail will parse the URL wrong. Don't ask. If you simply _must_ 720 # know more, go try it yourself... 721 line_fmt = '%-30s %-16s %-6s\n %s ' 722 message = ['Repair recommendations:\n', 723 line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')] 724 if recommendation: 725 for h in recommendation: 726 servo_name = servo_host.make_servo_hostname(h.host.hostname) 727 servo_present = utils.host_is_in_lab_zone(servo_name) 728 event = _get_diagnosis(h).task 729 line = line_fmt % ( 730 h.host.hostname, h.host_model, 731 'Yes' if servo_present else 'No', event.job_url) 732 message.append(line) 733 else: 734 message.append('(No DUTs to repair)') 735 return '\n'.join(message) 736 737 738def _generate_model_inventory_message(inventory): 739 """Generate the "model inventory" e-mail message. 740 741 The model inventory is a list by model summarizing the number of 742 working, broken, and idle DUTs, and the total shortfall or surplus 743 of working devices relative to the minimum critical pool 744 requirement. 745 746 The report omits models with no DUTs in the spare pool or with no 747 DUTs in a critical pool. 748 749 N.B. For sample output text formattted as users can expect to 750 see it in e-mail and log files, refer to the unit tests. 751 752 @param inventory `_LabInventory` object to be reported on. 753 @return String with the inventory message to be sent. 754 """ 755 logging.debug('Creating model inventory') 756 nworking = 0 757 nbroken = 0 758 nidle = 0 759 nbroken_models = 0 760 ntotal_models = 0 761 summaries = [] 762 column_names = ( 763 'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total') 764 for model, counts in _reportable_models(inventory): 765 logging.debug('Counting %2d DUTS for model %s', 766 counts.get_total(), model) 767 # Summary elements laid out in the same order as the column 768 # headers: 769 # Model Avail Bad Idle Good Spare Total 770 # e[0] e[1] e[2] e[3] e[4] e[5] e[6] 771 element = (model, 772 counts.get_spares_buffer(), 773 counts.get_broken(), 774 counts.get_idle(), 775 counts.get_working(), 776 counts.get_total(SPARE_POOL), 777 counts.get_total()) 778 if element[2]: 779 summaries.append(element) 780 nbroken_models += 1 781 ntotal_models += 1 782 nbroken += element[2] 783 nidle += element[3] 784 nworking += element[4] 785 ntotal = nworking + nbroken + nidle 786 summaries = sorted(summaries, key=lambda e: (e[1], -e[2])) 787 broken_percent = int(round(100.0 * nbroken / ntotal)) 788 idle_percent = int(round(100.0 * nidle / ntotal)) 789 working_percent = 100 - broken_percent - idle_percent 790 message = ['Summary of DUTs in inventory:', 791 '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'), 792 '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % ( 793 nbroken, broken_percent, 794 nidle, idle_percent, 795 nworking, working_percent, 796 ntotal), 797 '', 798 'Models with failures: %d' % nbroken_models, 799 'Models in inventory: %d' % ntotal_models, 800 '', '', 801 'Full model inventory:\n', 802 '%-22s %5s %5s %5s %5s %5s %5s' % column_names] 803 message.extend( 804 ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries]) 805 return '\n'.join(message) 806 807 808_POOL_INVENTORY_HEADER = '''\ 809Notice to Infrastructure deputies: All models shown below are at 810less than full strength, please take action to resolve the issues. 811Once you're satisified that failures won't recur, failed DUTs can 812be replaced with spares by running `balance_pool`. Detailed 813instructions can be found here: 814 http://go/cros-manage-duts 815''' 816 817 818def _generate_pool_inventory_message(inventory): 819 """Generate the "pool inventory" e-mail message. 820 821 The pool inventory is a list by pool and model summarizing the 822 number of working and broken DUTs in the pool. Only models with 823 at least one broken DUT are included in the list. 824 825 N.B. For sample output text formattted as users can expect to see it 826 in e-mail and log files, refer to the unit tests. 827 828 @param inventory `_LabInventory` object to be reported on. 829 @return String with the inventory message to be sent. 830 """ 831 logging.debug('Creating pool inventory') 832 message = [_POOL_INVENTORY_HEADER] 833 newline = '' 834 for pool in CRITICAL_POOLS: 835 message.append( 836 '%sStatus for pool:%s, by model:' % (newline, pool)) 837 message.append( 838 '%-20s %5s %5s %5s %5s' % ( 839 'Model', 'Bad', 'Idle', 'Good', 'Total')) 840 data_list = [] 841 for model, counts in inventory.iteritems(): 842 logging.debug('Counting %2d DUTs for %s, %s', 843 counts.get_total(pool), model, pool) 844 broken = counts.get_broken(pool) 845 idle = counts.get_idle(pool) 846 # models at full strength are not reported 847 if not broken and not idle: 848 continue 849 working = counts.get_working(pool) 850 total = counts.get_total(pool) 851 data_list.append((model, broken, idle, working, total)) 852 if data_list: 853 data_list = sorted(data_list, key=lambda d: -d[1]) 854 message.extend( 855 ['%-20s %5d %5d %5d %5d' % t for t in data_list]) 856 else: 857 message.append('(All models at full strength)') 858 newline = '\n' 859 return '\n'.join(message) 860 861 862_IDLE_INVENTORY_HEADER = '''\ 863Notice to Infrastructure deputies: The hosts shown below haven't 864run any jobs for at least 24 hours. Please check each host; locked 865hosts should normally be unlocked; stuck jobs should normally be 866aborted. 867''' 868 869 870def _generate_idle_inventory_message(inventory): 871 """Generate the "idle inventory" e-mail message. 872 873 The idle inventory is a host list with corresponding pool and model, 874 where the hosts are identified as idle. 875 876 N.B. For sample output text format as users can expect to 877 see it in e-mail and log files, refer to the unit tests. 878 879 @param inventory `_LabInventory` object to be reported on. 880 @return String with the inventory message to be sent. 881 """ 882 logging.debug('Creating idle inventory') 883 message = [_IDLE_INVENTORY_HEADER] 884 message.append('Idle Host List:') 885 message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool')) 886 data_list = [] 887 for pool in MANAGED_POOLS: 888 for model, counts in inventory.iteritems(): 889 logging.debug('Counting %2d DUTs for %s, %s', 890 counts.get_total(pool), model, pool) 891 data_list.extend([(dut.host.hostname, model, pool) 892 for dut in counts.get_idle_list(pool)]) 893 if data_list: 894 message.extend(['%-30s %-20s %s' % t for t in data_list]) 895 else: 896 message.append('(No idle DUTs)') 897 return '\n'.join(message) 898 899 900def _send_email(arguments, tag, subject, recipients, body): 901 """Send an inventory e-mail message. 902 903 The message is logged in the selected log directory using `tag` for 904 the file name. 905 906 If the --debug option was requested, the message is neither logged 907 nor sent, but merely printed on stdout. 908 909 @param arguments Parsed command-line options. 910 @param tag Tag identifying the inventory for logging 911 purposes. 912 @param subject E-mail Subject: header line. 913 @param recipients E-mail addresses for the To: header line. 914 @param body E-mail message body. 915 """ 916 logging.debug('Generating email: "%s"', subject) 917 all_recipients = ', '.join(recipients) 918 report_body = '\n'.join([ 919 'To: %s' % all_recipients, 920 'Subject: %s' % subject, 921 '', body, '']) 922 if arguments.debug: 923 print report_body 924 else: 925 filename = os.path.join(arguments.logdir, tag) 926 try: 927 report_file = open(filename, 'w') 928 report_file.write(report_body) 929 report_file.close() 930 except EnvironmentError as e: 931 logging.error('Failed to write %s: %s', filename, e) 932 try: 933 gmail_lib.send_email(all_recipients, subject, body) 934 except Exception as e: 935 logging.error('Failed to send e-mail to %s: %s', 936 all_recipients, e) 937 938 939def _populate_model_counts(inventory): 940 """Gather model counts while providing interactive feedback. 941 942 Gathering the status of all individual DUTs in the lab can take 943 considerable time (~30 minutes at the time of this writing). 944 Normally, we pay that cost by querying as we go. However, with 945 the `--debug` option, we expect a human being to be watching the 946 progress in real time. So, we force the first (expensive) queries 947 to happen up front, and provide simple ASCII output on sys.stdout 948 to show a progress bar and results. 949 950 @param inventory `_LabInventory` object from which to gather 951 counts. 952 """ 953 n = 0 954 total_broken = 0 955 for counts in inventory.itervalues(): 956 n += 1 957 if n % 10 == 5: 958 c = '+' 959 elif n % 10 == 0: 960 c = '%d' % ((n / 10) % 10) 961 else: 962 c = '.' 963 sys.stdout.write(c) 964 sys.stdout.flush() 965 # This next call is where all the time goes - it forces all of a 966 # model's `HostJobHistory` objects to query the database and 967 # cache their results. 968 total_broken += counts.get_broken() 969 sys.stdout.write('\n') 970 sys.stdout.write('Found %d broken DUTs\n' % total_broken) 971 972 973def _perform_model_inventory(arguments, inventory, timestamp): 974 """Perform the model inventory report. 975 976 The model inventory report consists of the following: 977 * A list of DUTs that are recommended to be repaired. This list 978 is optional, and only appears if the `--recommend` option is 979 present. 980 * A list of all models that have failed DUTs, with counts 981 of working, broken, and spare DUTs, among others. 982 983 @param arguments Command-line arguments as returned by 984 `ArgumentParser` 985 @param inventory `_LabInventory` object to be reported on. 986 @param timestamp A string used to identify this run's timestamp 987 in logs and email output. 988 """ 989 if arguments.recommend: 990 recommend_message = _generate_repair_recommendation( 991 inventory, arguments.recommend) + '\n\n\n' 992 else: 993 recommend_message = '' 994 model_message = _generate_model_inventory_message(inventory) 995 _send_email(arguments, 996 'models-%s.txt' % timestamp, 997 'DUT model inventory %s' % timestamp, 998 arguments.model_notify, 999 recommend_message + model_message) 1000 1001 1002def _perform_pool_inventory(arguments, inventory, timestamp): 1003 """Perform the pool inventory report. 1004 1005 The pool inventory report consists of the following: 1006 * A list of all critical pools that have failed DUTs, with counts 1007 of working, broken, and idle DUTs. 1008 * A list of all idle DUTs by hostname including the model and 1009 pool. 1010 1011 @param arguments Command-line arguments as returned by 1012 `ArgumentParser` 1013 @param inventory `_LabInventory` object to be reported on. 1014 @param timestamp A string used to identify this run's timestamp in 1015 logs and email output. 1016 """ 1017 pool_message = _generate_pool_inventory_message(inventory) 1018 idle_message = _generate_idle_inventory_message(inventory) 1019 _send_email(arguments, 1020 'pools-%s.txt' % timestamp, 1021 'DUT pool inventory %s' % timestamp, 1022 arguments.pool_notify, 1023 pool_message + '\n\n\n' + idle_message) 1024 1025 1026def _dut_in_repair_loop(history): 1027 """Return whether a DUT's history indicates a repair loop. 1028 1029 A DUT is considered looping if it runs no tests, and no tasks pass 1030 other than repair tasks. 1031 1032 @param history An instance of `status_history.HostJobHistory` to be 1033 scanned for a repair loop. The caller guarantees 1034 that this history corresponds to a working DUT. 1035 @returns Return a true value if the DUT's most recent history 1036 indicates a repair loop. 1037 """ 1038 # Our caller passes only histories for working DUTs; that means 1039 # we've already paid the cost of fetching the diagnosis task, and 1040 # we know that the task was successful. The diagnosis task will be 1041 # one of the tasks we must scan to find a loop, so if the task isn't 1042 # a repair task, then our history includes a successful non-repair 1043 # task, and we're not looping. 1044 # 1045 # The for loop below is very expensive, because it must fetch the 1046 # full history, regardless of how many tasks we examine. At the 1047 # time of this writing, this check against the diagnosis task 1048 # reduces the cost of finding loops in the full inventory from hours 1049 # to minutes. 1050 if _get_diagnosis(history).task.name != 'Repair': 1051 return False 1052 repair_ok_count = 0 1053 for task in history: 1054 if not task.is_special: 1055 # This is a test, so we're not looping. 1056 return False 1057 if task.diagnosis == status_history.BROKEN: 1058 # Failed a repair, so we're not looping. 1059 return False 1060 if (task.diagnosis == status_history.WORKING 1061 and task.name != 'Repair'): 1062 # Non-repair task succeeded, so we're not looping. 1063 return False 1064 # At this point, we have either a failed non-repair task, or 1065 # a successful repair. 1066 if task.name == 'Repair': 1067 repair_ok_count += 1 1068 if repair_ok_count >= _REPAIR_LOOP_THRESHOLD: 1069 return True 1070 1071 1072def _report_untestable_dut(history, state): 1073 fields = { 1074 'dut_hostname': history.hostname, 1075 'model': history.host_model, 1076 'pool': history.host_pool, 1077 'state': state, 1078 } 1079 logging.info('DUT in state %(state)s: %(dut_hostname)s, ' 1080 'model: %(model)s, pool: %(pool)s', fields) 1081 _UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields) 1082 1083 1084def _report_untestable_dut_metrics(inventory): 1085 """Scan the inventory for DUTs unable to run tests. 1086 1087 DUTs in the inventory are judged "untestable" if they meet one of 1088 two criteria: 1089 * The DUT is stuck in a repair loop; that is, it regularly passes 1090 repair, but never passes other operations. 1091 * The DUT runs no tasks at all, but is not locked. 1092 1093 This routine walks through the given inventory looking for DUTs in 1094 either of these states. Results are reported via a Monarch presence 1095 metric. 1096 1097 Note: To make sure that DUTs aren't flagged as "idle" merely 1098 because there's no work, a separate job runs prior to regular 1099 inventory runs which schedules trivial work on any DUT that appears 1100 idle. 1101 1102 @param inventory `_LabInventory` object to be reported on. 1103 """ 1104 logging.info('Scanning for untestable DUTs.') 1105 for history in _all_dut_histories(inventory): 1106 # Managed DUTs with names that don't match 1107 # _HOSTNAME_PATTERN shouldn't be possible. However, we 1108 # don't want arbitrary strings being attached to the 1109 # 'dut_hostname' field, so for safety, we exclude all 1110 # anomalies. 1111 if not _HOSTNAME_PATTERN.match(history.hostname): 1112 continue 1113 if _host_is_working(history): 1114 if _dut_in_repair_loop(history): 1115 _report_untestable_dut(history, 'repair_loop') 1116 elif _host_is_idle(history): 1117 if not history.host.locked: 1118 _report_untestable_dut(history, 'idle_unlocked') 1119 1120 1121def _log_startup(arguments, startup_time): 1122 """Log the start of this inventory run. 1123 1124 Print various log messages indicating the start of the run. Return 1125 a string based on `startup_time` that will be used to identify this 1126 run in log files and e-mail messages. 1127 1128 @param startup_time A UNIX timestamp marking the moment when 1129 this inventory run began. 1130 @returns A timestamp string that will be used to identify this run 1131 in logs and email output. 1132 """ 1133 timestamp = time.strftime('%Y-%m-%d.%H', 1134 time.localtime(startup_time)) 1135 logging.debug('Starting lab inventory for %s', timestamp) 1136 if arguments.model_notify: 1137 if arguments.recommend: 1138 logging.debug('Will include repair recommendations') 1139 logging.debug('Will include model inventory') 1140 if arguments.pool_notify: 1141 logging.debug('Will include pool inventory') 1142 return timestamp 1143 1144 1145def _create_inventory(arguments, end_time): 1146 """Create the `_LabInventory` instance to use for reporting. 1147 1148 @param end_time A UNIX timestamp for the end of the time range 1149 to be searched in this inventory run. 1150 """ 1151 start_time = end_time - arguments.duration * 60 * 60 1152 afe = frontend_wrappers.RetryingAFE(server=None) 1153 inventory = _LabInventory.create_inventory( 1154 afe, start_time, end_time, arguments.modelnames) 1155 logging.info('Found %d hosts across %d models', 1156 inventory.get_num_duts(), 1157 inventory.get_num_models()) 1158 return inventory 1159 1160 1161def _perform_inventory_reports(arguments): 1162 """Perform all inventory checks requested on the command line. 1163 1164 Create the initial inventory and run through the inventory reports 1165 as called for by the parsed command-line arguments. 1166 1167 @param arguments Command-line arguments as returned by 1168 `ArgumentParser`. 1169 """ 1170 startup_time = time.time() 1171 timestamp = _log_startup(arguments, startup_time) 1172 inventory = _create_inventory(arguments, startup_time) 1173 if arguments.debug: 1174 _populate_model_counts(inventory) 1175 if arguments.model_notify: 1176 _perform_model_inventory(arguments, inventory, timestamp) 1177 if arguments.pool_notify: 1178 _perform_pool_inventory(arguments, inventory, timestamp) 1179 if arguments.report_untestable: 1180 _report_untestable_dut_metrics(inventory) 1181 1182 1183def _separate_email_addresses(address_list): 1184 """Parse a list of comma-separated lists of e-mail addresses. 1185 1186 @param address_list A list of strings containing comma 1187 separate e-mail addresses. 1188 @return A list of the individual e-mail addresses. 1189 """ 1190 newlist = [] 1191 for arg in address_list: 1192 newlist.extend([email.strip() for email in arg.split(',')]) 1193 return newlist 1194 1195 1196def _verify_arguments(arguments): 1197 """Validate command-line arguments. 1198 1199 Join comma separated e-mail addresses for `--model-notify` and 1200 `--pool-notify` in separate option arguments into a single list. 1201 1202 For non-debug uses, require that at least one inventory report be 1203 requested. For debug, if a report isn't specified, treat it as "run 1204 all the reports." 1205 1206 The return value indicates success or failure; in the case of 1207 failure, we also write an error message to stderr. 1208 1209 @param arguments Command-line arguments as returned by 1210 `ArgumentParser` 1211 @return True if the arguments are semantically good, or False 1212 if the arguments don't meet requirements. 1213 """ 1214 arguments.model_notify = _separate_email_addresses( 1215 arguments.model_notify) 1216 arguments.pool_notify = _separate_email_addresses( 1217 arguments.pool_notify) 1218 if not any([arguments.model_notify, arguments.pool_notify, 1219 arguments.report_untestable]): 1220 if not arguments.debug: 1221 sys.stderr.write('Must request at least one report via ' 1222 '--model-notify, --pool-notify, or ' 1223 '--report-untestable\n') 1224 return False 1225 else: 1226 # We want to run all the e-mail reports. An empty notify 1227 # list will cause a report to be skipped, so make sure the 1228 # lists are non-empty. 1229 arguments.model_notify = [''] 1230 arguments.pool_notify = [''] 1231 return True 1232 1233 1234def _get_default_logdir(script): 1235 """Get the default directory for the `--logdir` option. 1236 1237 The default log directory is based on the parent directory 1238 containing this script. 1239 1240 @param script Path to this script file. 1241 @return A path to a directory. 1242 """ 1243 basedir = os.path.dirname(os.path.abspath(script)) 1244 basedir = os.path.dirname(basedir) 1245 return os.path.join(basedir, _LOGDIR) 1246 1247 1248def _parse_command(argv): 1249 """Parse the command line arguments. 1250 1251 Create an argument parser for this command's syntax, parse the 1252 command line, and return the result of the ArgumentParser 1253 parse_args() method. 1254 1255 @param argv Standard command line argument vector; argv[0] is 1256 assumed to be the command name. 1257 @return Result returned by ArgumentParser.parse_args(). 1258 """ 1259 parser = argparse.ArgumentParser( 1260 prog=argv[0], 1261 description='Gather and report lab inventory statistics') 1262 parser.add_argument('-d', '--duration', type=int, 1263 default=_DEFAULT_DURATION, metavar='HOURS', 1264 help='number of hours back to search for status' 1265 ' (default: %d)' % _DEFAULT_DURATION) 1266 parser.add_argument('--model-notify', action='append', 1267 default=[], metavar='ADDRESS', 1268 help='Generate model inventory message, ' 1269 'and send it to the given e-mail address(es)') 1270 parser.add_argument('--pool-notify', action='append', 1271 default=[], metavar='ADDRESS', 1272 help='Generate pool inventory message, ' 1273 'and send it to the given address(es)') 1274 parser.add_argument('-r', '--recommend', type=int, default=None, 1275 help=('Specify how many DUTs should be ' 1276 'recommended for repair (default: no ' 1277 'recommendation)')) 1278 parser.add_argument('--report-untestable', action='store_true', 1279 help='Check for devices unable to run tests.') 1280 parser.add_argument('--debug', action='store_true', 1281 help='Print e-mail, metrics messages on stdout ' 1282 'without sending them.') 1283 parser.add_argument('--no-metrics', action='store_false', 1284 dest='use_metrics', 1285 help='Suppress generation of Monarch metrics.') 1286 parser.add_argument('--logdir', default=_get_default_logdir(argv[0]), 1287 help='Directory where logs will be written.') 1288 parser.add_argument('modelnames', nargs='*', 1289 metavar='MODEL', 1290 help='names of models to report on ' 1291 '(default: all models)') 1292 arguments = parser.parse_args(argv[1:]) 1293 if not _verify_arguments(arguments): 1294 return None 1295 return arguments 1296 1297 1298def _configure_logging(arguments): 1299 """Configure the `logging` module for our needs. 1300 1301 How we log depends on whether the `--debug` option was provided on 1302 the command line. 1303 * Without the option, we configure the logging to capture all 1304 potentially relevant events in a log file. The log file is 1305 configured to rotate once a week on Friday evening, preserving 1306 ~3 months worth of history. 1307 * With the option, we expect stdout to contain other 1308 human-readable output (including the contents of the e-mail 1309 messages), so we restrict the output to INFO level. 1310 1311 For convenience, when `--debug` is on, the logging format has 1312 no adornments, so that a call like `logging.info(msg)` simply writes 1313 `msg` to stdout, plus a trailing newline. 1314 1315 @param arguments Command-line arguments as returned by 1316 `ArgumentParser` 1317 """ 1318 root_logger = logging.getLogger() 1319 if arguments.debug: 1320 root_logger.setLevel(logging.INFO) 1321 handler = logging.StreamHandler(sys.stdout) 1322 handler.setFormatter(logging.Formatter()) 1323 else: 1324 if not os.path.exists(arguments.logdir): 1325 os.mkdir(arguments.logdir) 1326 root_logger.setLevel(logging.DEBUG) 1327 logfile = os.path.join(arguments.logdir, _LOGFILE) 1328 handler = logging.handlers.TimedRotatingFileHandler( 1329 logfile, when='W4', backupCount=13) 1330 formatter = logging.Formatter(_LOG_FORMAT, 1331 time_utils.TIME_FMT) 1332 handler.setFormatter(formatter) 1333 # TODO(jrbarnette) This is gross. Importing client.bin.utils 1334 # implicitly imported logging_config, which calls 1335 # logging.basicConfig() *at module level*. That gives us an 1336 # extra logging handler that we don't want. So, clear out all 1337 # the handlers here. 1338 for h in root_logger.handlers: 1339 root_logger.removeHandler(h) 1340 root_logger.addHandler(handler) 1341 1342 1343def main(argv): 1344 """Standard main routine. 1345 1346 @param argv Command line arguments, including `sys.argv[0]`. 1347 """ 1348 arguments = _parse_command(argv) 1349 if not arguments: 1350 sys.exit(1) 1351 _configure_logging(arguments) 1352 1353 try: 1354 if arguments.use_metrics: 1355 if arguments.debug: 1356 logging.info('Debug mode: Will not report metrics to monarch.') 1357 metrics_file = '/dev/null' 1358 else: 1359 metrics_file = None 1360 with site_utils.SetupTsMonGlobalState( 1361 'lab_inventory', debug_file=metrics_file, 1362 auto_flush=False): 1363 success = False 1364 try: 1365 with metrics.SecondsTimer('%s/duration' % _METRICS_PREFIX): 1366 _perform_inventory_reports(arguments) 1367 success = True 1368 finally: 1369 metrics.Counter('%s/tick' % _METRICS_PREFIX).increment( 1370 fields={'success': success}) 1371 metrics.Flush() 1372 else: 1373 _perform_inventory_reports(arguments) 1374 except KeyboardInterrupt: 1375 pass 1376 except Exception: 1377 # Our cron setup doesn't preserve stderr, so drop extra breadcrumbs. 1378 logging.exception('Error escaped main') 1379 raise 1380 1381 1382def get_inventory(afe): 1383 end_time = int(time.time()) 1384 start_time = end_time - 24 * 60 * 60 1385 return _LabInventory.create_inventory(afe, start_time, end_time) 1386 1387 1388def get_managed_boards(afe): 1389 return get_inventory(afe).get_boards() 1390 1391 1392if __name__ == '__main__': 1393 main(sys.argv) 1394