1#!/usr/bin/env python 2# Copyright 2015 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Adjust pool balances to cover DUT shortfalls. 7 8This command takes all broken DUTs in a specific pool for specific 9boards and swaps them with working DUTs taken from a selected pool 10of spares. The command is meant primarily for replacing broken DUTs 11in critical pools like BVT or CQ, but it can also be used to adjust 12pool sizes, or to create or remove pools. 13 14usage: balance_pool.py [ options ] POOL BOARD [ BOARD ... ] 15 16positional arguments: 17 POOL Name of the pool to balance 18 BOARD Names of boards to balance 19 20optional arguments: 21 -h, --help show this help message and exit 22 -t COUNT, --total COUNT 23 Set the number of DUTs in the pool to the specified 24 count for every BOARD 25 -a COUNT, --grow COUNT 26 Add the specified number of DUTs to the pool for every 27 BOARD 28 -d COUNT, --shrink COUNT 29 Remove the specified number of DUTs from the pool for 30 every BOARD 31 -s POOL, --spare POOL 32 Pool from which to draw replacement spares (default: 33 pool:suites) 34 -n, --dry-run Report actions to take in the form of shell commands 35 36 37The command attempts to remove all broken DUTs from the target POOL 38for every BOARD, and replace them with enough working DUTs taken 39from the spare pool to bring the strength of POOL to the requested 40total COUNT. 41 42If no COUNT options are supplied (i.e. there are no --total, --grow, 43or --shrink options), the command will maintain the current totals of 44DUTs for every BOARD in the target POOL. 45 46If not enough working spares are available, broken DUTs may be left 47in the pool to keep the pool at the target COUNT. 48 49When reducing pool size, working DUTs will be returned after broken 50DUTs, if it's necessary to achieve the target COUNT. 51 52""" 53 54 55import argparse 56import sys 57import time 58 59import common 60from autotest_lib.server import frontend 61from autotest_lib.server.lib import status_history 62from autotest_lib.site_utils import lab_inventory 63from autotest_lib.site_utils.suite_scheduler import constants 64 65from chromite.lib import parallel 66 67 68_POOL_PREFIX = constants.Labels.POOL_PREFIX 69# This is the ratio of all boards we should calculate the default max number of 70# broken boards against. It seemed like the best choice that was neither too 71# strict nor lax. 72_MAX_BROKEN_BOARDS_DEFAULT_RATIO = 3.0 / 8.0 73 74_ALL_CRITICAL_POOLS = 'all_critical_pools' 75_SPARE_DEFAULT = lab_inventory.SPARE_POOL 76 77 78def _log_message(message, *args): 79 """Log a message with optional format arguments to stdout. 80 81 This function logs a single line to stdout, with formatting 82 if necessary, and without adornments. 83 84 If `*args` are supplied, the message will be formatted using 85 the arguments. 86 87 @param message Message to be logged, possibly after formatting. 88 @param args Format arguments. If empty, the message is logged 89 without formatting. 90 91 """ 92 if args: 93 message = message % args 94 sys.stdout.write('%s\n' % message) 95 96 97def _log_info(dry_run, message, *args): 98 """Log information in a dry-run dependent fashion. 99 100 This function logs a single line to stdout, with formatting 101 if necessary. When logging for a dry run, the message is 102 printed as a shell comment, rather than as unadorned text. 103 104 If `*args` are supplied, the message will be formatted using 105 the arguments. 106 107 @param message Message to be logged, possibly after formatting. 108 @param args Format arguments. If empty, the message is logged 109 without formatting. 110 111 """ 112 if dry_run: 113 message = '# ' + message 114 _log_message(message, *args) 115 116 117def _log_error(message, *args): 118 """Log an error to stderr, with optional format arguments. 119 120 This function logs a single line to stderr, prefixed to indicate 121 that it is an error message. 122 123 If `*args` are supplied, the message will be formatted using 124 the arguments. 125 126 @param message Message to be logged, possibly after formatting. 127 @param args Format arguments. If empty, the message is logged 128 without formatting. 129 130 """ 131 if args: 132 message = message % args 133 sys.stderr.write('ERROR: %s\n' % message) 134 135 136class _DUTPool(object): 137 """Information about a pool of DUTs for a given board. 138 139 This class collects information about all DUTs for a given 140 board and pool pair, and divides them into three categories: 141 + Working - the DUT is working for testing, and not locked. 142 + Broken - the DUT is unable to run tests, or it is locked. 143 + Ineligible - the DUT is not available to be removed from 144 this pool. The DUT may be either working or broken. 145 146 DUTs with more than one pool: label are ineligible for exchange 147 during balancing. This is done for the sake of chameleon hosts, 148 which must always be assigned to pool:suites. These DUTs are 149 always marked with pool:chameleon to prevent their reassignment. 150 151 TODO(jrbarnette): The use of `pool:chamelon` (instead of just 152 the `chameleon` label is a hack that should be eliminated. 153 154 _DUTPool instances are used to track both main pools that need 155 to be resupplied with working DUTs and spare pools that supply 156 those DUTs. 157 158 @property board Name of the board associated with 159 this pool of DUTs. 160 @property pool Name of the pool associated with 161 this pool of DUTs. 162 @property working_hosts The list of this pool's working 163 DUTs. 164 @property broken_hosts The list of this pool's broken 165 DUTs. 166 @property ineligible_hosts The list of this pool's ineligible DUTs. 167 @property labels A list of labels that identify a DUT 168 as part of this pool. 169 @property total_hosts The total number of hosts in pool. 170 171 """ 172 173 def __init__(self, afe, board, pool, start_time, end_time): 174 self.board = board 175 self.pool = pool 176 self.working_hosts = [] 177 self.broken_hosts = [] 178 self.ineligible_hosts = [] 179 self.total_hosts = self._get_hosts(afe, start_time, end_time) 180 self._labels = [_POOL_PREFIX + self.pool] 181 182 183 def _get_hosts(self, afe, start_time, end_time): 184 all_histories = ( 185 status_history.HostJobHistory.get_multiple_histories( 186 afe, start_time, end_time, 187 board=self.board, pool=self.pool)) 188 for h in all_histories: 189 host = h.host 190 host_pools = [l for l in host.labels 191 if l.startswith(_POOL_PREFIX)] 192 if len(host_pools) != 1: 193 self.ineligible_hosts.append(host) 194 else: 195 diag = h.last_diagnosis()[0] 196 if (diag == status_history.WORKING and 197 not host.locked): 198 self.working_hosts.append(host) 199 else: 200 self.broken_hosts.append(host) 201 return len(all_histories) 202 203 204 @property 205 def pool_labels(self): 206 """Return the AFE labels that identify this pool. 207 208 The returned labels are the labels that must be removed 209 to remove a DUT from the pool, or added to add a DUT. 210 211 @return A list of AFE labels suitable for AFE.add_labels() 212 or AFE.remove_labels(). 213 214 """ 215 return self._labels 216 217 def calculate_spares_needed(self, target_total): 218 """Calculate and log the spares needed to achieve a target. 219 220 Return how many working spares are needed to achieve the 221 given `target_total` with all DUTs working. 222 223 The spares count may be positive or negative. Positive 224 values indicate spares are needed to replace broken DUTs in 225 order to reach the target; negative numbers indicate that 226 no spares are needed, and that a corresponding number of 227 working devices can be returned. 228 229 If the new target total would require returning ineligible 230 DUTs, an error is logged, and the target total is adjusted 231 so that those DUTs are not exchanged. 232 233 @param target_total The new target pool size. 234 235 @return The number of spares needed. 236 237 """ 238 num_ineligible = len(self.ineligible_hosts) 239 if target_total < num_ineligible: 240 _log_error('%s %s pool: Target of %d is below ' 241 'minimum of %d DUTs.', 242 self.board, self.pool, 243 target_total, num_ineligible) 244 _log_error('Adjusting target to %d DUTs.', num_ineligible) 245 target_total = num_ineligible 246 adjustment = target_total - self.total_hosts 247 return len(self.broken_hosts) + adjustment 248 249 def allocate_surplus(self, num_broken): 250 """Allocate a list DUTs that can returned as surplus. 251 252 Return a list of devices that can be returned in order to 253 reduce this pool's supply. Broken DUTs will be preferred 254 over working ones. 255 256 The `num_broken` parameter indicates the number of broken 257 DUTs to be left in the pool. If this number exceeds the 258 number of broken DUTs actually in the pool, the returned 259 list will be empty. If this number is negative, it 260 indicates a number of working DUTs to be returned in 261 addition to all broken ones. 262 263 @param num_broken Total number of broken DUTs to be left in 264 this pool. 265 266 @return A list of DUTs to be returned as surplus. 267 268 """ 269 if num_broken >= 0: 270 surplus = self.broken_hosts[num_broken:] 271 return surplus 272 else: 273 return (self.broken_hosts + 274 self.working_hosts[:-num_broken]) 275 276 277def _exchange_labels(dry_run, hosts, target_pool, spare_pool): 278 """Reassign a list of DUTs from one pool to another. 279 280 For all the given hosts, remove all labels associated with 281 `spare_pool`, and add the labels for `target_pool`. 282 283 If `dry_run` is true, perform no changes, but log the `atest` 284 commands needed to accomplish the necessary label changes. 285 286 @param dry_run Whether the logging is for a dry run or 287 for actual execution. 288 @param hosts List of DUTs (AFE hosts) to be reassigned. 289 @param target_pool The `_DUTPool` object from which the hosts 290 are drawn. 291 @param spare_pool The `_DUTPool` object to which the hosts 292 will be added. 293 294 """ 295 if not hosts: 296 return 297 _log_info(dry_run, 'Transferring %d DUTs from %s to %s.', 298 len(hosts), spare_pool.pool, target_pool.pool) 299 additions = target_pool.pool_labels 300 removals = spare_pool.pool_labels 301 for host in hosts: 302 if not dry_run: 303 _log_message('Updating host: %s.', host.hostname) 304 host.remove_labels(removals) 305 host.add_labels(additions) 306 else: 307 _log_message('atest label remove -m %s %s', 308 host.hostname, ' '.join(removals)) 309 _log_message('atest label add -m %s %s', 310 host.hostname, ' '.join(additions)) 311 312 313def _balance_board(arguments, afe, board, pool, start_time, end_time): 314 """Balance one board as requested by command line arguments. 315 316 @param arguments Parsed command line arguments. 317 @param dry_run Whether the logging is for a dry run or 318 for actual execution. 319 @param afe AFE object to be used for the changes. 320 @param board Board to be balanced. 321 @param pool Pool of the board to be balanced. 322 @param start_time Start time for HostJobHistory objects in 323 the DUT pools. 324 @param end_time End time for HostJobHistory objects in the 325 DUT pools. 326 327 """ 328 spare_pool = _DUTPool(afe, board, arguments.spare, 329 start_time, end_time) 330 main_pool = _DUTPool(afe, board, pool, 331 start_time, end_time) 332 333 target_total = main_pool.total_hosts 334 if arguments.total is not None: 335 target_total = arguments.total 336 elif arguments.grow: 337 target_total += arguments.grow 338 elif arguments.shrink: 339 target_total -= arguments.shrink 340 341 spares_needed = main_pool.calculate_spares_needed(target_total) 342 if spares_needed > 0: 343 spare_duts = spare_pool.working_hosts[:spares_needed] 344 shortfall = spares_needed - len(spare_duts) 345 else: 346 spare_duts = [] 347 shortfall = spares_needed 348 349 surplus_duts = main_pool.allocate_surplus(shortfall) 350 351 if spares_needed or surplus_duts or arguments.verbose: 352 dry_run = arguments.dry_run 353 _log_message('') 354 355 _log_info(dry_run, 'Balancing %s %s pool:', board, main_pool.pool) 356 _log_info(dry_run, 357 'Total %d DUTs, %d working, %d broken, %d reserved.', 358 main_pool.total_hosts, len(main_pool.working_hosts), 359 len(main_pool.broken_hosts), len(main_pool.ineligible_hosts)) 360 361 if spares_needed > 0: 362 add_msg = 'grow pool by %d DUTs' % spares_needed 363 elif spares_needed < 0: 364 add_msg = 'shrink pool by %d DUTs' % -spares_needed 365 else: 366 add_msg = 'no change to pool size' 367 _log_info(dry_run, 'Target is %d working DUTs; %s.', 368 target_total, add_msg) 369 370 _log_info(dry_run, 371 '%s %s pool has %d spares available.', 372 board, main_pool.pool, len(spare_pool.working_hosts)) 373 374 if spares_needed > len(spare_duts): 375 _log_error('Not enough spares: need %d, only have %d.', 376 spares_needed, len(spare_duts)) 377 elif shortfall >= 0: 378 _log_info(dry_run, 379 '%s %s pool will return %d broken DUTs, ' 380 'leaving %d still in the pool.', 381 board, main_pool.pool, 382 len(surplus_duts), 383 len(main_pool.broken_hosts) - len(surplus_duts)) 384 else: 385 _log_info(dry_run, 386 '%s %s pool will return %d surplus DUTs, ' 387 'including %d working DUTs.', 388 board, main_pool.pool, 389 len(main_pool.broken_hosts) - shortfall, 390 -shortfall) 391 392 if (len(main_pool.broken_hosts) > arguments.max_broken and 393 not arguments.force_rebalance): 394 _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.', 395 board, main_pool.pool, len(main_pool.broken_hosts)) 396 _log_error('Please investigate this board to see if there is a bug ') 397 _log_error('that is bricking devices. Once you have finished your ') 398 _log_error('investigation, you can force a rebalance with ') 399 _log_error('--force-rebalance') 400 return 401 402 if not spare_duts and not surplus_duts: 403 if arguments.verbose: 404 _log_info(arguments.dry_run, 'No exchange required.') 405 return 406 407 _exchange_labels(arguments.dry_run, surplus_duts, 408 spare_pool, main_pool) 409 _exchange_labels(arguments.dry_run, spare_duts, 410 main_pool, spare_pool) 411 412 413def _too_many_broken_boards(inventory, pool, arguments): 414 """ 415 Get the inventory of boards and check if too many boards are broken. 416 417 @param inventory: inventory object to determine board status inventory. 418 @param pool: The pool to check on for the board. 419 @param arguments Parsed command line arguments. 420 421 @return True if the number of boards with 1 or more broken duts exceed 422 max_broken_boards, False otherwise. 423 """ 424 # Let's check if we even need to check for this max_broken_boards. 425 if arguments.force_rebalance or arguments.max_broken_boards == 0: 426 return False 427 428 # Let's get the number of broken duts for the specified pool and 429 # check that it's less than arguments.max_broken_boards. Or if 430 # it's not specified, calculate the default number of max broken 431 # boards based on the total number of boards per pool. 432 # TODO(kevcheng): Revisit to see if there's a better way to 433 # calculate the default max_broken_boards. 434 max_broken_boards = arguments.max_broken_boards 435 if max_broken_boards is None: 436 total_num_boards = len(inventory.get_managed_boards(pool=pool)) 437 max_broken_boards = int(_MAX_BROKEN_BOARDS_DEFAULT_RATIO * 438 total_num_boards) 439 _log_info(arguments.dry_run, 440 'Default max broken boards calculated to be %d for ' 441 '%s pool', 442 max_broken_boards, pool) 443 444 445 broken_boards = [board for board, counts in inventory.items() 446 if counts.get_broken(pool) != 0] 447 broken_boards.sort() 448 num_of_broken_boards = len(broken_boards) 449 # TODO(kevcheng): Track which boards have broken duts, we can limit the 450 # number of boards we go through in the main loop with this knowledge. 451 _log_message('There are %d boards in the %s pool with at least 1 ' 452 'broken DUT (max threshold %d)', num_of_broken_boards, 453 pool, max_broken_boards) 454 for broken_board in broken_boards: 455 _log_message(broken_board) 456 return num_of_broken_boards > max_broken_boards 457 458 459def _parse_command(argv): 460 """Parse the command line arguments. 461 462 Create an argument parser for this command's syntax, parse the 463 command line, and return the result of the `ArgumentParser` 464 `parse_args()` method. 465 466 @param argv Standard command line argument vector; `argv[0]` is 467 assumed to be the command name. 468 469 @return Result returned by `ArgumentParser.parse_args()`. 470 471 """ 472 parser = argparse.ArgumentParser( 473 prog=argv[0], 474 description='Balance pool shortages from spares on reserve') 475 476 count_group = parser.add_mutually_exclusive_group() 477 count_group.add_argument('-t', '--total', type=int, 478 metavar='COUNT', default=None, 479 help='Set the number of DUTs in the ' 480 'pool to the specified count for ' 481 'every BOARD') 482 count_group.add_argument('-a', '--grow', type=int, 483 metavar='COUNT', default=None, 484 help='Add the specified number of DUTs ' 485 'to the pool for every BOARD') 486 count_group.add_argument('-d', '--shrink', type=int, 487 metavar='COUNT', default=None, 488 help='Remove the specified number of DUTs ' 489 'from the pool for every BOARD') 490 491 parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT, 492 metavar='POOL', 493 help='Pool from which to draw replacement ' 494 'spares (default: pool:%s)' % _SPARE_DEFAULT) 495 parser.add_argument('-n', '--dry-run', action='store_true', 496 help='Report actions to take in the form of ' 497 'shell commands') 498 parser.add_argument('-v', '--verbose', action='store_true', 499 help='Print more detail about calculations for debug ' 500 'purposes.') 501 502 parser.add_argument('-m', '--max-broken', default=2, type=int, 503 metavar='COUNT', 504 help='Only rebalance a pool if it has at most ' 505 'COUNT broken DUTs.') 506 parser.add_argument('-f', '--force-rebalance', action='store_true', 507 help='Forcefully rebalance all DUTs in a pool, even ' 508 'if it has a large number of broken DUTs. ' 509 'Before doing this, please investigate whether ' 510 'there is a bug that is bricking devices in the ' 511 'lab.') 512 513 parser.add_argument('--all-boards', action='store_true', 514 help='Rebalance all managed boards. This will do a ' 515 'very expensive check to see how many boards have ' 516 'at least one broken DUT. To bypass that check, ' 517 'set --max-broken-boards to 0.') 518 parser.add_argument('--max-broken-boards', 519 default=None, type=int, 520 help='Only rebalance all boards if number of boards ' 521 'with broken DUTs in the specified pool ' 522 'is less than COUNT.') 523 524 parser.add_argument('pool', 525 metavar='POOL', 526 help='Name of the pool to balance. Use %s to balance ' 527 'all critical pools' % _ALL_CRITICAL_POOLS) 528 parser.add_argument('boards', nargs='*', 529 metavar='BOARD', 530 help='Names of boards to balance.') 531 532 arguments = parser.parse_args(argv[1:]) 533 534 # Error-check arguments. 535 if not arguments.boards and not arguments.all_boards: 536 parser.error('No boards specified. To balance all boards, use ' 537 '--all-boards') 538 if arguments.boards and arguments.all_boards: 539 parser.error('Cannot specify boards with --all-boards.') 540 if (arguments.pool == _ALL_CRITICAL_POOLS and 541 arguments.spare != _SPARE_DEFAULT): 542 parser.error('Cannot specify --spare pool to be %s when balancing all ' 543 'critical pools.' % _SPARE_DEFAULT) 544 return arguments 545 546 547def main(argv): 548 """Standard main routine. 549 550 @param argv Command line arguments including `sys.argv[0]`. 551 552 """ 553 def balancer(i, board, pool): 554 """Balance the specified board. 555 556 @param i The index of the board. 557 @param board The board name. 558 @param pool The pool to rebalance for the board. 559 """ 560 if i > 0: 561 _log_message('') 562 _balance_board(arguments, afe, board, pool, start_time, end_time) 563 564 arguments = _parse_command(argv) 565 end_time = time.time() 566 start_time = end_time - 24 * 60 * 60 567 afe = frontend.AFE(server=None) 568 boards = arguments.boards 569 pools = (lab_inventory.CRITICAL_POOLS 570 if arguments.pool == _ALL_CRITICAL_POOLS 571 else [arguments.pool]) 572 board_info = [] 573 if arguments.all_boards: 574 inventory = lab_inventory.get_inventory(afe) 575 for pool in pools: 576 if _too_many_broken_boards(inventory, pool, arguments): 577 _log_error('Refusing to balance all boards for %s pool, ' 578 'too many boards with at least 1 broken DUT ' 579 'detected.', pool) 580 else: 581 boards_in_pool = inventory.get_managed_boards(pool=pool) 582 current_len_board_info = len(board_info) 583 board_info.extend([(i + current_len_board_info, board, pool) 584 for i, board in enumerate(boards_in_pool)]) 585 else: 586 # We have specified boards with a specified pool, setup the args to the 587 # balancer properly. 588 for pool in pools: 589 current_len_board_info = len(board_info) 590 board_info.extend([(i + current_len_board_info, board, pool) 591 for i, board in enumerate(boards)]) 592 try: 593 parallel.RunTasksInProcessPool(balancer, board_info, processes=8) 594 except KeyboardInterrupt: 595 pass 596 597 598if __name__ == '__main__': 599 main(sys.argv) 600