• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Adjust pool balances to cover DUT shortfalls.
7
8This command takes all broken DUTs in a specific pool for specific
9boards and swaps them with working DUTs taken from a selected pool
10of spares.  The command is meant primarily for replacing broken DUTs
11in critical pools like BVT or CQ, but it can also be used to adjust
12pool sizes, or to create or remove pools.
13
14usage:  balance_pool.py [ options ] POOL BOARD [ BOARD ... ]
15
16positional arguments:
17  POOL                  Name of the pool to balance
18  BOARD                 Names of boards to balance
19
20optional arguments:
21  -h, --help            show this help message and exit
22  -t COUNT, --total COUNT
23                        Set the number of DUTs in the pool to the specified
24                        count for every BOARD
25  -a COUNT, --grow COUNT
26                        Add the specified number of DUTs to the pool for every
27                        BOARD
28  -d COUNT, --shrink COUNT
29                        Remove the specified number of DUTs from the pool for
30                        every BOARD
31  -s POOL, --spare POOL
32                        Pool from which to draw replacement spares (default:
33                        pool:suites)
34  -n, --dry-run         Report actions to take in the form of shell commands
35
36
37The command attempts to remove all broken DUTs from the target POOL
38for every BOARD, and replace them with enough working DUTs taken
39from the spare pool to bring the strength of POOL to the requested
40total COUNT.
41
42If no COUNT options are supplied (i.e. there are no --total, --grow,
43or --shrink options), the command will maintain the current totals of
44DUTs for every BOARD in the target POOL.
45
46If not enough working spares are available, broken DUTs may be left
47in the pool to keep the pool at the target COUNT.
48
49When reducing pool size, working DUTs will be returned after broken
50DUTs, if it's necessary to achieve the target COUNT.
51
52"""
53
54
55import argparse
56import sys
57import time
58
59import common
60from autotest_lib.server import frontend
61from autotest_lib.server.lib import status_history
62from autotest_lib.site_utils import lab_inventory
63from autotest_lib.site_utils.suite_scheduler import constants
64
65from chromite.lib import parallel
66
67
68_POOL_PREFIX = constants.Labels.POOL_PREFIX
69# This is the ratio of all boards we should calculate the default max number of
70# broken boards against.  It seemed like the best choice that was neither too
71# strict nor lax.
72_MAX_BROKEN_BOARDS_DEFAULT_RATIO = 3.0 / 8.0
73
74_ALL_CRITICAL_POOLS = 'all_critical_pools'
75_SPARE_DEFAULT = lab_inventory.SPARE_POOL
76
77
78def _log_message(message, *args):
79    """Log a message with optional format arguments to stdout.
80
81    This function logs a single line to stdout, with formatting
82    if necessary, and without adornments.
83
84    If `*args` are supplied, the message will be formatted using
85    the arguments.
86
87    @param message  Message to be logged, possibly after formatting.
88    @param args     Format arguments.  If empty, the message is logged
89                    without formatting.
90
91    """
92    if args:
93        message = message % args
94    sys.stdout.write('%s\n' % message)
95
96
97def _log_info(dry_run, message, *args):
98    """Log information in a dry-run dependent fashion.
99
100    This function logs a single line to stdout, with formatting
101    if necessary.  When logging for a dry run, the message is
102    printed as a shell comment, rather than as unadorned text.
103
104    If `*args` are supplied, the message will be formatted using
105    the arguments.
106
107    @param message  Message to be logged, possibly after formatting.
108    @param args     Format arguments.  If empty, the message is logged
109                    without formatting.
110
111    """
112    if dry_run:
113        message = '# ' + message
114    _log_message(message, *args)
115
116
117def _log_error(message, *args):
118    """Log an error to stderr, with optional format arguments.
119
120    This function logs a single line to stderr, prefixed to indicate
121    that it is an error message.
122
123    If `*args` are supplied, the message will be formatted using
124    the arguments.
125
126    @param message  Message to be logged, possibly after formatting.
127    @param args     Format arguments.  If empty, the message is logged
128                    without formatting.
129
130    """
131    if args:
132        message = message % args
133    sys.stderr.write('ERROR: %s\n' % message)
134
135
136class _DUTPool(object):
137    """Information about a pool of DUTs for a given board.
138
139    This class collects information about all DUTs for a given
140    board and pool pair, and divides them into three categories:
141      + Working - the DUT is working for testing, and not locked.
142      + Broken - the DUT is unable to run tests, or it is locked.
143      + Ineligible - the DUT is not available to be removed from
144          this pool.  The DUT may be either working or broken.
145
146    DUTs with more than one pool: label are ineligible for exchange
147    during balancing.  This is done for the sake of chameleon hosts,
148    which must always be assigned to pool:suites.  These DUTs are
149    always marked with pool:chameleon to prevent their reassignment.
150
151    TODO(jrbarnette):  The use of `pool:chamelon` (instead of just
152    the `chameleon` label is a hack that should be eliminated.
153
154    _DUTPool instances are used to track both main pools that need
155    to be resupplied with working DUTs and spare pools that supply
156    those DUTs.
157
158    @property board               Name of the board associated with
159                                  this pool of DUTs.
160    @property pool                Name of the pool associated with
161                                  this pool of DUTs.
162    @property working_hosts       The list of this pool's working
163                                  DUTs.
164    @property broken_hosts        The list of this pool's broken
165                                  DUTs.
166    @property ineligible_hosts    The list of this pool's ineligible DUTs.
167    @property labels              A list of labels that identify a DUT
168                                  as part of this pool.
169    @property total_hosts         The total number of hosts in pool.
170
171    """
172
173    def __init__(self, afe, board, pool, start_time, end_time):
174        self.board = board
175        self.pool = pool
176        self.working_hosts = []
177        self.broken_hosts = []
178        self.ineligible_hosts = []
179        self.total_hosts = self._get_hosts(afe, start_time, end_time)
180        self._labels = [_POOL_PREFIX + self.pool]
181
182
183    def _get_hosts(self, afe, start_time, end_time):
184        all_histories = (
185            status_history.HostJobHistory.get_multiple_histories(
186                    afe, start_time, end_time,
187                    board=self.board, pool=self.pool))
188        for h in all_histories:
189            host = h.host
190            host_pools = [l for l in host.labels
191                          if l.startswith(_POOL_PREFIX)]
192            if len(host_pools) != 1:
193                self.ineligible_hosts.append(host)
194            else:
195                diag = h.last_diagnosis()[0]
196                if (diag == status_history.WORKING and
197                        not host.locked):
198                    self.working_hosts.append(host)
199                else:
200                    self.broken_hosts.append(host)
201        return len(all_histories)
202
203
204    @property
205    def pool_labels(self):
206        """Return the AFE labels that identify this pool.
207
208        The returned labels are the labels that must be removed
209        to remove a DUT from the pool, or added to add a DUT.
210
211        @return A list of AFE labels suitable for AFE.add_labels()
212                or AFE.remove_labels().
213
214        """
215        return self._labels
216
217    def calculate_spares_needed(self, target_total):
218        """Calculate and log the spares needed to achieve a target.
219
220        Return how many working spares are needed to achieve the
221        given `target_total` with all DUTs working.
222
223        The spares count may be positive or negative.  Positive
224        values indicate spares are needed to replace broken DUTs in
225        order to reach the target; negative numbers indicate that
226        no spares are needed, and that a corresponding number of
227        working devices can be returned.
228
229        If the new target total would require returning ineligible
230        DUTs, an error is logged, and the target total is adjusted
231        so that those DUTs are not exchanged.
232
233        @param target_total  The new target pool size.
234
235        @return The number of spares needed.
236
237        """
238        num_ineligible = len(self.ineligible_hosts)
239        if target_total < num_ineligible:
240            _log_error('%s %s pool: Target of %d is below '
241                       'minimum of %d DUTs.',
242                       self.board, self.pool,
243                       target_total, num_ineligible)
244            _log_error('Adjusting target to %d DUTs.', num_ineligible)
245            target_total = num_ineligible
246        adjustment = target_total - self.total_hosts
247        return len(self.broken_hosts) + adjustment
248
249    def allocate_surplus(self, num_broken):
250        """Allocate a list DUTs that can returned as surplus.
251
252        Return a list of devices that can be returned in order to
253        reduce this pool's supply.  Broken DUTs will be preferred
254        over working ones.
255
256        The `num_broken` parameter indicates the number of broken
257        DUTs to be left in the pool.  If this number exceeds the
258        number of broken DUTs actually in the pool, the returned
259        list will be empty.  If this number is negative, it
260        indicates a number of working DUTs to be returned in
261        addition to all broken ones.
262
263        @param num_broken    Total number of broken DUTs to be left in
264                             this pool.
265
266        @return A list of DUTs to be returned as surplus.
267
268        """
269        if num_broken >= 0:
270            surplus = self.broken_hosts[num_broken:]
271            return surplus
272        else:
273            return (self.broken_hosts +
274                    self.working_hosts[:-num_broken])
275
276
277def _exchange_labels(dry_run, hosts, target_pool, spare_pool):
278    """Reassign a list of DUTs from one pool to another.
279
280    For all the given hosts, remove all labels associated with
281    `spare_pool`, and add the labels for `target_pool`.
282
283    If `dry_run` is true, perform no changes, but log the `atest`
284    commands needed to accomplish the necessary label changes.
285
286    @param dry_run       Whether the logging is for a dry run or
287                         for actual execution.
288    @param hosts         List of DUTs (AFE hosts) to be reassigned.
289    @param target_pool   The `_DUTPool` object from which the hosts
290                         are drawn.
291    @param spare_pool    The `_DUTPool` object to which the hosts
292                         will be added.
293
294    """
295    if not hosts:
296        return
297    _log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
298              len(hosts), spare_pool.pool, target_pool.pool)
299    additions = target_pool.pool_labels
300    removals = spare_pool.pool_labels
301    for host in hosts:
302        if not dry_run:
303            _log_message('Updating host: %s.', host.hostname)
304            host.remove_labels(removals)
305            host.add_labels(additions)
306        else:
307            _log_message('atest label remove -m %s %s',
308                         host.hostname, ' '.join(removals))
309            _log_message('atest label add -m %s %s',
310                         host.hostname, ' '.join(additions))
311
312
313def _balance_board(arguments, afe, board, pool, start_time, end_time):
314    """Balance one board as requested by command line arguments.
315
316    @param arguments     Parsed command line arguments.
317    @param dry_run       Whether the logging is for a dry run or
318                         for actual execution.
319    @param afe           AFE object to be used for the changes.
320    @param board         Board to be balanced.
321    @param pool          Pool of the board to be balanced.
322    @param start_time    Start time for HostJobHistory objects in
323                         the DUT pools.
324    @param end_time      End time for HostJobHistory objects in the
325                         DUT pools.
326
327    """
328    spare_pool = _DUTPool(afe, board, arguments.spare,
329                          start_time, end_time)
330    main_pool = _DUTPool(afe, board, pool,
331                         start_time, end_time)
332
333    target_total = main_pool.total_hosts
334    if arguments.total is not None:
335        target_total = arguments.total
336    elif arguments.grow:
337        target_total += arguments.grow
338    elif arguments.shrink:
339        target_total -= arguments.shrink
340
341    spares_needed = main_pool.calculate_spares_needed(target_total)
342    if spares_needed > 0:
343        spare_duts = spare_pool.working_hosts[:spares_needed]
344        shortfall = spares_needed - len(spare_duts)
345    else:
346        spare_duts = []
347        shortfall = spares_needed
348
349    surplus_duts = main_pool.allocate_surplus(shortfall)
350
351    if spares_needed or surplus_duts or arguments.verbose:
352        dry_run = arguments.dry_run
353        _log_message('')
354
355        _log_info(dry_run, 'Balancing %s %s pool:', board, main_pool.pool)
356        _log_info(dry_run,
357                  'Total %d DUTs, %d working, %d broken, %d reserved.',
358                  main_pool.total_hosts, len(main_pool.working_hosts),
359                  len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))
360
361        if spares_needed > 0:
362            add_msg = 'grow pool by %d DUTs' % spares_needed
363        elif spares_needed < 0:
364            add_msg = 'shrink pool by %d DUTs' % -spares_needed
365        else:
366            add_msg = 'no change to pool size'
367        _log_info(dry_run, 'Target is %d working DUTs; %s.',
368                  target_total, add_msg)
369
370        _log_info(dry_run,
371                  '%s %s pool has %d spares available.',
372                  board, main_pool.pool, len(spare_pool.working_hosts))
373
374        if spares_needed > len(spare_duts):
375            _log_error('Not enough spares: need %d, only have %d.',
376                       spares_needed, len(spare_duts))
377        elif shortfall >= 0:
378            _log_info(dry_run,
379                      '%s %s pool will return %d broken DUTs, '
380                      'leaving %d still in the pool.',
381                      board, main_pool.pool,
382                      len(surplus_duts),
383                      len(main_pool.broken_hosts) - len(surplus_duts))
384        else:
385            _log_info(dry_run,
386                      '%s %s pool will return %d surplus DUTs, '
387                      'including %d working DUTs.',
388                      board, main_pool.pool,
389                      len(main_pool.broken_hosts) - shortfall,
390                      -shortfall)
391
392    if (len(main_pool.broken_hosts) > arguments.max_broken and
393        not arguments.force_rebalance):
394        _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',
395                   board, main_pool.pool, len(main_pool.broken_hosts))
396        _log_error('Please investigate this board to see if there is a bug ')
397        _log_error('that is bricking devices. Once you have finished your ')
398        _log_error('investigation, you can force a rebalance with ')
399        _log_error('--force-rebalance')
400        return
401
402    if not spare_duts and not surplus_duts:
403        if arguments.verbose:
404            _log_info(arguments.dry_run, 'No exchange required.')
405        return
406
407    _exchange_labels(arguments.dry_run, surplus_duts,
408                     spare_pool, main_pool)
409    _exchange_labels(arguments.dry_run, spare_duts,
410                     main_pool, spare_pool)
411
412
413def _too_many_broken_boards(inventory, pool, arguments):
414    """
415    Get the inventory of boards and check if too many boards are broken.
416
417    @param inventory: inventory object to determine board status inventory.
418    @param pool: The pool to check on for the board.
419    @param arguments     Parsed command line arguments.
420
421    @return True if the number of boards with 1 or more broken duts exceed
422    max_broken_boards, False otherwise.
423    """
424    # Let's check if we even need to check for this max_broken_boards.
425    if arguments.force_rebalance or arguments.max_broken_boards == 0:
426        return False
427
428    # Let's get the number of broken duts for the specified pool and
429    # check that it's less than arguments.max_broken_boards.  Or if
430    # it's not specified, calculate the default number of max broken
431    # boards based on the total number of boards per pool.
432    # TODO(kevcheng): Revisit to see if there's a better way to
433    # calculate the default max_broken_boards.
434    max_broken_boards = arguments.max_broken_boards
435    if max_broken_boards is None:
436        total_num_boards = len(inventory.get_managed_boards(pool=pool))
437        max_broken_boards = int(_MAX_BROKEN_BOARDS_DEFAULT_RATIO *
438                                total_num_boards)
439        _log_info(arguments.dry_run,
440                  'Default max broken boards calculated to be %d for '
441                  '%s pool',
442                  max_broken_boards, pool)
443
444
445    broken_boards = [board for board, counts in inventory.items()
446                     if counts.get_broken(pool) != 0]
447    broken_boards.sort()
448    num_of_broken_boards = len(broken_boards)
449    # TODO(kevcheng): Track which boards have broken duts, we can limit the
450    # number of boards we go through in the main loop with this knowledge.
451    _log_message('There are %d boards in the %s pool with at least 1 '
452                 'broken DUT (max threshold %d)', num_of_broken_boards,
453                 pool, max_broken_boards)
454    for broken_board in broken_boards:
455        _log_message(broken_board)
456    return num_of_broken_boards > max_broken_boards
457
458
459def _parse_command(argv):
460    """Parse the command line arguments.
461
462    Create an argument parser for this command's syntax, parse the
463    command line, and return the result of the `ArgumentParser`
464    `parse_args()` method.
465
466    @param argv Standard command line argument vector; `argv[0]` is
467                assumed to be the command name.
468
469    @return Result returned by `ArgumentParser.parse_args()`.
470
471    """
472    parser = argparse.ArgumentParser(
473            prog=argv[0],
474            description='Balance pool shortages from spares on reserve')
475
476    count_group = parser.add_mutually_exclusive_group()
477    count_group.add_argument('-t', '--total', type=int,
478                             metavar='COUNT', default=None,
479                             help='Set the number of DUTs in the '
480                                  'pool to the specified count for '
481                                  'every BOARD')
482    count_group.add_argument('-a', '--grow', type=int,
483                             metavar='COUNT', default=None,
484                             help='Add the specified number of DUTs '
485                                  'to the pool for every BOARD')
486    count_group.add_argument('-d', '--shrink', type=int,
487                             metavar='COUNT', default=None,
488                             help='Remove the specified number of DUTs '
489                                  'from the pool for every BOARD')
490
491    parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT,
492                        metavar='POOL',
493                        help='Pool from which to draw replacement '
494                             'spares (default: pool:%s)' % _SPARE_DEFAULT)
495    parser.add_argument('-n', '--dry-run', action='store_true',
496                        help='Report actions to take in the form of '
497                             'shell commands')
498    parser.add_argument('-v', '--verbose', action='store_true',
499                        help='Print more detail about calculations for debug '
500                             'purposes.')
501
502    parser.add_argument('-m', '--max-broken', default=2, type=int,
503                        metavar='COUNT',
504                        help='Only rebalance a pool if it has at most '
505                             'COUNT broken DUTs.')
506    parser.add_argument('-f', '--force-rebalance', action='store_true',
507                        help='Forcefully rebalance all DUTs in a pool, even '
508                             'if it has a large number of broken DUTs. '
509                             'Before doing this, please investigate whether '
510                             'there is a bug that is bricking devices in the '
511                             'lab.')
512
513    parser.add_argument('--all-boards', action='store_true',
514                        help='Rebalance all managed boards.  This will do a '
515                             'very expensive check to see how many boards have '
516                             'at least one broken DUT.  To bypass that check, '
517                             'set --max-broken-boards to 0.')
518    parser.add_argument('--max-broken-boards',
519                        default=None, type=int,
520                        help='Only rebalance all boards if number of boards '
521                             'with broken DUTs in the specified pool '
522                             'is less than COUNT.')
523
524    parser.add_argument('pool',
525                        metavar='POOL',
526                        help='Name of the pool to balance.  Use %s to balance '
527                             'all critical pools' % _ALL_CRITICAL_POOLS)
528    parser.add_argument('boards', nargs='*',
529                        metavar='BOARD',
530                        help='Names of boards to balance.')
531
532    arguments = parser.parse_args(argv[1:])
533
534    # Error-check arguments.
535    if not arguments.boards and not arguments.all_boards:
536        parser.error('No boards specified. To balance all boards, use '
537                     '--all-boards')
538    if arguments.boards and arguments.all_boards:
539        parser.error('Cannot specify boards with --all-boards.')
540    if (arguments.pool == _ALL_CRITICAL_POOLS and
541            arguments.spare != _SPARE_DEFAULT):
542        parser.error('Cannot specify --spare pool to be %s when balancing all '
543                     'critical pools.' % _SPARE_DEFAULT)
544    return arguments
545
546
547def main(argv):
548    """Standard main routine.
549
550    @param argv  Command line arguments including `sys.argv[0]`.
551
552    """
553    def balancer(i, board, pool):
554      """Balance the specified board.
555
556      @param i The index of the board.
557      @param board The board name.
558      @param pool The pool to rebalance for the board.
559      """
560      if i > 0:
561          _log_message('')
562      _balance_board(arguments, afe, board, pool, start_time, end_time)
563
564    arguments = _parse_command(argv)
565    end_time = time.time()
566    start_time = end_time - 24 * 60 * 60
567    afe = frontend.AFE(server=None)
568    boards = arguments.boards
569    pools = (lab_inventory.CRITICAL_POOLS
570             if arguments.pool == _ALL_CRITICAL_POOLS
571             else [arguments.pool])
572    board_info = []
573    if arguments.all_boards:
574        inventory = lab_inventory.get_inventory(afe)
575        for pool in pools:
576            if _too_many_broken_boards(inventory, pool, arguments):
577                _log_error('Refusing to balance all boards for %s pool, '
578                           'too many boards with at least 1 broken DUT '
579                           'detected.', pool)
580            else:
581                boards_in_pool = inventory.get_managed_boards(pool=pool)
582                current_len_board_info = len(board_info)
583                board_info.extend([(i + current_len_board_info, board, pool)
584                                   for i, board in enumerate(boards_in_pool)])
585    else:
586        # We have specified boards with a specified pool, setup the args to the
587        # balancer properly.
588        for pool in pools:
589            current_len_board_info = len(board_info)
590            board_info.extend([(i + current_len_board_info, board, pool)
591                               for i, board in enumerate(boards)])
592    try:
593        parallel.RunTasksInProcessPool(balancer, board_info, processes=8)
594    except KeyboardInterrupt:
595        pass
596
597
598if __name__ == '__main__':
599    main(sys.argv)
600