• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Adjust pool balances to cover DUT shortfalls.
7
8This command takes all broken DUTs in a specific pool for specific
9models and swaps them with working DUTs taken from a selected pool
10of spares.  The command is meant primarily for replacing broken DUTs
11in critical pools like BVT or CQ, but it can also be used to adjust
12pool sizes, or to create or remove pools.
13
14usage:  balance_pool.py [ options ] POOL MODEL [ MODEL ... ]
15
16positional arguments:
17  POOL                  Name of the pool to balance
18  MODEL                 Names of models to balance
19
20optional arguments:
21  -h, --help            show this help message and exit
22  -t COUNT, --total COUNT
23                        Set the number of DUTs in the pool to the specified
24                        count for every MODEL
25  -a COUNT, --grow COUNT
26                        Add the specified number of DUTs to the pool for every
27                        MODEL
28  -d COUNT, --shrink COUNT
29                        Remove the specified number of DUTs from the pool for
30                        every MODEL
31  -s POOL, --spare POOL
32                        Pool from which to draw replacement spares (default:
33                        pool:suites)
34  --sku SKU             The specific SKU we intend to swap with
35  -n, --dry-run         Report actions to take in the form of shell commands
36
37
38The command attempts to remove all broken DUTs from the target POOL
39for every MODEL, and replace them with enough working DUTs taken
40from the spare pool to bring the strength of POOL to the requested
41total COUNT.
42
43If no COUNT options are supplied (i.e. there are no --total, --grow,
44or --shrink options), the command will maintain the current totals of
45DUTs for every MODEL in the target POOL.
46
47If not enough working spares are available, broken DUTs may be left
48in the pool to keep the pool at the target COUNT.
49
50When reducing pool size, working DUTs will be returned after broken
51DUTs, if it's necessary to achieve the target COUNT.
52
53"""
54
55
56import argparse
57import sys
58import time
59
60import common
61from autotest_lib.server import constants
62from autotest_lib.server import frontend
63from autotest_lib.server import site_utils
64from autotest_lib.server.lib import status_history
65from autotest_lib.site_utils import lab_inventory
66from autotest_lib.utils import labellib
67from chromite.lib import metrics
68from chromite.lib import parallel
69
70#This must be imported after chromite.lib.metrics
71from infra_libs import ts_mon
72
73_POOL_PREFIX = constants.Labels.POOL_PREFIX
74# This is the ratio of all models we should calculate the default max
75# number of broken models against.  It seemed like the best choice that
76# was neither too strict nor lax.
77_MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0
78
79_ALL_CRITICAL_POOLS = 'all_critical_pools'
80_SPARE_DEFAULT = lab_inventory.SPARE_POOL
81
82
83def _log_message(message, *args):
84    """Log a message with optional format arguments to stdout.
85
86    This function logs a single line to stdout, with formatting
87    if necessary, and without adornments.
88
89    If `*args` are supplied, the message will be formatted using
90    the arguments.
91
92    @param message  Message to be logged, possibly after formatting.
93    @param args     Format arguments.  If empty, the message is logged
94                    without formatting.
95
96    """
97    if args:
98        message = message % args
99    sys.stdout.write('%s\n' % message)
100
101
102def _log_info(dry_run, message, *args):
103    """Log information in a dry-run dependent fashion.
104
105    This function logs a single line to stdout, with formatting
106    if necessary.  When logging for a dry run, the message is
107    printed as a shell comment, rather than as unadorned text.
108
109    If `*args` are supplied, the message will be formatted using
110    the arguments.
111
112    @param message  Message to be logged, possibly after formatting.
113    @param args     Format arguments.  If empty, the message is logged
114                    without formatting.
115
116    """
117    if dry_run:
118        message = '# ' + message
119    _log_message(message, *args)
120
121
122def _log_error(message, *args):
123    """Log an error to stderr, with optional format arguments.
124
125    This function logs a single line to stderr, prefixed to indicate
126    that it is an error message.
127
128    If `*args` are supplied, the message will be formatted using
129    the arguments.
130
131    @param message  Message to be logged, possibly after formatting.
132    @param args     Format arguments.  If empty, the message is logged
133                    without formatting.
134
135    """
136    if args:
137        message = message % args
138    sys.stderr.write('ERROR: %s\n' % message)
139
140
141class _DUTPool(object):
142    """Information about a pool of DUTs matching given labels.
143
144    This class collects information about all DUTs for a given pool and matching
145    the given labels, and divides them into three categories:
146      + Working - the DUT is working for testing, and not locked.
147      + Broken - the DUT is unable to run tests, or it is locked.
148      + Ineligible - the DUT is not available to be removed from this pool.  The
149            DUT may be either working or broken.
150
151    DUTs with more than one pool: label are ineligible for exchange
152    during balancing.  This is done for the sake of chameleon hosts,
153    which must always be assigned to pool:suites.  These DUTs are
154    always marked with pool:chameleon to prevent their reassignment.
155
156    TODO(jrbarnette):  The use of `pool:chamelon` (instead of just
157    the `chameleon` label is a hack that should be eliminated.
158
159    _DUTPool instances are used to track both main pools that need
160    to be resupplied with working DUTs and spare pools that supply
161    those DUTs.
162
163    @property pool                Name of the pool associated with
164                                  this pool of DUTs.
165    @property labels              Labels that constrain the DUTs to consider.
166    @property working_hosts       The list of this pool's working DUTs.
167    @property broken_hosts        The list of this pool's broken DUTs.
168    @property ineligible_hosts    The list of this pool's ineligible DUTs.
169    @property pool_labels         A list of labels that identify a DUT as part
170                                  of this pool.
171    @property total_hosts         The total number of hosts in pool.
172
173    """
174
175    def __init__(self, afe, pool, labels, start_time, end_time):
176        self.pool = pool
177        self.labels = labellib.LabelsMapping(labels)
178        self.labels['pool'] = pool
179        self._pool_labels = [_POOL_PREFIX + self.pool]
180
181        self.working_hosts = []
182        self.broken_hosts = []
183        self.ineligible_hosts = []
184        self.total_hosts = self._get_hosts(afe, start_time, end_time)
185
186
187    def _get_hosts(self, afe, start_time, end_time):
188        all_histories = status_history.HostJobHistory.get_multiple_histories(
189                afe, start_time, end_time, self.labels.getlabels())
190        for h in all_histories:
191            host = h.host
192            host_pools = [l for l in host.labels
193                          if l.startswith(_POOL_PREFIX)]
194            if len(host_pools) != 1:
195                self.ineligible_hosts.append(host)
196            else:
197                diag = h.last_diagnosis()[0]
198                if (diag == status_history.WORKING and
199                        not host.locked):
200                    self.working_hosts.append(host)
201                else:
202                    self.broken_hosts.append(host)
203        return len(all_histories)
204
205
206    @property
207    def pool_labels(self):
208        """Return the AFE labels that identify this pool.
209
210        The returned labels are the labels that must be removed
211        to remove a DUT from the pool, or added to add a DUT.
212
213        @return A list of AFE labels suitable for AFE.add_labels()
214                or AFE.remove_labels().
215
216        """
217        return self._pool_labels
218
219    def calculate_spares_needed(self, target_total):
220        """Calculate and log the spares needed to achieve a target.
221
222        Return how many working spares are needed to achieve the
223        given `target_total` with all DUTs working.
224
225        The spares count may be positive or negative.  Positive
226        values indicate spares are needed to replace broken DUTs in
227        order to reach the target; negative numbers indicate that
228        no spares are needed, and that a corresponding number of
229        working devices can be returned.
230
231        If the new target total would require returning ineligible
232        DUTs, an error is logged, and the target total is adjusted
233        so that those DUTs are not exchanged.
234
235        @param target_total  The new target pool size.
236
237        @return The number of spares needed.
238
239        """
240        num_ineligible = len(self.ineligible_hosts)
241        spares_needed = target_total >= num_ineligible
242        metrics.Boolean(
243            'chromeos/autotest/balance_pools/exhausted_pools',
244            'True for each pool/model which requests more DUTs than supplied',
245            # TODO(jrbarnette) The 'board' field is a legacy.  We need
246            # to leave it here until we do the extra work Monarch
247            # requires to delete a field.
248            field_spec=[
249                    ts_mon.StringField('pool'),
250                    ts_mon.StringField('board'),
251                    ts_mon.StringField('model'),
252            ]).set(
253                    not spares_needed,
254                    fields={
255                            'pool': self.pool,
256                            'board': self.labels.get('model', ''),
257                            'model': self.labels.get('model', ''),
258                    },
259        )
260        if not spares_needed:
261            _log_error(
262                    '%s pool (%s): Target of %d is below minimum of %d DUTs.',
263                    self.pool, self.labels, target_total, num_ineligible,
264            )
265            _log_error('Adjusting target to %d DUTs.', num_ineligible)
266            target_total = num_ineligible
267        else:
268            _log_message('%s %s pool: Target of %d is above minimum.',
269                         self.labels.get('model', ''), self.pool, target_total)
270        adjustment = target_total - self.total_hosts
271        return len(self.broken_hosts) + adjustment
272
273    def allocate_surplus(self, num_broken):
274        """Allocate a list DUTs that can returned as surplus.
275
276        Return a list of devices that can be returned in order to
277        reduce this pool's supply.  Broken DUTs will be preferred
278        over working ones.
279
280        The `num_broken` parameter indicates the number of broken
281        DUTs to be left in the pool.  If this number exceeds the
282        number of broken DUTs actually in the pool, the returned
283        list will be empty.  If this number is negative, it
284        indicates a number of working DUTs to be returned in
285        addition to all broken ones.
286
287        @param num_broken    Total number of broken DUTs to be left in
288                             this pool.
289
290        @return A list of DUTs to be returned as surplus.
291
292        """
293        if num_broken >= 0:
294            surplus = self.broken_hosts[num_broken:]
295            return surplus
296        else:
297            return (self.broken_hosts +
298                    self.working_hosts[:-num_broken])
299
300
301def _exchange_labels(dry_run, hosts, target_pool, spare_pool):
302    """Reassign a list of DUTs from one pool to another.
303
304    For all the given hosts, remove all labels associated with
305    `spare_pool`, and add the labels for `target_pool`.
306
307    If `dry_run` is true, perform no changes, but log the `atest`
308    commands needed to accomplish the necessary label changes.
309
310    @param dry_run       Whether the logging is for a dry run or
311                         for actual execution.
312    @param hosts         List of DUTs (AFE hosts) to be reassigned.
313    @param target_pool   The `_DUTPool` object from which the hosts
314                         are drawn.
315    @param spare_pool    The `_DUTPool` object to which the hosts
316                         will be added.
317
318    """
319    _log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
320              len(hosts), spare_pool.pool, target_pool.pool)
321    metrics.Counter(
322        'chromeos/autotest/balance_pools/duts_moved',
323        'DUTs transferred between pools',
324        # TODO(jrbarnette) The 'board' field is a legacy.  We need to
325        # leave it here until we do the extra work Monarch requires to
326        # delete a field.
327        field_spec=[
328                ts_mon.StringField('board'),
329                ts_mon.StringField('model'),
330                ts_mon.StringField('source_pool'),
331                ts_mon.StringField('target_pool'),
332        ]
333    ).increment_by(
334            len(hosts),
335            fields={
336                    'board': target_pool.labels.get('model', ''),
337                    'model': target_pool.labels.get('model', ''),
338                    'source_pool': spare_pool.pool,
339                    'target_pool': target_pool.pool,
340            },
341    )
342    if not hosts:
343        return
344
345    additions = target_pool.pool_labels
346    removals = spare_pool.pool_labels
347    for host in hosts:
348        if not dry_run:
349            _log_message('Updating host: %s.', host.hostname)
350            host.remove_labels(removals)
351            host.add_labels(additions)
352        else:
353            _log_message('atest label remove -m %s %s',
354                         host.hostname, ' '.join(removals))
355            _log_message('atest label add -m %s %s',
356                         host.hostname, ' '.join(additions))
357
358
359def _balance_model(arguments, afe, pool, labels, start_time, end_time):
360    """Balance one model as requested by command line arguments.
361
362    @param arguments     Parsed command line arguments.
363    @param afe           AFE object to be used for the changes.
364    @param pool          Pool of the model to be balanced.
365    @param labels        Restrict the balancing operation within DUTs
366                         that have these labels.
367    @param start_time    Start time for HostJobHistory objects in
368                         the DUT pools.
369    @param end_time      End time for HostJobHistory objects in the
370                         DUT pools.
371
372    """
373    spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time)
374    main_pool = _DUTPool(afe, pool, labels, start_time, end_time)
375
376    target_total = main_pool.total_hosts
377    if arguments.total is not None:
378        target_total = arguments.total
379    elif arguments.grow:
380        target_total += arguments.grow
381    elif arguments.shrink:
382        target_total -= arguments.shrink
383
384    spares_needed = main_pool.calculate_spares_needed(target_total)
385    if spares_needed > 0:
386        spare_duts = spare_pool.working_hosts[:spares_needed]
387        shortfall = spares_needed - len(spare_duts)
388    else:
389        spare_duts = []
390        shortfall = spares_needed
391
392    surplus_duts = main_pool.allocate_surplus(shortfall)
393
394    if spares_needed or surplus_duts or arguments.verbose:
395        dry_run = arguments.dry_run
396        _log_message('')
397
398        _log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool)
399        _log_info(dry_run,
400                  'Total %d DUTs, %d working, %d broken, %d reserved.',
401                  main_pool.total_hosts, len(main_pool.working_hosts),
402                  len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))
403
404        if spares_needed > 0:
405            add_msg = 'grow pool by %d DUTs' % spares_needed
406        elif spares_needed < 0:
407            add_msg = 'shrink pool by %d DUTs' % -spares_needed
408        else:
409            add_msg = 'no change to pool size'
410        _log_info(dry_run, 'Target is %d working DUTs; %s.',
411                  target_total, add_msg)
412
413        _log_info(dry_run,
414                  '%s %s pool has %d spares available for balancing pool %s',
415                  labels, spare_pool.pool, len(spare_pool.working_hosts),
416                  main_pool.pool)
417
418        if spares_needed > len(spare_duts):
419            _log_error('Not enough spares: need %d, only have %d.',
420                       spares_needed, len(spare_duts))
421        elif shortfall >= 0:
422            _log_info(dry_run,
423                      '%s %s pool will return %d broken DUTs, '
424                      'leaving %d still in the pool.',
425                      labels, main_pool.pool,
426                      len(surplus_duts),
427                      len(main_pool.broken_hosts) - len(surplus_duts))
428        else:
429            _log_info(dry_run,
430                      '%s %s pool will return %d surplus DUTs, '
431                      'including %d working DUTs.',
432                      labels, main_pool.pool,
433                      len(main_pool.broken_hosts) - shortfall,
434                      -shortfall)
435
436    if (len(main_pool.broken_hosts) > arguments.max_broken and
437        not arguments.force_rebalance):
438        _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',
439                   labels, main_pool.pool, len(main_pool.broken_hosts))
440        _log_error('Please investigate this model to for a bug ')
441        _log_error('that is bricking devices. Once you have finished your ')
442        _log_error('investigation, you can force a rebalance with ')
443        _log_error('--force-rebalance')
444        spare_duts = []
445        surplus_duts = []
446
447    if not spare_duts and not surplus_duts:
448        if arguments.verbose:
449            _log_info(arguments.dry_run, 'No exchange required.')
450
451    _exchange_labels(arguments.dry_run, surplus_duts,
452                     spare_pool, main_pool)
453    _exchange_labels(arguments.dry_run, spare_duts,
454                     main_pool, spare_pool)
455
456
457def _too_many_broken(inventory, pool, args):
458    """
459    Get the inventory of models and check if too many are broken.
460
461    @param inventory: _LabInventory object.
462    @param pool: The pool to check.
463    @param args: Parsed command line arguments.
464
465    @return True if the number of models with 1 or more broken duts
466            exceed max_broken_models, False otherwise.
467    """
468    # Were we asked to skip this check?
469    if (args.force_rebalance or
470            (args.all_models and args.max_broken_models == 0)):
471        return False
472
473    max_broken = args.max_broken_models
474    if max_broken is None:
475        total_num = len(inventory.get_pool_models(pool))
476        max_broken = int(_MAX_BROKEN_DEFAULT_RATIO * total_num)
477    _log_info(args.dry_run,
478              'Max broken models for pool %s: %d',
479              pool, max_broken)
480
481    broken = [model for model, counts in inventory.iteritems()
482                  if counts.get_broken(pool) != 0]
483    _log_message('There are %d models in the %s pool with at least 1 '
484                 'broken DUT (max threshold %d)',
485                 len(broken), pool, max_broken)
486    for b in sorted(broken):
487        _log_message(b)
488    return len(broken) > max_broken
489
490
491def _parse_command(argv):
492    """Parse the command line arguments.
493
494    Create an argument parser for this command's syntax, parse the
495    command line, and return the result of the `ArgumentParser`
496    `parse_args()` method.
497
498    @param argv Standard command line argument vector; `argv[0]` is
499                assumed to be the command name.
500
501    @return Result returned by `ArgumentParser.parse_args()`.
502
503    """
504    parser = argparse.ArgumentParser(
505            prog=argv[0],
506            description='Balance pool shortages from spares on reserve')
507
508    parser.add_argument(
509        '-w', '--web', type=str, default=None,
510        help='AFE host to use. Default comes from shadow_config.',
511    )
512    count_group = parser.add_mutually_exclusive_group()
513    count_group.add_argument('-t', '--total', type=int,
514                             metavar='COUNT', default=None,
515                             help='Set the number of DUTs in the '
516                                  'pool to the specified count for '
517                                  'every MODEL')
518    count_group.add_argument('-a', '--grow', type=int,
519                             metavar='COUNT', default=None,
520                             help='Add the specified number of DUTs '
521                                  'to the pool for every MODEL')
522    count_group.add_argument('-d', '--shrink', type=int,
523                             metavar='COUNT', default=None,
524                             help='Remove the specified number of DUTs '
525                                  'from the pool for every MODEL')
526
527    parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT,
528                        metavar='POOL',
529                        help='Pool from which to draw replacement '
530                             'spares (default: pool:%s)' % _SPARE_DEFAULT)
531    parser.add_argument('-n', '--dry-run', action='store_true',
532                        help='Report actions to take in the form of '
533                             'shell commands')
534    parser.add_argument('-v', '--verbose', action='store_true',
535                        help='Print more detail about calculations for debug '
536                             'purposes.')
537
538    parser.add_argument('-m', '--max-broken', default=2, type=int,
539                        metavar='COUNT',
540                        help='Only rebalance a pool if it has at most '
541                             'COUNT broken DUTs.')
542    parser.add_argument('-f', '--force-rebalance', action='store_true',
543                        help='Forcefully rebalance all DUTs in a pool, even '
544                             'if it has a large number of broken DUTs. '
545                             'Before doing this, please investigate whether '
546                             'there is a bug that is bricking devices in the '
547                             'lab.')
548    parser.add_argument('--production', action='store_true',
549                        help='Treat this as a production run. This will '
550                             'collect metrics.')
551
552    parser.add_argument(
553            '--all-models',
554            action='store_true',
555            help='Rebalance all managed models.  This will do a very expensive '
556                 'check to see how many models have at least one broken DUT. '
557                 'To bypass that check, set --max-broken-models to 0.',
558    )
559    parser.add_argument(
560            '--max-broken-models', default=None, type=int, metavar='COUNT',
561            help='Only rebalance all models if number of models with broken '
562                 'DUTs in the specified pool is less than COUNT.',
563    )
564
565    parser.add_argument('pool',
566                        metavar='POOL',
567                        help='Name of the pool to balance.  Use %s to balance '
568                             'all critical pools' % _ALL_CRITICAL_POOLS)
569    parser.add_argument('models', nargs='*', metavar='MODEL',
570                        help='Names of models to balance.')
571
572    parser.add_argument('--sku', type=str,
573                        help='Optional name of sku to restrict to.')
574
575    arguments = parser.parse_args(argv[1:])
576
577    # Error-check arguments.
578    if arguments.models and arguments.all_models:
579        parser.error('Cannot specify individual models on the command line '
580                     'when using --all-models.')
581    if (arguments.pool == _ALL_CRITICAL_POOLS and
582        arguments.spare != _SPARE_DEFAULT):
583        parser.error('Cannot specify --spare pool to be %s when balancing all '
584                     'critical pools.' % _SPARE_DEFAULT)
585    return arguments
586
587
588def infer_balancer_targets(afe, arguments, pools):
589    """Take some arguments and translate them to a list of models to balance
590
591    Args:
592    @param afe           AFE object to be used for taking inventory.
593    @param arguments     Parsed command line arguments.
594    @param pools         The list of pools to balance.
595
596    @returns    a list of (model, labels) tuples to be balanced
597
598    """
599    balancer_targets = []
600
601    for pool in pools:
602        if arguments.all_models:
603            inventory = lab_inventory.get_inventory(afe)
604            quarantine = _too_many_broken(inventory, pool, arguments)
605            if quarantine:
606                _log_error('Refusing to balance all models for %s pool, '
607                           'too many models with at least 1 broken DUT '
608                           'detected.', pool)
609            else:
610                for model in inventory.get_models(pool):
611                    labels = labellib.LabelsMapping()
612                    labels['model'] = model
613                    balancer_targets.append((pool, labels.getlabels()))
614            metrics.Boolean(
615                'chromeos/autotest/balance_pools/unchanged_pools').set(
616                    quarantine, fields={'pool': pool})
617            _log_message('Pool %s quarantine status: %s', pool, quarantine)
618        else:
619            for model in arguments.models:
620                labels = labellib.LabelsMapping()
621                labels['model'] = model
622                if arguments.sku:
623                    labels['sku'] = arguments.sku
624                balancer_targets.append((pool, labels.getlabels()))
625    return balancer_targets
626
627
628def main(argv):
629    """Standard main routine.
630
631    @param argv  Command line arguments including `sys.argv[0]`.
632
633    """
634    arguments = _parse_command(argv)
635    if arguments.production:
636        metrics_manager = site_utils.SetupTsMonGlobalState(
637                'balance_pools',
638                indirect=False,
639                auto_flush=False,
640        )
641    else:
642        metrics_manager = site_utils.TrivialContextManager()
643
644    with metrics_manager:
645        end_time = time.time()
646        start_time = end_time - 24 * 60 * 60
647        afe = frontend.AFE(server=arguments.web)
648
649        def balancer(pool, labels):
650            """Balance the specified model.
651
652            @param pool: The pool to rebalance for the model.
653            @param labels: labels to restrict to balancing operations
654                    within.
655            """
656            _balance_model(arguments, afe, pool, labels,
657                           start_time, end_time)
658            _log_message('')
659
660        pools = (lab_inventory.CRITICAL_POOLS
661                if arguments.pool == _ALL_CRITICAL_POOLS
662                else [arguments.pool])
663        balancer_targets = infer_balancer_targets(afe, arguments, pools)
664        try:
665            parallel.RunTasksInProcessPool(
666                    balancer,
667                    balancer_targets,
668                    processes=8,
669            )
670        except KeyboardInterrupt:
671            pass
672        finally:
673            metrics.Flush()
674
675
676if __name__ == '__main__':
677    main(sys.argv)
678