• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""
6Framework for host verification and repair in Autotest.
7
8The framework provides implementation code in support of `Host.verify()`
9and `Host.repair()` used in Verify and Repair special tasks.
10
11The framework consists of these classes:
12  * `Verifier`: A class representing a single verification check.
13  * `RepairAction`: A class representing a repair operation that can fix
14    a failed verification check.
15  * `RepairStrategy`:  A class for organizing a collection of `Verifier`
16    and `RepairAction` instances, and invoking them in order.
17
18Individual operations during verification and repair are handled by
19instances of `Verifier` and `RepairAction`.  `Verifier` objects are
20meant to test for specific conditions that may cause tests to fail.
21`RepairAction` objects provide operations designed to fix one or
22more failures identified by a `Verifier` object.
23"""
24
25import collections
26import logging
27import re
28
29import common
30from autotest_lib.client.common_lib import error
31
32try:
33    from chromite.lib import metrics
34except ImportError:
35    from autotest_lib.client.bin.utils import metrics_mock as metrics
36
37#Regular experssion pattern to filter out unwanted hostname.
38_HOSTNAME_PATTERN = 'chromeos[0-9]-row[0-9]+[a-z]?-rack[0-9]+[a-z]?-host[0-9]+'
39_DISALLOWED_HOSTNAME = 'disallowed_hostname'
40
41
42class AutoservVerifyError(error.AutoservError):
43    """
44    Generic Exception for failures from `Verifier` objects.
45
46    Instances of this exception can be raised when a `verify()`
47    method fails, if no more specific exception is available.
48    """
49    pass
50
51
52_DependencyFailure = collections.namedtuple(
53        '_DependencyFailure', ('dependency', 'error', 'tag'))
54
55
56class AutoservVerifyDependencyError(error.AutoservError):
57    """
58    Exception raised for failures in dependencies.
59
60    This exception is used to distinguish an original failure from a
61    failure being passed back from a verification dependency.  That is,
62    if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception
63    to signal that the original failure is further down the dependency
64    chain.
65
66    The `failures` argument to the constructor for this class is a set
67    of instances of `_DependencyFailure`, each corresponding to one
68    failed dependency:
69      * The `dependency` attribute of each failure is the description
70        of the failed dependency.
71      * The `error` attribute of each failure is the string value of
72        the exception from the failed dependency.
73
74    Multiple methods in this module recognize and handle this exception
75    specially.
76
77    @property failures  Set of failures passed to the constructor.
78    @property _node     Instance of `_DependencyNode` reporting the
79                        failed dependencies.
80    """
81
82    def __init__(self, node, failures):
83        """
84        Constructor for `AutoservVerifyDependencyError`.
85
86        @param node       Instance of _DependencyNode reporting the
87                          failed dependencies.
88        @param failures   List of failure tuples as described above.
89        """
90        super(AutoservVerifyDependencyError, self).__init__(
91                '\n'.join([f.error for f in failures]))
92        self.failures = failures
93        self._node = node
94
95    def log_dependencies(self, action, deps):
96        """
97        Log an `AutoservVerifyDependencyError`.
98
99        This writes a short summary of the dependency failures captured
100        in this exception, using standard Python logging.
101
102        The passed in `action` string plus `self._node.description`
103        are logged at INFO level.  The `action` argument should
104        introduce or describe an action relative to `self._node`.
105
106        The passed in `deps` string and the description of each failed
107        dependency in `self` are be logged at DEBUG level.  The `deps`
108        argument is used to introduce the various failed dependencies.
109
110        @param action   A string mentioning the action being logged
111                        relative to `self._node`.
112        @param deps     A string introducing the dependencies that
113                        failed.
114        """
115        logging.info('%s: %s', action, self._node.description)
116        logging.debug('%s:', deps)
117        for failure in self.failures:
118            logging.debug('    %s', failure.dependency)
119
120
121class AutoservRepairError(error.AutoservError):
122    """
123    Generic Exception for failures from `RepairAction` objects.
124
125    Instances of this exception can be raised when a `repair()`
126    method fails, if no more specific exception is available.
127    """
128    def __init__(self, description, tag):
129        """
130        @param description  Message describe the exception.
131        @param tag          A short identifier used for metric purpose.
132        """
133        super(AutoservRepairError, self).__init__(description)
134        self.tag = tag
135
136
137class _DependencyNode(object):
138    """
139    An object that can depend on verifiers.
140
141    Both repair and verify operations have the notion of dependencies
142    that must pass before the operation proceeds.  This class captures
143    the shared behaviors required by both classes.
144
145    @property tag               Short identifier to be used in logging.
146    @property description       Text summary of this node's action, to be
147                                used in debug logs.
148    @property _dependency_list  Dependency pre-requisites.
149    """
150
151    def __init__(self, tag, record_type, dependencies):
152        self._dependency_list = dependencies
153        self._tag = tag
154        self._record_tag = record_type + '.' + tag
155
156    def _record(self, host, silent, status_code, *record_args):
157        """
158        Log a status record for `host`.
159
160        Call `host.record()` using the given status_code, and
161        operation tag `self._record_tag`, plus any extra arguments in
162        `record_args`.  Do nothing if `silent` is a true value.
163
164        @param host         Host which will record the status record.
165        @param silent       Don't record the event if this is a true
166                            value.
167        @param status_code  Value for the `status_code` parameter to
168                            `host.record()`.
169        @param record_args  Additional arguments to pass to
170                            `host.record()`.
171        """
172        if not silent:
173            host.record(status_code, None, self._record_tag,
174                        *record_args)
175
176    def _record_good(self, host, silent):
177        """Log a 'GOOD' status line.
178
179        @param host         Host which will record the status record.
180        @param silent       Don't record the event if this is a true
181                            value.
182        """
183        self._record(host, silent, 'GOOD')
184
185    def _record_fail(self, host, silent, exc):
186        """Log a 'FAIL' status line.
187
188        @param host         Host which will record the status record.
189        @param silent       Don't record the event if this is a true
190                            value.
191        @param exc          Exception describing the cause of failure.
192        """
193        self._record(host, silent, 'FAIL', str(exc))
194
195    def _verify_list(self, host, verifiers, silent):
196        """
197        Test a list of verifiers against a given host.
198
199        This invokes `_verify_host()` on every verifier in the given
200        list.  If any verifier in the transitive closure of dependencies
201        in the list fails, an `AutoservVerifyDependencyError` is raised
202        containing the description of each failed verifier.  Only
203        original failures are reported; verifiers that don't run due
204        to a failed dependency are omitted.
205
206        By design, original failures are logged once in `_verify_host()`
207        when `verify()` originally fails.  The additional data gathered
208        here is for the debug logs to indicate why a subsequent
209        operation never ran.
210
211        @param host       The host to be tested against the verifiers.
212        @param verifiers  List of verifiers to be checked.
213        @param silent     If true, don't log host status records.
214
215        @raises AutoservVerifyDependencyError   Raised when at least
216                        one verifier in the list has failed.
217        """
218        failures = set()
219        for v in verifiers:
220            try:
221                v._verify_host(host, silent)
222            except AutoservVerifyDependencyError as e:
223                failures.update(e.failures)
224            except Exception as e:
225                failures.add(_DependencyFailure(v.description, str(e), v.tag))
226        if failures:
227            raise AutoservVerifyDependencyError(self, failures)
228
229    def _verify_dependencies(self, host, silent):
230        """
231        Verify that all of this node's dependencies pass for a host.
232
233        @param host     The host to be verified.
234        @param silent   If true, don't log host status records.
235        """
236        try:
237            self._verify_list(host, self._dependency_list, silent)
238        except AutoservVerifyDependencyError as e:
239            e.log_dependencies(
240                    'Skipping this operation',
241                    'The following dependencies failed')
242            raise
243
244    @property
245    def tag(self):
246        """
247        Tag for use in logging status records.
248
249        This is a property with a short string used to identify the node
250        in the 'status.log' file and during node construction.  The tag
251        should contain only letters, digits, and '_' characters.  This
252        tag is not used alone, but is combined with other identifiers,
253        based on the operation being logged.
254
255        @return A short identifier-like string.
256        """
257        return self._tag
258
259    @property
260    def description(self):
261        """
262        Text description of this node for log messages.
263
264        This string will be logged with failures, and should describe
265        the condition required for success.
266
267        N.B. Subclasses are required to override this method, but we
268        _don't_ raise NotImplementedError here.  Various methods fail in
269        inscrutable ways if this method raises any exception, so for
270        debugging purposes, it's better to return a default value.
271
272        @return A descriptive string.
273        """
274        return ('Class %s fails to implement description().' %
275                type(self).__name__)
276
277
278class Verifier(_DependencyNode):
279    """
280    Abstract class embodying one verification check.
281
282    A concrete subclass of `Verifier` provides a simple check that can
283    determine a host's fitness for testing.  Failure indicates that the
284    check found a problem that can cause at least one test to fail.
285
286    `Verifier` objects are organized in a DAG identifying dependencies
287    among operations.  The DAG controls ordering and prevents wasted
288    effort:  If verification operation V2 requires that verification
289    operation V1 pass, then a) V1 will run before V2, and b) if V1
290    fails, V2 won't run at all.  The `_verify_host()` method ensures
291    that all dependencies run and pass before invoking the `verify()`
292    method.
293
294    A `Verifier` object caches its result the first time it calls
295    `verify()`.  Subsequent calls return the cached result, without
296    re-running the check code.  The `_reverify()` method clears the
297    cached result in the current node, and in all dependencies.
298
299    Subclasses must supply these properties and methods:
300      * `verify()`: This is the method to perform the actual
301        verification check.
302      * `description`:  A one-line summary of the verification check for
303        debug log messages.
304
305    Subclasses must override all of the above attributes; subclasses
306    should not override or extend any other attributes of this class.
307
308    The description string should be a simple sentence explaining what
309    must be true for the verifier to pass.  Do not include a terminating
310    period.  For example:
311
312        Host is available via ssh
313
314    The base class manages the following private data:
315      * `_result`:  The cached result of verification.
316      * `_dependency_list`:  The list of dependencies.
317    Subclasses should not use these attributes.
318
319    @property _result           Cached result of verification.
320    """
321
322    def __init__(self, tag, dependencies):
323        super(Verifier, self).__init__(tag, 'verify', dependencies)
324        self._result = None
325
326    def _reverify(self):
327        """
328        Discard cached verification results.
329
330        Reset the cached verification result for this node, and for the
331        transitive closure of all dependencies.
332        """
333        if self._result is not None:
334            self._result = None
335            for v in self._dependency_list:
336                v._reverify()
337
338    def _verify_host(self, host, silent):
339        """
340        Determine the result of verification, and log results.
341
342        If this verifier does not have a cached verification result,
343        check dependencies, and if they pass, run `verify()`.  Log
344        informational messages regarding failed dependencies.  If we
345        call `verify()`, log the result in `status.log`.
346
347        If we already have a cached result, return that result without
348        logging any message.
349
350        @param host     The host to be tested for a problem.
351        @param silent   If true, don't log host status records.
352        """
353        if self._result is not None:
354            if isinstance(self._result, Exception):
355                raise self._result  # cached failure
356            elif self._result:
357                return              # cached success
358        self._result = False
359        self._verify_dependencies(host, silent)
360        logging.info('Verifying this condition: %s', self.description)
361        try:
362            logging.debug('Start verify task: %s.', type(self).__name__)
363            self.verify(host)
364            self._record_good(host, silent)
365        except Exception as e:
366            logging.exception('Failed: %s', self.description)
367            self._result = e
368            self._record_fail(host, silent, e)
369            raise
370        finally:
371            logging.debug('Finished verify task: %s.', type(self).__name__)
372
373        self._result = True
374
375    def verify(self, host):
376        """
377        Unconditionally perform a verification check.
378
379        This method is responsible for testing for a single problem on a
380        host.  Implementations should follow these guidelines:
381          * The check should find a problem that will cause testing to
382            fail.
383          * Verification checks on a working system should run quickly
384            and should be optimized for success; a check that passes
385            should finish within seconds.
386          * Verification checks are not expected have side effects, but
387            may apply trivial fixes if they will finish within the time
388            constraints above.
389
390        A verification check should normally trigger a single set of
391        repair actions.  If two different failures can require two
392        different repairs, ideally they should use two different
393        subclasses of `Verifier`.
394
395        Implementations indicate failure by raising an exception.  The
396        exception text should be a short, 1-line summary of the error.
397        The text should be concise and diagnostic, as it will appear in
398        `status.log` files.
399
400        If this method finds no problems, it returns without raising any
401        exception.
402
403        Implementations should avoid most logging actions, but can log
404        DEBUG level messages if they provide significant information for
405        diagnosing failures.
406
407        @param host   The host to be tested for a problem.
408        """
409        raise NotImplementedError('Class %s does not implement '
410                                  'verify()' % type(self).__name__)
411
412
413class RepairAction(_DependencyNode):
414    """
415    Abstract class embodying one repair procedure.
416
417    A `RepairAction` is responsible for fixing one or more failed
418    `Verifier` checks, in order to make those checks pass.
419
420    Each repair action includes one or more verifier triggers that
421    determine when the repair action should run.  A repair action
422    will call its `repair()` method if one or more of its triggers
423    fails.  A repair action is successful if all of its triggers pass
424    after calling `repair()`.
425
426    A `RepairAction` is a subclass of `_DependencyNode`; if any of a
427    repair action's dependencies fail, the action does not check its
428    triggers, and doesn't call `repair()`.
429
430    Subclasses must supply these attributes:
431      * `repair()`: This is the method to perform the necessary
432        repair.  The method should avoid most logging actions, but
433        can log DEBUG level messages if they provide significant
434        information for diagnosing failures.
435      * `description`:  A one-line summary of the repair action for
436        debug log messages.
437
438    Subclasses must override both of the above attributes and should
439    not override any other attributes of this class.
440
441    The description string should be a simple sentence explaining the
442    operation that will be performed.  Do not include a terminating
443    period.  For example:
444
445        Re-install the stable build via AU
446
447    @property _trigger_list   List of verification checks that will
448                              trigger this repair when they fail.
449    @property host_class      A string identifier that will be
450                              used as a field to send repair metrics.
451    """
452
453    def __init__(self, tag, dependencies, triggers, host_class):
454        super(RepairAction, self).__init__(tag, 'repair', dependencies)
455        self._trigger_list = triggers
456        self._failure_modes_counter = metrics.Counter(
457            'chromeos/autotest/repair/failure_modes')
458        self._failure_detail_counter = metrics.Counter(
459            'chromeos/autotest/repair/failure_detail')
460        self.host_class = host_class
461
462    def _record_start(self, host, silent):
463        """Log a 'START' status line.
464
465        @param host         Host which will record the status record.
466        @param silent       Don't record the event if this is a true
467                            value.
468        """
469        self._record(host, silent, 'START')
470
471    def _record_end_good(self, host, silent):
472        """Log an 'END GOOD' status line.
473
474        @param host         Host which will record the status record.
475        @param silent       Don't record the event if this is a true
476                            value.
477        """
478        self._record(host, silent, 'END GOOD')
479        self.status = 'repaired'
480
481    def _record_end_fail(self, host, silent, status, *args):
482        """Log an 'END FAIL' status line.
483
484        @param host         Host which will record the status record.
485        @param silent       Don't record the event if this is a true
486                            value.
487        @param args         Extra arguments to `self._record()`
488        """
489        self._record(host, silent, 'END FAIL', *args)
490        self.status = status
491
492    def _send_failure_metrics(self, host, error, stage):
493        """Send failure mode metrics to monarch
494
495        @param host         Host which this RepairAction targeted to.
496        @param error        An exception that caught in _repair_host.
497        @param stage        In which stage we caught above exception.
498                            Can be one of below value:
499                                'dep'    during verify dependencies
500                                'pre'    during pre-repair trigger verification
501                                'repair' during repair() process itself
502                                'post'   during post-repair trigger verification
503        """
504
505        def get_fields(vf_tag):
506            fields = {
507                'ra_tag': self.tag,
508                'vf_tag': vf_tag,
509                'hostname': _filter_metrics_hostname(host),
510                'stage': stage,
511                'host_class': self.host_class
512            }
513            return fields
514
515        if isinstance(error, AutoservVerifyDependencyError):
516            # We'll catch all failure tags here for a dependencies error
517            for f in error.failures:
518                self._failure_modes_counter.increment(fields=get_fields(f.tag))
519        else:
520            # When there is failure during repair or unknown failure. there
521            # will be no Verifier, so vf_tag set to 'unknown'.
522            self._failure_modes_counter.increment(fields=get_fields('unknown'))
523
524        if stage == 'repair':
525            self._send_failure_detail(error)
526
527    def _send_failure_detail(self, error):
528        """Send reason of failure inside repair() to monarch.
529
530        @param error    The exception caught inside repair().
531        """
532        tag = error.tag if isinstance(error, AutoservRepairError) else 'unknown'
533        fields = {'repair_action_tag': self.tag, 'repair_failure_tag': tag}
534        self._failure_detail_counter.increment(fields=fields)
535
536    def _repair_host(self, host, silent):
537        """
538        Apply this repair action if any triggers fail.
539
540        Repair is triggered when all dependencies are successful, and at
541        least one trigger fails.
542
543        If the `repair()` method triggers, the success or failure of
544        this operation is logged in `status.log` bracketed by 'START'
545        and 'END' records.  Details of whether or why `repair()`
546        triggered are written to the debug logs.   If repair doesn't
547        trigger, nothing is logged to `status.log`.
548
549        @param host     The host to be repaired.
550        @param silent   If true, don't log host status records.
551        """
552        # Note:  Every exit path from the method must set `self.status`.
553        # There's a lot of exit paths, so be careful.
554        #
555        # If we're blocked by a failed dependency, we exit with an
556        # exception.  So set status to 'blocked' first.
557        self.status = 'blocked'
558        try:
559            self._verify_dependencies(host, silent)
560        except Exception as e:
561            self._send_failure_metrics(host, e, 'dep')
562            raise
563        # This is a defensive action.  Every path below should overwrite
564        # this setting, but if it doesn't, we want our status to reflect
565        # a coding error.
566        self.status = 'unknown'
567        try:
568            self._verify_list(host, self._trigger_list, silent)
569        except AutoservVerifyDependencyError as e:
570            e.log_dependencies(
571                    'Attempting this repair action',
572                    'Repairing because these triggers failed')
573            self._send_failure_metrics(host, e, 'pre')
574            self._record_start(host, silent)
575            try:
576                self.repair(host)
577            except Exception as e:
578                logging.exception('Repair failed: %s', self.description)
579                self._record_fail(host, silent, e)
580                self._record_end_fail(host, silent, 'repair_failure')
581                self._send_failure_metrics(host, e, 'repair')
582                raise
583            try:
584                for v in self._trigger_list:
585                    v._reverify()
586                self._verify_list(host, self._trigger_list, silent)
587                self._record_end_good(host, silent)
588            except AutoservVerifyDependencyError as e:
589                e.log_dependencies(
590                        'This repair action reported success',
591                        'However, these triggers still fail')
592                self._record_end_fail(host, silent, 'verify_failure')
593                self._send_failure_metrics(host, e, 'post')
594                raise AutoservRepairError(
595                        'Some verification checks still fail', 'post_verify')
596            except Exception:
597                # The specification for `self._verify_list()` says
598                # that this can't happen; this is a defensive
599                # precaution.
600                self._record_end_fail(host, silent, 'unknown',
601                                      'Internal error in repair')
602                self._send_failure_metrics(host, e, 'post')
603                raise
604        else:
605            self.status = 'skipped'
606            logging.info('No failed triggers, skipping repair:  %s',
607                         self.description)
608
609    def repair(self, host):
610        """
611        Apply this repair action to the given host.
612
613        This method is responsible for applying changes to fix failures
614        in one or more verification checks.  The repair is considered
615        successful if the DUT passes the specific checks after this
616        method completes.
617
618        Implementations indicate failure by raising an exception.  The
619        exception text should be a short, 1-line summary of the error.
620        The text should be concise and diagnostic, as it will appear in
621        `status.log` files.
622
623        If this method completes successfully, it returns without
624        raising any exception.
625
626        Implementations should avoid most logging actions, but can log
627        DEBUG level messages if they provide significant information for
628        diagnosing failures.
629
630        @param host   The host to be repaired.
631        """
632        raise NotImplementedError('Class %s does not implement '
633                                  'repair()' % type(self).__name__)
634
635
636class _RootVerifier(Verifier):
637    """
638    Utility class used by `RepairStrategy`.
639
640    A node of this class by itself does nothing; it always passes (if it
641    can run).  This class exists merely to be the root of a DAG of
642    dependencies in an instance of `RepairStrategy`.
643    """
644
645    def verify(self, host):
646        pass
647
648    @property
649    def description(self):
650        return 'All host verification checks pass'
651
652
653class RepairStrategy(object):
654    """
655    A class for organizing `Verifier` and `RepairAction` objects.
656
657    An instance of `RepairStrategy` is organized as a DAG of `Verifier`
658    objects, plus a list of `RepairAction` objects.  The class provides
659    methods for invoking those objects in the required order, when
660    needed:
661      * The `verify()` method walks the verifier DAG in dependency
662        order.
663      * The `repair()` method invokes the repair actions in list order.
664        Each repair action will invoke its dependencies and triggers as
665        needed.
666
667    # The Verifier DAG
668    The verifier DAG is constructed from the first argument passed to
669    the passed to the `RepairStrategy` constructor.  That argument is an
670    iterable consisting of three-element tuples in the form
671    `(constructor, tag, deps)`:
672      * The `constructor` value is a callable that creates a `Verifier`
673        as for the interface of the class constructor.  For classes
674        that inherit the default constructor from `Verifier`, this can
675        be the class itself.
676      * The `tag` value is the tag to be associated with the constructed
677        verifier.
678      * The `deps` value is an iterable (e.g. list or tuple) of strings.
679        Each string corresponds to the `tag` member of a `Verifier`
680        dependency.
681
682    The tag names of verifiers in the constructed DAG must all be
683    unique.  The tag name defined by `RepairStrategy.ROOT_TAG` is
684    reserved and may not be used by any verifier.
685
686    In the input data for the constructor, dependencies must appear
687    before the nodes that depend on them.  Thus:
688
689        ((A, 'a', ()), (B, 'b', ('a',)))     # This is valid
690        ((B, 'b', ('a',)), (A, 'a', ()))     # This will fail!
691
692    Internally, the DAG of verifiers is given unique root node.  So,
693    given this input:
694
695        ((C, 'c', ()),
696         (A, 'a', ('c',)),
697         (B, 'b', ('c',)))
698
699    The following DAG is constructed:
700
701          Root
702          /  \
703         A    B
704          \  /
705           C
706
707    Since nothing depends on `A` or `B`, the root node guarantees that
708    these two verifiers will both be called and properly logged.
709
710    The root node is not directly accessible; however repair actions can
711    trigger on it by using `RepairStrategy.ROOT_TAG`.  Additionally, the
712    node will be logged in `status.log` whenever `verify()` succeeds.
713
714    # The Repair Actions List
715    The list of repair actions is constructed from the second argument
716    passed to the passed to the `RepairStrategy` constructor.  That
717    argument is an iterable consisting of four-element tuples in the
718    form `(constructor, tag, deps, triggers)`:
719      * The `constructor` value is a callable that creates a
720        `RepairAction` as for the interface of the class constructor.
721        For classes that inherit the default constructor from
722        `RepairAction`, this can be the class itself.
723      * The `tag` value is the tag to be associated with the constructed
724        repair action.
725      * The `deps` value is an iterable (e.g. list or tuple) of strings.
726        Each string corresponds to the `tag` member of a `Verifier` that
727        the repair action depends on.
728      * The `triggers` value is an iterable (e.g. list or tuple) of
729        strings.  Each string corresponds to the `tag` member of a
730        `Verifier` that can trigger the repair action.
731
732    `RepairStrategy` deps and triggers can only refer to verifiers,
733    not to other repair actions.
734    """
735
736    # This name is reserved; clients may not use it.
737    ROOT_TAG = 'PASS'
738
739    @staticmethod
740    def _add_verifier(verifiers, constructor, tag, dep_tags):
741        """
742        Construct and remember a verifier.
743
744        Create a `Verifier` using `constructor` and `tag`.  Dependencies
745        for construction are found by looking up `dep_tags` in the
746        `verifiers` dictionary.
747
748        After construction, the new verifier is added to `verifiers`.
749
750        @param verifiers    Dictionary of verifiers, indexed by tag.
751        @param constructor  Verifier construction function.
752        @param tag          Tag parameter for the construction function.
753        @param dep_tags     Tags of dependencies for the constructor, to
754                            be found in `verifiers`.
755        """
756        assert tag not in verifiers
757        deps = [verifiers[d] for d in dep_tags]
758        verifiers[tag] = constructor(tag, deps)
759
760    def __init__(self, verifier_data, repair_data, host_class):
761        """
762        Construct a `RepairStrategy` from simplified DAG data.
763
764        The input `verifier_data` object describes how to construct
765        verify nodes and the dependencies that relate them, as detailed
766        above.
767
768        The input `repair_data` object describes how to construct repair
769        actions and their dependencies and triggers, as detailed above.
770
771        @param verifier_data  Iterable value with constructors for the
772                              elements of the verification DAG and their
773                              dependencies.
774        @param repair_data    Iterable value with constructors for the
775                              elements of the repair action list, and
776                              their dependencies and triggers.
777        @property host_class  A string identifier that identify what
778                              class of host this repair strategy target
779                              on, will be used as a field to send repair
780                              metrics.
781        """
782        # Metrics - we report on 'actions' for every repair action
783        # we execute; we report on 'strategy' for every complete
784        # repair operation.
785        self._strategy_counter = metrics.Counter(
786            'chromeos/autotest/repair/repair_strategy_v2')
787        self._actions_counter = metrics.Counter(
788            'chromeos/autotest/repair/repair_actions')
789        self.host_class = host_class
790        # We use the `all_verifiers` list to guarantee that our root
791        # verifier will execute its dependencies in the order provided
792        # to us by our caller.
793        verifier_map = {}
794        all_tags = []
795        dependencies = set()
796        for constructor, tag, deps in verifier_data:
797            self._add_verifier(verifier_map, constructor, tag, deps)
798            dependencies.update(deps)
799            all_tags.append(tag)
800        # Capture all the verifiers that have nothing depending on them.
801        root_tags = [t for t in all_tags if t not in dependencies]
802        self._add_verifier(verifier_map, _RootVerifier,
803                           self.ROOT_TAG, root_tags)
804        self._verify_root = verifier_map[self.ROOT_TAG]
805        self._repair_actions = []
806        for constructor, tag, deps, triggers in repair_data:
807            r = constructor(tag,
808                            [verifier_map[d] for d in deps],
809                            [verifier_map[t] for t in triggers],
810                            self.host_class)
811            self._repair_actions.append(r)
812
813    def _send_strategy_metrics(self, host, result):
814        """Send repair strategy metrics to monarch
815
816        @param host     The target to be repaired.
817        @param result   A String that describe a final result for the
818                        RepairStrategy.
819        """
820        info = host.host_info_store.get()
821        board = info.board if info.board else 'unknown'
822        model = info.model if info.model else 'unknown'
823        fields = {
824            'board': board,
825            'host_class': self.host_class,
826            'hostname': _filter_metrics_hostname(host),
827            'model': model,
828            'result': result,
829        }
830        self._strategy_counter.increment(fields=fields)
831
832    def _send_action_metrics(self, host, ra):
833        """Send repair action metrics to monarch
834
835        @param host     The target to be repaired.
836        @param ra       an RepairAction instance.
837        """
838        fields = {
839            'tag': ra.tag,
840            'status': ra.status,
841            'hostname': _filter_metrics_hostname(host),
842            'host_class': self.host_class
843        }
844        self._actions_counter.increment(fields=fields)
845
846    def verify(self, host, silent=False):
847        """
848        Run the verifier DAG on the given host.
849
850        @param host     The target to be verified.
851        @param silent   If true, don't log host status records.
852        """
853        self._verify_root._reverify()
854        self._verify_root._verify_host(host, silent)
855
856    def repair(self, host, silent=False):
857        """
858        Run the repair list on the given host.
859
860        @param host     The target to be repaired.
861        @param silent   If true, don't log host status records.
862        """
863        self._verify_root._reverify()
864        attempted = False
865        for ra in self._repair_actions:
866            try:
867                logging.debug('Start repair task: %s.', type(ra).__name__)
868                ra._repair_host(host, silent)
869            except Exception as e:
870                # all logging and exception handling was done at
871                # lower levels
872                pass
873            finally:
874                self._send_action_metrics(host, ra)
875                logging.debug('Finished repair task: %s.', type(ra).__name__)
876                if ra.status not in ('skipped', 'blocked'):
877                    attempted = True
878
879        result = 'failure'
880        try:
881            self._verify_root._verify_host(host, silent)
882            result = 'success' if attempted else 'not_attempted'
883        except:
884            if not attempted:
885                result = 'attempt_blocked'
886            raise
887        finally:
888            self._send_strategy_metrics(host, result)
889
890
891def _filter_metrics_hostname(host):
892    """
893       Restrict format of hostnames we'll send to monarch
894
895       @param host    An host instance(i.e. ServoHost, CrosHost)
896    """
897    if re.match(_HOSTNAME_PATTERN, host.hostname):
898        return host.hostname
899    else:
900        return _DISALLOWED_HOSTNAME
901
902