• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""
6Framework for host verification and repair in Autotest.
7
8The framework provides implementation code in support of `Host.verify()`
9and `Host.repair()` used in Verify and Repair special tasks.
10
11The framework consists of these classes:
12  * `Verifier`: A class representing a single verification check.
13  * `RepairAction`: A class representing a repair operation that can fix
14    a failed verification check.
15  * `RepairStrategy`:  A class for organizing a collection of `Verifier`
16    and `RepairAction` instances, and invoking them in order.
17
18Individual operations during verification and repair are handled by
19instances of `Verifier` and `RepairAction`.  `Verifier` objects are
20meant to test for specific conditions that may cause tests to fail.
21`RepairAction` objects provide operations designed to fix one or
22more failures identified by a `Verifier` object.
23"""
24
25import collections
26import logging
27import re
28
29import common
30from autotest_lib.client.common_lib import error
31
32try:
33    from chromite.lib import metrics
34except ImportError:
35    from autotest_lib.client.bin.utils import metrics_mock as metrics
36
37#Regular experssion pattern to filter out unwanted hostname.
38_HOSTNAME_PATTERN = 'chromeos[0-9]-row[0-9]+[a-z]?-rack[0-9]+[a-z]?-host[0-9]+'
39_DISALLOWED_HOSTNAME = 'disallowed_hostname'
40
41# States of verifiers
42# True - verifier run and passed
43# False - verifier run and failed
44# None - verifier did not run or dependency failed
45VERIFY_SUCCESS = True
46VERIFY_FAILED = False
47VERIFY_NOT_RUN = None
48
49
50class AutoservVerifyError(error.AutoservError):
51    """
52    Generic Exception for failures from `Verifier` objects.
53
54    Instances of this exception can be raised when a `verify()`
55    method fails, if no more specific exception is available.
56    """
57    pass
58
59
60class AutoservNonCriticalVerifyError(error.AutoservError):
61    """
62    Exception for failures from `Verifier` objects that not critical enough to
63    conclude the target host is in a bad state.
64    """
65    pass
66
67
68_DependencyFailure = collections.namedtuple(
69        '_DependencyFailure', ('dependency', 'error', 'tag'))
70
71
72_NonCriticalDependencyFailure = collections.namedtuple(
73    '_NonCriticalDependencyFailure', ('dependency', 'error', 'tag'))
74
75
76class AutoservVerifyDependencyError(error.AutoservError):
77    """
78    Exception raised for failures in dependencies.
79
80    This exception is used to distinguish an original failure from a
81    failure being passed back from a verification dependency.  That is,
82    if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception
83    to signal that the original failure is further down the dependency
84    chain.
85
86    The `failures` argument to the constructor for this class is a set
87    of instances of `_DependencyFailure`, each corresponding to one
88    failed dependency:
89      * The `dependency` attribute of each failure is the description
90        of the failed dependency.
91      * The `error` attribute of each failure is the string value of
92        the exception from the failed dependency.
93
94    Multiple methods in this module recognize and handle this exception
95    specially.
96
97    @property failures  Set of failures passed to the constructor.
98    @property _node     Instance of `_DependencyNode` reporting the
99                        failed dependencies.
100    """
101
102    def __init__(self, node, failures):
103        """
104        Constructor for `AutoservVerifyDependencyError`.
105
106        @param node       Instance of _DependencyNode reporting the
107                          failed dependencies.
108        @param failures   List of failure tuples as described above.
109        """
110        super(AutoservVerifyDependencyError, self).__init__(
111                '\n'.join([f.error for f in failures]))
112        self.failures = failures
113        self._node = node
114
115    def log_dependencies(self, action, deps):
116        """
117        Log an `AutoservVerifyDependencyError`.
118
119        This writes a short summary of the dependency failures captured
120        in this exception, using standard Python logging.
121
122        The passed in `action` string plus `self._node.description`
123        are logged at INFO level.  The `action` argument should
124        introduce or describe an action relative to `self._node`.
125
126        The passed in `deps` string and the description of each failed
127        dependency in `self` are be logged at DEBUG level.  The `deps`
128        argument is used to introduce the various failed dependencies.
129
130        @param action   A string mentioning the action being logged
131                        relative to `self._node`.
132        @param deps     A string introducing the dependencies that
133                        failed.
134        """
135        logging.info('%s: %s', action, self._node.description)
136        logging.debug('%s:', deps)
137        for failure in self.failures:
138            logging.debug('    %s', failure.dependency)
139
140    def is_critical(self, silent=False):
141        """Check if the error is considered to be critical to repair process."""
142        for error in self.failures:
143            if isinstance(error, _NonCriticalDependencyFailure):
144                if not silent:
145                    logging.warning("%s is still failing but forgiven because"
146                                    " it raised a non-critical error.",
147                                    error.tag)
148            else:
149                return True
150        return False
151
152
153class AutoservRepairError(error.AutoservError):
154    """
155    Generic Exception for failures from `RepairAction` objects.
156
157    Instances of this exception can be raised when a `repair()`
158    method fails, if no more specific exception is available.
159    """
160    def __init__(self, description, tag):
161        """
162        @param description  Message describe the exception.
163        @param tag          A short identifier used for metric purpose.
164        """
165        super(AutoservRepairError, self).__init__(description)
166        self.tag = tag
167
168
169class _DependencyNode(object):
170    """
171    An object that can depend on verifiers.
172
173    Both repair and verify operations have the notion of dependencies
174    that must pass before the operation proceeds.  This class captures
175    the shared behaviors required by both classes.
176
177    @property tag               Short identifier to be used in logging.
178    @property description       Text summary of this node's action, to be
179                                used in debug logs.
180    @property _dependency_list  Dependency pre-requisites.
181    """
182
183    def __init__(self, tag, record_type, dependencies):
184        self._dependency_list = dependencies
185        self._tag = tag
186        self._record_tag = record_type + '.' + tag
187
188    def _is_applicable(self, host):
189        """
190        Check if the action is applicable to target host. Subclasses
191        can override this method per their need.
192
193        @param host     Target host to check.
194        @return         A bool value.
195        """
196        return True
197
198    def _record(self, host, silent, status_code, *record_args):
199        """
200        Log a status record for `host`.
201
202        Call `host.record()` using the given status_code, and
203        operation tag `self._record_tag`, plus any extra arguments in
204        `record_args`.  Do nothing if `silent` is a true value.
205
206        @param host         Host which will record the status record.
207        @param silent       Don't record the event if this is a true
208                            value.
209        @param status_code  Value for the `status_code` parameter to
210                            `host.record()`.
211        @param record_args  Additional arguments to pass to
212                            `host.record()`.
213        """
214        if not silent:
215            host.record(status_code, None, self._record_tag,
216                        *record_args)
217
218    def _record_good(self, host, silent):
219        """Log a 'GOOD' status line.
220
221        @param host         Host which will record the status record.
222        @param silent       Don't record the event if this is a true
223                            value.
224        """
225        self._record(host, silent, 'GOOD')
226
227    def _record_fail(self, host, silent, exc):
228        """Log a 'FAIL' status line.
229
230        @param host         Host which will record the status record.
231        @param silent       Don't record the event if this is a true
232                            value.
233        @param exc          Exception describing the cause of failure.
234        """
235        self._record(host, silent, 'FAIL', str(exc))
236
237    def _verify_list(self, host, verifiers, silent):
238        """
239        Test a list of verifiers against a given host.
240
241        This invokes `_verify_host()` on every verifier in the given
242        list.  If any verifier in the transitive closure of dependencies
243        in the list fails, an `AutoservVerifyDependencyError` is raised
244        containing the description of each failed verifier.  Only
245        original failures are reported; verifiers that don't run due
246        to a failed dependency are omitted.
247
248        By design, original failures are logged once in `_verify_host()`
249        when `verify()` originally fails.  The additional data gathered
250        here is for the debug logs to indicate why a subsequent
251        operation never ran.
252
253        @param host       The host to be tested against the verifiers.
254        @param verifiers  List of verifiers to be checked.
255        @param silent     If true, don't log host status records.
256
257        @raises AutoservVerifyDependencyError   Raised when at least
258                        one verifier in the list has failed.
259        """
260        failures = set()
261        for v in verifiers:
262            try:
263                v._verify_host(host, silent)
264            except AutoservNonCriticalVerifyError as e:
265                failures.add(_NonCriticalDependencyFailure(v.description,
266                                                           str(e), v.tag))
267            except AutoservVerifyDependencyError as e:
268                failures.update(e.failures)
269            except Exception as e:
270                failures.add(_DependencyFailure(v.description, str(e), v.tag))
271        if failures:
272            raise AutoservVerifyDependencyError(self, failures)
273
274    def _verify_dependencies(self, host, silent):
275        """
276        Verify that all of this node's dependencies pass for a host.
277
278        @param host     The host to be verified.
279        @param silent   If true, don't log host status records.
280        """
281        try:
282            self._verify_list(host, self._dependency_list, silent)
283        except AutoservVerifyDependencyError as e:
284            e.log_dependencies(
285                    'Skipping this operation',
286                    'The following dependencies failed')
287            raise
288
289    @property
290    def tag(self):
291        """
292        Tag for use in logging status records.
293
294        This is a property with a short string used to identify the node
295        in the 'status.log' file and during node construction.  The tag
296        should contain only letters, digits, and '_' characters.  This
297        tag is not used alone, but is combined with other identifiers,
298        based on the operation being logged.
299
300        @return A short identifier-like string.
301        """
302        return self._tag
303
304    @property
305    def description(self):
306        """
307        Text description of this node for log messages.
308
309        This string will be logged with failures, and should describe
310        the condition required for success.
311
312        N.B. Subclasses are required to override this method, but we
313        _don't_ raise NotImplementedError here.  Various methods fail in
314        inscrutable ways if this method raises any exception, so for
315        debugging purposes, it's better to return a default value.
316
317        @return A descriptive string.
318        """
319        return ('Class %s fails to implement description().' %
320                type(self).__name__)
321
322    def _get_node_by_tag(self, tag):
323        """Find verifier by tag, recursive."""
324        if self._tag == tag:
325            return self
326        for child in self._dependency_list:
327            node = child._get_node_by_tag(tag)
328            if node is not None:
329                return node
330        return None
331
332
333class Verifier(_DependencyNode):
334    """
335    Abstract class embodying one verification check.
336
337    A concrete subclass of `Verifier` provides a simple check that can
338    determine a host's fitness for testing.  Failure indicates that the
339    check found a problem that can cause at least one test to fail.
340
341    `Verifier` objects are organized in a DAG identifying dependencies
342    among operations.  The DAG controls ordering and prevents wasted
343    effort:  If verification operation V2 requires that verification
344    operation V1 pass, then a) V1 will run before V2, and b) if V1
345    fails, V2 won't run at all.  The `_verify_host()` method ensures
346    that all dependencies run and pass before invoking the `verify()`
347    method.
348
349    A `Verifier` object caches its result the first time it calls
350    `verify()`.  Subsequent calls return the cached result, without
351    re-running the check code.  The `_reverify()` method clears the
352    cached result in the current node, and in all dependencies.
353
354    Subclasses must supply these properties and methods:
355      * `verify()`: This is the method to perform the actual
356        verification check.
357      * `description`:  A one-line summary of the verification check for
358        debug log messages.
359
360    Subclasses must override all of the above attributes; subclasses
361    should not override or extend any other attributes of this class.
362
363    The description string should be a simple sentence explaining what
364    must be true for the verifier to pass.  Do not include a terminating
365    period.  For example:
366
367        Host is available via ssh
368
369    The base class manages the following private data:
370      * `_result`:  The cached result of verification.
371                    None - did not run
372                    True - successful pass
373                    Exception - fail during execution
374      * `_dependency_list`:  The list of dependencies.
375    Subclasses should not use these attributes.
376
377    @property _result           Cached result of verification.
378    """
379
380    def __init__(self, tag, dependencies):
381        super(Verifier, self).__init__(tag, 'verify', dependencies)
382        self._result = None
383
384    def _reverify(self):
385        """
386        Discard cached verification results.
387
388        Reset the cached verification result for this node, and for the
389        transitive closure of all dependencies.
390        """
391        self._result = None
392        for v in self._dependency_list:
393            v._reverify()
394
395    def _verify_host(self, host, silent):
396        """
397        Determine the result of verification, and log results.
398
399        If this verifier does not have a cached verification result,
400        check dependencies, and if they pass, run `verify()`.  Log
401        informational messages regarding failed dependencies.  If we
402        call `verify()`, log the result in `status.log`.
403
404        If we already have a cached result, return that result without
405        logging any message.
406
407        @param host     The host to be tested for a problem.
408        @param silent   If true, don't log host status records.
409        """
410        try:
411            if not self._is_applicable(host):
412                logging.info('Verify %s is not applicable to %s, skipping...',
413                             self.description, host.hostname)
414                return
415        except Exception as e:
416            logging.error('Skipping %s verifier due to unexpect error during'
417                          ' check applicability; %s', self.tag, e)
418            return
419
420        if self._result is not None:
421            if isinstance(self._result, Exception):
422                raise self._result  # cached failure
423            elif self._result:
424                return              # cached success
425
426        self._verify_dependencies(host, silent)
427        logging.info('Verifying this condition: %s', self.description)
428        try:
429            logging.debug('Start verify task: %s.', type(self).__name__)
430            self.verify(host)
431            self._record_good(host, silent)
432        except Exception as e:
433            message = 'Failed: %s'
434            if isinstance(e, AutoservNonCriticalVerifyError):
435                message = '(Non-critical)Failed: %s'
436            logging.exception(message, self.description)
437            self._result = e
438            self._record_fail(host, silent, e)
439            # Increase verifier fail count if device health profile is
440            # available to the host class.
441            if hasattr(host, 'health_profile') and host.health_profile:
442                host.health_profile.insert_failed_verifier(self.tag)
443            raise
444        finally:
445            logging.debug('Finished verify task: %s.', type(self).__name__)
446
447        self._result = True
448
449    def verify(self, host):
450        """
451        Unconditionally perform a verification check.
452
453        This method is responsible for testing for a single problem on a
454        host.  Implementations should follow these guidelines:
455          * The check should find a problem that will cause testing to
456            fail.
457          * Verification checks on a working system should run quickly
458            and should be optimized for success; a check that passes
459            should finish within seconds.
460          * Verification checks are not expected have side effects, but
461            may apply trivial fixes if they will finish within the time
462            constraints above.
463
464        A verification check should normally trigger a single set of
465        repair actions.  If two different failures can require two
466        different repairs, ideally they should use two different
467        subclasses of `Verifier`.
468
469        Implementations indicate failure by raising an exception.  The
470        exception text should be a short, 1-line summary of the error.
471        The text should be concise and diagnostic, as it will appear in
472        `status.log` files.
473
474        If this method finds no problems, it returns without raising any
475        exception.
476
477        Implementations should avoid most logging actions, but can log
478        DEBUG level messages if they provide significant information for
479        diagnosing failures.
480
481        @param host   The host to be tested for a problem.
482        """
483        raise NotImplementedError('Class %s does not implement '
484                                  'verify()' % type(self).__name__)
485
486    def _is_good(self):
487        """Provide result of the verifier
488
489        @returns: a boolean or None value:
490            True - verifier passed
491            False - verifier did not pass
492            None - verifier did not run because it is not applicable
493                   or blocked due to dependency failure
494        """
495        if type(self._result) == type(True):
496            return self._result
497        elif isinstance(self._result, Exception):
498            return False
499        return None
500
501
502class RepairAction(_DependencyNode):
503    """
504    Abstract class embodying one repair procedure.
505
506    A `RepairAction` is responsible for fixing one or more failed
507    `Verifier` checks, in order to make those checks pass.
508
509    Each repair action includes one or more verifier triggers that
510    determine when the repair action should run.  A repair action
511    will call its `repair()` method if one or more of its triggers
512    fails.  A repair action is successful if all of its triggers pass
513    after calling `repair()`.
514
515    A `RepairAction` is a subclass of `_DependencyNode`; if any of a
516    repair action's dependencies fail, the action does not check its
517    triggers, and doesn't call `repair()`.
518
519    Subclasses must supply these attributes:
520      * `repair()`: This is the method to perform the necessary
521        repair.  The method should avoid most logging actions, but
522        can log DEBUG level messages if they provide significant
523        information for diagnosing failures.
524      * `description`:  A one-line summary of the repair action for
525        debug log messages.
526
527    Subclasses must override both of the above attributes and should
528    not override any other attributes of this class.
529
530    The description string should be a simple sentence explaining the
531    operation that will be performed.  Do not include a terminating
532    period.  For example:
533
534        Re-install the stable build via AU
535
536    @property _trigger_list   List of verification checks that will
537                              trigger this repair when they fail.
538    @property host_class      A string identifier that will be
539                              used as a field to send repair metrics.
540    """
541
542    def __init__(self, tag, dependencies, triggers, host_class):
543        super(RepairAction, self).__init__(tag, 'repair', dependencies)
544        self._trigger_list = triggers
545        self._failure_modes_counter = metrics.Counter(
546            'chromeos/autotest/repair/failure_modes')
547        self._failure_detail_counter = metrics.Counter(
548            'chromeos/autotest/repair/failure_detail')
549        self.host_class = host_class
550
551    def _record_start(self, host, silent):
552        """Log a 'START' status line.
553
554        @param host         Host which will record the status record.
555        @param silent       Don't record the event if this is a true
556                            value.
557        """
558        self._record(host, silent, 'START')
559
560    def _record_end_good(self, host, silent):
561        """Log an 'END GOOD' status line.
562
563        @param host         Host which will record the status record.
564        @param silent       Don't record the event if this is a true
565                            value.
566        """
567        self._record(host, silent, 'END GOOD')
568        self.status = 'repaired'
569
570    def _record_end_fail(self, host, silent, status, *args):
571        """Log an 'END FAIL' status line.
572
573        @param host         Host which will record the status record.
574        @param silent       Don't record the event if this is a true
575                            value.
576        @param args         Extra arguments to `self._record()`
577        """
578        self._record(host, silent, 'END FAIL', *args)
579        self.status = status
580
581    def _send_failure_metrics(self, host, error, stage):
582        """Send failure mode metrics to monarch
583
584        @param host         Host which this RepairAction targeted to.
585        @param error        An exception that caught in _repair_host.
586        @param stage        In which stage we caught above exception.
587                            Can be one of below value:
588                                'dep'    during verify dependencies
589                                'pre'    during pre-repair trigger verification
590                                'repair' during repair() process itself
591                                'post'   during post-repair trigger verification
592        """
593
594        def get_fields(vf_tag):
595            fields = {
596                'ra_tag': self.tag,
597                'vf_tag': vf_tag,
598                'hostname': _filter_metrics_hostname(host),
599                'stage': stage,
600                'host_class': self.host_class
601            }
602            return fields
603
604        if isinstance(error, AutoservVerifyDependencyError):
605            # We'll catch all failure tags here for a dependencies error
606            for f in error.failures:
607                self._failure_modes_counter.increment(fields=get_fields(f.tag))
608        else:
609            # When there is failure during repair or unknown failure. there
610            # will be no Verifier, so vf_tag set to 'unknown'.
611            self._failure_modes_counter.increment(fields=get_fields('unknown'))
612
613        if stage == 'repair':
614            self._send_failure_detail(error)
615
616    def _send_failure_detail(self, error):
617        """Send reason of failure inside repair() to monarch.
618
619        @param error    The exception caught inside repair().
620        """
621        tag = error.tag if isinstance(error, AutoservRepairError) else 'unknown'
622        fields = {'repair_action_tag': self.tag, 'repair_failure_tag': tag}
623        self._failure_detail_counter.increment(fields=fields)
624
625    def _repair_host(self, host, silent):
626        """
627        Apply this repair action if any triggers fail.
628
629        Repair is triggered when all dependencies are successful, and at
630        least one trigger fails.
631
632        If the `repair()` method triggers, the success or failure of
633        this operation is logged in `status.log` bracketed by 'START'
634        and 'END' records.  Details of whether or why `repair()`
635        triggered are written to the debug logs.   If repair doesn't
636        trigger, nothing is logged to `status.log`.
637
638        @param host     The host to be repaired.
639        @param silent   If true, don't log host status records.
640        """
641        # Note:  Every exit path from the method must set `self.status`.
642        # There's a lot of exit paths, so be careful.
643        #
644        # If we're blocked by a failed dependency, we exit with an
645        # exception.  So set status to 'blocked' first.
646        self.status = 'skipped'
647        try:
648            if not self._is_applicable(host):
649                logging.info('RepairAction is not applicable, skipping repair: %s',
650                             self.description)
651                return
652        except Exception as e:
653            logging.error('Skipping %s repair action due to unexpect error'
654                          ' during check applicability; %s', self.tag, e)
655            return
656
657        self.status = 'blocked'
658        try:
659            self._verify_dependencies(host, silent)
660        except Exception as e:
661            self._send_failure_metrics(host, e, 'dep')
662            raise
663        # This is a defensive action.  Every path below should overwrite
664        # this setting, but if it doesn't, we want our status to reflect
665        # a coding error.
666        self.status = 'unknown'
667        try:
668            self._verify_list(host, self._trigger_list, silent)
669        except AutoservVerifyDependencyError as e:
670            e.log_dependencies(
671                    'Attempting this repair action',
672                    'Repairing because these triggers failed')
673            self._send_failure_metrics(host, e, 'pre')
674            self._record_start(host, silent)
675            try:
676                self.repair(host)
677                # Increase action success count if device health profile is
678                # available to the host class.
679                if hasattr(host, 'health_profile') and host.health_profile:
680                    host.health_profile.insert_succeed_repair_action(self.tag)
681            except Exception as e:
682                logging.exception('Repair failed: %s', self.description)
683                self._record_fail(host, silent, e)
684                self._record_end_fail(host, silent, 'repair_failure')
685                self._send_failure_metrics(host, e, 'repair')
686                # Increase action fail count if device health profile is
687                # available to the host class.
688                if hasattr(host, 'health_profile') and host.health_profile:
689                    host.health_profile.insert_failed_repair_action(self.tag)
690                raise
691            try:
692                for v in self._trigger_list:
693                    v._reverify()
694                self._verify_list(host, self._trigger_list, silent)
695                self._record_end_good(host, silent)
696            except AutoservVerifyDependencyError as e:
697                e.log_dependencies(
698                        'This repair action reported success',
699                        'However, these triggers still fail')
700                self._record_end_fail(host, silent, 'verify_failure')
701                self._send_failure_metrics(host, e, 'post')
702                raise AutoservRepairError(
703                        'Some verification checks still fail', 'post_verify')
704            except Exception:
705                # The specification for `self._verify_list()` says
706                # that this can't happen; this is a defensive
707                # precaution.
708                self._record_end_fail(host, silent, 'unknown',
709                                      'Internal error in repair')
710                self._send_failure_metrics(host, e, 'post')
711                raise
712        else:
713            self.status = 'skipped'
714            logging.info('No failed triggers, skipping repair: %s',
715                         self.description)
716
717    def repair(self, host):
718        """
719        Apply this repair action to the given host.
720
721        This method is responsible for applying changes to fix failures
722        in one or more verification checks.  The repair is considered
723        successful if the DUT passes the specific checks after this
724        method completes.
725
726        Implementations indicate failure by raising an exception.  The
727        exception text should be a short, 1-line summary of the error.
728        The text should be concise and diagnostic, as it will appear in
729        `status.log` files.
730
731        If this method completes successfully, it returns without
732        raising any exception.
733
734        Implementations should avoid most logging actions, but can log
735        DEBUG level messages if they provide significant information for
736        diagnosing failures.
737
738        @param host   The host to be repaired.
739        """
740        raise NotImplementedError('Class %s does not implement '
741                                  'repair()' % type(self).__name__)
742
743
744class _RootVerifier(Verifier):
745    """
746    Utility class used by `RepairStrategy`.
747
748    A node of this class by itself does nothing; it always passes (if it
749    can run).  This class exists merely to be the root of a DAG of
750    dependencies in an instance of `RepairStrategy`.
751    """
752
753    def verify(self, host):
754        pass
755
756    @property
757    def description(self):
758        return 'All host verification checks pass'
759
760
761class RepairStrategy(object):
762    """
763    A class for organizing `Verifier` and `RepairAction` objects.
764
765    An instance of `RepairStrategy` is organized as a DAG of `Verifier`
766    objects, plus a list of `RepairAction` objects.  The class provides
767    methods for invoking those objects in the required order, when
768    needed:
769      * The `verify()` method walks the verifier DAG in dependency
770        order.
771      * The `repair()` method invokes the repair actions in list order.
772        Each repair action will invoke its dependencies and triggers as
773        needed.
774
775    # The Verifier DAG
776    The verifier DAG is constructed from the first argument passed to
777    the passed to the `RepairStrategy` constructor.  That argument is an
778    iterable consisting of three-element tuples in the form
779    `(constructor, tag, deps)`:
780      * The `constructor` value is a callable that creates a `Verifier`
781        as for the interface of the class constructor.  For classes
782        that inherit the default constructor from `Verifier`, this can
783        be the class itself.
784      * The `tag` value is the tag to be associated with the constructed
785        verifier.
786      * The `deps` value is an iterable (e.g. list or tuple) of strings.
787        Each string corresponds to the `tag` member of a `Verifier`
788        dependency.
789
790    The tag names of verifiers in the constructed DAG must all be
791    unique.  The tag name defined by `RepairStrategy.ROOT_TAG` is
792    reserved and may not be used by any verifier.
793
794    In the input data for the constructor, dependencies must appear
795    before the nodes that depend on them.  Thus:
796
797        ((A, 'a', ()), (B, 'b', ('a',)))     # This is valid
798        ((B, 'b', ('a',)), (A, 'a', ()))     # This will fail!
799
800    Internally, the DAG of verifiers is given unique root node.  So,
801    given this input:
802
803        ((C, 'c', ()),
804         (A, 'a', ('c',)),
805         (B, 'b', ('c',)))
806
807    The following DAG is constructed:
808
809          Root
810          /  \
811         A    B
812          \  /
813           C
814
815    Since nothing depends on `A` or `B`, the root node guarantees that
816    these two verifiers will both be called and properly logged.
817
818    The root node is not directly accessible; however repair actions can
819    trigger on it by using `RepairStrategy.ROOT_TAG`.  Additionally, the
820    node will be logged in `status.log` whenever `verify()` succeeds.
821
822    # The Repair Actions List
823    The list of repair actions is constructed from the second argument
824    passed to the passed to the `RepairStrategy` constructor.  That
825    argument is an iterable consisting of four-element tuples in the
826    form `(constructor, tag, deps, triggers)`:
827      * The `constructor` value is a callable that creates a
828        `RepairAction` as for the interface of the class constructor.
829        For classes that inherit the default constructor from
830        `RepairAction`, this can be the class itself.
831      * The `tag` value is the tag to be associated with the constructed
832        repair action.
833      * The `deps` value is an iterable (e.g. list or tuple) of strings.
834        Each string corresponds to the `tag` member of a `Verifier` that
835        the repair action depends on.
836      * The `triggers` value is an iterable (e.g. list or tuple) of
837        strings.  Each string corresponds to the `tag` member of a
838        `Verifier` that can trigger the repair action.
839
840    `RepairStrategy` deps and triggers can only refer to verifiers,
841    not to other repair actions.
842    """
843
844    # This name is reserved; clients may not use it.
845    ROOT_TAG = 'PASS'
846
847    @staticmethod
848    def _add_verifier(verifiers, constructor, tag, dep_tags):
849        """
850        Construct and remember a verifier.
851
852        Create a `Verifier` using `constructor` and `tag`.  Dependencies
853        for construction are found by looking up `dep_tags` in the
854        `verifiers` dictionary.
855
856        After construction, the new verifier is added to `verifiers`.
857
858        @param verifiers    Dictionary of verifiers, indexed by tag.
859        @param constructor  Verifier construction function.
860        @param tag          Tag parameter for the construction function.
861        @param dep_tags     Tags of dependencies for the constructor, to
862                            be found in `verifiers`.
863        """
864        assert tag not in verifiers
865        deps = [verifiers[d] for d in dep_tags]
866        verifiers[tag] = constructor(tag, deps)
867
868    def __init__(self, verifier_data, repair_data, host_class):
869        """
870        Construct a `RepairStrategy` from simplified DAG data.
871
872        The input `verifier_data` object describes how to construct
873        verify nodes and the dependencies that relate them, as detailed
874        above.
875
876        The input `repair_data` object describes how to construct repair
877        actions and their dependencies and triggers, as detailed above.
878
879        @param verifier_data  Iterable value with constructors for the
880                              elements of the verification DAG and their
881                              dependencies.
882        @param repair_data    Iterable value with constructors for the
883                              elements of the repair action list, and
884                              their dependencies and triggers.
885        @property host_class  A string identifier that identify what
886                              class of host this repair strategy target
887                              on, will be used as a field to send repair
888                              metrics.
889        """
890        # Metrics - we report on 'actions' for every repair action
891        # we execute; we report on 'strategy' for every complete
892        # repair operation.
893        self._strategy_counter = metrics.Counter(
894            'chromeos/autotest/repair/repair_strategy_v2')
895        self._actions_counter = metrics.Counter(
896            'chromeos/autotest/repair/repair_actions')
897        self.host_class = host_class
898        # We use the `all_verifiers` list to guarantee that our root
899        # verifier will execute its dependencies in the order provided
900        # to us by our caller.
901        verifier_map = {}
902        all_tags = []
903        dependencies = set()
904        for constructor, tag, deps in verifier_data:
905            self._add_verifier(verifier_map, constructor, tag, deps)
906            dependencies.update(deps)
907            all_tags.append(tag)
908        # Capture all the verifiers that have nothing depending on them.
909        root_tags = [t for t in all_tags if t not in dependencies]
910        self._add_verifier(verifier_map, _RootVerifier,
911                           self.ROOT_TAG, root_tags)
912        self._verify_root = verifier_map[self.ROOT_TAG]
913        self._repair_actions = []
914        for constructor, tag, deps, triggers in repair_data:
915            r = constructor(tag,
916                            [verifier_map[d] for d in deps],
917                            [verifier_map[t] for t in triggers],
918                            self.host_class)
919            self._repair_actions.append(r)
920
921    def _send_strategy_metrics(self, host, result):
922        """Send repair strategy metrics to monarch
923
924        @param host     The target to be repaired.
925        @param result   A String that describe a final result for the
926                        RepairStrategy.
927        """
928        info = host.host_info_store.get()
929        board = info.board if info.board else 'unknown'
930        model = info.model if info.model else 'unknown'
931        fields = {
932            'board': board,
933            'host_class': self.host_class,
934            'hostname': _filter_metrics_hostname(host),
935            'model': model,
936            'result': result,
937        }
938        self._strategy_counter.increment(fields=fields)
939
940    def _send_action_metrics(self, host, ra):
941        """Send repair action metrics to monarch
942
943        @param host     The target to be repaired.
944        @param ra       an RepairAction instance.
945        """
946        fields = {
947            'tag': ra.tag,
948            'status': ra.status,
949            'hostname': _filter_metrics_hostname(host),
950            'host_class': self.host_class
951        }
952        self._actions_counter.increment(fields=fields)
953
954    def verify(self, host, silent=False):
955        """
956        Run the verifier DAG on the given host.
957
958        @param host     The target to be verified.
959        @param silent   If true, don't log host status records.
960        """
961        self._verify_root._reverify()
962        self._verify_root._verify_host(host, silent)
963
964    def repair(self, host, silent=False):
965        """
966        Run the repair list on the given host.
967
968        @param host     The target to be repaired.
969        @param silent   If true, don't log host status records.
970        """
971        self._verify_root._reverify()
972        attempted = False
973        for ra in self._repair_actions:
974            try:
975                logging.debug('Start repair task: %s.', type(ra).__name__)
976                ra._repair_host(host, silent)
977            except Exception as e:
978                # all logging and exception handling was done at
979                # lower levels
980                pass
981            finally:
982                self._send_action_metrics(host, ra)
983                logging.debug('Finished repair task: %s.', type(ra).__name__)
984                if ra.status not in ('skipped', 'blocked'):
985                    attempted = True
986
987        result = 'failure'
988        try:
989            self._verify_root._verify_host(host, silent)
990            result = 'success' if attempted else 'not_attempted'
991        except:
992            if not attempted:
993                result = 'attempt_blocked'
994            raise
995        finally:
996            self._send_strategy_metrics(host, result)
997
998    def verifier_is_good(self, tag):
999        """Find and return result of a verifier.
1000
1001        @param tag: key to be associated with verifier
1002
1003        @returns: a boolean or None value:
1004            True - verifier passed
1005            False - verifier did not pass
1006            None - verifier did not run because it is not applicable
1007                   or blocked due to dependency failure
1008        """
1009        verifier = self._verify_root._get_node_by_tag(tag)
1010        if verifier is not None:
1011            result = verifier._is_good()
1012            logging.debug('Verifier with associated tag: %s found', tag)
1013            if result is None:
1014                logging.debug('%s did not run; it is not applicable to run '
1015                              'or blocked due to dependency failure', tag)
1016            elif result == True:
1017                logging.debug('Cached result of %s verifier is pass', tag)
1018            else:
1019                logging.debug('Cached result of %s verifier is fail', tag)
1020            return result
1021        logging.debug('Verifier with associated tag: %s not found', tag)
1022        return None
1023
1024
1025def _filter_metrics_hostname(host):
1026    """
1027       Restrict format of hostnames we'll send to monarch
1028
1029       @param host    An host instance(i.e. ServoHost, CrosHost)
1030    """
1031    if re.match(_HOSTNAME_PATTERN, host.hostname):
1032        return host.hostname
1033    else:
1034        return _DISALLOWED_HOSTNAME
1035