1# Copyright 2016 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5""" 6Framework for host verification and repair in Autotest. 7 8The framework provides implementation code in support of `Host.verify()` 9and `Host.repair()` used in Verify and Repair special tasks. 10 11The framework consists of these classes: 12 * `Verifier`: A class representing a single verification check. 13 * `RepairAction`: A class representing a repair operation that can fix 14 a failed verification check. 15 * `RepairStrategy`: A class for organizing a collection of `Verifier` 16 and `RepairAction` instances, and invoking them in order. 17 18Individual operations during verification and repair are handled by 19instances of `Verifier` and `RepairAction`. `Verifier` objects are 20meant to test for specific conditions that may cause tests to fail. 21`RepairAction` objects provide operations designed to fix one or 22more failures identified by a `Verifier` object. 23""" 24 25import collections 26import logging 27import re 28 29import common 30from autotest_lib.client.common_lib import error 31 32try: 33 from chromite.lib import metrics 34except ImportError: 35 from autotest_lib.client.bin.utils import metrics_mock as metrics 36 37#Regular experssion pattern to filter out unwanted hostname. 38_HOSTNAME_PATTERN = 'chromeos[0-9]-row[0-9]+[a-z]?-rack[0-9]+[a-z]?-host[0-9]+' 39_DISALLOWED_HOSTNAME = 'disallowed_hostname' 40 41 42class AutoservVerifyError(error.AutoservError): 43 """ 44 Generic Exception for failures from `Verifier` objects. 45 46 Instances of this exception can be raised when a `verify()` 47 method fails, if no more specific exception is available. 48 """ 49 pass 50 51 52_DependencyFailure = collections.namedtuple( 53 '_DependencyFailure', ('dependency', 'error', 'tag')) 54 55 56class AutoservVerifyDependencyError(error.AutoservError): 57 """ 58 Exception raised for failures in dependencies. 59 60 This exception is used to distinguish an original failure from a 61 failure being passed back from a verification dependency. That is, 62 if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception 63 to signal that the original failure is further down the dependency 64 chain. 65 66 The `failures` argument to the constructor for this class is a set 67 of instances of `_DependencyFailure`, each corresponding to one 68 failed dependency: 69 * The `dependency` attribute of each failure is the description 70 of the failed dependency. 71 * The `error` attribute of each failure is the string value of 72 the exception from the failed dependency. 73 74 Multiple methods in this module recognize and handle this exception 75 specially. 76 77 @property failures Set of failures passed to the constructor. 78 @property _node Instance of `_DependencyNode` reporting the 79 failed dependencies. 80 """ 81 82 def __init__(self, node, failures): 83 """ 84 Constructor for `AutoservVerifyDependencyError`. 85 86 @param node Instance of _DependencyNode reporting the 87 failed dependencies. 88 @param failures List of failure tuples as described above. 89 """ 90 super(AutoservVerifyDependencyError, self).__init__( 91 '\n'.join([f.error for f in failures])) 92 self.failures = failures 93 self._node = node 94 95 def log_dependencies(self, action, deps): 96 """ 97 Log an `AutoservVerifyDependencyError`. 98 99 This writes a short summary of the dependency failures captured 100 in this exception, using standard Python logging. 101 102 The passed in `action` string plus `self._node.description` 103 are logged at INFO level. The `action` argument should 104 introduce or describe an action relative to `self._node`. 105 106 The passed in `deps` string and the description of each failed 107 dependency in `self` are be logged at DEBUG level. The `deps` 108 argument is used to introduce the various failed dependencies. 109 110 @param action A string mentioning the action being logged 111 relative to `self._node`. 112 @param deps A string introducing the dependencies that 113 failed. 114 """ 115 logging.info('%s: %s', action, self._node.description) 116 logging.debug('%s:', deps) 117 for failure in self.failures: 118 logging.debug(' %s', failure.dependency) 119 120 121class AutoservRepairError(error.AutoservError): 122 """ 123 Generic Exception for failures from `RepairAction` objects. 124 125 Instances of this exception can be raised when a `repair()` 126 method fails, if no more specific exception is available. 127 """ 128 def __init__(self, description, tag): 129 """ 130 @param description Message describe the exception. 131 @param tag A short identifier used for metric purpose. 132 """ 133 super(AutoservRepairError, self).__init__(description) 134 self.tag = tag 135 136 137class _DependencyNode(object): 138 """ 139 An object that can depend on verifiers. 140 141 Both repair and verify operations have the notion of dependencies 142 that must pass before the operation proceeds. This class captures 143 the shared behaviors required by both classes. 144 145 @property tag Short identifier to be used in logging. 146 @property description Text summary of this node's action, to be 147 used in debug logs. 148 @property _dependency_list Dependency pre-requisites. 149 """ 150 151 def __init__(self, tag, record_type, dependencies): 152 self._dependency_list = dependencies 153 self._tag = tag 154 self._record_tag = record_type + '.' + tag 155 156 def _record(self, host, silent, status_code, *record_args): 157 """ 158 Log a status record for `host`. 159 160 Call `host.record()` using the given status_code, and 161 operation tag `self._record_tag`, plus any extra arguments in 162 `record_args`. Do nothing if `silent` is a true value. 163 164 @param host Host which will record the status record. 165 @param silent Don't record the event if this is a true 166 value. 167 @param status_code Value for the `status_code` parameter to 168 `host.record()`. 169 @param record_args Additional arguments to pass to 170 `host.record()`. 171 """ 172 if not silent: 173 host.record(status_code, None, self._record_tag, 174 *record_args) 175 176 def _record_good(self, host, silent): 177 """Log a 'GOOD' status line. 178 179 @param host Host which will record the status record. 180 @param silent Don't record the event if this is a true 181 value. 182 """ 183 self._record(host, silent, 'GOOD') 184 185 def _record_fail(self, host, silent, exc): 186 """Log a 'FAIL' status line. 187 188 @param host Host which will record the status record. 189 @param silent Don't record the event if this is a true 190 value. 191 @param exc Exception describing the cause of failure. 192 """ 193 self._record(host, silent, 'FAIL', str(exc)) 194 195 def _verify_list(self, host, verifiers, silent): 196 """ 197 Test a list of verifiers against a given host. 198 199 This invokes `_verify_host()` on every verifier in the given 200 list. If any verifier in the transitive closure of dependencies 201 in the list fails, an `AutoservVerifyDependencyError` is raised 202 containing the description of each failed verifier. Only 203 original failures are reported; verifiers that don't run due 204 to a failed dependency are omitted. 205 206 By design, original failures are logged once in `_verify_host()` 207 when `verify()` originally fails. The additional data gathered 208 here is for the debug logs to indicate why a subsequent 209 operation never ran. 210 211 @param host The host to be tested against the verifiers. 212 @param verifiers List of verifiers to be checked. 213 @param silent If true, don't log host status records. 214 215 @raises AutoservVerifyDependencyError Raised when at least 216 one verifier in the list has failed. 217 """ 218 failures = set() 219 for v in verifiers: 220 try: 221 v._verify_host(host, silent) 222 except AutoservVerifyDependencyError as e: 223 failures.update(e.failures) 224 except Exception as e: 225 failures.add(_DependencyFailure(v.description, str(e), v.tag)) 226 if failures: 227 raise AutoservVerifyDependencyError(self, failures) 228 229 def _verify_dependencies(self, host, silent): 230 """ 231 Verify that all of this node's dependencies pass for a host. 232 233 @param host The host to be verified. 234 @param silent If true, don't log host status records. 235 """ 236 try: 237 self._verify_list(host, self._dependency_list, silent) 238 except AutoservVerifyDependencyError as e: 239 e.log_dependencies( 240 'Skipping this operation', 241 'The following dependencies failed') 242 raise 243 244 @property 245 def tag(self): 246 """ 247 Tag for use in logging status records. 248 249 This is a property with a short string used to identify the node 250 in the 'status.log' file and during node construction. The tag 251 should contain only letters, digits, and '_' characters. This 252 tag is not used alone, but is combined with other identifiers, 253 based on the operation being logged. 254 255 @return A short identifier-like string. 256 """ 257 return self._tag 258 259 @property 260 def description(self): 261 """ 262 Text description of this node for log messages. 263 264 This string will be logged with failures, and should describe 265 the condition required for success. 266 267 N.B. Subclasses are required to override this method, but we 268 _don't_ raise NotImplementedError here. Various methods fail in 269 inscrutable ways if this method raises any exception, so for 270 debugging purposes, it's better to return a default value. 271 272 @return A descriptive string. 273 """ 274 return ('Class %s fails to implement description().' % 275 type(self).__name__) 276 277 278class Verifier(_DependencyNode): 279 """ 280 Abstract class embodying one verification check. 281 282 A concrete subclass of `Verifier` provides a simple check that can 283 determine a host's fitness for testing. Failure indicates that the 284 check found a problem that can cause at least one test to fail. 285 286 `Verifier` objects are organized in a DAG identifying dependencies 287 among operations. The DAG controls ordering and prevents wasted 288 effort: If verification operation V2 requires that verification 289 operation V1 pass, then a) V1 will run before V2, and b) if V1 290 fails, V2 won't run at all. The `_verify_host()` method ensures 291 that all dependencies run and pass before invoking the `verify()` 292 method. 293 294 A `Verifier` object caches its result the first time it calls 295 `verify()`. Subsequent calls return the cached result, without 296 re-running the check code. The `_reverify()` method clears the 297 cached result in the current node, and in all dependencies. 298 299 Subclasses must supply these properties and methods: 300 * `verify()`: This is the method to perform the actual 301 verification check. 302 * `description`: A one-line summary of the verification check for 303 debug log messages. 304 305 Subclasses must override all of the above attributes; subclasses 306 should not override or extend any other attributes of this class. 307 308 The description string should be a simple sentence explaining what 309 must be true for the verifier to pass. Do not include a terminating 310 period. For example: 311 312 Host is available via ssh 313 314 The base class manages the following private data: 315 * `_result`: The cached result of verification. 316 * `_dependency_list`: The list of dependencies. 317 Subclasses should not use these attributes. 318 319 @property _result Cached result of verification. 320 """ 321 322 def __init__(self, tag, dependencies): 323 super(Verifier, self).__init__(tag, 'verify', dependencies) 324 self._result = None 325 326 def _reverify(self): 327 """ 328 Discard cached verification results. 329 330 Reset the cached verification result for this node, and for the 331 transitive closure of all dependencies. 332 """ 333 if self._result is not None: 334 self._result = None 335 for v in self._dependency_list: 336 v._reverify() 337 338 def _verify_host(self, host, silent): 339 """ 340 Determine the result of verification, and log results. 341 342 If this verifier does not have a cached verification result, 343 check dependencies, and if they pass, run `verify()`. Log 344 informational messages regarding failed dependencies. If we 345 call `verify()`, log the result in `status.log`. 346 347 If we already have a cached result, return that result without 348 logging any message. 349 350 @param host The host to be tested for a problem. 351 @param silent If true, don't log host status records. 352 """ 353 if self._result is not None: 354 if isinstance(self._result, Exception): 355 raise self._result # cached failure 356 elif self._result: 357 return # cached success 358 self._result = False 359 self._verify_dependencies(host, silent) 360 logging.info('Verifying this condition: %s', self.description) 361 try: 362 logging.debug('Start verify task: %s.', type(self).__name__) 363 self.verify(host) 364 self._record_good(host, silent) 365 except Exception as e: 366 logging.exception('Failed: %s', self.description) 367 self._result = e 368 self._record_fail(host, silent, e) 369 raise 370 finally: 371 logging.debug('Finished verify task: %s.', type(self).__name__) 372 373 self._result = True 374 375 def verify(self, host): 376 """ 377 Unconditionally perform a verification check. 378 379 This method is responsible for testing for a single problem on a 380 host. Implementations should follow these guidelines: 381 * The check should find a problem that will cause testing to 382 fail. 383 * Verification checks on a working system should run quickly 384 and should be optimized for success; a check that passes 385 should finish within seconds. 386 * Verification checks are not expected have side effects, but 387 may apply trivial fixes if they will finish within the time 388 constraints above. 389 390 A verification check should normally trigger a single set of 391 repair actions. If two different failures can require two 392 different repairs, ideally they should use two different 393 subclasses of `Verifier`. 394 395 Implementations indicate failure by raising an exception. The 396 exception text should be a short, 1-line summary of the error. 397 The text should be concise and diagnostic, as it will appear in 398 `status.log` files. 399 400 If this method finds no problems, it returns without raising any 401 exception. 402 403 Implementations should avoid most logging actions, but can log 404 DEBUG level messages if they provide significant information for 405 diagnosing failures. 406 407 @param host The host to be tested for a problem. 408 """ 409 raise NotImplementedError('Class %s does not implement ' 410 'verify()' % type(self).__name__) 411 412 413class RepairAction(_DependencyNode): 414 """ 415 Abstract class embodying one repair procedure. 416 417 A `RepairAction` is responsible for fixing one or more failed 418 `Verifier` checks, in order to make those checks pass. 419 420 Each repair action includes one or more verifier triggers that 421 determine when the repair action should run. A repair action 422 will call its `repair()` method if one or more of its triggers 423 fails. A repair action is successful if all of its triggers pass 424 after calling `repair()`. 425 426 A `RepairAction` is a subclass of `_DependencyNode`; if any of a 427 repair action's dependencies fail, the action does not check its 428 triggers, and doesn't call `repair()`. 429 430 Subclasses must supply these attributes: 431 * `repair()`: This is the method to perform the necessary 432 repair. The method should avoid most logging actions, but 433 can log DEBUG level messages if they provide significant 434 information for diagnosing failures. 435 * `description`: A one-line summary of the repair action for 436 debug log messages. 437 438 Subclasses must override both of the above attributes and should 439 not override any other attributes of this class. 440 441 The description string should be a simple sentence explaining the 442 operation that will be performed. Do not include a terminating 443 period. For example: 444 445 Re-install the stable build via AU 446 447 @property _trigger_list List of verification checks that will 448 trigger this repair when they fail. 449 @property host_class A string identifier that will be 450 used as a field to send repair metrics. 451 """ 452 453 def __init__(self, tag, dependencies, triggers, host_class): 454 super(RepairAction, self).__init__(tag, 'repair', dependencies) 455 self._trigger_list = triggers 456 self._failure_modes_counter = metrics.Counter( 457 'chromeos/autotest/repair/failure_modes') 458 self._failure_detail_counter = metrics.Counter( 459 'chromeos/autotest/repair/failure_detail') 460 self.host_class = host_class 461 462 def _record_start(self, host, silent): 463 """Log a 'START' status line. 464 465 @param host Host which will record the status record. 466 @param silent Don't record the event if this is a true 467 value. 468 """ 469 self._record(host, silent, 'START') 470 471 def _record_end_good(self, host, silent): 472 """Log an 'END GOOD' status line. 473 474 @param host Host which will record the status record. 475 @param silent Don't record the event if this is a true 476 value. 477 """ 478 self._record(host, silent, 'END GOOD') 479 self.status = 'repaired' 480 481 def _record_end_fail(self, host, silent, status, *args): 482 """Log an 'END FAIL' status line. 483 484 @param host Host which will record the status record. 485 @param silent Don't record the event if this is a true 486 value. 487 @param args Extra arguments to `self._record()` 488 """ 489 self._record(host, silent, 'END FAIL', *args) 490 self.status = status 491 492 def _send_failure_metrics(self, host, error, stage): 493 """Send failure mode metrics to monarch 494 495 @param host Host which this RepairAction targeted to. 496 @param error An exception that caught in _repair_host. 497 @param stage In which stage we caught above exception. 498 Can be one of below value: 499 'dep' during verify dependencies 500 'pre' during pre-repair trigger verification 501 'repair' during repair() process itself 502 'post' during post-repair trigger verification 503 """ 504 505 def get_fields(vf_tag): 506 fields = { 507 'ra_tag': self.tag, 508 'vf_tag': vf_tag, 509 'hostname': _filter_metrics_hostname(host), 510 'stage': stage, 511 'host_class': self.host_class 512 } 513 return fields 514 515 if isinstance(error, AutoservVerifyDependencyError): 516 # We'll catch all failure tags here for a dependencies error 517 for f in error.failures: 518 self._failure_modes_counter.increment(fields=get_fields(f.tag)) 519 else: 520 # When there is failure during repair or unknown failure. there 521 # will be no Verifier, so vf_tag set to 'unknown'. 522 self._failure_modes_counter.increment(fields=get_fields('unknown')) 523 524 if stage == 'repair': 525 self._send_failure_detail(error) 526 527 def _send_failure_detail(self, error): 528 """Send reason of failure inside repair() to monarch. 529 530 @param error The exception caught inside repair(). 531 """ 532 tag = error.tag if isinstance(error, AutoservRepairError) else 'unknown' 533 fields = {'repair_action_tag': self.tag, 'repair_failure_tag': tag} 534 self._failure_detail_counter.increment(fields=fields) 535 536 def _repair_host(self, host, silent): 537 """ 538 Apply this repair action if any triggers fail. 539 540 Repair is triggered when all dependencies are successful, and at 541 least one trigger fails. 542 543 If the `repair()` method triggers, the success or failure of 544 this operation is logged in `status.log` bracketed by 'START' 545 and 'END' records. Details of whether or why `repair()` 546 triggered are written to the debug logs. If repair doesn't 547 trigger, nothing is logged to `status.log`. 548 549 @param host The host to be repaired. 550 @param silent If true, don't log host status records. 551 """ 552 # Note: Every exit path from the method must set `self.status`. 553 # There's a lot of exit paths, so be careful. 554 # 555 # If we're blocked by a failed dependency, we exit with an 556 # exception. So set status to 'blocked' first. 557 self.status = 'blocked' 558 try: 559 self._verify_dependencies(host, silent) 560 except Exception as e: 561 self._send_failure_metrics(host, e, 'dep') 562 raise 563 # This is a defensive action. Every path below should overwrite 564 # this setting, but if it doesn't, we want our status to reflect 565 # a coding error. 566 self.status = 'unknown' 567 try: 568 self._verify_list(host, self._trigger_list, silent) 569 except AutoservVerifyDependencyError as e: 570 e.log_dependencies( 571 'Attempting this repair action', 572 'Repairing because these triggers failed') 573 self._send_failure_metrics(host, e, 'pre') 574 self._record_start(host, silent) 575 try: 576 self.repair(host) 577 except Exception as e: 578 logging.exception('Repair failed: %s', self.description) 579 self._record_fail(host, silent, e) 580 self._record_end_fail(host, silent, 'repair_failure') 581 self._send_failure_metrics(host, e, 'repair') 582 raise 583 try: 584 for v in self._trigger_list: 585 v._reverify() 586 self._verify_list(host, self._trigger_list, silent) 587 self._record_end_good(host, silent) 588 except AutoservVerifyDependencyError as e: 589 e.log_dependencies( 590 'This repair action reported success', 591 'However, these triggers still fail') 592 self._record_end_fail(host, silent, 'verify_failure') 593 self._send_failure_metrics(host, e, 'post') 594 raise AutoservRepairError( 595 'Some verification checks still fail', 'post_verify') 596 except Exception: 597 # The specification for `self._verify_list()` says 598 # that this can't happen; this is a defensive 599 # precaution. 600 self._record_end_fail(host, silent, 'unknown', 601 'Internal error in repair') 602 self._send_failure_metrics(host, e, 'post') 603 raise 604 else: 605 self.status = 'skipped' 606 logging.info('No failed triggers, skipping repair: %s', 607 self.description) 608 609 def repair(self, host): 610 """ 611 Apply this repair action to the given host. 612 613 This method is responsible for applying changes to fix failures 614 in one or more verification checks. The repair is considered 615 successful if the DUT passes the specific checks after this 616 method completes. 617 618 Implementations indicate failure by raising an exception. The 619 exception text should be a short, 1-line summary of the error. 620 The text should be concise and diagnostic, as it will appear in 621 `status.log` files. 622 623 If this method completes successfully, it returns without 624 raising any exception. 625 626 Implementations should avoid most logging actions, but can log 627 DEBUG level messages if they provide significant information for 628 diagnosing failures. 629 630 @param host The host to be repaired. 631 """ 632 raise NotImplementedError('Class %s does not implement ' 633 'repair()' % type(self).__name__) 634 635 636class _RootVerifier(Verifier): 637 """ 638 Utility class used by `RepairStrategy`. 639 640 A node of this class by itself does nothing; it always passes (if it 641 can run). This class exists merely to be the root of a DAG of 642 dependencies in an instance of `RepairStrategy`. 643 """ 644 645 def verify(self, host): 646 pass 647 648 @property 649 def description(self): 650 return 'All host verification checks pass' 651 652 653class RepairStrategy(object): 654 """ 655 A class for organizing `Verifier` and `RepairAction` objects. 656 657 An instance of `RepairStrategy` is organized as a DAG of `Verifier` 658 objects, plus a list of `RepairAction` objects. The class provides 659 methods for invoking those objects in the required order, when 660 needed: 661 * The `verify()` method walks the verifier DAG in dependency 662 order. 663 * The `repair()` method invokes the repair actions in list order. 664 Each repair action will invoke its dependencies and triggers as 665 needed. 666 667 # The Verifier DAG 668 The verifier DAG is constructed from the first argument passed to 669 the passed to the `RepairStrategy` constructor. That argument is an 670 iterable consisting of three-element tuples in the form 671 `(constructor, tag, deps)`: 672 * The `constructor` value is a callable that creates a `Verifier` 673 as for the interface of the class constructor. For classes 674 that inherit the default constructor from `Verifier`, this can 675 be the class itself. 676 * The `tag` value is the tag to be associated with the constructed 677 verifier. 678 * The `deps` value is an iterable (e.g. list or tuple) of strings. 679 Each string corresponds to the `tag` member of a `Verifier` 680 dependency. 681 682 The tag names of verifiers in the constructed DAG must all be 683 unique. The tag name defined by `RepairStrategy.ROOT_TAG` is 684 reserved and may not be used by any verifier. 685 686 In the input data for the constructor, dependencies must appear 687 before the nodes that depend on them. Thus: 688 689 ((A, 'a', ()), (B, 'b', ('a',))) # This is valid 690 ((B, 'b', ('a',)), (A, 'a', ())) # This will fail! 691 692 Internally, the DAG of verifiers is given unique root node. So, 693 given this input: 694 695 ((C, 'c', ()), 696 (A, 'a', ('c',)), 697 (B, 'b', ('c',))) 698 699 The following DAG is constructed: 700 701 Root 702 / \ 703 A B 704 \ / 705 C 706 707 Since nothing depends on `A` or `B`, the root node guarantees that 708 these two verifiers will both be called and properly logged. 709 710 The root node is not directly accessible; however repair actions can 711 trigger on it by using `RepairStrategy.ROOT_TAG`. Additionally, the 712 node will be logged in `status.log` whenever `verify()` succeeds. 713 714 # The Repair Actions List 715 The list of repair actions is constructed from the second argument 716 passed to the passed to the `RepairStrategy` constructor. That 717 argument is an iterable consisting of four-element tuples in the 718 form `(constructor, tag, deps, triggers)`: 719 * The `constructor` value is a callable that creates a 720 `RepairAction` as for the interface of the class constructor. 721 For classes that inherit the default constructor from 722 `RepairAction`, this can be the class itself. 723 * The `tag` value is the tag to be associated with the constructed 724 repair action. 725 * The `deps` value is an iterable (e.g. list or tuple) of strings. 726 Each string corresponds to the `tag` member of a `Verifier` that 727 the repair action depends on. 728 * The `triggers` value is an iterable (e.g. list or tuple) of 729 strings. Each string corresponds to the `tag` member of a 730 `Verifier` that can trigger the repair action. 731 732 `RepairStrategy` deps and triggers can only refer to verifiers, 733 not to other repair actions. 734 """ 735 736 # This name is reserved; clients may not use it. 737 ROOT_TAG = 'PASS' 738 739 @staticmethod 740 def _add_verifier(verifiers, constructor, tag, dep_tags): 741 """ 742 Construct and remember a verifier. 743 744 Create a `Verifier` using `constructor` and `tag`. Dependencies 745 for construction are found by looking up `dep_tags` in the 746 `verifiers` dictionary. 747 748 After construction, the new verifier is added to `verifiers`. 749 750 @param verifiers Dictionary of verifiers, indexed by tag. 751 @param constructor Verifier construction function. 752 @param tag Tag parameter for the construction function. 753 @param dep_tags Tags of dependencies for the constructor, to 754 be found in `verifiers`. 755 """ 756 assert tag not in verifiers 757 deps = [verifiers[d] for d in dep_tags] 758 verifiers[tag] = constructor(tag, deps) 759 760 def __init__(self, verifier_data, repair_data, host_class): 761 """ 762 Construct a `RepairStrategy` from simplified DAG data. 763 764 The input `verifier_data` object describes how to construct 765 verify nodes and the dependencies that relate them, as detailed 766 above. 767 768 The input `repair_data` object describes how to construct repair 769 actions and their dependencies and triggers, as detailed above. 770 771 @param verifier_data Iterable value with constructors for the 772 elements of the verification DAG and their 773 dependencies. 774 @param repair_data Iterable value with constructors for the 775 elements of the repair action list, and 776 their dependencies and triggers. 777 @property host_class A string identifier that identify what 778 class of host this repair strategy target 779 on, will be used as a field to send repair 780 metrics. 781 """ 782 # Metrics - we report on 'actions' for every repair action 783 # we execute; we report on 'strategy' for every complete 784 # repair operation. 785 self._strategy_counter = metrics.Counter( 786 'chromeos/autotest/repair/repair_strategy_v2') 787 self._actions_counter = metrics.Counter( 788 'chromeos/autotest/repair/repair_actions') 789 self.host_class = host_class 790 # We use the `all_verifiers` list to guarantee that our root 791 # verifier will execute its dependencies in the order provided 792 # to us by our caller. 793 verifier_map = {} 794 all_tags = [] 795 dependencies = set() 796 for constructor, tag, deps in verifier_data: 797 self._add_verifier(verifier_map, constructor, tag, deps) 798 dependencies.update(deps) 799 all_tags.append(tag) 800 # Capture all the verifiers that have nothing depending on them. 801 root_tags = [t for t in all_tags if t not in dependencies] 802 self._add_verifier(verifier_map, _RootVerifier, 803 self.ROOT_TAG, root_tags) 804 self._verify_root = verifier_map[self.ROOT_TAG] 805 self._repair_actions = [] 806 for constructor, tag, deps, triggers in repair_data: 807 r = constructor(tag, 808 [verifier_map[d] for d in deps], 809 [verifier_map[t] for t in triggers], 810 self.host_class) 811 self._repair_actions.append(r) 812 813 def _send_strategy_metrics(self, host, result): 814 """Send repair strategy metrics to monarch 815 816 @param host The target to be repaired. 817 @param result A String that describe a final result for the 818 RepairStrategy. 819 """ 820 info = host.host_info_store.get() 821 board = info.board if info.board else 'unknown' 822 model = info.model if info.model else 'unknown' 823 fields = { 824 'board': board, 825 'host_class': self.host_class, 826 'hostname': _filter_metrics_hostname(host), 827 'model': model, 828 'result': result, 829 } 830 self._strategy_counter.increment(fields=fields) 831 832 def _send_action_metrics(self, host, ra): 833 """Send repair action metrics to monarch 834 835 @param host The target to be repaired. 836 @param ra an RepairAction instance. 837 """ 838 fields = { 839 'tag': ra.tag, 840 'status': ra.status, 841 'hostname': _filter_metrics_hostname(host), 842 'host_class': self.host_class 843 } 844 self._actions_counter.increment(fields=fields) 845 846 def verify(self, host, silent=False): 847 """ 848 Run the verifier DAG on the given host. 849 850 @param host The target to be verified. 851 @param silent If true, don't log host status records. 852 """ 853 self._verify_root._reverify() 854 self._verify_root._verify_host(host, silent) 855 856 def repair(self, host, silent=False): 857 """ 858 Run the repair list on the given host. 859 860 @param host The target to be repaired. 861 @param silent If true, don't log host status records. 862 """ 863 self._verify_root._reverify() 864 attempted = False 865 for ra in self._repair_actions: 866 try: 867 logging.debug('Start repair task: %s.', type(ra).__name__) 868 ra._repair_host(host, silent) 869 except Exception as e: 870 # all logging and exception handling was done at 871 # lower levels 872 pass 873 finally: 874 self._send_action_metrics(host, ra) 875 logging.debug('Finished repair task: %s.', type(ra).__name__) 876 if ra.status not in ('skipped', 'blocked'): 877 attempted = True 878 879 result = 'failure' 880 try: 881 self._verify_root._verify_host(host, silent) 882 result = 'success' if attempted else 'not_attempted' 883 except: 884 if not attempted: 885 result = 'attempt_blocked' 886 raise 887 finally: 888 self._send_strategy_metrics(host, result) 889 890 891def _filter_metrics_hostname(host): 892 """ 893 Restrict format of hostnames we'll send to monarch 894 895 @param host An host instance(i.e. ServoHost, CrosHost) 896 """ 897 if re.match(_HOSTNAME_PATTERN, host.hostname): 898 return host.hostname 899 else: 900 return _DISALLOWED_HOSTNAME 901 902