1# Copyright 2016 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5""" 6Framework for host verification and repair in Autotest. 7 8The framework provides implementation code in support of `Host.verify()` 9and `Host.repair()` used in Verify and Repair special tasks. 10 11The framework consists of these classes: 12 * `Verifier`: A class representing a single verification check. 13 * `RepairAction`: A class representing a repair operation that can fix 14 a failed verification check. 15 * `RepairStrategy`: A class for organizing a collection of `Verifier` 16 and `RepairAction` instances, and invoking them in order. 17 18Individual operations during verification and repair are handled by 19instances of `Verifier` and `RepairAction`. `Verifier` objects are 20meant to test for specific conditions that may cause tests to fail. 21`RepairAction` objects provide operations designed to fix one or 22more failures identified by a `Verifier` object. 23""" 24 25import collections 26import logging 27import re 28 29import common 30from autotest_lib.client.common_lib import error 31 32try: 33 from chromite.lib import metrics 34except ImportError: 35 from autotest_lib.client.bin.utils import metrics_mock as metrics 36 37#Regular experssion pattern to filter out unwanted hostname. 38_HOSTNAME_PATTERN = 'chromeos[0-9]-row[0-9]+[a-z]?-rack[0-9]+[a-z]?-host[0-9]+' 39_DISALLOWED_HOSTNAME = 'disallowed_hostname' 40 41 42class AutoservVerifyError(error.AutoservError): 43 """ 44 Generic Exception for failures from `Verifier` objects. 45 46 Instances of this exception can be raised when a `verify()` 47 method fails, if no more specific exception is available. 48 """ 49 pass 50 51 52_DependencyFailure = collections.namedtuple( 53 '_DependencyFailure', ('dependency', 'error', 'tag')) 54 55 56class AutoservVerifyDependencyError(error.AutoservError): 57 """ 58 Exception raised for failures in dependencies. 59 60 This exception is used to distinguish an original failure from a 61 failure being passed back from a verification dependency. That is, 62 if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception 63 to signal that the original failure is further down the dependency 64 chain. 65 66 The `failures` argument to the constructor for this class is a set 67 of instances of `_DependencyFailure`, each corresponding to one 68 failed dependency: 69 * The `dependency` attribute of each failure is the description 70 of the failed dependency. 71 * The `error` attribute of each failure is the string value of 72 the exception from the failed dependency. 73 74 Multiple methods in this module recognize and handle this exception 75 specially. 76 77 @property failures Set of failures passed to the constructor. 78 @property _node Instance of `_DependencyNode` reporting the 79 failed dependencies. 80 """ 81 82 def __init__(self, node, failures): 83 """ 84 Constructor for `AutoservVerifyDependencyError`. 85 86 @param node Instance of _DependencyNode reporting the 87 failed dependencies. 88 @param failures List of failure tuples as described above. 89 """ 90 super(AutoservVerifyDependencyError, self).__init__( 91 '\n'.join([f.error for f in failures])) 92 self.failures = failures 93 self._node = node 94 95 def log_dependencies(self, action, deps): 96 """ 97 Log an `AutoservVerifyDependencyError`. 98 99 This writes a short summary of the dependency failures captured 100 in this exception, using standard Python logging. 101 102 The passed in `action` string plus `self._node.description` 103 are logged at INFO level. The `action` argument should 104 introduce or describe an action relative to `self._node`. 105 106 The passed in `deps` string and the description of each failed 107 dependency in `self` are be logged at DEBUG level. The `deps` 108 argument is used to introduce the various failed dependencies. 109 110 @param action A string mentioning the action being logged 111 relative to `self._node`. 112 @param deps A string introducing the dependencies that 113 failed. 114 """ 115 logging.info('%s: %s', action, self._node.description) 116 logging.debug('%s:', deps) 117 for failure in self.failures: 118 logging.debug(' %s', failure.dependency) 119 120 121class AutoservRepairError(error.AutoservError): 122 """ 123 Generic Exception for failures from `RepairAction` objects. 124 125 Instances of this exception can be raised when a `repair()` 126 method fails, if no more specific exception is available. 127 """ 128 def __init__(self, description, tag): 129 """ 130 @param description Message describe the exception. 131 @param tag A short identifier used for metric purpose. 132 """ 133 super(AutoservRepairError, self).__init__(description) 134 self.tag = tag 135 136 137class _DependencyNode(object): 138 """ 139 An object that can depend on verifiers. 140 141 Both repair and verify operations have the notion of dependencies 142 that must pass before the operation proceeds. This class captures 143 the shared behaviors required by both classes. 144 145 @property tag Short identifier to be used in logging. 146 @property description Text summary of this node's action, to be 147 used in debug logs. 148 @property _dependency_list Dependency pre-requisites. 149 """ 150 151 def __init__(self, tag, record_type, dependencies): 152 self._dependency_list = dependencies 153 self._tag = tag 154 self._record_tag = record_type + '.' + tag 155 156 def _record(self, host, silent, status_code, *record_args): 157 """ 158 Log a status record for `host`. 159 160 Call `host.record()` using the given status_code, and 161 operation tag `self._record_tag`, plus any extra arguments in 162 `record_args`. Do nothing if `silent` is a true value. 163 164 @param host Host which will record the status record. 165 @param silent Don't record the event if this is a true 166 value. 167 @param status_code Value for the `status_code` parameter to 168 `host.record()`. 169 @param record_args Additional arguments to pass to 170 `host.record()`. 171 """ 172 if not silent: 173 host.record(status_code, None, self._record_tag, 174 *record_args) 175 176 def _record_good(self, host, silent): 177 """Log a 'GOOD' status line. 178 179 @param host Host which will record the status record. 180 @param silent Don't record the event if this is a true 181 value. 182 """ 183 self._record(host, silent, 'GOOD') 184 185 def _record_fail(self, host, silent, exc): 186 """Log a 'FAIL' status line. 187 188 @param host Host which will record the status record. 189 @param silent Don't record the event if this is a true 190 value. 191 @param exc Exception describing the cause of failure. 192 """ 193 self._record(host, silent, 'FAIL', str(exc)) 194 195 def _verify_list(self, host, verifiers, silent): 196 """ 197 Test a list of verifiers against a given host. 198 199 This invokes `_verify_host()` on every verifier in the given 200 list. If any verifier in the transitive closure of dependencies 201 in the list fails, an `AutoservVerifyDependencyError` is raised 202 containing the description of each failed verifier. Only 203 original failures are reported; verifiers that don't run due 204 to a failed dependency are omitted. 205 206 By design, original failures are logged once in `_verify_host()` 207 when `verify()` originally fails. The additional data gathered 208 here is for the debug logs to indicate why a subsequent 209 operation never ran. 210 211 @param host The host to be tested against the verifiers. 212 @param verifiers List of verifiers to be checked. 213 @param silent If true, don't log host status records. 214 215 @raises AutoservVerifyDependencyError Raised when at least 216 one verifier in the list has failed. 217 """ 218 failures = set() 219 for v in verifiers: 220 try: 221 v._verify_host(host, silent) 222 except AutoservVerifyDependencyError as e: 223 failures.update(e.failures) 224 except Exception as e: 225 failures.add(_DependencyFailure(v.description, str(e), v.tag)) 226 if failures: 227 raise AutoservVerifyDependencyError(self, failures) 228 229 def _verify_dependencies(self, host, silent): 230 """ 231 Verify that all of this node's dependencies pass for a host. 232 233 @param host The host to be verified. 234 @param silent If true, don't log host status records. 235 """ 236 try: 237 self._verify_list(host, self._dependency_list, silent) 238 except AutoservVerifyDependencyError as e: 239 e.log_dependencies( 240 'Skipping this operation', 241 'The following dependencies failed') 242 raise 243 244 @property 245 def tag(self): 246 """ 247 Tag for use in logging status records. 248 249 This is a property with a short string used to identify the node 250 in the 'status.log' file and during node construction. The tag 251 should contain only letters, digits, and '_' characters. This 252 tag is not used alone, but is combined with other identifiers, 253 based on the operation being logged. 254 255 @return A short identifier-like string. 256 """ 257 return self._tag 258 259 @property 260 def description(self): 261 """ 262 Text description of this node for log messages. 263 264 This string will be logged with failures, and should describe 265 the condition required for success. 266 267 N.B. Subclasses are required to override this method, but we 268 _don't_ raise NotImplementedError here. Various methods fail in 269 inscrutable ways if this method raises any exception, so for 270 debugging purposes, it's better to return a default value. 271 272 @return A descriptive string. 273 """ 274 return ('Class %s fails to implement description().' % 275 type(self).__name__) 276 277 278class Verifier(_DependencyNode): 279 """ 280 Abstract class embodying one verification check. 281 282 A concrete subclass of `Verifier` provides a simple check that can 283 determine a host's fitness for testing. Failure indicates that the 284 check found a problem that can cause at least one test to fail. 285 286 `Verifier` objects are organized in a DAG identifying dependencies 287 among operations. The DAG controls ordering and prevents wasted 288 effort: If verification operation V2 requires that verification 289 operation V1 pass, then a) V1 will run before V2, and b) if V1 290 fails, V2 won't run at all. The `_verify_host()` method ensures 291 that all dependencies run and pass before invoking the `verify()` 292 method. 293 294 A `Verifier` object caches its result the first time it calls 295 `verify()`. Subsequent calls return the cached result, without 296 re-running the check code. The `_reverify()` method clears the 297 cached result in the current node, and in all dependencies. 298 299 Subclasses must supply these properties and methods: 300 * `verify()`: This is the method to perform the actual 301 verification check. 302 * `description`: A one-line summary of the verification check for 303 debug log messages. 304 305 Subclasses must override all of the above attributes; subclasses 306 should not override or extend any other attributes of this class. 307 308 The description string should be a simple sentence explaining what 309 must be true for the verifier to pass. Do not include a terminating 310 period. For example: 311 312 Host is available via ssh 313 314 The base class manages the following private data: 315 * `_result`: The cached result of verification. 316 * `_dependency_list`: The list of dependencies. 317 Subclasses should not use these attributes. 318 319 @property _result Cached result of verification. 320 """ 321 322 def __init__(self, tag, dependencies): 323 super(Verifier, self).__init__(tag, 'verify', dependencies) 324 self._result = None 325 326 def _reverify(self): 327 """ 328 Discard cached verification results. 329 330 Reset the cached verification result for this node, and for the 331 transitive closure of all dependencies. 332 """ 333 if self._result is not None: 334 self._result = None 335 for v in self._dependency_list: 336 v._reverify() 337 338 def _verify_host(self, host, silent): 339 """ 340 Determine the result of verification, and log results. 341 342 If this verifier does not have a cached verification result, 343 check dependencies, and if they pass, run `verify()`. Log 344 informational messages regarding failed dependencies. If we 345 call `verify()`, log the result in `status.log`. 346 347 If we already have a cached result, return that result without 348 logging any message. 349 350 @param host The host to be tested for a problem. 351 @param silent If true, don't log host status records. 352 """ 353 if self._result is not None: 354 if isinstance(self._result, Exception): 355 raise self._result # cached failure 356 elif self._result: 357 return # cached success 358 self._result = False 359 self._verify_dependencies(host, silent) 360 logging.info('Verifying this condition: %s', self.description) 361 try: 362 self.verify(host) 363 self._record_good(host, silent) 364 except Exception as e: 365 logging.exception('Failed: %s', self.description) 366 self._result = e 367 self._record_fail(host, silent, e) 368 raise 369 self._result = True 370 371 def verify(self, host): 372 """ 373 Unconditionally perform a verification check. 374 375 This method is responsible for testing for a single problem on a 376 host. Implementations should follow these guidelines: 377 * The check should find a problem that will cause testing to 378 fail. 379 * Verification checks on a working system should run quickly 380 and should be optimized for success; a check that passes 381 should finish within seconds. 382 * Verification checks are not expected have side effects, but 383 may apply trivial fixes if they will finish within the time 384 constraints above. 385 386 A verification check should normally trigger a single set of 387 repair actions. If two different failures can require two 388 different repairs, ideally they should use two different 389 subclasses of `Verifier`. 390 391 Implementations indicate failure by raising an exception. The 392 exception text should be a short, 1-line summary of the error. 393 The text should be concise and diagnostic, as it will appear in 394 `status.log` files. 395 396 If this method finds no problems, it returns without raising any 397 exception. 398 399 Implementations should avoid most logging actions, but can log 400 DEBUG level messages if they provide significant information for 401 diagnosing failures. 402 403 @param host The host to be tested for a problem. 404 """ 405 raise NotImplementedError('Class %s does not implement ' 406 'verify()' % type(self).__name__) 407 408 409class RepairAction(_DependencyNode): 410 """ 411 Abstract class embodying one repair procedure. 412 413 A `RepairAction` is responsible for fixing one or more failed 414 `Verifier` checks, in order to make those checks pass. 415 416 Each repair action includes one or more verifier triggers that 417 determine when the repair action should run. A repair action 418 will call its `repair()` method if one or more of its triggers 419 fails. A repair action is successful if all of its triggers pass 420 after calling `repair()`. 421 422 A `RepairAction` is a subclass of `_DependencyNode`; if any of a 423 repair action's dependencies fail, the action does not check its 424 triggers, and doesn't call `repair()`. 425 426 Subclasses must supply these attributes: 427 * `repair()`: This is the method to perform the necessary 428 repair. The method should avoid most logging actions, but 429 can log DEBUG level messages if they provide significant 430 information for diagnosing failures. 431 * `description`: A one-line summary of the repair action for 432 debug log messages. 433 434 Subclasses must override both of the above attributes and should 435 not override any other attributes of this class. 436 437 The description string should be a simple sentence explaining the 438 operation that will be performed. Do not include a terminating 439 period. For example: 440 441 Re-install the stable build via AU 442 443 @property _trigger_list List of verification checks that will 444 trigger this repair when they fail. 445 @property host_class A string identifier that will be 446 used as a field to send repair metrics. 447 """ 448 449 def __init__(self, tag, dependencies, triggers, host_class): 450 super(RepairAction, self).__init__(tag, 'repair', dependencies) 451 self._trigger_list = triggers 452 self._failure_modes_counter = metrics.Counter( 453 'chromeos/autotest/repair/failure_modes') 454 self._failure_detail_counter = metrics.Counter( 455 'chromeos/autotest/repair/failure_detail') 456 self.host_class = host_class 457 458 def _record_start(self, host, silent): 459 """Log a 'START' status line. 460 461 @param host Host which will record the status record. 462 @param silent Don't record the event if this is a true 463 value. 464 """ 465 self._record(host, silent, 'START') 466 467 def _record_end_good(self, host, silent): 468 """Log an 'END GOOD' status line. 469 470 @param host Host which will record the status record. 471 @param silent Don't record the event if this is a true 472 value. 473 """ 474 self._record(host, silent, 'END GOOD') 475 self.status = 'repaired' 476 477 def _record_end_fail(self, host, silent, status, *args): 478 """Log an 'END FAIL' status line. 479 480 @param host Host which will record the status record. 481 @param silent Don't record the event if this is a true 482 value. 483 @param args Extra arguments to `self._record()` 484 """ 485 self._record(host, silent, 'END FAIL', *args) 486 self.status = status 487 488 def _send_failure_metrics(self, host, error, stage): 489 """Send failure mode metrics to monarch 490 491 @param host Host which this RepairAction targeted to. 492 @param error An exception that caught in _repair_host. 493 @param stage In which stage we caught above exception. 494 Can be one of below value: 495 'dep' during verify dependencies 496 'pre' during pre-repair trigger verification 497 'repair' during repair() process itself 498 'post' during post-repair trigger verification 499 """ 500 501 def get_fields(vf_tag): 502 fields = { 503 'ra_tag': self.tag, 504 'vf_tag': vf_tag, 505 'hostname': _filter_metrics_hostname(host), 506 'stage': stage, 507 'host_class': self.host_class 508 } 509 return fields 510 511 if isinstance(error, AutoservVerifyDependencyError): 512 # We'll catch all failure tags here for a dependencies error 513 for f in error.failures: 514 self._failure_modes_counter.increment(fields=get_fields(f.tag)) 515 else: 516 # When there is failure during repair or unknown failure. there 517 # will be no Verifier, so vf_tag set to 'unknown'. 518 self._failure_modes_counter.increment(fields=get_fields('unknown')) 519 520 if stage == 'repair': 521 self._send_failure_detail(error) 522 523 def _send_failure_detail(self, error): 524 """Send reason of failure inside repair() to monarch. 525 526 @param error The exception caught inside repair(). 527 """ 528 tag = error.tag if isinstance(error, AutoservRepairError) else 'unknown' 529 fields = {'repair_action_tag': self.tag, 'repair_failure_tag': tag} 530 self._failure_detail_counter.increment(fields=fields) 531 532 def _repair_host(self, host, silent): 533 """ 534 Apply this repair action if any triggers fail. 535 536 Repair is triggered when all dependencies are successful, and at 537 least one trigger fails. 538 539 If the `repair()` method triggers, the success or failure of 540 this operation is logged in `status.log` bracketed by 'START' 541 and 'END' records. Details of whether or why `repair()` 542 triggered are written to the debug logs. If repair doesn't 543 trigger, nothing is logged to `status.log`. 544 545 @param host The host to be repaired. 546 @param silent If true, don't log host status records. 547 """ 548 # Note: Every exit path from the method must set `self.status`. 549 # There's a lot of exit paths, so be careful. 550 # 551 # If we're blocked by a failed dependency, we exit with an 552 # exception. So set status to 'blocked' first. 553 self.status = 'blocked' 554 try: 555 self._verify_dependencies(host, silent) 556 except Exception as e: 557 self._send_failure_metrics(host, e, 'dep') 558 raise 559 # This is a defensive action. Every path below should overwrite 560 # this setting, but if it doesn't, we want our status to reflect 561 # a coding error. 562 self.status = 'unknown' 563 try: 564 self._verify_list(host, self._trigger_list, silent) 565 except AutoservVerifyDependencyError as e: 566 e.log_dependencies( 567 'Attempting this repair action', 568 'Repairing because these triggers failed') 569 self._send_failure_metrics(host, e, 'pre') 570 self._record_start(host, silent) 571 try: 572 self.repair(host) 573 except Exception as e: 574 logging.exception('Repair failed: %s', self.description) 575 self._record_fail(host, silent, e) 576 self._record_end_fail(host, silent, 'repair_failure') 577 self._send_failure_metrics(host, e, 'repair') 578 raise 579 try: 580 for v in self._trigger_list: 581 v._reverify() 582 self._verify_list(host, self._trigger_list, silent) 583 self._record_end_good(host, silent) 584 except AutoservVerifyDependencyError as e: 585 e.log_dependencies( 586 'This repair action reported success', 587 'However, these triggers still fail') 588 self._record_end_fail(host, silent, 'verify_failure') 589 self._send_failure_metrics(host, e, 'post') 590 raise AutoservRepairError( 591 'Some verification checks still fail', 'post_verify') 592 except Exception: 593 # The specification for `self._verify_list()` says 594 # that this can't happen; this is a defensive 595 # precaution. 596 self._record_end_fail(host, silent, 'unknown', 597 'Internal error in repair') 598 self._send_failure_metrics(host, e, 'post') 599 raise 600 else: 601 self.status = 'skipped' 602 logging.info('No failed triggers, skipping repair: %s', 603 self.description) 604 605 def repair(self, host): 606 """ 607 Apply this repair action to the given host. 608 609 This method is responsible for applying changes to fix failures 610 in one or more verification checks. The repair is considered 611 successful if the DUT passes the specific checks after this 612 method completes. 613 614 Implementations indicate failure by raising an exception. The 615 exception text should be a short, 1-line summary of the error. 616 The text should be concise and diagnostic, as it will appear in 617 `status.log` files. 618 619 If this method completes successfully, it returns without 620 raising any exception. 621 622 Implementations should avoid most logging actions, but can log 623 DEBUG level messages if they provide significant information for 624 diagnosing failures. 625 626 @param host The host to be repaired. 627 """ 628 raise NotImplementedError('Class %s does not implement ' 629 'repair()' % type(self).__name__) 630 631 632class _RootVerifier(Verifier): 633 """ 634 Utility class used by `RepairStrategy`. 635 636 A node of this class by itself does nothing; it always passes (if it 637 can run). This class exists merely to be the root of a DAG of 638 dependencies in an instance of `RepairStrategy`. 639 """ 640 641 def verify(self, host): 642 pass 643 644 @property 645 def description(self): 646 return 'All host verification checks pass' 647 648 649class RepairStrategy(object): 650 """ 651 A class for organizing `Verifier` and `RepairAction` objects. 652 653 An instance of `RepairStrategy` is organized as a DAG of `Verifier` 654 objects, plus a list of `RepairAction` objects. The class provides 655 methods for invoking those objects in the required order, when 656 needed: 657 * The `verify()` method walks the verifier DAG in dependency 658 order. 659 * The `repair()` method invokes the repair actions in list order. 660 Each repair action will invoke its dependencies and triggers as 661 needed. 662 663 # The Verifier DAG 664 The verifier DAG is constructed from the first argument passed to 665 the passed to the `RepairStrategy` constructor. That argument is an 666 iterable consisting of three-element tuples in the form 667 `(constructor, tag, deps)`: 668 * The `constructor` value is a callable that creates a `Verifier` 669 as for the interface of the class constructor. For classes 670 that inherit the default constructor from `Verifier`, this can 671 be the class itself. 672 * The `tag` value is the tag to be associated with the constructed 673 verifier. 674 * The `deps` value is an iterable (e.g. list or tuple) of strings. 675 Each string corresponds to the `tag` member of a `Verifier` 676 dependency. 677 678 The tag names of verifiers in the constructed DAG must all be 679 unique. The tag name defined by `RepairStrategy.ROOT_TAG` is 680 reserved and may not be used by any verifier. 681 682 In the input data for the constructor, dependencies must appear 683 before the nodes that depend on them. Thus: 684 685 ((A, 'a', ()), (B, 'b', ('a',))) # This is valid 686 ((B, 'b', ('a',)), (A, 'a', ())) # This will fail! 687 688 Internally, the DAG of verifiers is given unique root node. So, 689 given this input: 690 691 ((C, 'c', ()), 692 (A, 'a', ('c',)), 693 (B, 'b', ('c',))) 694 695 The following DAG is constructed: 696 697 Root 698 / \ 699 A B 700 \ / 701 C 702 703 Since nothing depends on `A` or `B`, the root node guarantees that 704 these two verifiers will both be called and properly logged. 705 706 The root node is not directly accessible; however repair actions can 707 trigger on it by using `RepairStrategy.ROOT_TAG`. Additionally, the 708 node will be logged in `status.log` whenever `verify()` succeeds. 709 710 # The Repair Actions List 711 The list of repair actions is constructed from the second argument 712 passed to the passed to the `RepairStrategy` constructor. That 713 argument is an iterable consisting of four-element tuples in the 714 form `(constructor, tag, deps, triggers)`: 715 * The `constructor` value is a callable that creates a 716 `RepairAction` as for the interface of the class constructor. 717 For classes that inherit the default constructor from 718 `RepairAction`, this can be the class itself. 719 * The `tag` value is the tag to be associated with the constructed 720 repair action. 721 * The `deps` value is an iterable (e.g. list or tuple) of strings. 722 Each string corresponds to the `tag` member of a `Verifier` that 723 the repair action depends on. 724 * The `triggers` value is an iterable (e.g. list or tuple) of 725 strings. Each string corresponds to the `tag` member of a 726 `Verifier` that can trigger the repair action. 727 728 `RepairStrategy` deps and triggers can only refer to verifiers, 729 not to other repair actions. 730 """ 731 732 # This name is reserved; clients may not use it. 733 ROOT_TAG = 'PASS' 734 735 @staticmethod 736 def _add_verifier(verifiers, constructor, tag, dep_tags): 737 """ 738 Construct and remember a verifier. 739 740 Create a `Verifier` using `constructor` and `tag`. Dependencies 741 for construction are found by looking up `dep_tags` in the 742 `verifiers` dictionary. 743 744 After construction, the new verifier is added to `verifiers`. 745 746 @param verifiers Dictionary of verifiers, indexed by tag. 747 @param constructor Verifier construction function. 748 @param tag Tag parameter for the construction function. 749 @param dep_tags Tags of dependencies for the constructor, to 750 be found in `verifiers`. 751 """ 752 assert tag not in verifiers 753 deps = [verifiers[d] for d in dep_tags] 754 verifiers[tag] = constructor(tag, deps) 755 756 def __init__(self, verifier_data, repair_data, host_class): 757 """ 758 Construct a `RepairStrategy` from simplified DAG data. 759 760 The input `verifier_data` object describes how to construct 761 verify nodes and the dependencies that relate them, as detailed 762 above. 763 764 The input `repair_data` object describes how to construct repair 765 actions and their dependencies and triggers, as detailed above. 766 767 @param verifier_data Iterable value with constructors for the 768 elements of the verification DAG and their 769 dependencies. 770 @param repair_data Iterable value with constructors for the 771 elements of the repair action list, and 772 their dependencies and triggers. 773 @property host_class A string identifier that identify what 774 class of host this repair strategy target 775 on, will be used as a field to send repair 776 metrics. 777 """ 778 # Metrics - we report on 'actions' for every repair action 779 # we execute; we report on 'strategy' for every complete 780 # repair operation. 781 self._strategy_counter = metrics.Counter( 782 'chromeos/autotest/repair/repair_strategy_v2') 783 self._actions_counter = metrics.Counter( 784 'chromeos/autotest/repair/repair_actions') 785 self.host_class = host_class 786 # We use the `all_verifiers` list to guarantee that our root 787 # verifier will execute its dependencies in the order provided 788 # to us by our caller. 789 verifier_map = {} 790 all_tags = [] 791 dependencies = set() 792 for constructor, tag, deps in verifier_data: 793 self._add_verifier(verifier_map, constructor, tag, deps) 794 dependencies.update(deps) 795 all_tags.append(tag) 796 # Capture all the verifiers that have nothing depending on them. 797 root_tags = [t for t in all_tags if t not in dependencies] 798 self._add_verifier(verifier_map, _RootVerifier, 799 self.ROOT_TAG, root_tags) 800 self._verify_root = verifier_map[self.ROOT_TAG] 801 self._repair_actions = [] 802 for constructor, tag, deps, triggers in repair_data: 803 r = constructor(tag, 804 [verifier_map[d] for d in deps], 805 [verifier_map[t] for t in triggers], 806 self.host_class) 807 self._repair_actions.append(r) 808 809 def _send_strategy_metrics(self, host, result): 810 """Send repair strategy metrics to monarch 811 812 @param host The target to be repaired. 813 @param result A String that describe a final result for the 814 RepairStrategy. 815 """ 816 info = host.host_info_store.get() 817 board = info.board if info.board else 'unknown' 818 model = info.model if info.model else 'unknown' 819 fields = { 820 'board': board, 821 'host_class': self.host_class, 822 'hostname': _filter_metrics_hostname(host), 823 'model': model, 824 'result': result, 825 } 826 self._strategy_counter.increment(fields=fields) 827 828 def _send_action_metrics(self, host, ra): 829 """Send repair action metrics to monarch 830 831 @param host The target to be repaired. 832 @param ra an RepairAction instance. 833 """ 834 fields = { 835 'tag': ra.tag, 836 'status': ra.status, 837 'hostname': _filter_metrics_hostname(host), 838 'host_class': self.host_class 839 } 840 self._actions_counter.increment(fields=fields) 841 842 def verify(self, host, silent=False): 843 """ 844 Run the verifier DAG on the given host. 845 846 @param host The target to be verified. 847 @param silent If true, don't log host status records. 848 """ 849 self._verify_root._reverify() 850 self._verify_root._verify_host(host, silent) 851 852 def repair(self, host, silent=False): 853 """ 854 Run the repair list on the given host. 855 856 @param host The target to be repaired. 857 @param silent If true, don't log host status records. 858 """ 859 self._verify_root._reverify() 860 attempted = False 861 for ra in self._repair_actions: 862 try: 863 ra._repair_host(host, silent) 864 except Exception as e: 865 # all logging and exception handling was done at 866 # lower levels 867 pass 868 finally: 869 self._send_action_metrics(host, ra) 870 if ra.status not in ('skipped', 'blocked'): 871 attempted = True 872 873 result = 'failure' 874 try: 875 self._verify_root._verify_host(host, silent) 876 result = 'success' if attempted else 'not_attempted' 877 except: 878 if not attempted: 879 result = 'attempt_blocked' 880 raise 881 finally: 882 self._send_strategy_metrics(host, result) 883 884 885def _filter_metrics_hostname(host): 886 """ 887 Restrict format of hostnames we'll send to monarch 888 889 @param host An host instance(i.e. ServoHost, CrosHost) 890 """ 891 if re.match(_HOSTNAME_PATTERN, host.hostname): 892 return host.hostname 893 else: 894 return _DISALLOWED_HOSTNAME 895 896