1# Copyright 2016 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5""" 6Framework for host verification and repair in Autotest. 7 8The framework provides implementation code in support of `Host.verify()` 9and `Host.repair()` used in Verify and Repair special tasks. 10 11The framework consists of these classes: 12 * `Verifier`: A class representing a single verification check. 13 * `RepairAction`: A class representing a repair operation that can fix 14 a failed verification check. 15 * `RepairStrategy`: A class for organizing a collection of `Verifier` 16 and `RepairAction` instances, and invoking them in order. 17 18Individual operations during verification and repair are handled by 19instances of `Verifier` and `RepairAction`. `Verifier` objects are 20meant to test for specific conditions that may cause tests to fail. 21`RepairAction` objects provide operations designed to fix one or 22more failures identified by a `Verifier` object. 23""" 24 25import collections 26import logging 27import re 28 29import common 30from autotest_lib.client.common_lib import error 31 32try: 33 from chromite.lib import metrics 34except ImportError: 35 from autotest_lib.client.bin.utils import metrics_mock as metrics 36 37#Regular experssion pattern to filter out unwanted hostname. 38_HOSTNAME_PATTERN = 'chromeos[0-9]-row[0-9]+[a-z]?-rack[0-9]+[a-z]?-host[0-9]+' 39_DISALLOWED_HOSTNAME = 'disallowed_hostname' 40 41# States of verifiers 42# True - verifier run and passed 43# False - verifier run and failed 44# None - verifier did not run or dependency failed 45VERIFY_SUCCESS = True 46VERIFY_FAILED = False 47VERIFY_NOT_RUN = None 48 49 50class AutoservVerifyError(error.AutoservError): 51 """ 52 Generic Exception for failures from `Verifier` objects. 53 54 Instances of this exception can be raised when a `verify()` 55 method fails, if no more specific exception is available. 56 """ 57 pass 58 59 60class AutoservNonCriticalVerifyError(error.AutoservError): 61 """ 62 Exception for failures from `Verifier` objects that not critical enough to 63 conclude the target host is in a bad state. 64 """ 65 pass 66 67 68_DependencyFailure = collections.namedtuple( 69 '_DependencyFailure', ('dependency', 'error', 'tag')) 70 71 72_NonCriticalDependencyFailure = collections.namedtuple( 73 '_NonCriticalDependencyFailure', ('dependency', 'error', 'tag')) 74 75 76class AutoservVerifyDependencyError(error.AutoservError): 77 """ 78 Exception raised for failures in dependencies. 79 80 This exception is used to distinguish an original failure from a 81 failure being passed back from a verification dependency. That is, 82 if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception 83 to signal that the original failure is further down the dependency 84 chain. 85 86 The `failures` argument to the constructor for this class is a set 87 of instances of `_DependencyFailure`, each corresponding to one 88 failed dependency: 89 * The `dependency` attribute of each failure is the description 90 of the failed dependency. 91 * The `error` attribute of each failure is the string value of 92 the exception from the failed dependency. 93 94 Multiple methods in this module recognize and handle this exception 95 specially. 96 97 @property failures Set of failures passed to the constructor. 98 @property _node Instance of `_DependencyNode` reporting the 99 failed dependencies. 100 """ 101 102 def __init__(self, node, failures): 103 """ 104 Constructor for `AutoservVerifyDependencyError`. 105 106 @param node Instance of _DependencyNode reporting the 107 failed dependencies. 108 @param failures List of failure tuples as described above. 109 """ 110 super(AutoservVerifyDependencyError, self).__init__( 111 '\n'.join([f.error for f in failures])) 112 self.failures = failures 113 self._node = node 114 115 def log_dependencies(self, action, deps): 116 """ 117 Log an `AutoservVerifyDependencyError`. 118 119 This writes a short summary of the dependency failures captured 120 in this exception, using standard Python logging. 121 122 The passed in `action` string plus `self._node.description` 123 are logged at INFO level. The `action` argument should 124 introduce or describe an action relative to `self._node`. 125 126 The passed in `deps` string and the description of each failed 127 dependency in `self` are be logged at DEBUG level. The `deps` 128 argument is used to introduce the various failed dependencies. 129 130 @param action A string mentioning the action being logged 131 relative to `self._node`. 132 @param deps A string introducing the dependencies that 133 failed. 134 """ 135 logging.info('%s: %s', action, self._node.description) 136 logging.debug('%s:', deps) 137 for failure in self.failures: 138 logging.debug(' %s', failure.dependency) 139 140 def is_critical(self, silent=False): 141 """Check if the error is considered to be critical to repair process.""" 142 for error in self.failures: 143 if isinstance(error, _NonCriticalDependencyFailure): 144 if not silent: 145 logging.warning("%s is still failing but forgiven because" 146 " it raised a non-critical error.", 147 error.tag) 148 else: 149 return True 150 return False 151 152 153class AutoservRepairError(error.AutoservError): 154 """ 155 Generic Exception for failures from `RepairAction` objects. 156 157 Instances of this exception can be raised when a `repair()` 158 method fails, if no more specific exception is available. 159 """ 160 def __init__(self, description, tag): 161 """ 162 @param description Message describe the exception. 163 @param tag A short identifier used for metric purpose. 164 """ 165 super(AutoservRepairError, self).__init__(description) 166 self.tag = tag 167 168 169class _DependencyNode(object): 170 """ 171 An object that can depend on verifiers. 172 173 Both repair and verify operations have the notion of dependencies 174 that must pass before the operation proceeds. This class captures 175 the shared behaviors required by both classes. 176 177 @property tag Short identifier to be used in logging. 178 @property description Text summary of this node's action, to be 179 used in debug logs. 180 @property _dependency_list Dependency pre-requisites. 181 """ 182 183 def __init__(self, tag, record_type, dependencies): 184 self._dependency_list = dependencies 185 self._tag = tag 186 self._record_tag = record_type + '.' + tag 187 188 def _is_applicable(self, host): 189 """ 190 Check if the action is applicable to target host. Subclasses 191 can override this method per their need. 192 193 @param host Target host to check. 194 @return A bool value. 195 """ 196 return True 197 198 def _record(self, host, silent, status_code, *record_args): 199 """ 200 Log a status record for `host`. 201 202 Call `host.record()` using the given status_code, and 203 operation tag `self._record_tag`, plus any extra arguments in 204 `record_args`. Do nothing if `silent` is a true value. 205 206 @param host Host which will record the status record. 207 @param silent Don't record the event if this is a true 208 value. 209 @param status_code Value for the `status_code` parameter to 210 `host.record()`. 211 @param record_args Additional arguments to pass to 212 `host.record()`. 213 """ 214 if not silent: 215 host.record(status_code, None, self._record_tag, 216 *record_args) 217 218 def _record_good(self, host, silent): 219 """Log a 'GOOD' status line. 220 221 @param host Host which will record the status record. 222 @param silent Don't record the event if this is a true 223 value. 224 """ 225 self._record(host, silent, 'GOOD') 226 227 def _record_fail(self, host, silent, exc): 228 """Log a 'FAIL' status line. 229 230 @param host Host which will record the status record. 231 @param silent Don't record the event if this is a true 232 value. 233 @param exc Exception describing the cause of failure. 234 """ 235 self._record(host, silent, 'FAIL', str(exc)) 236 237 def _verify_list(self, host, verifiers, silent): 238 """ 239 Test a list of verifiers against a given host. 240 241 This invokes `_verify_host()` on every verifier in the given 242 list. If any verifier in the transitive closure of dependencies 243 in the list fails, an `AutoservVerifyDependencyError` is raised 244 containing the description of each failed verifier. Only 245 original failures are reported; verifiers that don't run due 246 to a failed dependency are omitted. 247 248 By design, original failures are logged once in `_verify_host()` 249 when `verify()` originally fails. The additional data gathered 250 here is for the debug logs to indicate why a subsequent 251 operation never ran. 252 253 @param host The host to be tested against the verifiers. 254 @param verifiers List of verifiers to be checked. 255 @param silent If true, don't log host status records. 256 257 @raises AutoservVerifyDependencyError Raised when at least 258 one verifier in the list has failed. 259 """ 260 failures = set() 261 for v in verifiers: 262 try: 263 v._verify_host(host, silent) 264 except AutoservNonCriticalVerifyError as e: 265 failures.add(_NonCriticalDependencyFailure(v.description, 266 str(e), v.tag)) 267 except AutoservVerifyDependencyError as e: 268 failures.update(e.failures) 269 except Exception as e: 270 failures.add(_DependencyFailure(v.description, str(e), v.tag)) 271 if failures: 272 raise AutoservVerifyDependencyError(self, failures) 273 274 def _verify_dependencies(self, host, silent): 275 """ 276 Verify that all of this node's dependencies pass for a host. 277 278 @param host The host to be verified. 279 @param silent If true, don't log host status records. 280 """ 281 try: 282 self._verify_list(host, self._dependency_list, silent) 283 except AutoservVerifyDependencyError as e: 284 e.log_dependencies( 285 'Skipping this operation', 286 'The following dependencies failed') 287 raise 288 289 @property 290 def tag(self): 291 """ 292 Tag for use in logging status records. 293 294 This is a property with a short string used to identify the node 295 in the 'status.log' file and during node construction. The tag 296 should contain only letters, digits, and '_' characters. This 297 tag is not used alone, but is combined with other identifiers, 298 based on the operation being logged. 299 300 @return A short identifier-like string. 301 """ 302 return self._tag 303 304 @property 305 def description(self): 306 """ 307 Text description of this node for log messages. 308 309 This string will be logged with failures, and should describe 310 the condition required for success. 311 312 N.B. Subclasses are required to override this method, but we 313 _don't_ raise NotImplementedError here. Various methods fail in 314 inscrutable ways if this method raises any exception, so for 315 debugging purposes, it's better to return a default value. 316 317 @return A descriptive string. 318 """ 319 return ('Class %s fails to implement description().' % 320 type(self).__name__) 321 322 def _get_node_by_tag(self, tag): 323 """Find verifier by tag, recursive.""" 324 if self._tag == tag: 325 return self 326 for child in self._dependency_list: 327 node = child._get_node_by_tag(tag) 328 if node is not None: 329 return node 330 return None 331 332 333class Verifier(_DependencyNode): 334 """ 335 Abstract class embodying one verification check. 336 337 A concrete subclass of `Verifier` provides a simple check that can 338 determine a host's fitness for testing. Failure indicates that the 339 check found a problem that can cause at least one test to fail. 340 341 `Verifier` objects are organized in a DAG identifying dependencies 342 among operations. The DAG controls ordering and prevents wasted 343 effort: If verification operation V2 requires that verification 344 operation V1 pass, then a) V1 will run before V2, and b) if V1 345 fails, V2 won't run at all. The `_verify_host()` method ensures 346 that all dependencies run and pass before invoking the `verify()` 347 method. 348 349 A `Verifier` object caches its result the first time it calls 350 `verify()`. Subsequent calls return the cached result, without 351 re-running the check code. The `_reverify()` method clears the 352 cached result in the current node, and in all dependencies. 353 354 Subclasses must supply these properties and methods: 355 * `verify()`: This is the method to perform the actual 356 verification check. 357 * `description`: A one-line summary of the verification check for 358 debug log messages. 359 360 Subclasses must override all of the above attributes; subclasses 361 should not override or extend any other attributes of this class. 362 363 The description string should be a simple sentence explaining what 364 must be true for the verifier to pass. Do not include a terminating 365 period. For example: 366 367 Host is available via ssh 368 369 The base class manages the following private data: 370 * `_result`: The cached result of verification. 371 None - did not run 372 True - successful pass 373 Exception - fail during execution 374 * `_dependency_list`: The list of dependencies. 375 Subclasses should not use these attributes. 376 377 @property _result Cached result of verification. 378 """ 379 380 def __init__(self, tag, dependencies): 381 super(Verifier, self).__init__(tag, 'verify', dependencies) 382 self._result = None 383 384 def _reverify(self): 385 """ 386 Discard cached verification results. 387 388 Reset the cached verification result for this node, and for the 389 transitive closure of all dependencies. 390 """ 391 self._result = None 392 for v in self._dependency_list: 393 v._reverify() 394 395 def _verify_host(self, host, silent): 396 """ 397 Determine the result of verification, and log results. 398 399 If this verifier does not have a cached verification result, 400 check dependencies, and if they pass, run `verify()`. Log 401 informational messages regarding failed dependencies. If we 402 call `verify()`, log the result in `status.log`. 403 404 If we already have a cached result, return that result without 405 logging any message. 406 407 @param host The host to be tested for a problem. 408 @param silent If true, don't log host status records. 409 """ 410 try: 411 if not self._is_applicable(host): 412 logging.info('Verify %s is not applicable to %s, skipping...', 413 self.description, host.hostname) 414 return 415 except Exception as e: 416 logging.error('Skipping %s verifier due to unexpect error during' 417 ' check applicability; %s', self.tag, e) 418 return 419 420 if self._result is not None: 421 if isinstance(self._result, Exception): 422 raise self._result # cached failure 423 elif self._result: 424 return # cached success 425 426 self._verify_dependencies(host, silent) 427 logging.info('Verifying this condition: %s', self.description) 428 try: 429 logging.debug('Start verify task: %s.', type(self).__name__) 430 self.verify(host) 431 self._record_good(host, silent) 432 except Exception as e: 433 message = 'Failed: %s' 434 if isinstance(e, AutoservNonCriticalVerifyError): 435 message = '(Non-critical)Failed: %s' 436 logging.exception(message, self.description) 437 self._result = e 438 self._record_fail(host, silent, e) 439 # Increase verifier fail count if device health profile is 440 # available to the host class. 441 if hasattr(host, 'health_profile') and host.health_profile: 442 host.health_profile.insert_failed_verifier(self.tag) 443 raise 444 finally: 445 logging.debug('Finished verify task: %s.', type(self).__name__) 446 447 self._result = True 448 449 def verify(self, host): 450 """ 451 Unconditionally perform a verification check. 452 453 This method is responsible for testing for a single problem on a 454 host. Implementations should follow these guidelines: 455 * The check should find a problem that will cause testing to 456 fail. 457 * Verification checks on a working system should run quickly 458 and should be optimized for success; a check that passes 459 should finish within seconds. 460 * Verification checks are not expected have side effects, but 461 may apply trivial fixes if they will finish within the time 462 constraints above. 463 464 A verification check should normally trigger a single set of 465 repair actions. If two different failures can require two 466 different repairs, ideally they should use two different 467 subclasses of `Verifier`. 468 469 Implementations indicate failure by raising an exception. The 470 exception text should be a short, 1-line summary of the error. 471 The text should be concise and diagnostic, as it will appear in 472 `status.log` files. 473 474 If this method finds no problems, it returns without raising any 475 exception. 476 477 Implementations should avoid most logging actions, but can log 478 DEBUG level messages if they provide significant information for 479 diagnosing failures. 480 481 @param host The host to be tested for a problem. 482 """ 483 raise NotImplementedError('Class %s does not implement ' 484 'verify()' % type(self).__name__) 485 486 def _is_good(self): 487 """Provide result of the verifier 488 489 @returns: a boolean or None value: 490 True - verifier passed 491 False - verifier did not pass 492 None - verifier did not run because it is not applicable 493 or blocked due to dependency failure 494 """ 495 if type(self._result) == type(True): 496 return self._result 497 elif isinstance(self._result, Exception): 498 return False 499 return None 500 501 502class RepairAction(_DependencyNode): 503 """ 504 Abstract class embodying one repair procedure. 505 506 A `RepairAction` is responsible for fixing one or more failed 507 `Verifier` checks, in order to make those checks pass. 508 509 Each repair action includes one or more verifier triggers that 510 determine when the repair action should run. A repair action 511 will call its `repair()` method if one or more of its triggers 512 fails. A repair action is successful if all of its triggers pass 513 after calling `repair()`. 514 515 A `RepairAction` is a subclass of `_DependencyNode`; if any of a 516 repair action's dependencies fail, the action does not check its 517 triggers, and doesn't call `repair()`. 518 519 Subclasses must supply these attributes: 520 * `repair()`: This is the method to perform the necessary 521 repair. The method should avoid most logging actions, but 522 can log DEBUG level messages if they provide significant 523 information for diagnosing failures. 524 * `description`: A one-line summary of the repair action for 525 debug log messages. 526 527 Subclasses must override both of the above attributes and should 528 not override any other attributes of this class. 529 530 The description string should be a simple sentence explaining the 531 operation that will be performed. Do not include a terminating 532 period. For example: 533 534 Re-install the stable build via AU 535 536 @property _trigger_list List of verification checks that will 537 trigger this repair when they fail. 538 @property host_class A string identifier that will be 539 used as a field to send repair metrics. 540 """ 541 542 def __init__(self, tag, dependencies, triggers, host_class): 543 super(RepairAction, self).__init__(tag, 'repair', dependencies) 544 self._trigger_list = triggers 545 self._failure_modes_counter = metrics.Counter( 546 'chromeos/autotest/repair/failure_modes') 547 self._failure_detail_counter = metrics.Counter( 548 'chromeos/autotest/repair/failure_detail') 549 self.host_class = host_class 550 551 def _record_start(self, host, silent): 552 """Log a 'START' status line. 553 554 @param host Host which will record the status record. 555 @param silent Don't record the event if this is a true 556 value. 557 """ 558 self._record(host, silent, 'START') 559 560 def _record_end_good(self, host, silent): 561 """Log an 'END GOOD' status line. 562 563 @param host Host which will record the status record. 564 @param silent Don't record the event if this is a true 565 value. 566 """ 567 self._record(host, silent, 'END GOOD') 568 self.status = 'repaired' 569 570 def _record_end_fail(self, host, silent, status, *args): 571 """Log an 'END FAIL' status line. 572 573 @param host Host which will record the status record. 574 @param silent Don't record the event if this is a true 575 value. 576 @param args Extra arguments to `self._record()` 577 """ 578 self._record(host, silent, 'END FAIL', *args) 579 self.status = status 580 581 def _send_failure_metrics(self, host, error, stage): 582 """Send failure mode metrics to monarch 583 584 @param host Host which this RepairAction targeted to. 585 @param error An exception that caught in _repair_host. 586 @param stage In which stage we caught above exception. 587 Can be one of below value: 588 'dep' during verify dependencies 589 'pre' during pre-repair trigger verification 590 'repair' during repair() process itself 591 'post' during post-repair trigger verification 592 """ 593 594 def get_fields(vf_tag): 595 fields = { 596 'ra_tag': self.tag, 597 'vf_tag': vf_tag, 598 'hostname': _filter_metrics_hostname(host), 599 'stage': stage, 600 'host_class': self.host_class 601 } 602 return fields 603 604 if isinstance(error, AutoservVerifyDependencyError): 605 # We'll catch all failure tags here for a dependencies error 606 for f in error.failures: 607 self._failure_modes_counter.increment(fields=get_fields(f.tag)) 608 else: 609 # When there is failure during repair or unknown failure. there 610 # will be no Verifier, so vf_tag set to 'unknown'. 611 self._failure_modes_counter.increment(fields=get_fields('unknown')) 612 613 if stage == 'repair': 614 self._send_failure_detail(error) 615 616 def _send_failure_detail(self, error): 617 """Send reason of failure inside repair() to monarch. 618 619 @param error The exception caught inside repair(). 620 """ 621 tag = error.tag if isinstance(error, AutoservRepairError) else 'unknown' 622 fields = {'repair_action_tag': self.tag, 'repair_failure_tag': tag} 623 self._failure_detail_counter.increment(fields=fields) 624 625 def _repair_host(self, host, silent): 626 """ 627 Apply this repair action if any triggers fail. 628 629 Repair is triggered when all dependencies are successful, and at 630 least one trigger fails. 631 632 If the `repair()` method triggers, the success or failure of 633 this operation is logged in `status.log` bracketed by 'START' 634 and 'END' records. Details of whether or why `repair()` 635 triggered are written to the debug logs. If repair doesn't 636 trigger, nothing is logged to `status.log`. 637 638 @param host The host to be repaired. 639 @param silent If true, don't log host status records. 640 """ 641 # Note: Every exit path from the method must set `self.status`. 642 # There's a lot of exit paths, so be careful. 643 # 644 # If we're blocked by a failed dependency, we exit with an 645 # exception. So set status to 'blocked' first. 646 self.status = 'skipped' 647 try: 648 if not self._is_applicable(host): 649 logging.info('RepairAction is not applicable, skipping repair: %s', 650 self.description) 651 return 652 except Exception as e: 653 logging.error('Skipping %s repair action due to unexpect error' 654 ' during check applicability; %s', self.tag, e) 655 return 656 657 self.status = 'blocked' 658 try: 659 self._verify_dependencies(host, silent) 660 except Exception as e: 661 self._send_failure_metrics(host, e, 'dep') 662 raise 663 # This is a defensive action. Every path below should overwrite 664 # this setting, but if it doesn't, we want our status to reflect 665 # a coding error. 666 self.status = 'unknown' 667 try: 668 self._verify_list(host, self._trigger_list, silent) 669 except AutoservVerifyDependencyError as e: 670 e.log_dependencies( 671 'Attempting this repair action', 672 'Repairing because these triggers failed') 673 self._send_failure_metrics(host, e, 'pre') 674 self._record_start(host, silent) 675 try: 676 self.repair(host) 677 # Increase action success count if device health profile is 678 # available to the host class. 679 if hasattr(host, 'health_profile') and host.health_profile: 680 host.health_profile.insert_succeed_repair_action(self.tag) 681 except Exception as e: 682 logging.exception('Repair failed: %s', self.description) 683 self._record_fail(host, silent, e) 684 self._record_end_fail(host, silent, 'repair_failure') 685 self._send_failure_metrics(host, e, 'repair') 686 # Increase action fail count if device health profile is 687 # available to the host class. 688 if hasattr(host, 'health_profile') and host.health_profile: 689 host.health_profile.insert_failed_repair_action(self.tag) 690 raise 691 try: 692 for v in self._trigger_list: 693 v._reverify() 694 self._verify_list(host, self._trigger_list, silent) 695 self._record_end_good(host, silent) 696 except AutoservVerifyDependencyError as e: 697 e.log_dependencies( 698 'This repair action reported success', 699 'However, these triggers still fail') 700 self._record_end_fail(host, silent, 'verify_failure') 701 self._send_failure_metrics(host, e, 'post') 702 raise AutoservRepairError( 703 'Some verification checks still fail', 'post_verify') 704 except Exception: 705 # The specification for `self._verify_list()` says 706 # that this can't happen; this is a defensive 707 # precaution. 708 self._record_end_fail(host, silent, 'unknown', 709 'Internal error in repair') 710 self._send_failure_metrics(host, e, 'post') 711 raise 712 else: 713 self.status = 'skipped' 714 logging.info('No failed triggers, skipping repair: %s', 715 self.description) 716 717 def repair(self, host): 718 """ 719 Apply this repair action to the given host. 720 721 This method is responsible for applying changes to fix failures 722 in one or more verification checks. The repair is considered 723 successful if the DUT passes the specific checks after this 724 method completes. 725 726 Implementations indicate failure by raising an exception. The 727 exception text should be a short, 1-line summary of the error. 728 The text should be concise and diagnostic, as it will appear in 729 `status.log` files. 730 731 If this method completes successfully, it returns without 732 raising any exception. 733 734 Implementations should avoid most logging actions, but can log 735 DEBUG level messages if they provide significant information for 736 diagnosing failures. 737 738 @param host The host to be repaired. 739 """ 740 raise NotImplementedError('Class %s does not implement ' 741 'repair()' % type(self).__name__) 742 743 744class _RootVerifier(Verifier): 745 """ 746 Utility class used by `RepairStrategy`. 747 748 A node of this class by itself does nothing; it always passes (if it 749 can run). This class exists merely to be the root of a DAG of 750 dependencies in an instance of `RepairStrategy`. 751 """ 752 753 def verify(self, host): 754 pass 755 756 @property 757 def description(self): 758 return 'All host verification checks pass' 759 760 761class RepairStrategy(object): 762 """ 763 A class for organizing `Verifier` and `RepairAction` objects. 764 765 An instance of `RepairStrategy` is organized as a DAG of `Verifier` 766 objects, plus a list of `RepairAction` objects. The class provides 767 methods for invoking those objects in the required order, when 768 needed: 769 * The `verify()` method walks the verifier DAG in dependency 770 order. 771 * The `repair()` method invokes the repair actions in list order. 772 Each repair action will invoke its dependencies and triggers as 773 needed. 774 775 # The Verifier DAG 776 The verifier DAG is constructed from the first argument passed to 777 the passed to the `RepairStrategy` constructor. That argument is an 778 iterable consisting of three-element tuples in the form 779 `(constructor, tag, deps)`: 780 * The `constructor` value is a callable that creates a `Verifier` 781 as for the interface of the class constructor. For classes 782 that inherit the default constructor from `Verifier`, this can 783 be the class itself. 784 * The `tag` value is the tag to be associated with the constructed 785 verifier. 786 * The `deps` value is an iterable (e.g. list or tuple) of strings. 787 Each string corresponds to the `tag` member of a `Verifier` 788 dependency. 789 790 The tag names of verifiers in the constructed DAG must all be 791 unique. The tag name defined by `RepairStrategy.ROOT_TAG` is 792 reserved and may not be used by any verifier. 793 794 In the input data for the constructor, dependencies must appear 795 before the nodes that depend on them. Thus: 796 797 ((A, 'a', ()), (B, 'b', ('a',))) # This is valid 798 ((B, 'b', ('a',)), (A, 'a', ())) # This will fail! 799 800 Internally, the DAG of verifiers is given unique root node. So, 801 given this input: 802 803 ((C, 'c', ()), 804 (A, 'a', ('c',)), 805 (B, 'b', ('c',))) 806 807 The following DAG is constructed: 808 809 Root 810 / \ 811 A B 812 \ / 813 C 814 815 Since nothing depends on `A` or `B`, the root node guarantees that 816 these two verifiers will both be called and properly logged. 817 818 The root node is not directly accessible; however repair actions can 819 trigger on it by using `RepairStrategy.ROOT_TAG`. Additionally, the 820 node will be logged in `status.log` whenever `verify()` succeeds. 821 822 # The Repair Actions List 823 The list of repair actions is constructed from the second argument 824 passed to the passed to the `RepairStrategy` constructor. That 825 argument is an iterable consisting of four-element tuples in the 826 form `(constructor, tag, deps, triggers)`: 827 * The `constructor` value is a callable that creates a 828 `RepairAction` as for the interface of the class constructor. 829 For classes that inherit the default constructor from 830 `RepairAction`, this can be the class itself. 831 * The `tag` value is the tag to be associated with the constructed 832 repair action. 833 * The `deps` value is an iterable (e.g. list or tuple) of strings. 834 Each string corresponds to the `tag` member of a `Verifier` that 835 the repair action depends on. 836 * The `triggers` value is an iterable (e.g. list or tuple) of 837 strings. Each string corresponds to the `tag` member of a 838 `Verifier` that can trigger the repair action. 839 840 `RepairStrategy` deps and triggers can only refer to verifiers, 841 not to other repair actions. 842 """ 843 844 # This name is reserved; clients may not use it. 845 ROOT_TAG = 'PASS' 846 847 @staticmethod 848 def _add_verifier(verifiers, constructor, tag, dep_tags): 849 """ 850 Construct and remember a verifier. 851 852 Create a `Verifier` using `constructor` and `tag`. Dependencies 853 for construction are found by looking up `dep_tags` in the 854 `verifiers` dictionary. 855 856 After construction, the new verifier is added to `verifiers`. 857 858 @param verifiers Dictionary of verifiers, indexed by tag. 859 @param constructor Verifier construction function. 860 @param tag Tag parameter for the construction function. 861 @param dep_tags Tags of dependencies for the constructor, to 862 be found in `verifiers`. 863 """ 864 assert tag not in verifiers 865 deps = [verifiers[d] for d in dep_tags] 866 verifiers[tag] = constructor(tag, deps) 867 868 def __init__(self, verifier_data, repair_data, host_class): 869 """ 870 Construct a `RepairStrategy` from simplified DAG data. 871 872 The input `verifier_data` object describes how to construct 873 verify nodes and the dependencies that relate them, as detailed 874 above. 875 876 The input `repair_data` object describes how to construct repair 877 actions and their dependencies and triggers, as detailed above. 878 879 @param verifier_data Iterable value with constructors for the 880 elements of the verification DAG and their 881 dependencies. 882 @param repair_data Iterable value with constructors for the 883 elements of the repair action list, and 884 their dependencies and triggers. 885 @property host_class A string identifier that identify what 886 class of host this repair strategy target 887 on, will be used as a field to send repair 888 metrics. 889 """ 890 # Metrics - we report on 'actions' for every repair action 891 # we execute; we report on 'strategy' for every complete 892 # repair operation. 893 self._strategy_counter = metrics.Counter( 894 'chromeos/autotest/repair/repair_strategy_v2') 895 self._actions_counter = metrics.Counter( 896 'chromeos/autotest/repair/repair_actions') 897 self.host_class = host_class 898 # We use the `all_verifiers` list to guarantee that our root 899 # verifier will execute its dependencies in the order provided 900 # to us by our caller. 901 verifier_map = {} 902 all_tags = [] 903 dependencies = set() 904 for constructor, tag, deps in verifier_data: 905 self._add_verifier(verifier_map, constructor, tag, deps) 906 dependencies.update(deps) 907 all_tags.append(tag) 908 # Capture all the verifiers that have nothing depending on them. 909 root_tags = [t for t in all_tags if t not in dependencies] 910 self._add_verifier(verifier_map, _RootVerifier, 911 self.ROOT_TAG, root_tags) 912 self._verify_root = verifier_map[self.ROOT_TAG] 913 self._repair_actions = [] 914 for constructor, tag, deps, triggers in repair_data: 915 r = constructor(tag, 916 [verifier_map[d] for d in deps], 917 [verifier_map[t] for t in triggers], 918 self.host_class) 919 self._repair_actions.append(r) 920 921 def _send_strategy_metrics(self, host, result): 922 """Send repair strategy metrics to monarch 923 924 @param host The target to be repaired. 925 @param result A String that describe a final result for the 926 RepairStrategy. 927 """ 928 info = host.host_info_store.get() 929 board = info.board if info.board else 'unknown' 930 model = info.model if info.model else 'unknown' 931 fields = { 932 'board': board, 933 'host_class': self.host_class, 934 'hostname': _filter_metrics_hostname(host), 935 'model': model, 936 'result': result, 937 } 938 self._strategy_counter.increment(fields=fields) 939 940 def _send_action_metrics(self, host, ra): 941 """Send repair action metrics to monarch 942 943 @param host The target to be repaired. 944 @param ra an RepairAction instance. 945 """ 946 fields = { 947 'tag': ra.tag, 948 'status': ra.status, 949 'hostname': _filter_metrics_hostname(host), 950 'host_class': self.host_class 951 } 952 self._actions_counter.increment(fields=fields) 953 954 def verify(self, host, silent=False): 955 """ 956 Run the verifier DAG on the given host. 957 958 @param host The target to be verified. 959 @param silent If true, don't log host status records. 960 """ 961 self._verify_root._reverify() 962 self._verify_root._verify_host(host, silent) 963 964 def repair(self, host, silent=False): 965 """ 966 Run the repair list on the given host. 967 968 @param host The target to be repaired. 969 @param silent If true, don't log host status records. 970 """ 971 self._verify_root._reverify() 972 attempted = False 973 for ra in self._repair_actions: 974 try: 975 logging.debug('Start repair task: %s.', type(ra).__name__) 976 ra._repair_host(host, silent) 977 except Exception as e: 978 # all logging and exception handling was done at 979 # lower levels 980 pass 981 finally: 982 self._send_action_metrics(host, ra) 983 logging.debug('Finished repair task: %s.', type(ra).__name__) 984 if ra.status not in ('skipped', 'blocked'): 985 attempted = True 986 987 result = 'failure' 988 try: 989 self._verify_root._verify_host(host, silent) 990 result = 'success' if attempted else 'not_attempted' 991 except: 992 if not attempted: 993 result = 'attempt_blocked' 994 raise 995 finally: 996 self._send_strategy_metrics(host, result) 997 998 def verifier_is_good(self, tag): 999 """Find and return result of a verifier. 1000 1001 @param tag: key to be associated with verifier 1002 1003 @returns: a boolean or None value: 1004 True - verifier passed 1005 False - verifier did not pass 1006 None - verifier did not run because it is not applicable 1007 or blocked due to dependency failure 1008 """ 1009 verifier = self._verify_root._get_node_by_tag(tag) 1010 if verifier is not None: 1011 result = verifier._is_good() 1012 logging.debug('Verifier with associated tag: %s found', tag) 1013 if result is None: 1014 logging.debug('%s did not run; it is not applicable to run ' 1015 'or blocked due to dependency failure', tag) 1016 elif result == True: 1017 logging.debug('Cached result of %s verifier is pass', tag) 1018 else: 1019 logging.debug('Cached result of %s verifier is fail', tag) 1020 return result 1021 logging.debug('Verifier with associated tag: %s not found', tag) 1022 return None 1023 1024 1025def _filter_metrics_hostname(host): 1026 """ 1027 Restrict format of hostnames we'll send to monarch 1028 1029 @param host An host instance(i.e. ServoHost, CrosHost) 1030 """ 1031 if re.match(_HOSTNAME_PATTERN, host.hostname): 1032 return host.hostname 1033 else: 1034 return _DISALLOWED_HOSTNAME 1035