1# Copyright 2016 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5""" 6Framework for host verification and repair in Autotest. 7 8The framework provides implementation code in support of `Host.verify()` 9and `Host.repair()` used in Verify and Repair special tasks. 10 11The framework consists of these classes: 12 * `Verifier`: A class representing a single verification check. 13 * `RepairAction`: A class representing a repair operation that can fix 14 a failed verification check. 15 * `RepairStrategy`: A class for organizing a collection of `Verifier` 16 and `RepairAction` instances, and invoking them in order. 17 18Individual operations during verification and repair are handled by 19instances of `Verifier` and `RepairAction`. `Verifier` objects are 20meant to test for specific conditions that may cause tests to fail. 21`RepairAction` objects provide operations designed to fix one or 22more failures identified by a `Verifier` object. 23""" 24 25import collections 26import logging 27 28import common 29from autotest_lib.client.common_lib import error 30 31 32class AutoservVerifyError(error.AutoservError): 33 """ 34 Generic Exception for failures from `Verifier` objects. 35 36 Instances of this exception can be raised when a `verify()` 37 method fails, if no more specific exception is available. 38 """ 39 pass 40 41 42_DependencyFailure = collections.namedtuple( 43 '_DependencyFailure', ('dependency', 'error')) 44 45 46class AutoservVerifyDependencyError(error.AutoservError): 47 """ 48 Exception raised for failures in dependencies. 49 50 This exception is used to distinguish an original failure from a 51 failure being passed back from a verification dependency. That is, 52 if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception 53 to signal that the original failure is further down the dependency 54 chain. 55 56 The `failures` argument to the constructor for this class is a set 57 of instances of `_DependencyFailure`, each corresponding to one 58 failed dependency: 59 * The `dependency` attribute of each failure is the description 60 of the failed dependency. 61 * The `error` attribute of each failure is the string value of 62 the exception from the failed dependency. 63 64 Multiple methods in this module recognize and handle this exception 65 specially. 66 67 @property failures Set of failures passed to the constructor. 68 @property _node Instance of `_DependencyNode` reporting the 69 failed dependencies. 70 """ 71 def __init__(self, node, failures): 72 """ 73 Constructor for `AutoservVerifyDependencyError`. 74 75 @param node Instance of _DependencyNode reporting the 76 failed dependencies. 77 @param failures List of failure tuples as described above. 78 """ 79 super(AutoservVerifyDependencyError, self).__init__( 80 '\n'.join([f.error for f in failures])) 81 self.failures = failures 82 self._node = node 83 84 85 def log_dependencies(self, action, deps): 86 """ 87 Log an `AutoservVerifyDependencyError`. 88 89 This writes a short summary of the dependency failures captured 90 in this exception, using standard Python logging. 91 92 The passed in `action` string plus `self._node.description` 93 are logged at INFO level. The `action` argument should 94 introduce or describe an action relative to `self._node`. 95 96 The passed in `deps` string and the description of each failed 97 dependency in `self` are be logged at DEBUG level. The `deps` 98 argument is used to introduce the various failed dependencies. 99 100 @param action A string mentioning the action being logged 101 relative to `self._node`. 102 @param deps A string introducing the dependencies that 103 failed. 104 """ 105 logging.info('%s: %s', action, self._node.description) 106 logging.debug('%s:', deps) 107 for failure in self.failures: 108 logging.debug(' %s', failure.dependency) 109 110 111class AutoservRepairError(error.AutoservError): 112 """ 113 Generic Exception for failures from `RepairAction` objects. 114 115 Instances of this exception can be raised when a `repair()` 116 method fails, if no more specific exception is available. 117 """ 118 pass 119 120 121class _DependencyNode(object): 122 """ 123 An object that can depend on verifiers. 124 125 Both repair and verify operations have the notion of dependencies 126 that must pass before the operation proceeds. This class captures 127 the shared behaviors required by both classes. 128 129 @property tag Short identifier to be used in logging. 130 @property description Text summary of this node's action, to be 131 used in debug logs. 132 @property _dependency_list Dependency pre-requisites. 133 """ 134 135 def __init__(self, tag, dependencies): 136 self._dependency_list = dependencies 137 self._tag = tag 138 139 140 def _record(self, host, silent, *record_args): 141 """ 142 Log a status record for `host`. 143 144 Call `host.record()` with the given `record_args`, unless 145 requested to skip by `silent`. 146 147 @param host Host which will record the status record. 148 @param silent Don't record the event if this is a true 149 value. 150 @param record_args Arguments to pass to `host.record()`. 151 """ 152 if not silent: 153 host.record(*record_args) 154 155 156 def _verify_list(self, host, verifiers, silent): 157 """ 158 Test a list of verifiers against a given host. 159 160 This invokes `_verify_host()` on every verifier in the given 161 list. If any verifier in the transitive closure of dependencies 162 in the list fails, an `AutoservVerifyDependencyError` is raised 163 containing the description of each failed verifier. Only 164 original failures are reported; verifiers that don't run due 165 to a failed dependency are omitted. 166 167 By design, original failures are logged once in `_verify_host()` 168 when `verify()` originally fails. The additional data gathered 169 here is for the debug logs to indicate why a subsequent 170 operation never ran. 171 172 @param host The host to be tested against the verifiers. 173 @param verifiers List of verifiers to be checked. 174 @param silent If true, don't log host status records. 175 176 @raises AutoservVerifyDependencyError Raised when at least 177 one verifier in the list has failed. 178 """ 179 failures = set() 180 for v in verifiers: 181 try: 182 v._verify_host(host, silent) 183 except AutoservVerifyDependencyError as e: 184 failures.update(e.failures) 185 except Exception as e: 186 failures.add(_DependencyFailure(v.description, str(e))) 187 if failures: 188 raise AutoservVerifyDependencyError(self, failures) 189 190 191 def _verify_dependencies(self, host, silent): 192 """ 193 Verify that all of this node's dependencies pass for a host. 194 195 @param host The host to be verified. 196 @param silent If true, don't log host status records. 197 """ 198 try: 199 self._verify_list(host, self._dependency_list, silent) 200 except AutoservVerifyDependencyError as e: 201 e.log_dependencies( 202 'Skipping this operation', 203 'The following dependencies failed') 204 raise 205 206 207 @property 208 def tag(self): 209 """ 210 Tag for use in logging status records. 211 212 This is a property with a short string used to identify the node 213 in the 'status.log' file and during node construction. The tag 214 should contain only letters, digits, and '_' characters. This 215 tag is not used alone, but is combined with other identifiers, 216 based on the operation being logged. 217 218 @return A short identifier-like string. 219 """ 220 return self._tag 221 222 223 @property 224 def description(self): 225 """ 226 Text description of this node for log messages. 227 228 This string will be logged with failures, and should describe 229 the condition required for success. 230 231 N.B. Subclasses are required to override this method, but we 232 _don't_ raise NotImplementedError here. Various methods fail in 233 inscrutable ways if this method raises any exception, so for 234 debugging purposes, it's better to return a default value. 235 236 @return A descriptive string. 237 """ 238 return ('Class %s fails to implement description().' % 239 type(self).__name__) 240 241 242class Verifier(_DependencyNode): 243 """ 244 Abstract class embodying one verification check. 245 246 A concrete subclass of `Verifier` provides a simple check that can 247 determine a host's fitness for testing. Failure indicates that the 248 check found a problem that can cause at least one test to fail. 249 250 `Verifier` objects are organized in a DAG identifying dependencies 251 among operations. The DAG controls ordering and prevents wasted 252 effort: If verification operation V2 requires that verification 253 operation V1 pass, then a) V1 will run before V2, and b) if V1 254 fails, V2 won't run at all. The `_verify_host()` method ensures 255 that all dependencies run and pass before invoking the `verify()` 256 method. 257 258 A `Verifier` object caches its result the first time it calls 259 `verify()`. Subsequent calls return the cached result, without 260 re-running the check code. The `_reverify()` method clears the 261 cached result in the current node, and in all dependencies. 262 263 Subclasses must supply these properties and methods: 264 * `verify()`: This is the method to perform the actual 265 verification check. 266 * `description`: A one-line summary of the verification check for 267 debug log messages. 268 269 Subclasses must override all of the above attributes; subclasses 270 should not override or extend any other attributes of this class. 271 272 The description string should be a simple sentence explaining what 273 must be true for the verifier to pass. Do not include a terminating 274 period. For example: 275 276 Host is available via ssh 277 278 The base class manages the following private data: 279 * `_result`: The cached result of verification. 280 * `_dependency_list`: The list of dependencies. 281 Subclasses should not use these attributes. 282 283 @property _result Cached result of verification. 284 """ 285 286 def __init__(self, tag, dependencies): 287 super(Verifier, self).__init__(tag, dependencies) 288 self._result = None 289 self._verify_tag = 'verify.' + self.tag 290 291 292 def _reverify(self): 293 """ 294 Discard cached verification results. 295 296 Reset the cached verification result for this node, and for the 297 transitive closure of all dependencies. 298 """ 299 if self._result is not None: 300 self._result = None 301 for v in self._dependency_list: 302 v._reverify() 303 304 305 def _verify_host(self, host, silent): 306 """ 307 Determine the result of verification, and log results. 308 309 If this verifier does not have a cached verification result, 310 check dependencies, and if they pass, run `verify()`. Log 311 informational messages regarding failed dependencies. If we 312 call `verify()`, log the result in `status.log`. 313 314 If we already have a cached result, return that result without 315 logging any message. 316 317 @param host The host to be tested for a problem. 318 @param silent If true, don't log host status records. 319 """ 320 if self._result is not None: 321 if isinstance(self._result, Exception): 322 raise self._result # cached failure 323 elif self._result: 324 return # cached success 325 self._result = False 326 self._verify_dependencies(host, silent) 327 logging.info('Verifying this condition: %s', self.description) 328 try: 329 self.verify(host) 330 self._record(host, silent, 'GOOD', None, self._verify_tag) 331 except Exception as e: 332 logging.exception('Failed: %s', self.description) 333 self._result = e 334 self._record(host, silent, 335 'FAIL', None, self._verify_tag, str(e)) 336 raise 337 self._result = True 338 339 340 def verify(self, host): 341 """ 342 Unconditionally perform a verification check. 343 344 This method is responsible for testing for a single problem on a 345 host. Implementations should follow these guidelines: 346 * The check should find a problem that will cause testing to 347 fail. 348 * Verification checks on a working system should run quickly 349 and should be optimized for success; a check that passes 350 should finish within seconds. 351 * Verification checks are not expected have side effects, but 352 may apply trivial fixes if they will finish within the time 353 constraints above. 354 355 A verification check should normally trigger a single set of 356 repair actions. If two different failures can require two 357 different repairs, ideally they should use two different 358 subclasses of `Verifier`. 359 360 Implementations indicate failure by raising an exception. The 361 exception text should be a short, 1-line summary of the error. 362 The text should be concise and diagnostic, as it will appear in 363 `status.log` files. 364 365 If this method finds no problems, it returns without raising any 366 exception. 367 368 Implementations should avoid most logging actions, but can log 369 DEBUG level messages if they provide significant information for 370 diagnosing failures. 371 372 @param host The host to be tested for a problem. 373 """ 374 raise NotImplementedError('Class %s does not implement ' 375 'verify()' % type(self).__name__) 376 377 378class RepairAction(_DependencyNode): 379 """ 380 Abstract class embodying one repair procedure. 381 382 A `RepairAction` is responsible for fixing one or more failed 383 `Verifier` checks, in order to make those checks pass. 384 385 Each repair action includes one or more verifier triggers that 386 determine when the repair action should run. A repair action 387 will call its `repair()` method if one or more of its triggers 388 fails. A repair action is successful if all of its triggers pass 389 after calling `repair()`. 390 391 A `RepairAction` is a subclass of `_DependencyNode`; if any of a 392 repair action's dependencies fail, the action does not check its 393 triggers, and doesn't call `repair()`. 394 395 Subclasses must supply these attributes: 396 * `repair()`: This is the method to perform the necessary 397 repair. The method should avoid most logging actions, but 398 can log DEBUG level messages if they provide significant 399 information for diagnosing failures. 400 * `description`: A one-line summary of the repair action for 401 debug log messages. 402 403 Subclasses must override both of the above attributes and should 404 not override any other attributes of this class. 405 406 The description string should be a simple sentence explaining the 407 operation that will be performed. Do not include a terminating 408 period. For example: 409 410 Re-install the stable build via AU 411 412 @property _trigger_list List of verification checks that will 413 trigger this repair when they fail. 414 """ 415 416 def __init__(self, tag, dependencies, triggers): 417 super(RepairAction, self).__init__(tag, dependencies) 418 self._trigger_list = triggers 419 self._repair_tag = 'repair.' + self.tag 420 421 422 def _repair_host(self, host, silent): 423 """ 424 Apply this repair action if any triggers fail. 425 426 Repair is triggered when all dependencies are successful, and at 427 least one trigger fails. 428 429 If the `repair()` method triggers, the success or failure of 430 this operation is logged in `status.log` bracketed by 'START' 431 and 'END' records. Details of whether or why `repair()` 432 triggered are written to the debug logs. If repair doesn't 433 trigger, nothing is logged to `status.log`. 434 435 @param host The host to be repaired. 436 @param silent If true, don't log host status records. 437 """ 438 self._verify_dependencies(host, silent) 439 try: 440 self._verify_list(host, self._trigger_list, silent) 441 except AutoservVerifyDependencyError as e: 442 e.log_dependencies( 443 'Attempting this repair action', 444 'Repairing because these triggers failed') 445 self._record(host, silent, 'START', None, self._repair_tag) 446 try: 447 self.repair(host) 448 except Exception as e: 449 logging.exception('Repair failed: %s', self.description) 450 self._record(host, silent, 451 'FAIL', None, self._repair_tag, str(e)) 452 self._record(host, silent, 453 'END FAIL', None, self._repair_tag) 454 raise 455 try: 456 for v in self._trigger_list: 457 v._reverify() 458 self._verify_list(host, self._trigger_list, silent) 459 self._record(host, silent, 460 'END GOOD', None, self._repair_tag) 461 except AutoservVerifyDependencyError as e: 462 e.log_dependencies( 463 'This repair action reported success', 464 'However, these triggers still fail') 465 self._record(host, silent, 466 'END FAIL', None, self._repair_tag) 467 raise AutoservRepairError( 468 'Some verification checks still fail') 469 except Exception: 470 # The specification for `self._verify_list()` says 471 # that this can't happen; this is a defensive 472 # precaution. 473 self._record(host, silent, 474 'END FAIL', None, self._repair_tag, 475 'Internal error in repair') 476 raise 477 else: 478 logging.info('No failed triggers, skipping repair: %s', 479 self.description) 480 481 482 def repair(self, host): 483 """ 484 Apply this repair action to the given host. 485 486 This method is responsible for applying changes to fix failures 487 in one or more verification checks. The repair is considered 488 successful if the DUT passes the specific checks after this 489 method completes. 490 491 Implementations indicate failure by raising an exception. The 492 exception text should be a short, 1-line summary of the error. 493 The text should be concise and diagnostic, as it will appear in 494 `status.log` files. 495 496 If this method completes successfully, it returns without 497 raising any exception. 498 499 Implementations should avoid most logging actions, but can log 500 DEBUG level messages if they provide significant information for 501 diagnosing failures. 502 503 @param host The host to be repaired. 504 """ 505 raise NotImplementedError('Class %s does not implement ' 506 'repair()' % type(self).__name__) 507 508 509class _RootVerifier(Verifier): 510 """ 511 Utility class used by `RepairStrategy`. 512 513 A node of this class by itself does nothing; it always passes (if it 514 can run). This class exists merely to be the root of a DAG of 515 dependencies in an instance of `RepairStrategy`. 516 """ 517 518 def verify(self, host): 519 pass 520 521 522 @property 523 def description(self): 524 return 'All host verification checks pass' 525 526 527 528class RepairStrategy(object): 529 """ 530 A class for organizing `Verifier` and `RepairAction` objects. 531 532 An instance of `RepairStrategy` is organized as a DAG of `Verifier` 533 objects, plus a list of `RepairAction` objects. The class provides 534 methods for invoking those objects in the required order, when 535 needed: 536 * The `verify()` method walks the verifier DAG in dependency 537 order. 538 * The `repair()` method invokes the repair actions in list order. 539 Each repair action will invoke its dependencies and triggers as 540 needed. 541 542 # The Verifier DAG 543 The verifier DAG is constructed from the first argument passed to 544 the passed to the `RepairStrategy` constructor. That argument is an 545 iterable consisting of three-element tuples in the form 546 `(constructor, tag, deps)`: 547 * The `constructor` value is a callable that creates a `Verifier` 548 as for the interface of the class constructor. For classes 549 that inherit the default constructor from `Verifier`, this can 550 be the class itself. 551 * The `tag` value is the tag to be associated with the constructed 552 verifier. 553 * The `deps` value is an iterable (e.g. list or tuple) of strings. 554 Each string corresponds to the `tag` member of a `Verifier` 555 dependency. 556 557 The tag names of verifiers in the constructed DAG must all be 558 unique. The tag name defined by `RepairStrategy.ROOT_TAG` is 559 reserved and may not be used by any verifier. 560 561 In the input data for the constructor, dependencies must appear 562 before the nodes that depend on them. Thus: 563 564 ((A, 'a', ()), (B, 'b', ('a',))) # This is valid 565 ((B, 'b', ('a',)), (A, 'a', ())) # This will fail! 566 567 Internally, the DAG of verifiers is given unique root node. So, 568 given this input: 569 570 ((C, 'c', ()), 571 (A, 'a', ('c',)), 572 (B, 'b', ('c',))) 573 574 The following DAG is constructed: 575 576 Root 577 / \ 578 A B 579 \ / 580 C 581 582 Since nothing depends on `A` or `B`, the root node guarantees that 583 these two verifiers will both be called and properly logged. 584 585 The root node is not directly accessible; however repair actions can 586 trigger on it by using `RepairStrategy.ROOT_TAG`. Additionally, the 587 node will be logged in `status.log` whenever `verify()` succeeds. 588 589 # The Repair Actions List 590 The list of repair actions is constructed from the second argument 591 passed to the passed to the `RepairStrategy` constructor. That 592 argument is an iterable consisting of four-element tuples in the 593 form `(constructor, tag, deps, triggers)`: 594 * The `constructor` value is a callable that creates a 595 `RepairAction` as for the interface of the class constructor. 596 For classes that inherit the default constructor from 597 `RepairAction`, this can be the class itself. 598 * The `tag` value is the tag to be associated with the constructed 599 repair action. 600 * The `deps` value is an iterable (e.g. list or tuple) of strings. 601 Each string corresponds to the `tag` member of a `Verifier` that 602 the repair action depends on. 603 * The `triggers` value is an iterable (e.g. list or tuple) of 604 strings. Each string corresponds to the `tag` member of a 605 `Verifier` that can trigger the repair action. 606 607 `RepairStrategy` deps and triggers can only refer to verifiers, 608 not to other repair actions. 609 """ 610 611 # This name is reserved; clients may not use it. 612 ROOT_TAG = 'PASS' 613 614 @staticmethod 615 def _add_verifier(verifiers, constructor, tag, dep_tags): 616 """ 617 Construct and remember a verifier. 618 619 Create a `Verifier` using `constructor` and `tag`. Dependencies 620 for construction are found by looking up `dep_tags` in the 621 `verifiers` dictionary. 622 623 After construction, the new verifier is added to `verifiers`. 624 625 @param verifiers Dictionary of verifiers, indexed by tag. 626 @param constructor Verifier construction function. 627 @param tag Tag parameter for the construction function. 628 @param dep_tags Tags of dependencies for the constructor, to 629 be found in `verifiers`. 630 """ 631 assert tag not in verifiers 632 deps = [verifiers[d] for d in dep_tags] 633 verifiers[tag] = constructor(tag, deps) 634 635 636 def __init__(self, verifier_data, repair_data): 637 """ 638 Construct a `RepairStrategy` from simplified DAG data. 639 640 The input `verifier_data` object describes how to construct 641 verify nodes and the dependencies that relate them, as detailed 642 above. 643 644 The input `repair_data` object describes how to construct repair 645 actions and their dependencies and triggers, as detailed above. 646 647 @param verifier_data Iterable value with constructors for the 648 elements of the verification DAG and their 649 dependencies. 650 @param repair_data Iterable value with constructors for the 651 elements of the repair action list, and 652 their dependencies and triggers. 653 """ 654 # We use the `all_verifiers` list to guarantee that our root 655 # verifier will execute its dependencies in the order provided 656 # to us by our caller. 657 verifier_map = {} 658 all_tags = [] 659 dependencies = set() 660 for constructor, tag, deps in verifier_data: 661 self._add_verifier(verifier_map, constructor, tag, deps) 662 dependencies.update(deps) 663 all_tags.append(tag) 664 # Capture all the verifiers that have nothing depending on them. 665 root_tags = [t for t in all_tags if t not in dependencies] 666 self._add_verifier(verifier_map, _RootVerifier, 667 self.ROOT_TAG, root_tags) 668 self._verify_root = verifier_map[self.ROOT_TAG] 669 self._repair_actions = [] 670 for constructor, tag, deps, triggers in repair_data: 671 r = constructor(tag, 672 [verifier_map[d] for d in deps], 673 [verifier_map[t] for t in triggers]) 674 self._repair_actions.append(r) 675 676 677 def verify(self, host, silent=False): 678 """ 679 Run the verifier DAG on the given host. 680 681 @param host The target to be verified. 682 @param silent If true, don't log host status records. 683 """ 684 self._verify_root._reverify() 685 self._verify_root._verify_host(host, silent) 686 687 688 def repair(self, host, silent=False): 689 """ 690 Run the repair list on the given host. 691 692 @param host The target to be repaired. 693 @param silent If true, don't log host status records. 694 """ 695 self._verify_root._reverify() 696 for ra in self._repair_actions: 697 try: 698 ra._repair_host(host, silent) 699 except Exception as e: 700 # all logging and exception handling was done at 701 # lower levels 702 pass 703 self._verify_root._verify_host(host, silent) 704